From b076f6640e3c2781410588f4a8e4ccfeed8eb606 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Tue, 3 Sep 2024 16:45:28 -0700
Subject: [PATCH 001/425] [lldb] Remove limit on max memory read size (#105765)

`memory read` will return an error if you try to read more than 1k bytes
in a single command, instructing you to set
`target.max-memory-read-size` or use `--force` if you intended to read
more than that. This is a safeguard for a command where people are being
explicit about how much memory they would like lldb to read (either to
display, or save to a file) and is an annoyance every time you need to
read more than a small amount. If someone confuses the --count argument
with the start address, lldb may begin dumping gigabytes of data but I'd
rather that behavior than requiring everyone to special-case their way
around a common use case.

I don't want to remove the setting because many people have added (much
larger) default max read sizes to their ~/.lldbinit files after hitting
this behavior. Another option would be to stop reading/using the value
in Target.cpp, but I see no harm in leaving the setting if someone
really does prefer to have a small cap on their memory read size.
---
 lldb/source/Target/TargetProperties.td                          | 2 +-
 .../memory/big-read/TestMemoryReadMaximumSize.py                | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Target/TargetProperties.td b/lldb/source/Target/TargetProperties.td
index 7bb5bd53688b1..0f68deb543f90 100644
--- a/lldb/source/Target/TargetProperties.td
+++ b/lldb/source/Target/TargetProperties.td
@@ -102,7 +102,7 @@ let Definition = "target" in {
     DefaultUnsignedValue<1024>,
     Desc<"Maximum number of characters to show when using %s in summary strings.">;
   def MaxMemReadSize: Property<"max-memory-read-size", "UInt64">,
-    DefaultUnsignedValue<1024>,
+    DefaultUnsignedValue<0xffffffff>,
     Desc<"Maximum number of bytes that 'memory read' will fetch before --force must be specified.">;
   def BreakpointUseAvoidList: Property<"breakpoints-use-platform-avoid-list", "Boolean">,
     DefaultTrue,
diff --git a/lldb/test/API/functionalities/memory/big-read/TestMemoryReadMaximumSize.py b/lldb/test/API/functionalities/memory/big-read/TestMemoryReadMaximumSize.py
index 259fde71a6362..1bc227dfce9be 100644
--- a/lldb/test/API/functionalities/memory/big-read/TestMemoryReadMaximumSize.py
+++ b/lldb/test/API/functionalities/memory/big-read/TestMemoryReadMaximumSize.py
@@ -22,6 +22,8 @@ def test_memory_read_max_setting(self):
         )
         self.assertTrue(self.bp.IsValid())
 
+        self.runCmd("settings set target.max-memory-read-size 1024")
+
         self.expect(
             "mem rea -f x -s 4 -c 2048 `&c`",
             error=True,

From 3e8840ba71bfcceeb598c2ca28d2d8784e24ba1e Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 3 Sep 2024 16:49:42 -0700
Subject: [PATCH 002/425] Remove "Target" from createXReduction naming [nfc]

Despite the stale comments, none of these actually use TTI, and they're
solely generating standard LLVM IR.
---
 llvm/include/llvm/IR/VectorBuilder.h          |  6 ++--
 .../include/llvm/Transforms/Utils/LoopUtils.h | 30 ++++++++-----------
 llvm/lib/IR/VectorBuilder.cpp                 |  8 ++---
 llvm/lib/Transforms/Utils/LoopUtils.cpp       | 28 ++++++++---------
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  2 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  6 ++--
 6 files changed, 38 insertions(+), 42 deletions(-)

diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h
index dbb9f4c7336d5..b0277c2b52595 100644
--- a/llvm/include/llvm/IR/VectorBuilder.h
+++ b/llvm/include/llvm/IR/VectorBuilder.h
@@ -103,9 +103,9 @@ class VectorBuilder {
   /// \param ValTy       The type of operand which the reduction operation is
   ///                    performed.
   /// \param VecOpArray  The operand list.
-  Value *createSimpleTargetReduction(Intrinsic::ID RdxID, Type *ValTy,
-                                     ArrayRef<Value *> VecOpArray,
-                                     const Twine &Name = Twine());
+  Value *createSimpleReduction(Intrinsic::ID RdxID, Type *ValTy,
+                               ArrayRef<Value *> VecOpArray,
+                               const Twine &Name = Twine());
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 56880bd4822c7..ba8af4aa2b0cd 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -393,32 +393,28 @@ Value *getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op,
                            TargetTransformInfo::ReductionShuffle RS,
                            RecurKind MinMaxKind = RecurKind::None);
 
-/// Create a target reduction of the given vector. The reduction operation
+/// Create a reduction of the given vector. The reduction operation
 /// is described by the \p Opcode parameter. min/max reductions require
 /// additional information supplied in \p RdxKind.
-/// The target is queried to determine if intrinsics or shuffle sequences are
-/// required to implement the reduction.
 /// Fast-math-flags are propagated using the IRBuilder's setting.
-Value *createSimpleTargetReduction(IRBuilderBase &B, Value *Src,
-                                   RecurKind RdxKind);
-/// Overloaded function to generate vector-predication intrinsics for target
+Value *createSimpleReduction(IRBuilderBase &B, Value *Src,
+                             RecurKind RdxKind);
+/// Overloaded function to generate vector-predication intrinsics for
 /// reduction.
-Value *createSimpleTargetReduction(VectorBuilder &VB, Value *Src,
-                                   const RecurrenceDescriptor &Desc);
+Value *createSimpleReduction(VectorBuilder &VB, Value *Src,
+                             const RecurrenceDescriptor &Desc);
 
-/// Create a target reduction of the given vector \p Src for a reduction of the
+/// Create a reduction of the given vector \p Src for a reduction of the
 /// kind RecurKind::IAnyOf or RecurKind::FAnyOf. The reduction operation is
 /// described by \p Desc.
-Value *createAnyOfTargetReduction(IRBuilderBase &B, Value *Src,
-                                  const RecurrenceDescriptor &Desc,
-                                  PHINode *OrigPhi);
+Value *createAnyOfReduction(IRBuilderBase &B, Value *Src,
+                            const RecurrenceDescriptor &Desc,
+                            PHINode *OrigPhi);
 
-/// Create a generic target reduction using a recurrence descriptor \p Desc
-/// The target is queried to determine if intrinsics or shuffle sequences are
-/// required to implement the reduction.
+/// Create a generic reduction using a recurrence descriptor \p Desc
 /// Fast-math-flags are propagated using the RecurrenceDescriptor.
-Value *createTargetReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc,
-                             Value *Src, PHINode *OrigPhi = nullptr);
+Value *createReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc,
+                       Value *Src, PHINode *OrigPhi = nullptr);
 
 /// Create an ordered reduction intrinsic using the given recurrence
 /// descriptor \p Desc.
diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp
index b8f56a7a2e5f9..f42948ba89042 100644
--- a/llvm/lib/IR/VectorBuilder.cpp
+++ b/llvm/lib/IR/VectorBuilder.cpp
@@ -60,10 +60,10 @@ Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
   return createVectorInstructionImpl(VPID, ReturnTy, InstOpArray, Name);
 }
 
-Value *VectorBuilder::createSimpleTargetReduction(Intrinsic::ID RdxID,
-                                                  Type *ValTy,
-                                                  ArrayRef<Value *> InstOpArray,
-                                                  const Twine &Name) {
+Value *VectorBuilder::createSimpleReduction(Intrinsic::ID RdxID,
+                                            Type *ValTy,
+                                            ArrayRef<Value *> InstOpArray,
+                                            const Twine &Name) {
   auto VPID = VPIntrinsic::getForIntrinsic(RdxID);
   assert(VPReductionIntrinsic::isVPReduction(VPID) &&
          "No VPIntrinsic for this reduction");
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 61f7b23693c7e..559129442a041 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1172,9 +1172,9 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
   return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
 }
 
-Value *llvm::createAnyOfTargetReduction(IRBuilderBase &Builder, Value *Src,
-                                        const RecurrenceDescriptor &Desc,
-                                        PHINode *OrigPhi) {
+Value *llvm::createAnyOfReduction(IRBuilderBase &Builder, Value *Src,
+                                  const RecurrenceDescriptor &Desc,
+                                  PHINode *OrigPhi) {
   assert(
       RecurrenceDescriptor::isAnyOfRecurrenceKind(Desc.getRecurrenceKind()) &&
       "Unexpected reduction kind");
@@ -1207,8 +1207,8 @@ Value *llvm::createAnyOfTargetReduction(IRBuilderBase &Builder, Value *Src,
   return Builder.CreateSelect(AnyOf, NewVal, InitVal, "rdx.select");
 }
 
-Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
-                                         RecurKind RdxKind) {
+Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
+                                   RecurKind RdxKind) {
   auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType();
   auto getIdentity = [&]() {
     Intrinsic::ID ID = getReductionIntrinsicID(RdxKind);
@@ -1241,8 +1241,8 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
   }
 }
 
-Value *llvm::createSimpleTargetReduction(VectorBuilder &VBuilder, Value *Src,
-                                         const RecurrenceDescriptor &Desc) {
+Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src,
+                                   const RecurrenceDescriptor &Desc) {
   RecurKind Kind = Desc.getRecurrenceKind();
   assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
          "AnyOf reduction is not supported.");
@@ -1252,12 +1252,12 @@ Value *llvm::createSimpleTargetReduction(VectorBuilder &VBuilder, Value *Src,
   Value *Iden =
       Desc.getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags());
   Value *Ops[] = {Iden, Src};
-  return VBuilder.createSimpleTargetReduction(Id, SrcTy, Ops);
+  return VBuilder.createSimpleReduction(Id, SrcTy, Ops);
 }
 
-Value *llvm::createTargetReduction(IRBuilderBase &B,
-                                   const RecurrenceDescriptor &Desc, Value *Src,
-                                   PHINode *OrigPhi) {
+Value *llvm::createReduction(IRBuilderBase &B,
+                             const RecurrenceDescriptor &Desc, Value *Src,
+                             PHINode *OrigPhi) {
   // TODO: Support in-order reductions based on the recurrence descriptor.
   // All ops in the reduction inherit fast-math-flags from the recurrence
   // descriptor.
@@ -1266,9 +1266,9 @@ Value *llvm::createTargetReduction(IRBuilderBase &B,
 
   RecurKind RK = Desc.getRecurrenceKind();
   if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
-    return createAnyOfTargetReduction(B, Src, Desc, OrigPhi);
+    return createAnyOfReduction(B, Src, Desc, OrigPhi);
 
-  return createSimpleTargetReduction(B, Src, RK);
+  return createSimpleReduction(B, Src, RK);
 }
 
 Value *llvm::createOrderedReduction(IRBuilderBase &B,
@@ -1295,7 +1295,7 @@ Value *llvm::createOrderedReduction(VectorBuilder &VBuilder,
   Intrinsic::ID Id = getReductionIntrinsicID(RecurKind::FAdd);
   auto *SrcTy = cast<VectorType>(Src->getType());
   Value *Ops[] = {Start, Src};
-  return VBuilder.createSimpleTargetReduction(Id, SrcTy, Ops);
+  return VBuilder.createSimpleReduction(Id, SrcTy, Ops);
 }
 
 void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cf802034cd56a..a7272deb3c34f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -18351,7 +18351,7 @@ class HorizontalReduction {
            "A call to the llvm.fmuladd intrinsic is not handled yet");
 
     ++NumVectorInstructions;
-    return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
+    return createSimpleReduction(Builder, VectorizedValue, RdxKind);
   }
 
   /// Emits optimized code for unique scalar value reused \p Cnt times.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0d3d0febfea1b..69c76edd0f554 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -593,7 +593,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
         !PhiR->isInLoop()) {
       ReducedPartRdx =
-          createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
+          createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
       // If the reduction can be performed in a smaller type, we need to extend
       // the reduction to the wider type before we branch to the original loop.
       if (PhiTy != RdxDesc.getRecurrenceType())
@@ -1857,7 +1857,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
       NextInChain = NewRed;
     } else {
       PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
-      NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
+      NewRed = createReduction(State.Builder, RdxDesc, NewVecOp);
       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
         NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
                                      NewRed, PrevInChain);
@@ -1900,7 +1900,7 @@ void VPReductionEVLRecipe::execute(VPTransformState &State) {
   if (isOrdered()) {
     NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
   } else {
-    NewRed = createSimpleTargetReduction(VBuilder, VecOp, RdxDesc);
+    NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc);
     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
     else

From eaa95a1c2bd38332c1a4e634595f29d22b28ffea Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Wed, 4 Sep 2024 04:10:46 +0400
Subject: [PATCH 003/425] [clang] Add test for CWG2486 (`noexcept` and function
 pointer conversion) (#107131)

[CWG2486](https://cplusplus.github.io/CWG/issues/2486.html) "Call to
`noexcept` function via `noexcept(false)` pointer/lvalue" allows
`noexcept` functions to be called via `noexcept(false)` pointers or
values. There appears to be no implementation divergence whatsoever:
https://godbolt.org/z/3afTfeEM8. That said, in C++14 and earlier we do
not issue all the diagnostics we issue in C++17 and newer, so I'm
specifying the status of the issue accordingly.
---
 clang/test/CXX/drs/cwg24xx.cpp | 43 ++++++++++++++++++++++++++++------
 clang/www/cxx_dr_status.html   |  2 +-
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/clang/test/CXX/drs/cwg24xx.cpp b/clang/test/CXX/drs/cwg24xx.cpp
index 00b6bb5a865df..79e9d031ef41c 100644
--- a/clang/test/CXX/drs/cwg24xx.cpp
+++ b/clang/test/CXX/drs/cwg24xx.cpp
@@ -1,15 +1,11 @@
-// RUN: %clang_cc1 -std=c++98 -pedantic-errors %s -verify=expected
-// RUN: %clang_cc1 -std=c++11 -pedantic-errors %s -verify=expected
-// RUN: %clang_cc1 -std=c++14 -pedantic-errors %s -verify=expected
+// RUN: %clang_cc1 -std=c++98 -pedantic-errors %s -verify=expected,cxx98-14
+// RUN: %clang_cc1 -std=c++11 -pedantic-errors %s -verify=expected,cxx98-14
+// RUN: %clang_cc1 -std=c++14 -pedantic-errors %s -verify=expected,cxx98-14
 // RUN: %clang_cc1 -std=c++17 -pedantic-errors %s -verify=expected,since-cxx17
 // RUN: %clang_cc1 -std=c++20 -pedantic-errors %s -verify=expected,since-cxx17
 // RUN: %clang_cc1 -std=c++23 -pedantic-errors %s -verify=expected,since-cxx17
 // RUN: %clang_cc1 -std=c++2c -pedantic-errors %s -verify=expected,since-cxx17
 
-#if __cplusplus <= 201402L
-// expected-no-diagnostics
-#endif
-
 namespace cwg2406 { // cwg2406: 5
 #if __cplusplus >= 201703L
 void fallthrough(int n) {
@@ -186,3 +182,36 @@ namespace cwg2445 { // cwg2445: 19
   }
 #endif
 }
+
+namespace cwg2486 { // cwg2486: 4 c++17
+struct C {
+  void fn() throw();
+};
+
+static void call(C& c, void (C::*f)()) {
+  (c.*f)();
+}
+
+static void callNE(C& c, void (C::*f)() throw()) {
+// cxx98-14-warning@-1 {{mangled name of 'callNE' will change in C++17 due to non-throwing exception specification in function signature}}
+  (c.*f)();
+}
+
+void ref() {
+  C c;
+  call(c, &C::fn); // <= implicit cast removes noexcept
+  callNE(c, &C::fn);
+}
+
+void (*p)();
+void (*pp)() throw() = p;
+// since-cxx17-error@-1 {{cannot initialize a variable of type 'void (*)() throw()' with an lvalue of type 'void (*)()': different exception specifications}}
+
+struct S {
+  typedef void (*p)();
+  operator p(); // #cwg2486-conv
+};
+void (*q)() throw() = S();
+// since-cxx17-error@-1 {{no viable conversion from 'S' to 'void (*)() throw()'}}
+//   since-cxx17-note@#cwg2486-conv {{candidate function}}
+} // namespace cwg2486
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 4f4d8d0a97d43..ca25776823cfa 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -14751,7 +14751,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2486.html">2486</a></td>
     <td>CD6</td>
     <td>Call to <TT>noexcept</TT> function via <TT>noexcept(false)</TT> pointer/lvalue</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 4 (C++17 onwards)</td>
   </tr>
   <tr class="open" id="2487">
     <td><a href="https://cplusplus.github.io/CWG/issues/2487.html">2487</a></td>

From 83ad644afaac23577e3563d3ec1fac1b1fde37f4 Mon Sep 17 00:00:00 2001
From: Freddy Ye <freddy.ye@intel.com>
Date: Wed, 4 Sep 2024 08:13:24 +0800
Subject: [PATCH 004/425] [X86][AVX10.2] Support AVX10.2-BF16 new instructions.
  (#101603)

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965
---
 clang/include/clang/Basic/BuiltinsX86.def     |   62 +
 clang/lib/Basic/Targets/X86.cpp               |    1 +
 clang/lib/CodeGen/CGBuiltin.cpp               |   23 +
 clang/lib/Headers/CMakeLists.txt              |    2 +
 clang/lib/Headers/avx10_2_512bf16intrin.h     |  565 +++
 clang/lib/Headers/avx10_2bf16intrin.h         | 1091 ++++++
 clang/lib/Headers/immintrin.h                 |    2 +
 clang/lib/Sema/SemaX86.cpp                    |    9 +
 .../CodeGen/X86/avx10_2_512bf16-builtins.c    | 1085 ++++++
 clang/test/CodeGen/X86/avx10_2bf16-builtins.c | 2082 ++++++++++++
 llvm/include/llvm/IR/IntrinsicsX86.td         |  253 ++
 .../lib/Target/X86/AsmParser/X86AsmParser.cpp |    8 +-
 .../X86/MCTargetDesc/X86ATTInstPrinter.cpp    |   12 +-
 .../X86/MCTargetDesc/X86InstPrinterCommon.cpp |   11 +
 .../X86/MCTargetDesc/X86IntelInstPrinter.cpp  |    9 +
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   37 +-
 llvm/lib/Target/X86/X86InstrAVX10.td          |  315 ++
 llvm/lib/Target/X86/X86InstrAVX512.td         |    4 +-
 llvm/lib/Target/X86/X86InstrFMA3Info.cpp      |   35 +-
 llvm/lib/Target/X86/X86InstrFragmentsSIMD.td  |    5 +
 llvm/lib/Target/X86/X86InstrUtils.td          |    6 +-
 llvm/lib/Target/X86/X86IntrinsicsInfo.h       |   54 +
 llvm/test/CodeGen/X86/avx10.2-fma-commute.ll  | 1244 +++++++
 .../test/CodeGen/X86/avx10_2_512bf16-arith.ll |  587 ++++
 .../CodeGen/X86/avx10_2_512bf16-intrinsics.ll |  230 ++
 llvm/test/CodeGen/X86/avx10_2bf16-arith.ll    | 1168 +++++++
 .../CodeGen/X86/avx10_2bf16-intrinsics.ll     |  602 ++++
 .../MC/Disassembler/X86/avx10.2-bf16-32.txt   | 3015 +++++++++++++++++
 .../MC/Disassembler/X86/avx10.2-bf16-64.txt   | 3015 +++++++++++++++++
 llvm/test/MC/X86/avx10.2-bf16-32-att.s        | 3014 ++++++++++++++++
 llvm/test/MC/X86/avx10.2-bf16-32-intel.s      | 3014 ++++++++++++++++
 llvm/test/MC/X86/avx10.2-bf16-64-att.s        | 3014 ++++++++++++++++
 llvm/test/MC/X86/avx10.2-bf16-64-intel.s      | 3014 ++++++++++++++++
 llvm/test/TableGen/x86-fold-tables.inc        |  494 +++
 34 files changed, 28058 insertions(+), 24 deletions(-)
 create mode 100644 clang/lib/Headers/avx10_2_512bf16intrin.h
 create mode 100644 clang/lib/Headers/avx10_2bf16intrin.h
 create mode 100644 clang/test/CodeGen/X86/avx10_2_512bf16-builtins.c
 create mode 100644 clang/test/CodeGen/X86/avx10_2bf16-builtins.c
 create mode 100644 llvm/test/CodeGen/X86/avx10.2-fma-commute.ll
 create mode 100644 llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
 create mode 100644 llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll
 create mode 100644 llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
 create mode 100644 llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll
 create mode 100644 llvm/test/MC/Disassembler/X86/avx10.2-bf16-32.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/avx10.2-bf16-64.txt
 create mode 100644 llvm/test/MC/X86/avx10.2-bf16-32-att.s
 create mode 100644 llvm/test/MC/X86/avx10.2-bf16-32-intel.s
 create mode 100644 llvm/test/MC/X86/avx10.2-bf16-64-att.s
 create mode 100644 llvm/test/MC/X86/avx10.2-bf16-64-intel.s

diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index e4aa8661b9a80..48376ee052798 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -2261,6 +2261,68 @@ TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_512_mask, "V32cV32xV32cUi", "nV:512:"
 TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
 TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
 TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
+
+// AVX10.2 BF16
+TARGET_BUILTIN(__builtin_ia32_loadsbf16128_mask, "V8yV8yC*V8yUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_storesbf16128_mask, "vV8y*V8yUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vdivnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vdivnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vdivnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vmaxpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmaxpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmaxpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vmulnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmulnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmulnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vsubnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsubnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsubnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16eq, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16lt, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16neq, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16ge, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16gt, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16le, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcmppbf16512_mask,"UiV32yV32yIiUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcmppbf16256_mask,"UsV16yV16yIiUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcmppbf16128_mask,"UcV8yV8yIiUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16128_mask, "UcV8yIiUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16256_mask, "UsV16yIiUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16512_mask, "UiV32yIiUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vscalefpbf16128_mask, "V8yV8yV8yV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vscalefpbf16256_mask, "V16yV16yV16yV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vscalefpbf16512_mask, "V32yV32yV32yV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vrcppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrcppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrcppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vgetexppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetexppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetexppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vreducenepbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vreducenepbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vreducenepbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16, "V8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16256, "V16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16512, "V32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh512, "V32yV32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh256, "V16yV16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh128, "V8yV8yV8yV8y", "ncV:128:", "avx10.2-256")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
 #undef TARGET_HEADER_BUILTIN
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index a9cbdb7b10dff..62c382b67ad14 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -306,6 +306,7 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasAVX10_1_512 = true;
     } else if (Feature == "+avx10.2-256") {
       HasAVX10_2 = true;
+      HasFullBFloat16 = true;
     } else if (Feature == "+avx10.2-512") {
       HasAVX10_2_512 = true;
     } else if (Feature == "+avx512cd") {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index e4d169d2ad603..786c2c224b349 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14728,6 +14728,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_storeups512_mask:
     return EmitX86MaskedStore(*this, Ops, Align(1));
 
+  case X86::BI__builtin_ia32_storesbf16128_mask:
   case X86::BI__builtin_ia32_storesh128_mask:
   case X86::BI__builtin_ia32_storess128_mask:
   case X86::BI__builtin_ia32_storesd128_mask:
@@ -14836,6 +14837,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_vfmaddph512_mask:
   case X86::BI__builtin_ia32_vfmaddph512_maskz:
   case X86::BI__builtin_ia32_vfmaddph512_mask3:
+  case X86::BI__builtin_ia32_vfmaddnepbh128:
+  case X86::BI__builtin_ia32_vfmaddnepbh256:
+  case X86::BI__builtin_ia32_vfmaddnepbh512:
   case X86::BI__builtin_ia32_vfmaddps512_mask:
   case X86::BI__builtin_ia32_vfmaddps512_maskz:
   case X86::BI__builtin_ia32_vfmaddps512_mask3:
@@ -14920,6 +14924,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_loaddqudi512_mask:
     return EmitX86MaskedLoad(*this, Ops, Align(1));
 
+  case X86::BI__builtin_ia32_loadsbf16128_mask:
   case X86::BI__builtin_ia32_loadsh128_mask:
   case X86::BI__builtin_ia32_loadss128_mask:
   case X86::BI__builtin_ia32_loadsd128_mask:
@@ -16074,6 +16079,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_sqrtph256:
   case X86::BI__builtin_ia32_sqrtph:
   case X86::BI__builtin_ia32_sqrtph512:
+  case X86::BI__builtin_ia32_vsqrtnepbf16256:
+  case X86::BI__builtin_ia32_vsqrtnepbf16:
+  case X86::BI__builtin_ia32_vsqrtnepbf16512:
   case X86::BI__builtin_ia32_sqrtps512:
   case X86::BI__builtin_ia32_sqrtpd512: {
     if (Ops.size() == 2) {
@@ -16293,6 +16301,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_fpclassps128_mask:
   case X86::BI__builtin_ia32_fpclassps256_mask:
   case X86::BI__builtin_ia32_fpclassps512_mask:
+  case X86::BI__builtin_ia32_vfpclasspbf16128_mask:
+  case X86::BI__builtin_ia32_vfpclasspbf16256_mask:
+  case X86::BI__builtin_ia32_vfpclasspbf16512_mask:
   case X86::BI__builtin_ia32_fpclassph128_mask:
   case X86::BI__builtin_ia32_fpclassph256_mask:
   case X86::BI__builtin_ia32_fpclassph512_mask:
@@ -16307,6 +16318,15 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     Intrinsic::ID ID;
     switch (BuiltinID) {
     default: llvm_unreachable("Unsupported intrinsic!");
+    case X86::BI__builtin_ia32_vfpclasspbf16128_mask:
+      ID = Intrinsic::x86_avx10_fpclass_nepbf16_128;
+      break;
+    case X86::BI__builtin_ia32_vfpclasspbf16256_mask:
+      ID = Intrinsic::x86_avx10_fpclass_nepbf16_256;
+      break;
+    case X86::BI__builtin_ia32_vfpclasspbf16512_mask:
+      ID = Intrinsic::x86_avx10_fpclass_nepbf16_512;
+      break;
     case X86::BI__builtin_ia32_fpclassph128_mask:
       ID = Intrinsic::x86_avx512fp16_fpclass_ph_128;
       break;
@@ -16465,6 +16485,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_vcmppd256_round_mask:
   case X86::BI__builtin_ia32_vcmpps256_round_mask:
   case X86::BI__builtin_ia32_vcmpph256_round_mask:
+  case X86::BI__builtin_ia32_vcmppbf16512_mask:
+  case X86::BI__builtin_ia32_vcmppbf16256_mask:
+  case X86::BI__builtin_ia32_vcmppbf16128_mask:
     IsMaskFCmp = true;
     [[fallthrough]];
   case X86::BI__builtin_ia32_cmpps:
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 5a62538792f30..e928b5b142827 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,10 +147,12 @@ set(x86_files
   amxcomplexintrin.h
   amxfp16intrin.h
   amxintrin.h
+  avx10_2_512bf16intrin.h
   avx10_2_512convertintrin.h
   avx10_2_512minmaxintrin.h
   avx10_2_512niintrin.h
   avx10_2_512satcvtintrin.h
+  avx10_2bf16intrin.h
   avx10_2convertintrin.h
   avx10_2minmaxintrin.h
   avx10_2niintrin.h
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
new file mode 100644
index 0000000000000..392b7ae770c5b
--- /dev/null
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -0,0 +1,565 @@
+/*===----------- avx10_2_512bf16intrin.h - AVX10-BF16 intrinsics ---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx10_2_512bf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2_512BF16INTRIN_H
+#define __AVX10_2_512BF16INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS512                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"),    \
+                 __min_vector_width__(512)))
+
+static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) {
+  return __builtin_bit_cast(__m512bh, _mm512_setzero_ps());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_undefined_pbh(void) {
+  return (__m512bh)__builtin_ia32_undef512();
+}
+
+static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set1_pbh(__bf16 bf) {
+  return (__m512bh)(__v32bf){bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
+                             bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
+                             bf, bf, bf, bf, bf, bf, bf, bf, bf, bf};
+}
+
+static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set_pbh(
+    __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6,
+    __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
+    __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16, __bf16 bf17,
+    __bf16 bf18, __bf16 bf19, __bf16 bf20, __bf16 bf21, __bf16 bf22,
+    __bf16 bf23, __bf16 bf24, __bf16 bf25, __bf16 bf26, __bf16 bf27,
+    __bf16 bf28, __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) {
+  return (__m512bh)(__v32bf){bf32, bf31, bf30, bf29, bf28, bf27, bf26, bf25,
+                             bf24, bf23, bf22, bf21, bf20, bf19, bf18, bf17,
+                             bf16, bf15, bf14, bf13, bf12, bf11, bf10, bf9,
+                             bf8,  bf7,  bf6,  bf5,  bf4,  bf3,  bf2,  bf1};
+}
+
+#define _mm512_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10,     \
+                        bf11, bf12, bf13, bf14, bf15, bf16, bf17, bf18, bf19,  \
+                        bf20, bf21, bf22, bf23, bf24, bf25, bf26, bf27, bf28,  \
+                        bf29, bf30, bf31, bf32)                                \
+  _mm512_set_pbh((bf32), (bf31), (bf30), (bf29), (bf28), (bf27), (bf26),       \
+                 (bf25), (bf24), (bf23), (bf22), (bf21), (bf20), (bf19),       \
+                 (bf18), (bf17), (bf16), (bf15), (bf14), (bf13), (bf12),       \
+                 (bf11), (bf10), (bf9), (bf8), (bf7), (bf6), (bf5), (bf4),     \
+                 (bf3), (bf2), (bf1))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_castpbf16_ps(__m512bh __a) {
+  return (__m512)__a;
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_castpbf16_pd(__m512bh __a) {
+  return (__m512d)__a;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_castpbf16_si512(__m512bh __a) {
+  return (__m512i)__a;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_castps_pbh(__m512 __a) {
+  return (__m512bh)__a;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castpd_pbh(__m512d __a) {
+  return (__m512bh)__a;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castsi512_pbh(__m512i __a) {
+  return (__m512bh)__a;
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS512
+_mm512_castpbf16512_pbh128(__m512bh __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS512
+_mm512_castpbf16512_pbh256(__m512bh __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                 12, 13, 14, 15);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castpbf16128_pbh512(__m128bh __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castpbf16256_pbh512(__m256bh __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_zextpbf16128_pbh512(__m128bh __a) {
+  return __builtin_shufflevector(
+      __a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+      13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_zextpbf16256_pbh512(__m256bh __a) {
+  return __builtin_shufflevector(__a, (__v16bf)_mm256_setzero_pbh(), 0, 1, 2, 3,
+                                 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+                                 29, 30, 31);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_abs_pbh(__m512bh __A) {
+  return (__m512bh)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF),
+                                    (__m512i)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_load_pbh(void const *__p) {
+  return *(const __m512bh *)__p;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_loadu_pbh(void const *__p) {
+  struct __loadu_pbh {
+    __m512bh_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((const struct __loadu_pbh *)__p)->__v;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_pbh(void *__P,
+                                                              __m512bh __A) {
+  *(__m512bh *)__P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_pbh(void *__P,
+                                                               __m512bh __A) {
+  struct __storeu_pbh {
+    __m512bh_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_pbh *)__P)->__v = __A;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) {
+  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, (__v32bf)__W,
+                                                (__v32bf)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
+                                                  (__v32hi)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_permutexvar_pbh(__m512i __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_addne_pbh(__m512bh __A, __m512bh __B) {
+  return (__m512bh)((__v32bf)__A + (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_addne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_addne_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_addne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_addne_pbh(__A, __B),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_subne_pbh(__m512bh __A, __m512bh __B) {
+  return (__m512bh)((__v32bf)__A - (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_subne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_subne_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_subne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_subne_pbh(__A, __B),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mulne_pbh(__m512bh __A, __m512bh __B) {
+  return (__m512bh)((__v32bf)__A * (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_mulne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_mulne_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_mulne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_mulne_pbh(__A, __B),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_divne_pbh(__m512bh __A, __m512bh __B) {
+  return (__m512bh)((__v32bf)__A / (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_divne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_divne_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_divne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_divne_pbh(__A, __B),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_max_pbh(__m512bh __A,
+                                                                __m512bh __B) {
+  return (__m512bh)__builtin_ia32_vmaxpbf16512((__v32bf)__A, (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_max_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_max_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_min_pbh(__m512bh __A,
+                                                                __m512bh __B) {
+  return (__m512bh)__builtin_ia32_vminpbf16512((__v32bf)__A, (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_min_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_min_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+#define _mm512_cmp_pbh_mask(__A, __B, __P)                                     \
+  ((__mmask32)__builtin_ia32_vcmppbf16512_mask((__v32bf)(__m512bh)(__A),       \
+                                               (__v32bf)(__m512bh)(__B),       \
+                                               (int)(__P), (__mmask32) - 1))
+
+#define _mm512_mask_cmp_pbh_mask(__U, __A, __B, __P)                           \
+  ((__mmask32)__builtin_ia32_vcmppbf16512_mask((__v32bf)(__m512bh)(__A),       \
+                                               (__v32bf)(__m512bh)(__B),       \
+                                               (int)(__P), (__mmask32)(__U)))
+
+#define _mm512_mask_fpclass_pbh_mask(__U, __A, imm)                            \
+  ((__mmask32)__builtin_ia32_vfpclasspbf16512_mask(                            \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32)(__U)))
+
+#define _mm512_fpclass_pbh_mask(__A, imm)                                      \
+  ((__mmask32)__builtin_ia32_vfpclasspbf16512_mask(                            \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32) - 1))
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_scalef_pbh(__m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_vscalefpbf16512_mask(
+      (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_undefined_pbh(),
+      (__mmask32)-1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_scalef_pbh(
+    __m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_vscalefpbf16512_mask(
+      (__v32bf)__A, (__v32bf)__B, (__v32bf)__W, (__mmask32)__U);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_scalef_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_vscalefpbf16512_mask(
+      (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_setzero_pbh(),
+      (__mmask32)__U);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_rcp_pbh(__m512bh __A) {
+  return (__m512bh)__builtin_ia32_vrcppbf16512_mask(
+      (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_rcp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_vrcppbf16512_mask((__v32bf)__A, (__v32bf)__W,
+                                                    (__mmask32)__U);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_rcp_pbh(__mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_vrcppbf16512_mask(
+      (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_getexp_pbh(__m512bh __A) {
+  return (__m512bh)__builtin_ia32_vgetexppbf16512_mask(
+      (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_getexp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_vgetexppbf16512_mask(
+      (__v32bf)__A, (__v32bf)__W, (__mmask32)__U);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_getexp_pbh(__mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_vgetexppbf16512_mask(
+      (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_rsqrt_pbh(__m512bh __A) {
+  return (__m512bh)__builtin_ia32_vrsqrtpbf16512_mask(
+      (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_rsqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_vrsqrtpbf16512_mask(
+      (__v32bf)__A, (__v32bf)__W, (__mmask32)__U);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_vrsqrtpbf16512_mask(
+      (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
+}
+
+#define _mm512_reducene_pbh(__A, imm)                                          \
+  ((__m512bh)__builtin_ia32_vreducenepbf16512_mask(                            \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_undefined_pbh(),   \
+      (__mmask32) - 1))
+
+#define _mm512_mask_reducene_pbh(__W, __U, __A, imm)                           \
+  ((__m512bh)__builtin_ia32_vreducenepbf16512_mask(                            \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W),          \
+      (__mmask32)(__U)))
+
+#define _mm512_maskz_reducene_pbh(__U, __A, imm)                               \
+  ((__m512bh)__builtin_ia32_vreducenepbf16512_mask(                            \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(),     \
+      (__mmask32)(__U)))
+
+#define _mm512_roundscalene_pbh(__A, imm)                                      \
+  ((__m512bh)__builtin_ia32_vrndscalenepbf16_mask(                             \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(),     \
+      (__mmask32) - 1))
+
+#define _mm512_mask_roundscalene_pbh(__W, __U, __A, imm)                       \
+  ((__m512bh)__builtin_ia32_vrndscalenepbf16_mask(                             \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W),          \
+      (__mmask32)(__U)))
+
+#define _mm512_maskz_roundscalene_pbh(__U, __A, imm)                           \
+  ((__m512bh)__builtin_ia32_vrndscalenepbf16_mask(                             \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(),     \
+      (__mmask32)(__U)))
+
+#define _mm512_getmant_pbh(__A, __B, __C)                                      \
+  ((__m512bh)__builtin_ia32_vgetmantpbf16512_mask(                             \
+      (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)),                   \
+      (__v32bf)_mm512_undefined_pbh(), (__mmask32) - 1))
+
+#define _mm512_mask_getmant_pbh(__W, __U, __A, __B, __C)                       \
+  ((__m512bh)__builtin_ia32_vgetmantpbf16512_mask(                             \
+      (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)),                   \
+      (__v32bf)(__m512bh)(__W), (__mmask32)(__U)))
+
+#define _mm512_maskz_getmant_pbh(__U, __A, __B, __C)                           \
+  ((__m512bh)__builtin_ia32_vgetmantpbf16512_mask(                             \
+      (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)),                   \
+      (__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
+  return (__m512bh)__builtin_ia32_vsqrtnepbf16512((__v32bf)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_sqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_sqrt_pbh(__A), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_sqrt_pbh(__mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
+                                                (__v32bf)_mm512_sqrt_pbh(__A),
+                                                (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_fmaddne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B,
+                                                 (__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fmaddne_pbh(
+    __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmaddne_pbh(
+    __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmaddne_pbh(
+    __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_fmsubne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B,
+                                                 -(__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fmsubne_pbh(
+    __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmsubne_pbh(
+    __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsubne_pbh(
+    __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_fnmaddne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B,
+                                                 (__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmaddne_pbh(
+    __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fnmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmaddne_pbh(
+    __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fnmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmaddne_pbh(
+    __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fnmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_fnmsubne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B,
+                                                 -(__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmsubne_pbh(
+    __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fnmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmsubne_pbh(
+    __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fnmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsubne_pbh(
+    __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fnmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+#undef __DEFAULT_FN_ATTRS512
+
+#endif
+#endif
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
new file mode 100644
index 0000000000000..0a427b9b7418b
--- /dev/null
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -0,0 +1,1091 @@
+/*===-------------- avx10_2bf16intrin.h - AVX10-BF16 intrinsics ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx10_2bf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2BF16INTRIN_H
+#define __AVX10_2BF16INTRIN_H
+
+typedef __bf16 __m128bh_u __attribute__((__vector_size__(16), __aligned__(1)));
+typedef __bf16 __m256bh_u __attribute__((__vector_size__(32), __aligned__(1)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
+                 __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
+                 __min_vector_width__(128)))
+
+static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_setzero_pbh(void) {
+  return __builtin_bit_cast(__m256bh, _mm256_setzero_ps());
+}
+
+static __inline __m128bh __DEFAULT_FN_ATTRS128 _mm_setzero_pbh(void) {
+  return __builtin_bit_cast(__m128bh, _mm_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castpbf16_ps(__m128bh __a) {
+  return (__m128)__a;
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_castpbf16_ps(__m256bh __a) {
+  return (__m256)__a;
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_castpbf16_pd(__m256bh __a) {
+  return (__m256d)__a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castpbf16_pd(__m128bh __a) {
+  return (__m128d)__a;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_castpbf16_si128(__m128bh __a) {
+  return (__m128i)__a;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_castpbf16_si256(__m256bh __a) {
+  return (__m256i)__a;
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_castps_pbh(__m128 __a) {
+  return (__m128bh)__a;
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_castps_pbh(__m256 __a) {
+  return (__m256bh)__a;
+}
+
+static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtsbh_bf16(__m128bh __a) {
+  return __a[0];
+}
+
+static __inline__ __bf16 __DEFAULT_FN_ATTRS256
+_mm256_cvtsbh_bf16(__m256bh __a) {
+  return __a[0];
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_castpd_pbh(__m128d __a) {
+  return (__m128bh)__a;
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_castpd_pbh(__m256d __a) {
+  return (__m256bh)__a;
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_castsi128_pbh(__m128i __a) {
+  return (__m128bh)__a;
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_castsi256_pbh(__m256i __a) {
+  return (__m256bh)__a;
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS256
+_mm256_castpbf16256_pbh128(__m256bh __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_castpbf16128_pbh256(__m128bh __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
+                                 -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_zextpbf16128_pbh256(__m128bh __a) {
+  return __builtin_shufflevector(__a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4,
+                                 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_undefined_pbh(void) {
+  return (__m256bh)__builtin_ia32_undef256();
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_load_sbh(void const *__dp) {
+  __m128bh src = (__v8bf)_mm_setzero_pbh();
+  return (__m128bh)__builtin_ia32_loadsbf16128_mask((const __v8bf *)__dp, src,
+                                                    1);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_load_sbh(__m128bh __W, __mmask8 __U, const void *__A) {
+  __m128bh src = (__v8bf)__builtin_shufflevector(
+      (__v8bf)__W, (__v8bf)_mm_setzero_pbh(), 0, 8, 8, 8, 8, 8, 8, 8);
+
+  return (__m128bh)__builtin_ia32_loadsbf16128_mask((const __v8bf *)__A, src,
+                                                    __U & 1);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_load_sbh(__mmask8 __U, const void *__A) {
+  return (__m128bh)__builtin_ia32_loadsbf16128_mask(
+      (const __v8bf *)__A, (__v8bf)_mm_setzero_pbh(), __U & 1);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_load_pbh(void const *__p) {
+  return *(const __m256bh *)__p;
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_load_pbh(void const *__p) {
+  return *(const __m128bh *)__p;
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_loadu_pbh(void const *__p) {
+  struct __loadu_pbh {
+    __m256bh_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((const struct __loadu_pbh *)__p)->__v;
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_loadu_pbh(void const *__p) {
+  struct __loadu_pbh {
+    __m128bh_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((const struct __loadu_pbh *)__p)->__v;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sbh(void *__dp,
+                                                           __m128bh __a) {
+  struct __mm_store_sbh_struct {
+    __bf16 __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_sbh_struct *)__dp)->__u = __a[0];
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sbh(void *__W,
+                                                                __mmask8 __U,
+                                                                __m128bh __A) {
+  __builtin_ia32_storesbf16128_mask((__v8bf *)__W, __A, __U & 1);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_pbh(void *__P,
+                                                              __m256bh __A) {
+  *(__m256bh *)__P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_pbh(void *__P,
+                                                           __m128bh __A) {
+  *(__m128bh *)__P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_pbh(void *__P,
+                                                               __m256bh __A) {
+  struct __storeu_pbh {
+    __m256bh_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_pbh *)__P)->__v = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_pbh(void *__P,
+                                                            __m128bh __A) {
+  struct __storeu_pbh {
+    __m128bh_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_pbh *)__P)->__v = __A;
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_move_sbh(__m128bh __a,
+                                                              __m128bh __b) {
+  __a[0] = __b[0];
+  return __a;
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_move_sbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
+  return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B), __W);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_move_sbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
+  return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B),
+                                      _mm_setzero_pbh());
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_undefined_pbh(void) {
+  return (__m128bh)__builtin_ia32_undef128();
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_set_sbh(__bf16 bf) {
+  return (__v8bf)__builtin_shufflevector(
+      (__v8bf){bf, bf, bf, bf, bf, bf, bf, bf}, (__v8bf)_mm_setzero_pbh(), 0, 8,
+      8, 8, 8, 8, 8, 8);
+}
+
+static __inline __m128bh __DEFAULT_FN_ATTRS128 _mm_set1_pbh(__bf16 bf) {
+  return (__m128bh)(__v8bf){bf, bf, bf, bf, bf, bf, bf, bf};
+}
+
+static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_set1_pbh(__bf16 bf) {
+  return (__m256bh)(__v16bf){bf, bf, bf, bf, bf, bf, bf, bf,
+                             bf, bf, bf, bf, bf, bf, bf, bf};
+}
+
+static __inline __m128bh __DEFAULT_FN_ATTRS128
+_mm_set_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5,
+            __bf16 bf6, __bf16 bf7, __bf16 bf8) {
+  return (__m128bh)(__v8bf){bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8};
+}
+
+static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_set_pbh(
+    __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6,
+    __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
+    __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16) {
+  return (__m256bh)(__v16bf){bf1, bf2,  bf3,  bf4,  bf5,  bf6,  bf7,  bf8,
+                             bf9, bf10, bf11, bf12, bf13, bf14, bf15, bf16};
+}
+
+#define _mm_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8)                   \
+  _mm_set_pbh((bf8), (bf7), (bf6), (bf5), (bf4), (bf3), (bf2), (bf1))
+
+#define _mm256_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10,     \
+                        bf11, bf12, bf13, bf14, bf15, bf16)                    \
+  _mm256_set_pbh((bf16), (bf15), (bf14), (bf13), (bf12), (bf11), (bf10),       \
+                 (bf9), (bf8), (bf7), (bf6), (bf5), (bf4), (bf3), (bf2),       \
+                 (bf1))
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_abs_pbh(__m256bh __A) {
+  return (__m256bh)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF),
+                                    (__m256i)__A);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_abs_pbh(__m128bh __A) {
+  return (__m128bh)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_blend_pbh(__mmask8 __U, __m128bh __A, __m128bh __W) {
+  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U, (__v8bf)__W,
+                                                (__v8bf)__A);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) {
+  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U, (__v16bf)__W,
+                                                (__v16bf)__A);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
+                                                  (__v8hi)__B);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
+                                                  (__v16hi)__B);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_permutexvar_pbh(__m128i __A, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_permutexvar_pbh(__m256i __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_addne_pbh(__m256bh __A, __m256bh __B) {
+  return (__m256bh)((__v16bf)__A + (__v16bf)__B);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_mask_addne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U, (__v16bf)_mm256_addne_pbh(__A, __B), (__v16bf)__W);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_addne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U, (__v16bf)_mm256_addne_pbh(__A, __B),
+      (__v16bf)_mm256_setzero_pbh());
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_addne_pbh(__m128bh __A,
+                                                               __m128bh __B) {
+  return (__m128bh)((__v8bf)__A + (__v8bf)__B);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_addne_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, (__v8bf)_mm_addne_pbh(__A, __B), (__v8bf)__W);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_addne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
+                                                (__v8bf)_mm_addne_pbh(__A, __B),
+                                                (__v8bf)_mm_setzero_pbh());
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_subne_pbh(__m256bh __A, __m256bh __B) {
+  return (__m256bh)((__v16bf)__A - (__v16bf)__B);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_mask_subne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U, (__v16bf)_mm256_subne_pbh(__A, __B), (__v16bf)__W);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_subne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U, (__v16bf)_mm256_subne_pbh(__A, __B),
+      (__v16bf)_mm256_setzero_pbh());
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_subne_pbh(__m128bh __A,
+                                                               __m128bh __B) {
+  return (__m128bh)((__v8bf)__A - (__v8bf)__B);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_subne_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, (__v8bf)_mm_subne_pbh(__A, __B), (__v8bf)__W);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_subne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
+                                                (__v8bf)_mm_subne_pbh(__A, __B),
+                                                (__v8bf)_mm_setzero_pbh());
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_mulne_pbh(__m256bh __A, __m256bh __B) {
+  return (__m256bh)((__v16bf)__A * (__v16bf)__B);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_mask_mulne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U, (__v16bf)_mm256_mulne_pbh(__A, __B), (__v16bf)__W);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_mulne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U, (__v16bf)_mm256_mulne_pbh(__A, __B),
+      (__v16bf)_mm256_setzero_pbh());
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_mulne_pbh(__m128bh __A,
+                                                               __m128bh __B) {
+  return (__m128bh)((__v8bf)__A * (__v8bf)__B);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_mulne_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, (__v8bf)_mm_mulne_pbh(__A, __B), (__v8bf)__W);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_mulne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
+                                                (__v8bf)_mm_mulne_pbh(__A, __B),
+                                                (__v8bf)_mm_setzero_pbh());
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_divne_pbh(__m256bh __A, __m256bh __B) {
+  return (__m256bh)((__v16bf)__A / (__v16bf)__B);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_mask_divne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U, (__v16bf)_mm256_divne_pbh(__A, __B), (__v16bf)__W);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_divne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U, (__v16bf)_mm256_divne_pbh(__A, __B),
+      (__v16bf)_mm256_setzero_pbh());
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_divne_pbh(__m128bh __A,
+                                                               __m128bh __B) {
+  return (__m128bh)((__v8bf)__A / (__v8bf)__B);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_divne_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, (__v8bf)_mm_divne_pbh(__A, __B), (__v8bf)__W);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_divne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
+                                                (__v8bf)_mm_divne_pbh(__A, __B),
+                                                (__v8bf)_mm_setzero_pbh());
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_max_pbh(__m256bh __A,
+                                                                __m256bh __B) {
+  return (__m256bh)__builtin_ia32_vmaxpbf16256((__v16bf)__A, (__v16bf)__B);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_mask_max_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U, (__v16bf)_mm256_max_pbh(__A, __B), (__v16bf)__W);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U, (__v16bf)_mm256_max_pbh(__A, __B),
+      (__v16bf)_mm256_setzero_pbh());
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_max_pbh(__m128bh __A,
+                                                             __m128bh __B) {
+  return (__m128bh)__builtin_ia32_vmaxpbf16128((__v8bf)__A, (__v8bf)__B);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_max_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, (__v8bf)_mm_max_pbh(__A, __B), (__v8bf)__W);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_max_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, (__v8bf)_mm_max_pbh(__A, __B), (__v8bf)_mm_setzero_pbh());
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_min_pbh(__m256bh __A,
+                                                                __m256bh __B) {
+  return (__m256bh)__builtin_ia32_vminpbf16256((__v16bf)__A, (__v16bf)__B);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_mask_min_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U, (__v16bf)_mm256_min_pbh(__A, __B), (__v16bf)__W);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U, (__v16bf)_mm256_min_pbh(__A, __B),
+      (__v16bf)_mm256_setzero_pbh());
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_min_pbh(__m128bh __A,
+                                                             __m128bh __B) {
+  return (__m128bh)__builtin_ia32_vminpbf16128((__v8bf)__A, (__v8bf)__B);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_min_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, (__v8bf)_mm_min_pbh(__A, __B), (__v8bf)__W);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_min_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, (__v8bf)_mm_min_pbh(__A, __B), (__v8bf)_mm_setzero_pbh());
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comeqsbh(__m128bh A,
+                                                         __m128bh B) {
+  return __builtin_ia32_vcomsbf16eq((__v8bf)A, (__v8bf)B);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comltsbh(__m128bh A,
+                                                         __m128bh B) {
+  return __builtin_ia32_vcomsbf16lt((__v8bf)A, (__v8bf)B);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comlesbh(__m128bh A,
+                                                         __m128bh B) {
+  return __builtin_ia32_vcomsbf16le((__v8bf)A, (__v8bf)B);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comgtsbh(__m128bh A,
+                                                         __m128bh B) {
+  return __builtin_ia32_vcomsbf16gt((__v8bf)A, (__v8bf)B);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comgesbh(__m128bh A,
+                                                         __m128bh B) {
+  return __builtin_ia32_vcomsbf16ge((__v8bf)A, (__v8bf)B);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comneqsbh(__m128bh A,
+                                                          __m128bh B) {
+  return __builtin_ia32_vcomsbf16neq((__v8bf)A, (__v8bf)B);
+}
+
+#define _mm256_cmp_pbh_mask(__A, __B, __P)                                     \
+  ((__mmask16)__builtin_ia32_vcmppbf16256_mask((__v16bf)(__m256bh)(__A),       \
+                                               (__v16bf)(__m256bh)(__B),       \
+                                               (int)(__P), (__mmask16) - 1))
+
+#define _mm256_mask_cmp_pbh_mask(__U, __A, __B, __P)                           \
+  ((__mmask16)__builtin_ia32_vcmppbf16256_mask((__v16bf)(__m256bh)(__A),       \
+                                               (__v16bf)(__m256bh)(__B),       \
+                                               (int)(__P), (__mmask16)(__U)))
+
+#define _mm_cmp_pbh_mask(__A, __B, __P)                                        \
+  ((__mmask8)__builtin_ia32_vcmppbf16128_mask((__v8bf)(__m128bh)(__A),         \
+                                              (__v8bf)(__m128bh)(__B),         \
+                                              (int)(__P), (__mmask8) - 1))
+
+#define _mm_mask_cmp_pbh_mask(__U, __A, __B, __P)                              \
+  ((__mmask8)__builtin_ia32_vcmppbf16128_mask((__v8bf)(__m128bh)(__A),         \
+                                              (__v8bf)(__m128bh)(__B),         \
+                                              (int)(__P), (__mmask8)(__U)))
+
+#define _mm256_mask_fpclass_pbh_mask(__U, __A, imm)                            \
+  ((__mmask16)__builtin_ia32_vfpclasspbf16256_mask(                            \
+      (__v16bf)(__m256bh)(__A), (int)(imm), (__mmask16)(__U)))
+
+#define _mm256_fpclass_pbh_mask(__A, imm)                                      \
+  ((__mmask16)__builtin_ia32_vfpclasspbf16256_mask(                            \
+      (__v16bf)(__m256bh)(__A), (int)(imm), (__mmask16) - 1))
+
+#define _mm_mask_fpclass_pbh_mask(__U, __A, imm)                               \
+  ((__mmask8)__builtin_ia32_vfpclasspbf16128_mask(                             \
+      (__v8bf)(__m128bh)(__A), (int)(imm), (__mmask8)(__U)))
+
+#define _mm_fpclass_pbh_mask(__A, imm)                                         \
+  ((__mmask8)__builtin_ia32_vfpclasspbf16128_mask((__v8bf)(__m128bh)(__A),     \
+                                                  (int)(imm), (__mmask8) - 1))
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_scalef_pbh(__m256bh __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_vscalefpbf16256_mask(
+      (__v16bf)__A, (__v16bf)__B, (__v16bf)_mm256_undefined_pbh(),
+      (__mmask16)-1);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_scalef_pbh(
+    __m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_vscalefpbf16256_mask(
+      (__v16bf)__A, (__v16bf)__B, (__v16bf)__W, (__mmask16)__U);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_scalef_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
+  return (__m256bh)__builtin_ia32_vscalefpbf16256_mask(
+      (__v16bf)__A, (__v16bf)__B, (__v16bf)_mm256_setzero_pbh(),
+      (__mmask16)__U);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_scalef_pbh(__m128bh __A,
+                                                                __m128bh __B) {
+  return (__m128bh)__builtin_ia32_vscalefpbf16128_mask(
+      (__v8bf)__A, (__v8bf)__B, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_scalef_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_vscalefpbf16128_mask(
+      (__v8bf)__A, (__v8bf)__B, (__v8bf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_scalef_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128bh)__builtin_ia32_vscalefpbf16128_mask(
+      (__v8bf)__A, (__v8bf)__B, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_rcp_pbh(__m256bh __A) {
+  return (__m256bh)__builtin_ia32_vrcppbf16256_mask(
+      (__v16bf)__A, (__v16bf)_mm256_undefined_pbh(), (__mmask16)-1);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_mask_rcp_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) {
+  return (__m256bh)__builtin_ia32_vrcppbf16256_mask((__v16bf)__A, (__v16bf)__W,
+                                                    (__mmask16)__U);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_rcp_pbh(__mmask16 __U, __m256bh __A) {
+  return (__m256bh)__builtin_ia32_vrcppbf16256_mask(
+      (__v16bf)__A, (__v16bf)_mm256_setzero_pbh(), (__mmask16)__U);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_rcp_pbh(__m128bh __A) {
+  return (__m128bh)__builtin_ia32_vrcppbf16128_mask(
+      (__v8bf)__A, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_rcp_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) {
+  return (__m128bh)__builtin_ia32_vrcppbf16128_mask((__v8bf)__A, (__v8bf)__W,
+                                                    (__mmask8)__U);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_rcp_pbh(__mmask8 __U, __m128bh __A) {
+  return (__m128bh)__builtin_ia32_vrcppbf16128_mask(
+      (__v8bf)__A, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_getexp_pbh(__m256bh __A) {
+  return (__m256bh)__builtin_ia32_vgetexppbf16256_mask(
+      (__v16bf)__A, (__v16bf)_mm256_undefined_pbh(), (__mmask16)-1);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_mask_getexp_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) {
+  return (__m256bh)__builtin_ia32_vgetexppbf16256_mask(
+      (__v16bf)__A, (__v16bf)__W, (__mmask16)__U);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_getexp_pbh(__mmask16 __U, __m256bh __A) {
+  return (__m256bh)__builtin_ia32_vgetexppbf16256_mask(
+      (__v16bf)__A, (__v16bf)_mm256_setzero_pbh(), (__mmask16)__U);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_getexp_pbh(__m128bh __A) {
+  return (__m128bh)__builtin_ia32_vgetexppbf16128_mask(
+      (__v8bf)__A, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_getexp_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) {
+  return (__m128bh)__builtin_ia32_vgetexppbf16128_mask((__v8bf)__A, (__v8bf)__W,
+                                                       (__mmask8)__U);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_getexp_pbh(__mmask8 __U, __m128bh __A) {
+  return (__m128bh)__builtin_ia32_vgetexppbf16128_mask(
+      (__v8bf)__A, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_rsqrt_pbh(__m256bh __A) {
+  return (__m256bh)__builtin_ia32_vrsqrtpbf16256_mask(
+      (__v16bf)__A, (__v16bf)_mm256_undefined_pbh(), (__mmask16)-1);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_mask_rsqrt_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) {
+  return (__m256bh)__builtin_ia32_vrsqrtpbf16256_mask(
+      (__v16bf)__A, (__v16bf)__W, (__mmask16)__U);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_rsqrt_pbh(__mmask16 __U, __m256bh __A) {
+  return (__m256bh)__builtin_ia32_vrsqrtpbf16256_mask(
+      (__v16bf)__A, (__v16bf)_mm256_setzero_pbh(), (__mmask16)__U);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_rsqrt_pbh(__m128bh __A) {
+  return (__m128bh)__builtin_ia32_vrsqrtpbf16128_mask(
+      (__v8bf)__A, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_rsqrt_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) {
+  return (__m128bh)__builtin_ia32_vrsqrtpbf16128_mask((__v8bf)__A, (__v8bf)__W,
+                                                      (__mmask8)__U);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) {
+  return (__m128bh)__builtin_ia32_vrsqrtpbf16128_mask(
+      (__v8bf)__A, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U);
+}
+
+#define _mm256_reducene_pbh(__A, imm)                                          \
+  ((__m256bh)__builtin_ia32_vreducenepbf16256_mask(                            \
+      (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_undefined_pbh(),   \
+      (__mmask16) - 1))
+
+#define _mm256_mask_reducene_pbh(__W, __U, __A, imm)                           \
+  ((__m256bh)__builtin_ia32_vreducenepbf16256_mask(                            \
+      (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)(__m256bh)(__W),          \
+      (__mmask16)(__U)))
+
+#define _mm256_maskz_reducene_pbh(__U, __A, imm)                               \
+  ((__m256bh)__builtin_ia32_vreducenepbf16256_mask(                            \
+      (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_setzero_pbh(),     \
+      (__mmask16)(__U)))
+
+#define _mm_reducene_pbh(__A, imm)                                             \
+  ((__m128bh)__builtin_ia32_vreducenepbf16128_mask(                            \
+      (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_undefined_pbh(),        \
+      (__mmask8) - 1))
+
+#define _mm_mask_reducene_pbh(__W, __U, __A, imm)                              \
+  ((__m128bh)__builtin_ia32_vreducenepbf16128_mask(                            \
+      (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)(__m128bh)(__W),            \
+      (__mmask8)(__U)))
+
+#define _mm_maskz_reducene_pbh(__U, __A, imm)                                  \
+  ((__m128bh)__builtin_ia32_vreducenepbf16128_mask(                            \
+      (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_setzero_pbh(),          \
+      (__mmask8)(__U)))
+
+#define _mm256_roundscalene_pbh(__A, imm)                                      \
+  ((__m256bh)__builtin_ia32_vrndscalenepbf16_256_mask(                         \
+      (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_setzero_pbh(),     \
+      (__mmask16) - 1))
+
+#define _mm256_mask_roundscalene_pbh(__W, __U, __A, imm)                       \
+  ((__m256bh)__builtin_ia32_vrndscalenepbf16_256_mask(                         \
+      (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)(__m256bh)(__W),          \
+      (__mmask16)(__U)))
+
+#define _mm256_maskz_roundscalene_pbh(__U, __A, imm)                           \
+  ((__m256bh)__builtin_ia32_vrndscalenepbf16_256_mask(                         \
+      (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_setzero_pbh(),     \
+      (__mmask16)(__U)))
+
+#define _mm_roundscalene_pbh(__A, imm)                                         \
+  ((__m128bh)__builtin_ia32_vrndscalenepbf16_128_mask(                         \
+      (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_setzero_pbh(),          \
+      (__mmask8) - 1))
+
+#define _mm_mask_roundscalene_pbh(__W, __U, __A, imm)                          \
+  ((__m128bh)__builtin_ia32_vrndscalenepbf16_128_mask(                         \
+      (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)(__m128bh)(__W),            \
+      (__mmask8)(__U)))
+
+#define _mm_maskz_roundscalene_pbh(__U, __A, imm)                              \
+  ((__m128bh)__builtin_ia32_vrndscalenepbf16_128_mask(                         \
+      (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_setzero_pbh(),          \
+      (__mmask8)(__U)))
+
+#define _mm256_getmant_pbh(__A, __B, __C)                                      \
+  ((__m256bh)__builtin_ia32_vgetmantpbf16256_mask(                             \
+      (__v16bf)(__m256bh)(__A), (int)(((__C) << 2) | (__B)),                   \
+      (__v16bf)_mm256_undefined_pbh(), (__mmask16) - 1))
+
+#define _mm256_mask_getmant_pbh(__W, __U, __A, __B, __C)                       \
+  ((__m256bh)__builtin_ia32_vgetmantpbf16256_mask(                             \
+      (__v16bf)(__m256bh)(__A), (int)(((__C) << 2) | (__B)),                   \
+      (__v16bf)(__m256bh)(__W), (__mmask16)(__U)))
+
+#define _mm256_maskz_getmant_pbh(__U, __A, __B, __C)                           \
+  ((__m256bh)__builtin_ia32_vgetmantpbf16256_mask(                             \
+      (__v16bf)(__m256bh)(__A), (int)(((__C) << 2) | (__B)),                   \
+      (__v16bf)_mm256_setzero_pbh(), (__mmask16)(__U)))
+
+#define _mm_getmant_pbh(__A, __B, __C)                                         \
+  ((__m128bh)__builtin_ia32_vgetmantpbf16128_mask(                             \
+      (__v8bf)(__m128bh)(__A), (int)(((__C) << 2) | (__B)),                    \
+      (__v8bf)_mm_undefined_pbh(), (__mmask8) - 1))
+
+#define _mm_mask_getmant_pbh(__W, __U, __A, __B, __C)                          \
+  ((__m128bh)__builtin_ia32_vgetmantpbf16128_mask(                             \
+      (__v8bf)(__m128bh)(__A), (int)(((__C) << 2) | (__B)),                    \
+      (__v8bf)(__m128bh)(__W), (__mmask8)(__U)))
+
+#define _mm_maskz_getmant_pbh(__U, __A, __B, __C)                              \
+  ((__m128bh)__builtin_ia32_vgetmantpbf16128_mask(                             \
+      (__v8bf)(__m128bh)(__A), (int)(((__C) << 2) | (__B)),                    \
+      (__v8bf)_mm_setzero_pbh(), (__mmask8)(__U)))
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) {
+  return (__m256bh)__builtin_ia32_vsqrtnepbf16256((__v16bf)__A);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_mask_sqrt_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U, (__v16bf)_mm256_sqrt_pbh(__A), (__v16bf)__W);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) {
+  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
+                                                (__v16bf)_mm256_sqrt_pbh(__A),
+                                                (__v16bf)_mm256_setzero_pbh());
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) {
+  return (__m128bh)__builtin_ia32_vsqrtnepbf16((__v8bf)__A);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_sqrt_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, (__v8bf)_mm_sqrt_pbh(__A), (__v8bf)__W);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_sqrt_pbh(__mmask8 __U, __m128bh __A) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, (__v8bf)_mm_sqrt_pbh(__A), (__v8bf)_mm_setzero_pbh());
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_fmaddne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
+  return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, (__v16bf)__B,
+                                                 (__v16bf)__C);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fmaddne_pbh(
+    __m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U,
+      _mm256_fmaddne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
+      (__v16bf)__A);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fmaddne_pbh(
+    __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U,
+      _mm256_fmaddne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
+      (__v16bf)__C);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmaddne_pbh(
+    __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U,
+      _mm256_fmaddne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
+      (__v16bf)_mm256_setzero_pbh());
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_fmsubne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
+  return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, (__v16bf)__B,
+                                                 -(__v16bf)__C);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fmsubne_pbh(
+    __m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U,
+      _mm256_fmsubne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
+      (__v16bf)__A);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsubne_pbh(
+    __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U,
+      _mm256_fmsubne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
+      (__v16bf)__C);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsubne_pbh(
+    __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U,
+      _mm256_fmsubne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
+      (__v16bf)_mm256_setzero_pbh());
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_fnmaddne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
+  return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, -(__v16bf)__B,
+                                                 (__v16bf)__C);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmaddne_pbh(
+    __m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U,
+      _mm256_fnmaddne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
+      (__v16bf)__A);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmaddne_pbh(
+    __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U,
+      _mm256_fnmaddne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
+      (__v16bf)__C);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmaddne_pbh(
+    __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U,
+      _mm256_fnmaddne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
+      (__v16bf)_mm256_setzero_pbh());
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_fnmsubne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
+  return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, -(__v16bf)__B,
+                                                 -(__v16bf)__C);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsubne_pbh(
+    __m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U,
+      _mm256_fnmsubne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
+      (__v16bf)__A);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmsubne_pbh(
+    __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U,
+      _mm256_fnmsubne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
+      (__v16bf)__C);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsubne_pbh(
+    __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) {
+  return (__m256bh)__builtin_ia32_selectpbf_256(
+      (__mmask16)__U,
+      _mm256_fnmsubne_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
+      (__v16bf)_mm256_setzero_pbh());
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmaddne_pbh(__m128bh __A,
+                                                                 __m128bh __B,
+                                                                 __m128bh __C) {
+  return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, (__v8bf)__B,
+                                                 (__v8bf)__C);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_fmaddne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, _mm_fmaddne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
+      (__v8bf)__A);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask3_fmaddne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, _mm_fmaddne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
+      (__v8bf)__C);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_fmaddne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, _mm_fmaddne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
+      (__v8bf)_mm_setzero_pbh());
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmsubne_pbh(__m128bh __A,
+                                                                 __m128bh __B,
+                                                                 __m128bh __C) {
+  return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, (__v8bf)__B,
+                                                 -(__v8bf)__C);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_fmsubne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, _mm_fmsubne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
+      (__v8bf)__A);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask3_fmsubne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, _mm_fmsubne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
+      (__v8bf)__C);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_fmsubne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, _mm_fmsubne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
+      (__v8bf)_mm_setzero_pbh());
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_fnmaddne_pbh(__m128bh __A, __m128bh __B, __m128bh __C) {
+  return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, -(__v8bf)__B,
+                                                 (__v8bf)__C);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_fnmaddne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, _mm_fnmaddne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
+      (__v8bf)__A);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmaddne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, _mm_fnmaddne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
+      (__v8bf)__C);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmaddne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, _mm_fnmaddne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
+      (__v8bf)_mm_setzero_pbh());
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_fnmsubne_pbh(__m128bh __A, __m128bh __B, __m128bh __C) {
+  return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, -(__v8bf)__B,
+                                                 -(__v8bf)__C);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_fnmsubne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, _mm_fnmsubne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
+      (__v8bf)__A);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmsubne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, _mm_fnmsubne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
+      (__v8bf)__C);
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmsubne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
+  return (__m128bh)__builtin_ia32_selectpbf_128(
+      (__mmask8)__U, _mm_fnmsubne_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
+      (__v8bf)_mm_setzero_pbh());
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
+#endif
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index a922056622e79..30fcc028958f3 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -649,6 +649,7 @@ _storebe_i64(void * __P, long long __D) {
 #endif
 
 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__)
+#include <avx10_2bf16intrin.h>
 #include <avx10_2convertintrin.h>
 #include <avx10_2minmaxintrin.h>
 #include <avx10_2niintrin.h>
@@ -656,6 +657,7 @@ _storebe_i64(void * __P, long long __D) {
 #endif
 
 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2_512__)
+#include <avx10_2_512bf16intrin.h>
 #include <avx10_2_512convertintrin.h>
 #include <avx10_2_512minmaxintrin.h>
 #include <avx10_2_512niintrin.h>
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index 311e574537059..233a068c8574c 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -875,6 +875,9 @@ bool SemaX86::CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
   case X86::BI__builtin_ia32_rndscaleps_mask:
   case X86::BI__builtin_ia32_rndscalepd_mask:
   case X86::BI__builtin_ia32_rndscaleph_mask:
+  case X86::BI__builtin_ia32_vrndscalenepbf16_128_mask:
+  case X86::BI__builtin_ia32_vrndscalenepbf16_256_mask:
+  case X86::BI__builtin_ia32_vrndscalenepbf16_mask:
   case X86::BI__builtin_ia32_reducepd128_mask:
   case X86::BI__builtin_ia32_reducepd256_mask:
   case X86::BI__builtin_ia32_reducepd512_mask:
@@ -884,6 +887,9 @@ bool SemaX86::CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
   case X86::BI__builtin_ia32_reduceph128_mask:
   case X86::BI__builtin_ia32_reduceph256_mask:
   case X86::BI__builtin_ia32_reduceph512_mask:
+  case X86::BI__builtin_ia32_vreducenepbf16128_mask:
+  case X86::BI__builtin_ia32_vreducenepbf16256_mask:
+  case X86::BI__builtin_ia32_vreducenepbf16512_mask:
   case X86::BI__builtin_ia32_vreducepd256_round_mask:
   case X86::BI__builtin_ia32_vreduceps256_round_mask:
   case X86::BI__builtin_ia32_vreduceph256_round_mask:
@@ -911,6 +917,9 @@ bool SemaX86::CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
   case X86::BI__builtin_ia32_fpclassph128_mask:
   case X86::BI__builtin_ia32_fpclassph256_mask:
   case X86::BI__builtin_ia32_fpclassph512_mask:
+  case X86::BI__builtin_ia32_vfpclasspbf16128_mask:
+  case X86::BI__builtin_ia32_vfpclasspbf16256_mask:
+  case X86::BI__builtin_ia32_vfpclasspbf16512_mask:
   case X86::BI__builtin_ia32_fpclasssd_mask:
   case X86::BI__builtin_ia32_fpclassss_mask:
   case X86::BI__builtin_ia32_fpclasssh_mask:
diff --git a/clang/test/CodeGen/X86/avx10_2_512bf16-builtins.c b/clang/test/CodeGen/X86/avx10_2_512bf16-builtins.c
new file mode 100644
index 0000000000000..b00859c174fba
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx10_2_512bf16-builtins.c
@@ -0,0 +1,1085 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-512 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386 -target-feature +avx10.2-512 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+__m512bh test_mm512_setzero_pbh() {
+  // CHECK-LABEL: @test_mm512_setzero_pbh
+  // CHECK: zeroinitializer
+  return _mm512_setzero_pbh();
+}
+
+__m512bh test_mm512_undefined_pbh(void) {
+  // CHECK-LABEL: @test_mm512_undefined_pbh
+  // CHECK: ret <32 x bfloat> zeroinitializer
+  return _mm512_undefined_pbh();
+}
+
+__m512bh test_mm512_set1_pbh(__bf16 h) {
+  // CHECK-LABEL: @test_mm512_set1_pbh
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 0
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 1
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 2
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 3
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 4
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 5
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 6
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 7
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 8
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 9
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 10
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 11
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 12
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 13
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 14
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 15
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 16
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 17
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 18
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 19
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 20
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 21
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 22
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 23
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 24
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 25
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 26
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 27
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 28
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 29
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 30
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 31
+  return _mm512_set1_pbh(h);
+}
+
+__m512bh test_mm512_set_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4,
+                          __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8,
+                          __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
+                          __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16,
+                          __bf16 bf17, __bf16 bf18, __bf16 bf19, __bf16 bf20,
+                          __bf16 bf21, __bf16 bf22, __bf16 bf23, __bf16 bf24,
+                          __bf16 bf25, __bf16 bf26, __bf16 bf27, __bf16 bf28,
+                          __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) {
+  // CHECK-LABEL: @test_mm512_set_pbh
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 0
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 1
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 2
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 3
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 4
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 5
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 6
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 7
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 8
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 9
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 10
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 11
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 12
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 13
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 14
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 15
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 16
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 17
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 18
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 19
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 20
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 21
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 22
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 23
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 24
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 25
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 26
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 27
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 28
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 29
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 30
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 31
+  return _mm512_set_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8,
+                       bf9, bf10, bf11, bf12, bf13, bf14, bf15, bf16,
+                       bf17, bf18, bf19, bf20, bf21, bf22, bf23, bf24,
+                       bf25, bf26, bf27, bf28, bf29, bf30, bf31, bf32);
+}
+
+__m512bh test_mm512_setr_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4,
+                           __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8,
+                           __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
+                           __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16,
+                           __bf16 bf17, __bf16 bf18, __bf16 bf19, __bf16 bf20,
+                           __bf16 bf21, __bf16 bf22, __bf16 bf23, __bf16 bf24,
+                           __bf16 bf25, __bf16 bf26, __bf16 bf27, __bf16 bf28,
+                           __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) {
+  // CHECK-LABEL: @test_mm512_setr_pbh
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 0
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 1
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 2
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 3
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 4
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 5
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 6
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 7
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 8
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 9
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 10
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 11
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 12
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 13
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 14
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 15
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 16
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 17
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 18
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 19
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 20
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 21
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 22
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 23
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 24
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 25
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 26
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 27
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 28
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 29
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 30
+  // CHECK: insertelement <32 x bfloat> {{.*}}, i32 31
+  return _mm512_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8,
+                        bf9, bf10, bf11, bf12, bf13, bf14, bf15, bf16,
+                        bf17, bf18, bf19, bf20, bf21, bf22, bf23, bf24,
+                        bf25, bf26, bf27, bf28, bf29, bf30, bf31, bf32);
+}
+
+__m512 test_mm512_castpbf16_ps(__m512bh A) {
+  // CHECK-LABEL: test_mm512_castpbf16_ps
+  // CHECK: bitcast <32 x bfloat> %{{.*}} to <16 x float>
+  return _mm512_castpbf16_ps(A);
+}
+
+__m512d test_mm512_castpbf16_pd(__m512bh A) {
+  // CHECK-LABEL: test_mm512_castpbf16_pd
+  // CHECK: bitcast <32 x bfloat> %{{.*}} to <8 x double>
+  return _mm512_castpbf16_pd(A);
+}
+
+__m512i test_mm512_castpbf16_si512(__m512bh A) {
+  // CHECK-LABEL: test_mm512_castpbf16_si512
+  // CHECK: bitcast <32 x bfloat> %{{.*}} to <8 x i64>
+  return _mm512_castpbf16_si512(A);
+}
+
+__m512bh test_mm512_castps_pbh(__m512 A) {
+  // CHECK-LABEL: test_mm512_castps_pbh
+  // CHECK: bitcast <16 x float> %{{.*}} to <32 x bfloat>
+  return _mm512_castps_pbh(A);
+}
+
+__m512bh test_mm512_castpd_pbh(__m512d A) {
+  // CHECK-LABEL: test_mm512_castpd_pbh
+  // CHECK: bitcast <8 x double> %{{.*}} to <32 x bfloat>
+  return _mm512_castpd_pbh(A);
+}
+
+__m512bh test_mm512_castsi512_pbh(__m512i A) {
+  // CHECK-LABEL: test_mm512_castsi512_pbh
+  // CHECK: bitcast <8 x i64> %{{.*}} to <32 x bfloat>
+  return _mm512_castsi512_pbh(A);
+}
+
+__m128bh test_mm512_castpbf16512_pbh128(__m512bh __a) {
+  // CHECK-LABEL: test_mm512_castpbf16512_pbh128
+  // CHECK: shufflevector <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm512_castpbf16512_pbh128(__a);
+}
+
+__m256bh test_mm512_castpbf16512_pbh256(__m512bh __a) {
+  // CHECK-LABEL: test_mm512_castpbf16512_pbh256
+  // CHECK: shufflevector <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return _mm512_castpbf16512_pbh256(__a);
+}
+
+__m512bh test_mm512_castpbf16128_pbh512(__m128bh __a) {
+  // CHECK-LABEL: test_mm512_castpbf16128_pbh512
+  // CHECK: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  return _mm512_castpbf16128_pbh512(__a);
+}
+
+__m512bh test_mm512_castpbf16256_pbh512(__m256bh __a) {
+  // CHECK-LABEL: test_mm512_castpbf16256_pbh512
+  // CHECK: shufflevector <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  return _mm512_castpbf16256_pbh512(__a);
+}
+
+__m512bh test_mm512_zextpbf16128_pbh512(__m128bh __a) {
+  // CHECK-LABEL: test_mm512_zextpbf16128_pbh512
+  // CHECK: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> {{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return _mm512_zextpbf16128_pbh512(__a);
+}
+
+__m512bh test_mm512_zextpbf16256_pbh512(__m256bh __a) {
+  // CHECK-LABEL: test_mm512_zextpbf16256_pbh512
+  // CHECK: shufflevector <16 x bfloat> %{{.*}}, <16 x bfloat> {{.*}}, <32 x i32>
+  return _mm512_zextpbf16256_pbh512(__a);
+}
+
+__m512bh test_mm512_abs_pbh(__m512bh a) {
+  // CHECK-LABEL: @test_mm512_abs_pbh
+  // CHECK: and <16 x i32>
+  return _mm512_abs_pbh(a);
+}
+
+// VMOVSH
+
+__m512bh test_mm512_load_pbh(void *p) {
+  // CHECK-LABEL: @test_mm512_load_pbh
+  // CHECK: load <32 x bfloat>, ptr %{{.*}}, align 64
+  return _mm512_load_pbh(p);
+}
+
+__m512bh test_mm512_loadu_pbh(void *p) {
+  // CHECK-LABEL: @test_mm512_loadu_pbh
+  // CHECK: load <32 x bfloat>, ptr {{.*}}, align 1{{$}}
+  return _mm512_loadu_pbh(p);
+}
+
+void test_mm512_store_pbh(void *p, __m512bh a) {
+  // CHECK-LABEL: @test_mm512_store_pbh
+  // CHECK: store <32 x bfloat> %{{.*}}, ptr %{{.*}}, align 64
+  _mm512_store_pbh(p, a);
+}
+
+void test_mm512_storeu_pbh(void *p, __m512bh a) {
+  // CHECK-LABEL: @test_mm512_storeu_pbh
+  // CHECK: store <32 x bfloat> %{{.*}}, ptr %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm512_storeu_pbh(p, a);
+}
+
+__m512bh test_mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) {
+  // CHECK-LABEL: @test_mm512_mask_blend_pbh
+  // CHECK:  %{{.*}} = bitcast i32 %{{.*}} to <32 x i1>
+  // CHECK:  %{{.*}} = select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_mask_blend_pbh(__U, __A, __W);
+}
+
+__m512bh test_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
+  // CHECK-LABEL: @test_mm512_permutex2var_pbh
+  // CHECK:  %{{.*}} = bitcast <32 x bfloat> %{{.*}} to <32 x i16>
+  // CHECK:  %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x i16>
+  // CHECK:  %{{.*}} = bitcast <32 x bfloat> %{{.*}} to <32 x i16>
+  // CHECK:  %{{.*}} = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
+  // CHECK:  %{{.*}} = bitcast <32 x i16> %{{.*}} to <32 x bfloat>
+  return _mm512_permutex2var_pbh(__A, __I, __B);
+}
+
+__m512bh test_mm512_permutexvar_epi16(__m512i __A, __m512bh __B) {
+  // CHECK-LABEL: @test_mm512_permutexvar_epi16
+  // CHECK:  %{{.*}} = bitcast <32 x bfloat> %{{.*}} to <32 x i16>
+  // CHECK:  %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x i16>
+  // CHECK:  %{{.*}} = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
+  // CHECK:  %{{.*}} = bitcast <32 x i16> %{{.*}} to <32 x bfloat>
+  return _mm512_permutexvar_pbh(__A, __B);
+}
+
+__m512bh test_mm512_addne_pbh(__m512bh __A, __m512bh __B) {
+  // CHECK-LABEL: @test_mm512_addne_pbh
+  // CHECK: %{{.*}} = fadd <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_addne_pbh(__A, __B);
+}
+
+__m512bh test_mm512_mask_addne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  // CHECK: %{{.*}} = fadd <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_mask_addne_pbh(__W, __U, __A, __B);
+}
+
+__m512bh test_mm512_maskz_addne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  // CHECK: %{{.*}} = fadd <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_maskz_addne_pbh(__U, __A, __B);
+}
+
+__m512bh test_mm512_subne_pbh(__m512bh __A, __m512bh __B) {
+  // CHECK-LABEL: @test_mm512_subne_pbh
+  // CHECK: %{{.*}} = fsub <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_subne_pbh(__A, __B);
+}
+
+__m512bh test_mm512_mask_subne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  // CHECK: %{{.*}} = fsub <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_mask_subne_pbh(__W, __U, __A, __B);
+}
+
+__m512bh test_mm512_maskz_subne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  // CHECK: %{{.*}} = fsub <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_maskz_subne_pbh(__U, __A, __B);
+}
+
+__m512bh test_mm512_mulne_pbh(__m512bh __A, __m512bh __B) {
+  // CHECK-LABEL: @test_mm512_mulne_pbh
+  // CHECK: %{{.*}} = fmul <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_mulne_pbh(__A, __B);
+}
+
+__m512bh test_mm512_mask_mulne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  // CHECK: %{{.*}} = fmul <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_mask_mulne_pbh(__W, __U, __A, __B);
+}
+
+__m512bh test_mm512_maskz_mulne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  // CHECK: %{{.*}} = fmul <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_maskz_mulne_pbh(__U, __A, __B);
+}
+
+__m512bh test_mm512_divne_pbh(__m512bh __A, __m512bh __B) {
+  // CHECK-LABEL: @test_mm512_divne_pbh
+  // CHECK: %{{.*}} = fdiv <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_divne_pbh(__A, __B);
+}
+
+__m512bh test_mm512_mask_divne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  // CHECK: %{{.*}} = fdiv <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_mask_divne_pbh(__W, __U, __A, __B);
+}
+
+__m512bh test_mm512_maskz_divne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  // CHECK: %{{.*}} = fdiv <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_maskz_divne_pbh(__U, __A, __B);
+}
+
+__m512bh test_mm512_max_pbh(__m512bh __A, __m512bh __B) {
+  // CHECK-LABEL: @test_mm512_max_pbh
+  // CHECK: @llvm.x86.avx10.vmaxpbf16512(
+  return _mm512_max_pbh(__A, __B);
+}
+
+__m512bh test_mm512_mask_max_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  // CHECK: @llvm.x86.avx10.vmaxpbf16512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_mask_max_pbh(__W, __U, __A, __B);
+}
+
+__m512bh test_mm512_maskz_max_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  // CHECK: @llvm.x86.avx10.vmaxpbf16512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_maskz_max_pbh(__U, __A, __B);
+}
+
+__m512bh test_mm512_min_pbh(__m512bh __A, __m512bh __B) {
+  // CHECK-LABEL: @test_mm512_min_pbh
+  // CHECK: @llvm.x86.avx10.vminpbf16512(
+  return _mm512_min_pbh(__A, __B);
+}
+
+__m512bh test_mm512_mask_min_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  // CHECK: @llvm.x86.avx10.vminpbf16512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_mask_min_pbh(__W, __U, __A, __B);
+}
+
+__m512bh test_mm512_maskz_min_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  // CHECK: @llvm.x86.avx10.vminpbf16512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_maskz_min_pbh(__U, __A, __B);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_eq_oq(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: @test_mm512_cmp_pbh_mask_eq_oq
+  // CHECK: fcmp oeq <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_EQ_OQ);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_lt_os(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_lt_os
+  // CHECK: fcmp olt <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_LT_OS);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_le_os(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_le_os
+  // CHECK: fcmp ole <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_LE_OS);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_unord_q(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_unord_q
+  // CHECK: fcmp uno <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_UNORD_Q);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_neq_uq(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_neq_uq
+  // CHECK: fcmp une <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_NEQ_UQ);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_nlt_us(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_nlt_us
+  // CHECK: fcmp uge <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_NLT_US);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_nle_us(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_nle_us
+  // CHECK: fcmp ugt <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_NLE_US);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_ord_q(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_ord_q
+  // CHECK: fcmp ord <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_ORD_Q);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_eq_uq(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_eq_uq
+  // CHECK: fcmp ueq <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_EQ_UQ);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_nge_us(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_nge_us
+  // CHECK: fcmp ult <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_NGE_US);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_ngt_us(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_ngt_us
+  // CHECK: fcmp ule <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_NGT_US);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_false_oq(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_false_oq
+  // CHECK: fcmp false <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_FALSE_OQ);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_neq_oq(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_neq_oq
+  // CHECK: fcmp one <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_NEQ_OQ);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_ge_os(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_ge_os
+  // CHECK: fcmp oge <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_GE_OS);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_gt_os(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_gt_os
+  // CHECK: fcmp ogt <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_GT_OS);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_true_uq(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_true_uq
+  // CHECK: fcmp true <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_TRUE_UQ);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_eq_os(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_eq_os
+  // CHECK: fcmp oeq <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_EQ_OS);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_lt_oq(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_lt_oq
+  // CHECK: fcmp olt <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_LT_OQ);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_le_oq(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_le_oq
+  // CHECK: fcmp ole <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_LE_OQ);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_unord_s(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_unord_s
+  // CHECK: fcmp uno <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_UNORD_S);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_neq_us(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_neq_us
+  // CHECK: fcmp une <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_NEQ_US);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_nlt_uq(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_nlt_uq
+  // CHECK: fcmp uge <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_NLT_UQ);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_nle_uq(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_nle_uq
+  // CHECK: fcmp ugt <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_NLE_UQ);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_ord_s(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_ord_s
+  // CHECK: fcmp ord <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_ORD_S);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_eq_us(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_eq_us
+  // CHECK: fcmp ueq <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_EQ_US);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_nge_uq(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_nge_uq
+  // CHECK: fcmp ult <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_NGE_UQ);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_ngt_uq(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_ngt_uq
+  // CHECK: fcmp ule <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_NGT_UQ);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_false_os(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_false_os
+  // CHECK: fcmp false <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_FALSE_OS);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_neq_os(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_neq_os
+  // CHECK: fcmp one <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_NEQ_OS);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_ge_oq(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_ge_oq
+  // CHECK: fcmp oge <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_GE_OQ);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_gt_oq(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_gt_oq
+  // CHECK: fcmp ogt <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_GT_OQ);
+}
+
+__mmask32 test_mm512_cmp_pbh_mask_true_us(__m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_cmp_pbh_mask_true_us
+  // CHECK: fcmp true <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pbh_mask(a, b, _CMP_TRUE_US);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_eq_oq(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: @test_mm512_mask_cmp_pbh_mask_eq_oq
+  // CHECK: fcmp oeq <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_OQ);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_lt_os(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_lt_os
+  // CHECK: fcmp olt <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_LT_OS);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_le_os(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_le_os
+  // CHECK: fcmp ole <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_LE_OS);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_unord_q(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_unord_q
+  // CHECK: fcmp uno <32 x bfloat> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_UNORD_Q);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_neq_uq(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_neq_uq
+  // CHECK: fcmp une <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_UQ);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_nlt_us(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_nlt_us
+  // CHECK: fcmp uge <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NLT_US);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_nle_us(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_nle_us
+  // CHECK: fcmp ugt <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NLE_US);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_ord_q(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_ord_q
+  // CHECK: fcmp ord <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_ORD_Q);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_eq_uq(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_eq_uq
+  // CHECK: fcmp ueq <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_UQ);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_nge_us(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_nge_us
+  // CHECK: fcmp ult <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NGE_US);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_ngt_us(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_ngt_us
+  // CHECK: fcmp ule <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NGT_US);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_false_oq(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_false_oq
+  // CHECK: fcmp false <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_FALSE_OQ);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_neq_oq(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_neq_oq
+  // CHECK: fcmp one <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_OQ);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_ge_os(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_ge_os
+  // CHECK: fcmp oge <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_GE_OS);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_gt_os(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_gt_os
+  // CHECK: fcmp ogt <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_GT_OS);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_true_uq(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_true_uq
+  // CHECK: fcmp true <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_TRUE_UQ);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_eq_os(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_eq_os
+  // CHECK: fcmp oeq <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_OS);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_lt_oq(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_lt_oq
+  // CHECK: fcmp olt <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_LT_OQ);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_le_oq(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_le_oq
+  // CHECK: fcmp ole <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_LE_OQ);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_unord_s(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_unord_s
+  // CHECK: fcmp uno <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_UNORD_S);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_neq_us(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_neq_us
+  // CHECK: fcmp une <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_US);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_nlt_uq(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_nlt_uq
+  // CHECK: fcmp uge <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NLT_UQ);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_nle_uq(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_nle_uq
+  // CHECK: fcmp ugt <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NLE_UQ);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_ord_s(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_ord_s
+  // CHECK: fcmp ord <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_ORD_S);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_eq_us(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_eq_us
+  // CHECK: fcmp ueq <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_US);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_nge_uq(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_nge_uq
+  // CHECK: fcmp ult <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NGE_UQ);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_ngt_uq(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_ngt_uq
+  // CHECK: fcmp ule <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NGT_UQ);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_false_os(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_false_os
+  // CHECK: fcmp false <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_FALSE_OS);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_neq_os(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_neq_os
+  // CHECK: fcmp one <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_OS);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_ge_oq(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_ge_oq
+  // CHECK: fcmp oge <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_GE_OQ);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_gt_oq(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_gt_oq
+  // CHECK: fcmp ogt <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_GT_OQ);
+}
+
+__mmask32 test_mm512_mask_cmp_pbh_mask_true_us(__mmask32 m, __m512bh a, __m512bh b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pbh_mask_true_us
+  // CHECK: fcmp true <32 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return _mm512_mask_cmp_pbh_mask(m, a, b, _CMP_TRUE_US);
+}
+
+__mmask32 test_mm512_mask_fpclass_pbh_mask(__mmask32 __U, __m512bh __A) {
+  // CHECK-LABEL: @test_mm512_mask_fpclass_pbh_mask
+  // CHECK: @llvm.x86.avx10.fpclass.nepbf16.512
+  return _mm512_mask_fpclass_pbh_mask(__U, __A, 4);
+}
+
+__mmask32 test_mm512_fpclass_pbh_mask(__m512bh __A) {
+  // CHECK-LABEL: @test_mm512_fpclass_pbh_mask
+  // CHECK: @llvm.x86.avx10.fpclass.nepbf16.512
+  return _mm512_fpclass_pbh_mask(__A, 4);
+}
+
+__m512bh test_mm512_scalef_pbh(__m512bh __A, __m512bh __B) {
+  // CHECK-LABEL: @test_mm512_scalef_pbh
+  // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.512
+  return _mm512_scalef_pbh(__A, __B);
+}
+
+__m512bh test_mm512_mask_scalef_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  // CHECK-LABEL: @test_mm512_mask_scalef_pbh
+  // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.512
+  return _mm512_mask_scalef_pbh(__W, __U, __A, __B);
+}
+
+__m512bh test_mm512_maskz_scalef_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  // CHECK-LABEL: @test_mm512_maskz_scalef_pbh
+  // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.512
+  return _mm512_maskz_scalef_pbh(__U, __A, __B);
+}
+
+__m512bh test_mm512_rcp_pbh(__m512bh __A) {
+  // CHECK-LABEL: @test_mm512_rcp_pbh
+  // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.512
+  return _mm512_rcp_pbh(__A);
+}
+
+__m512bh test_mm512_mask_rcp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
+  // CHECK-LABEL: @test_mm512_mask_rcp_pbh
+  // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.512
+  return (__m512bh)_mm512_mask_rcp_pbh(__W, __U, __A);
+}
+
+__m512bh test_mm512_maskz_rcp_pbh(__mmask32 __U, __m512bh __A) {
+  // CHECK-LABEL: @test_mm512_maskz_rcp_pbh
+  // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.512
+  return _mm512_maskz_rcp_pbh(__U, __A);
+}
+
+__m512bh test_mm512_getexp_pbh(__m512bh __A) {
+  // CHECK-LABEL: @test_mm512_getexp_pbh
+  // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.512
+  return _mm512_getexp_pbh(__A);
+}
+
+__m512bh test_mm512_mask_getexp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
+  // CHECK-LABEL: @test_mm512_mask_getexp_pbh
+  // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.512
+  return _mm512_mask_getexp_pbh(__W, __U, __A);
+}
+
+__m512bh test_mm512_maskz_getexp_pbh(__mmask32 __U, __m512bh __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getexp_pbh
+  // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.512
+  return _mm512_maskz_getexp_pbh(__U, __A);
+}
+
+__m512bh test_mm512_rsqrt_pbh(__m512bh __A) {
+  // CHECK-LABEL: @test_mm512_rsqrt_pbh
+  // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.512
+  return _mm512_rsqrt_pbh(__A);
+}
+
+__m512bh test_mm512_mask_rsqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
+  // CHECK-LABEL: @test_mm512_mask_rsqrt_pbh
+  // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.512
+  return (__m512bh)_mm512_mask_rsqrt_pbh(__W, __U, __A);
+}
+
+__m512bh test_mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
+  // CHECK-LABEL: @test_mm512_maskz_rsqrt_pbh
+  // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.512
+  return _mm512_maskz_rsqrt_pbh(__U, __A);
+}
+
+__m512bh test_mm512_reducene_pbh(__m512bh __A) {
+  // CHECK-LABEL: @test_mm512_reducene_pbh
+  // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.512
+  return _mm512_reducene_pbh(__A, 3);
+}
+
+__m512bh test_mm512_mask_reducene_pbh(__m512bh __W, __mmask16 __U, __m512bh __A) {
+  // CHECK-LABEL: @test_mm512_mask_reducene_pbh
+  // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.512
+  return _mm512_mask_reducene_pbh(__W, __U, __A, 1);
+}
+
+__m512bh test_mm512_maskz_reducene_pbh(__mmask16 __U, __m512bh __A) {
+  // CHECK-LABEL: @test_mm512_maskz_reducene_pbh
+  // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.512
+  return _mm512_maskz_reducene_pbh(__U, __A, 1);
+}
+
+__m512bh test_mm512_roundscalene_pbh(__m512bh __A) {
+  // CHECK-LABEL: @test_mm512_roundscalene_pbh
+  // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.512
+  return _mm512_roundscalene_pbh(__A, 3);
+}
+
+__m512bh test_mm512_mask_roundscalene_pbh(__m512bh __W, __mmask16 __U, __m512bh __A) {
+  // CHECK-LABEL: @test_mm512_mask_roundscalene_pbh
+  // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.512
+  return _mm512_mask_roundscalene_pbh(__W, __U, __A, 1);
+}
+
+__m512bh test_mm512_maskz_roundscalene_pbh(__mmask16 __U, __m512bh __A) {
+  // CHECK-LABEL: @test_mm512_maskz_roundscalene_pbh
+  // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.512
+  return _mm512_maskz_roundscalene_pbh(__U, __A, 1 );
+}
+
+__m512bh test_mm512_getmant_pbh(__m512bh __A) {
+  // CHECK-LABEL: @test_mm512_getmant_pbh
+  // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.512
+  return _mm512_getmant_pbh(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m512bh test_mm512_mask_getmant_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
+  // CHECK-LABEL: @test_mm512_mask_getmant_pbh
+  // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.512
+  return _mm512_mask_getmant_pbh(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m512bh test_mm512_maskz_getmant_pbh(__mmask32 __U, __m512bh __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getmant_pbh
+  // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.512
+  return _mm512_maskz_getmant_pbh(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m512bh test_mm512_sqrt_pbh(__m512bh __A) {
+  // CHECK-LABEL: @test_mm512_sqrt_pbh
+  // CHECK: %{{.*}} = call <32 x bfloat> @llvm.sqrt.v32bf16(<32 x bfloat> %{{.*}})
+  return _mm512_sqrt_pbh(__A);
+}
+
+__m512bh test_mm512_mask_sqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
+  // CHECK-LABEL: @test_mm512_mask_sqrt_pbh
+  // CHECK: %{{.*}} = call <32 x bfloat> @llvm.sqrt.v32bf16(<32 x bfloat> %{{.*}})
+  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return (__m512bh)_mm512_mask_sqrt_pbh(__W, __U, __A);
+}
+
+__m512bh test_mm512_maskz_sqrt_pbh(__mmask32 __U, __m512bh __A) {
+  // CHECK-LABEL: @test_mm512_maskz_sqrt_pbh
+  // CHECK: %{{.*}} = call <32 x bfloat> @llvm.sqrt.v32bf16(<32 x bfloat> %{{.*}})
+  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_maskz_sqrt_pbh(__U, __A);
+}
+
+__m512bh test_mm512_fmaddne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
+  // CHECK-LABEL: @test_mm512_fmaddne_pbh
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  return _mm512_fmaddne_pbh(__A, __B, __C);
+}
+
+__m512bh test_mm512_mask_fmaddne_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
+  // CHECK-LABEL: @test_mm512_mask_fmaddne_pbh
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_mask_fmaddne_pbh(__A, __U, __B, __C);
+}
+
+__m512bh test_mm512_mask3_fmaddne_pbh(__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
+  // CHECK-LABEL: @test_mm512_mask3_fmaddne_pbh
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_mask3_fmaddne_pbh(__A, __B, __C, __U);
+}
+
+__m512bh test_mm512_maskz_fmaddne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
+  // CHECK-LABEL: @test_mm512_maskz_fmaddne_pbh
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_maskz_fmaddne_pbh(__U, __A, __B, __C);
+}
+
+__m512bh test_mm512_fmsubne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
+  // CHECK-LABEL: @test_mm512_fmsubne_pbh
+  // CHECK: fneg
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  return _mm512_fmsubne_pbh(__A, __B, __C);
+}
+
+__m512bh test_mm512_mask_fmsubne_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
+  // CHECK-LABEL: @test_mm512_mask_fmsubne_pbh
+  // CHECK: fneg
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_mask_fmsubne_pbh(__A, __U, __B, __C);
+}
+
+__m512bh test_mm512_mask3_fmsubne_pbh(__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
+  // CHECK-LABEL: @test_mm512_mask3_fmsubne_pbh
+  // CHECK: fneg
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_mask3_fmsubne_pbh(__A, __B, __C, __U);
+}
+
+__m512bh test_mm512_maskz_fmsubne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
+  // CHECK-LABEL: @test_mm512_maskz_fmsubne_pbh
+  // CHECK: fneg
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_maskz_fmsubne_pbh(__U, __A, __B, __C);
+}
+
+__m512bh test_mm512_fnmaddne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
+  // CHECK-LABEL: @test_mm512_fnmaddne_pbh
+  // CHECK: fneg
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  return _mm512_fnmaddne_pbh(__A, __B, __C);
+}
+
+__m512bh test_mm512_mask_fnmaddne_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
+  // CHECK-LABEL: @test_mm512_mask_fnmaddne_pbh
+  // CHECK: fneg
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_mask_fnmaddne_pbh(__A, __U, __B, __C);
+}
+
+__m512bh test_mm512_mask3_fnmaddne_pbh(__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
+  // CHECK-LABEL: @test_mm512_mask3_fnmaddne_pbh
+  // CHECK: fneg
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_mask3_fnmaddne_pbh(__A, __B, __C, __U);
+}
+
+__m512bh test_mm512_maskz_fnmaddne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
+  // CHECK-LABEL: @test_mm512_maskz_fnmaddne_pbh
+  // CHECK: fneg
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_maskz_fnmaddne_pbh(__U, __A, __B, __C);
+}
+
+__m512bh test_mm512_fnmsubne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
+  // CHECK-LABEL: @test_mm512_fnmsubne_pbh
+  // CHECK: fneg
+  // CHECK: fneg
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  return _mm512_fnmsubne_pbh(__A, __B, __C);
+}
+
+__m512bh test_mm512_mask_fnmsubne_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
+  // CHECK-LABEL: @test_mm512_mask_fnmsubne_pbh
+  // CHECK: fneg
+  // CHECK: fneg
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_mask_fnmsubne_pbh(__A, __U, __B, __C);
+}
+
+__m512bh test_mm512_mask3_fnmsubne_pbh(__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
+  // CHECK-LABEL: @test_mm512_mask3_fnmsubne_pbh
+  // CHECK: fneg
+  // CHECK: fneg
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_mask3_fnmsubne_pbh(__A, __B, __C, __U);
+}
+
+__m512bh test_mm512_maskz_fnmsubne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
+  // CHECK-LABEL: @test_mm512_maskz_fnmsubne_pbh
+  // CHECK: fneg
+  // CHECK: fneg
+  // CHECK: call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}})
+  // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
+  return _mm512_maskz_fnmsubne_pbh(__U, __A, __B, __C);
+}
diff --git a/clang/test/CodeGen/X86/avx10_2bf16-builtins.c b/clang/test/CodeGen/X86/avx10_2bf16-builtins.c
new file mode 100644
index 0000000000000..cd94edcf58ea2
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx10_2bf16-builtins.c
@@ -0,0 +1,2082 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-256 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386 -target-feature +avx10.2-256 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+__m256bh test_mm256_setzero_pbh() {
+  // CHECK-LABEL: @test_mm256_setzero_pbh
+  // CHECK: zeroinitializer
+  return _mm256_setzero_pbh();
+}
+
+__m128bh test_mm_setzero_pbh() {
+  // CHECK-LABEL: @test_mm_setzero_pbh
+  // CHECK: zeroinitializer
+  return _mm_setzero_pbh();
+}
+
+__m256bh test_mm256_undefined_pbh(void) {
+  // CHECK-LABEL: @test_mm256_undefined_pbh
+  // CHECK: ret <16 x bfloat> zeroinitializer
+  return _mm256_undefined_pbh();
+}
+
+__m128bh test_mm_undefined_pbh(void) {
+  // CHECK-LABEL: @test_mm_undefined_pbh
+  // CHECK: ret <8 x bfloat> zeroinitializer
+  return _mm_undefined_pbh();
+}
+
+__bf16 test_mm_cvtsbh_bf16(__m128bh __A) {
+  // CHECK-LABEL: @test_mm_cvtsbh_bf16
+  // CHECK: extractelement <8 x bfloat> %{{.*}}, i32 0
+  return _mm_cvtsbh_bf16(__A);
+}
+
+__bf16 test_mm256_cvtsbh_bf16(__m256bh __A) {
+  // CHECK-LABEL: @test_mm256_cvtsbh_bf16
+  // CHECK: extractelement <16 x bfloat> %{{.*}}, i32 0
+  return _mm256_cvtsbh_bf16(__A);
+}
+
+__m128bh test_mm_set_sbh(__bf16 h) {
+  // CHECK-LABEL: @test_mm_set_sbh
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 0
+  // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 1
+  // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 2
+  // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 3
+  // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 4
+  // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 5
+  // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 6
+  // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 7
+  return _mm_set_sbh(h);
+}
+
+__m128bh test_mm_set1_pbh(__bf16 h) {
+  // CHECK-LABEL: @test_mm_set1_pbh
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 0
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 1
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 2
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 3
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 4
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 5
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 6
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 7
+  return _mm_set1_pbh(h);
+}
+
+__m256bh test_mm256_set1_pbh(__bf16 h) {
+  // CHECK-LABEL: @test_mm256_set1_pbh
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 0
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 1
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 2
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 3
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 4
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 5
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 6
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 7
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 8
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 9
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 10
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 11
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 12
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 13
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 14
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 15
+  return _mm256_set1_pbh(h);
+}
+
+__m128bh test_mm_set_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4,
+                       __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8) {
+  // CHECK-LABEL: @test_mm_set_pbh
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 0
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 1
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 2
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 3
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 4
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 5
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 6
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 7
+  return _mm_set_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8);
+}
+
+__m256bh test_mm256_set_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4,
+                          __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8,
+                          __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
+                          __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16) {
+  // CHECK-LABEL: @test_mm256_set_pbh
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 0
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 1
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 2
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 3
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 4
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 5
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 6
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 7
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 8
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 9
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 10
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 11
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 12
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 13
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 14
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 15
+  return _mm256_set_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8,
+                       bf9, bf10, bf11, bf12, bf13, bf14, bf15, bf16);
+}
+
+__m128bh test_mm_setr_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4,
+                        __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8) {
+  // CHECK-LABEL: @test_mm_setr_pbh
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 0
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 1
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 2
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 3
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 4
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 5
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 6
+  // CHECK: insertelement <8 x bfloat> {{.*}}, i32 7
+  return _mm_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8);
+}
+
+__m256bh test_mm256_setr_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4,
+                           __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8,
+                           __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
+                           __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16) {
+  // CHECK-LABEL: @test_mm256_setr_pbh
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 0
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 1
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 2
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 3
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 4
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 5
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 6
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 7
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 8
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 9
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 10
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 11
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 12
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 13
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 14
+  // CHECK: insertelement <16 x bfloat> {{.*}}, i32 15
+  return _mm256_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8,
+                        bf9, bf10, bf11, bf12, bf13, bf14, bf15, bf16);
+}
+
+__m128 test_mm_castpbf16_ps(__m128bh A) {
+  // CHECK-LABEL: test_mm_castpbf16_ps
+  // CHECK: bitcast <8 x bfloat> %{{.*}} to <4 x float>
+  return _mm_castpbf16_ps(A);
+}
+
+__m256 test_mm256_castpbf16_ps(__m256bh A) {
+  // CHECK-LABEL: test_mm256_castpbf16_ps
+  // CHECK: bitcast <16 x bfloat> %{{.*}} to <8 x float>
+  return _mm256_castpbf16_ps(A);
+}
+
+__m128i test_mm_castpbf16_si128(__m128bh A) {
+  // CHECK-LABEL: test_mm_castpbf16_si128
+  // CHECK: bitcast <8 x bfloat> %{{.*}} to <2 x i64>
+  return _mm_castpbf16_si128(A);
+}
+
+__m256i test_mm256_castpbf16_si256(__m256bh A) {
+  // CHECK-LABEL: test_mm256_castpbf16_si256
+  // CHECK: bitcast <16 x bfloat> %{{.*}} to <4 x i64>
+  return _mm256_castpbf16_si256(A);
+}
+
+__m128bh test_mm_castps_pbh(__m128 A) {
+  // CHECK-LABEL: test_mm_castps_pbh
+  // CHECK: bitcast <4 x float> %{{.*}} to <8 x bfloat>
+  return _mm_castps_pbh(A);
+}
+
+__m256bh test_mm256_castps_pbh(__m256 A) {
+  // CHECK-LABEL: test_mm256_castps_pbh
+  // CHECK: bitcast <8 x float> %{{.*}} to <16 x bfloat>
+  return _mm256_castps_pbh(A);
+}
+
+__m128bh test_mm_castpd_pbh(__m128d A) {
+  // CHECK-LABEL: test_mm_castpd_pbh
+  // CHECK: bitcast <2 x double> %{{.*}} to <8 x bfloat>
+  return _mm_castpd_pbh(A);
+}
+
+__m256bh test_mm256_castpd_pbh(__m256d A) {
+  // CHECK-LABEL: test_mm256_castpd_pbh
+  // CHECK: bitcast <4 x double> %{{.*}} to <16 x bfloat>
+  return _mm256_castpd_pbh(A);
+}
+
+__m128bh test_mm_castsi128_pbh(__m128i A) {
+  // CHECK-LABEL: test_mm_castsi128_pbh
+  // CHECK: bitcast <2 x i64> %{{.*}} to <8 x bfloat>
+  return _mm_castsi128_pbh(A);
+}
+
+__m256bh test_mm256_castsi256_pbh(__m256i A) {
+  // CHECK-LABEL: test_mm256_castsi256_pbh
+  // CHECK: bitcast <4 x i64> %{{.*}} to <16 x bfloat>
+  return _mm256_castsi256_pbh(A);
+}
+
+__m128d test_mm_castpbf16_pd(__m128bh A) {
+  // CHECK-LABEL: test_mm_castpbf16_pd
+  // CHECK: bitcast <8 x bfloat> %{{.*}} to <2 x double>
+  return _mm_castpbf16_pd(A);
+}
+
+__m128bh test_mm256_castpbf16256_pbh128(__m256bh __a) {
+  // CHECK-LABEL: test_mm256_castpbf16256_pbh128
+  // CHECK: shufflevector <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_castpbf16256_pbh128(__a);
+}
+
+__m256bh test_mm256_castpbf16128_pbh256(__m128bh __a) {
+  // CHECK-LABEL: test_mm256_castpbf16128_pbh256
+  // CHECK: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  return _mm256_castpbf16128_pbh256(__a);
+}
+
+__m256d test_mm256_castpbf16_pd(__m256bh A) {
+  // CHECK-LABEL: test_mm256_castpbf16_pd
+  // CHECK: bitcast <16 x bfloat> %{{.*}} to <4 x double>
+  return _mm256_castpbf16_pd(A);
+}
+
+__m256bh test_mm256_zextpbf16128_pbh256(__m128bh __a) {
+  // CHECK-LABEL: test_mm256_zextpbf16128_pbh256
+  // CHECK: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> {{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return _mm256_zextpbf16128_pbh256(__a);
+}
+
+__m128bh test_mm_abs_pbh(__m128bh a) {
+  // CHECK-LABEL: @test_mm_abs_pbh
+  // CHECK: and <4 x i32>
+  return _mm_abs_pbh(a);
+}
+
+__m256bh test_mm256_abs_pbh(__m256bh a) {
+  // CHECK-LABEL: @test_mm256_abs_pbh
+  // CHECK: and <8 x i32>
+  return _mm256_abs_pbh(a);
+}
+
+__m256bh test_mm256_loadu_pbh(void *p) {
+  // CHECK-LABEL: @test_mm256_loadu_pbh
+  // CHECK: load <16 x bfloat>, ptr {{.*}}, align 1{{$}}
+  return _mm256_loadu_pbh(p);
+}
+
+__m128bh test_mm_load_sbh(void const *A) {
+  // CHECK-LABEL: test_mm_load_sbh
+  // CHECK: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr %{{.*}}, i32 1, <8 x i1> bitcast (<1 x i8> <i8 1> to <8 x i1>), <8 x bfloat> %{{.*}})
+  return _mm_load_sbh(A);
+}
+
+__m256bh test_mm256_load_pbh(void *p) {
+  // CHECK-LABEL: @test_mm256_load_pbh
+  // CHECK: load <16 x bfloat>, ptr %{{.*}}, align 32
+  return _mm256_load_pbh(p);
+}
+
+__m128bh test_mm_load_pbh(void *p) {
+  // CHECK-LABEL: @test_mm_load_pbh
+  // CHECK: load <8 x bfloat>, ptr %{{.*}}, align 16
+  return _mm_load_pbh(p);
+}
+
+__m128bh test_mm_loadu_pbh(void *p) {
+  // CHECK-LABEL: @test_mm_loadu_pbh
+  // CHECK: load <8 x bfloat>, ptr {{.*}}, align 1{{$}}
+  return _mm_loadu_pbh(p);
+}
+
+void test_mm_store_sbh(void *A, __m128bh B) {
+  // CHECK-LABEL: test_mm_store_sbh
+  // CHECK: extractelement <8 x bfloat> %{{.*}}, i32 0
+  // CHECK: store bfloat %{{.*}}, ptr %{{.*}}, align 1{{$}}
+  _mm_store_sbh(A, B);
+}
+
+void test_mm_mask_store_sbh(void *__P, __mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_mask_store_sbh
+  // CHECK: call void @llvm.masked.store.v8bf16.p0(<8 x bfloat> %{{.*}}, ptr %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  _mm_mask_store_sbh(__P, __U, __A);
+}
+
+void test_mm256_store_pbh(void *p, __m256bh a) {
+  // CHECK-LABEL: @test_mm256_store_pbh
+  // CHECK: store <16 x bfloat> %{{.*}}, ptr %{{.*}}, align 32
+  _mm256_store_pbh(p, a);
+}
+
+void test_mm_store_pbh(void *p, __m128bh a) {
+  // CHECK-LABEL: @test_mm_store_pbh
+  // CHECK: store <8 x bfloat> %{{.*}}, ptr %{{.*}}, align 16
+  _mm_store_pbh(p, a);
+}
+
+__m128bh test_mm_mask_load_sbh(__m128bh __A, __mmask8 __U, const void *__W) {
+  // CHECK-LABEL: @test_mm_mask_load_sbh
+  // CHECK: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}})
+  return _mm_mask_load_sbh(__A, __U, __W);
+}
+
+__m128bh test_mm_maskz_load_sbh(__mmask8 __U, const void *__W) {
+  // CHECK-LABEL: @test_mm_maskz_load_sbh
+  // CHECK: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}})
+  return _mm_maskz_load_sbh(__U, __W);
+}
+
+void test_mm256_storeu_pbh(void *p, __m256bh a) {
+  // CHECK-LABEL: @test_mm256_storeu_pbh
+  // CHECK: store <16 x bfloat> %{{.*}}, ptr %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm256_storeu_pbh(p, a);
+}
+
+void test_mm_storeu_pbh(void *p, __m128bh a) {
+  // CHECK-LABEL: @test_mm_storeu_pbh
+  // CHECK: store <8 x bfloat> %{{.*}}, ptr %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm_storeu_pbh(p, a);
+}
+
+__m128bh test_mm_move_sbh(__m128bh A, __m128bh B) {
+  // CHECK-LABEL: test_mm_move_sbh
+  // CHECK: extractelement <8 x bfloat> %{{.*}}, i32 0
+  // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 0
+  return _mm_move_sbh(A, B);
+}
+
+__m128bh test_mm_mask_move_sbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: @test_mm_mask_move_sbh
+  // CHECK: [[EXT:%.*]] = extractelement <8 x bfloat> %{{.*}}, i32 0
+  // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat [[EXT]], i32 0
+  // CHECK: [[A:%.*]] = extractelement <8 x bfloat> [[VEC:%.*]], i64 0
+  // CHECK-NEXT: [[B:%.*]] = extractelement <8 x bfloat> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, bfloat [[A]], bfloat [[B]]
+  // CHECK-NEXT: insertelement <8 x bfloat> [[VEC]], bfloat [[SEL]], i64 0
+  return _mm_mask_move_sbh(__W, __U, __A, __B);
+}
+
+__m128bh test_mm_maskz_move_sbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: @test_mm_maskz_move_sbh
+  // CHECK: [[EXT:%.*]] = extractelement <8 x bfloat> %{{.*}}, i32 0
+  // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat [[EXT]], i32 0
+  // CHECK: [[A:%.*]] = extractelement <8 x bfloat> [[VEC:%.*]], i64 0
+  // CHECK-NEXT: [[B:%.*]] = extractelement <8 x bfloat> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, bfloat [[A]], bfloat [[B]]
+  // CHECK-NEXT: insertelement <8 x bfloat> [[VEC]], bfloat [[SEL]], i64 0
+  return _mm_maskz_move_sbh(__U, __A, __B);
+}
+
+__m128bh test_mm_mask_blend_pbh(__mmask8 __U, __m128bh __A, __m128bh __W) {
+  // CHECK-LABEL: @test_mm_mask_blend_pbh
+  // CHECK:  %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK:  %{{.*}} = select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_mask_blend_pbh(__U, __A, __W);
+}
+
+__m256bh test_mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) {
+  // CHECK-LABEL: @test_mm256_mask_blend_pbh
+  // CHECK:  %{{.*}} = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK:  %{{.*}} = select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_mask_blend_pbh(__U, __A, __W);
+}
+
+__m128bh test_mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) {
+  // CHECK-LABEL: @test_mm_permutex2var_pbh
+  // CHECK:  %{{.*}} = bitcast <8 x bfloat> %{{.*}} to <8 x i16>
+  // CHECK:  %{{.*}} = bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // CHECK:  %{{.*}} = bitcast <8 x bfloat> %{{.*}} to <8 x i16>
+  // CHECK:  %{{.*}} = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK:  %{{.*}} = bitcast <8 x i16> %{{.*}} to <8 x bfloat>
+  return _mm_permutex2var_pbh(__A, __I, __B);
+}
+
+__m256bh test_mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) {
+  // CHECK-LABEL: @test_mm256_permutex2var_pbh
+  // CHECK:  %{{.*}} = bitcast <16 x bfloat> %{{.*}} to <16 x i16>
+  // CHECK:  %{{.*}} = bitcast <4 x i64> %{{.*}} to <16 x i16>
+  // CHECK:  %{{.*}} = bitcast <16 x bfloat> %{{.*}} to <16 x i16>
+  // CHECK:  %{{.*}} = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  // CHECK:  %{{.*}} = bitcast <16 x i16> %{{.*}} to <16 x bfloat>
+  return _mm256_permutex2var_pbh(__A, __I, __B);
+}
+
+__m128bh test_mm_permutexvar_pbh(__m128i __A, __m128bh __B) {
+  // CHECK-LABEL: @test_mm_permutexvar_pbh
+  // CHECK:  %{{.*}} = bitcast <8 x bfloat> %{{.*}} to <8 x i16>
+  // CHECK:  %{{.*}} = bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // CHECK:  %{{.*}} = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK:  %{{.*}} = bitcast <8 x i16> %{{.*}} to <8 x bfloat>
+  return _mm_permutexvar_pbh(__A, __B);
+}
+
+__m256bh test_mm256_permutexvar_pbh(__m256i __A, __m256bh __B) {
+  // CHECK-LABEL: @test_mm256_permutexvar_pbh
+  // CHECK:  %{{.*}} = bitcast <16 x bfloat> %{{.*}} to <16 x i16>
+  // CHECK:  %{{.*}} = bitcast <4 x i64> %{{.*}} to <16 x i16>
+  // CHECK:  %{{.*}} = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  // CHECK:  %{{.*}} = bitcast <16 x i16> %{{.*}} to <16 x bfloat>
+  return _mm256_permutexvar_pbh(__A, __B);
+}
+
+__m256bh test_mm256_addne_pbh(__m256bh __A, __m256bh __B) {
+  // CHECK-LABEL: @test_mm256_addne_pbh
+  // CHECK: %{{.*}} = fadd <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_addne_pbh(__A, __B);
+}
+
+__m256bh test_mm256_mask_addne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
+  // CHECK: %{{.*}} = fadd <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return (__m256bh)_mm256_mask_addne_pbh(__W, __U, __A, __B);
+}
+
+__m256bh test_mm256_maskz_addne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
+  // CHECK: %{{.*}} = fadd <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_maskz_addne_pbh(__U, __A, __B);
+}
+
+__m128bh test_mm_addne_pbh(__m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: @test_mm_addne_pbh
+  // CHECK: %{{.*}} = fadd <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_addne_pbh(__A, __B);
+}
+
+__m128bh test_mm_mask_addne_pbh(__m128bh __W, __mmask16 __U, __m128bh __A, __m128bh __B) {
+  // CHECK: %{{.*}} = fadd <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return (__m128bh)_mm_mask_addne_pbh(__W, __U, __A, __B);
+}
+
+__m128bh test_mm_maskz_addne_pbh(__mmask16 __U, __m128bh __A, __m128bh __B) {
+  // CHECK: %{{.*}} = fadd <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_maskz_addne_pbh(__U, __A, __B);
+}
+
+__m256bh test_mm256_subne_pbh(__m256bh __A, __m256bh __B) {
+  // CHECK-LABEL: @test_mm256_subne_pbh
+  // CHECK: %{{.*}} = fsub <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_subne_pbh(__A, __B);
+}
+
+__m256bh test_mm256_mask_subne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
+  // CHECK: %{{.*}} = fsub <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return (__m256bh)_mm256_mask_subne_pbh(__W, __U, __A, __B);
+}
+
+__m256bh test_mm256_maskz_subne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
+  // CHECK: %{{.*}} = fsub <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_maskz_subne_pbh(__U, __A, __B);
+}
+
+__m128bh test_mm_subne_pbh(__m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: @test_mm_subne_pbh
+  // CHECK: %{{.*}} = fsub <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_subne_pbh(__A, __B);
+}
+
+__m128bh test_mm_mask_subne_pbh(__m128bh __W, __mmask16 __U, __m128bh __A, __m128bh __B) {
+  // CHECK: %{{.*}} = fsub <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return (__m128bh)_mm_mask_subne_pbh(__W, __U, __A, __B);
+}
+
+__m128bh test_mm_maskz_subne_pbh(__mmask16 __U, __m128bh __A, __m128bh __B) {
+  // CHECK: %{{.*}} = fsub <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_maskz_subne_pbh(__U, __A, __B);
+}
+
+__m256bh test_mm256_mulne_pbh(__m256bh __A, __m256bh __B) {
+  // CHECK-LABEL: @test_mm256_mulne_pbh
+  // CHECK: %{{.*}} = fmul <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_mulne_pbh(__A, __B);
+}
+
+__m256bh test_mm256_mask_mulne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
+  // CHECK: %{{.*}} = fmul <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return (__m256bh)_mm256_mask_mulne_pbh(__W, __U, __A, __B);
+}
+
+__m256bh test_mm256_maskz_mulne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
+  // CHECK: %{{.*}} = fmul <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_maskz_mulne_pbh(__U, __A, __B);
+}
+
+__m128bh test_mm_mulne_pbh(__m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: @test_mm_mulne_pbh
+  // CHECK: %{{.*}} = fmul <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_mulne_pbh(__A, __B);
+}
+
+__m128bh test_mm_mask_mulne_pbh(__m128bh __W, __mmask16 __U, __m128bh __A, __m128bh __B) {
+  // CHECK: %{{.*}} = fmul <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return (__m128bh)_mm_mask_mulne_pbh(__W, __U, __A, __B);
+}
+
+__m128bh test_mm_maskz_mulne_pbh(__mmask16 __U, __m128bh __A, __m128bh __B) {
+  // CHECK: %{{.*}} = fmul <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_maskz_mulne_pbh(__U, __A, __B);
+}
+
+__m256bh test_mm256_divne_pbh(__m256bh __A, __m256bh __B) {
+  // CHECK-LABEL: @test_mm256_divne_pbh
+  // CHECK: %{{.*}} = fdiv <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_divne_pbh(__A, __B);
+}
+
+__m256bh test_mm256_mask_divne_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
+  // CHECK: %{{.*}} = fdiv <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return (__m256bh)_mm256_mask_divne_pbh(__W, __U, __A, __B);
+}
+
+__m256bh test_mm256_maskz_divne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
+  // CHECK: %{{.*}} = fdiv <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_maskz_divne_pbh(__U, __A, __B);
+}
+
+__m128bh test_mm_divne_pbh(__m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: @test_mm_divne_pbh
+  // CHECK: %{{.*}} = fdiv <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_divne_pbh(__A, __B);
+}
+
+__m128bh test_mm_mask_divne_pbh(__m128bh __W, __mmask16 __U, __m128bh __A, __m128bh __B) {
+  // CHECK: %{{.*}} = fdiv <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return (__m128bh)_mm_mask_divne_pbh(__W, __U, __A, __B);
+}
+
+__m128bh test_mm_maskz_divne_pbh(__mmask16 __U, __m128bh __A, __m128bh __B) {
+  // CHECK: %{{.*}} = fdiv <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_maskz_divne_pbh(__U, __A, __B);
+}
+
+__m256bh test_mm256_max_pbh(__m256bh __A, __m256bh __B) {
+  // CHECK-LABEL: @test_mm256_max_pbh
+  // CHECK: @llvm.x86.avx10.vmaxpbf16256(
+  return _mm256_max_pbh(__A, __B);
+}
+
+__m256bh test_mm256_mask_max_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
+  // CHECK: @llvm.x86.avx10.vmaxpbf16256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return (__m256bh)_mm256_mask_max_pbh(__W, __U, __A, __B);
+}
+
+__m256bh test_mm256_maskz_max_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
+  // CHECK: @llvm.x86.avx10.vmaxpbf16256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_maskz_max_pbh(__U, __A, __B);
+}
+
+__m128bh test_mm_max_pbh(__m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: @test_mm_max_pbh
+  // CHECK: @llvm.x86.avx10.vmaxpbf16128(
+  return _mm_max_pbh(__A, __B);
+}
+
+__m128bh test_mm_mask_max_pbh(__m128bh __W, __mmask16 __U, __m128bh __A, __m128bh __B) {
+  // CHECK: @llvm.x86.avx10.vmaxpbf16128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return (__m128bh)_mm_mask_max_pbh(__W, __U, __A, __B);
+}
+
+__m128bh test_mm_maskz_max_pbh(__mmask16 __U, __m128bh __A, __m128bh __B) {
+  // CHECK: @llvm.x86.avx10.vmaxpbf16128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_maskz_max_pbh(__U, __A, __B);
+}
+
+__m256bh test_mm256_min_pbh(__m256bh __A, __m256bh __B) {
+  // CHECK-LABEL: @test_mm256_min_pbh
+  // CHECK: @llvm.x86.avx10.vminpbf16256(
+  return _mm256_min_pbh(__A, __B);
+}
+
+__m256bh test_mm256_mask_min_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
+  // CHECK: @llvm.x86.avx10.vminpbf16256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return (__m256bh)_mm256_mask_min_pbh(__W, __U, __A, __B);
+}
+
+__m256bh test_mm256_maskz_min_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
+  // CHECK: @llvm.x86.avx10.vminpbf16256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_maskz_min_pbh(__U, __A, __B);
+}
+
+__m128bh test_mm_min_pbh(__m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: @test_mm_min_pbh
+  // CHECK: @llvm.x86.avx10.vminpbf16128(
+  return _mm_min_pbh(__A, __B);
+}
+
+__m128bh test_mm_mask_min_pbh(__m128bh __W, __mmask16 __U, __m128bh __A, __m128bh __B) {
+  // CHECK: @llvm.x86.avx10.vminpbf16128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return (__m128bh)_mm_mask_min_pbh(__W, __U, __A, __B);
+}
+
+__m128bh test_mm_maskz_min_pbh(__mmask16 __U, __m128bh __A, __m128bh __B) {
+  // CHECK: @llvm.x86.avx10.vminpbf16128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_maskz_min_pbh(__U, __A, __B);
+}
+
+int test_mm_comeqsbh(__m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: test_mm_comeqsbh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx10.vcomsbf16eq(<8 x bfloat> %{{.}}, <8 x bfloat> %{{.}})
+  return _mm_comeqsbh(__A, __B);
+}
+
+int test_mm_comltsbh(__m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: test_mm_comltsbh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx10.vcomsbf16lt(<8 x bfloat> %{{.}}, <8 x bfloat> %{{.}})
+  return _mm_comltsbh(__A, __B);
+}
+
+int test_mm_comlesbh(__m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: test_mm_comlesbh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx10.vcomsbf16le(<8 x bfloat> %{{.}}, <8 x bfloat> %{{.}})
+  return _mm_comlesbh(__A, __B);
+}
+
+int test_mm_comgtsbh(__m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: test_mm_comgtsbh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx10.vcomsbf16gt(<8 x bfloat> %{{.}}, <8 x bfloat> %{{.}})
+  return _mm_comgtsbh(__A, __B);
+}
+
+int test_mm_comgesbh(__m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: test_mm_comgesbh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx10.vcomsbf16ge(<8 x bfloat> %{{.}}, <8 x bfloat> %{{.}})
+  return _mm_comgesbh(__A, __B);
+}
+
+int test_mm_comneqsbh(__m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: test_mm_comneqsbh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx10.vcomsbf16neq(<8 x bfloat> %{{.}}, <8 x bfloat> %{{.}})
+  return _mm_comneqsbh(__A, __B);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_eq_oq(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: @test_mm256_cmp_pbh_mask_eq_oq
+  // CHECK: fcmp oeq <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_EQ_OQ);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_lt_os(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_lt_os
+  // CHECK: fcmp olt <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_LT_OS);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_le_os(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_le_os
+  // CHECK: fcmp ole <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_LE_OS);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_unord_q(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_unord_q
+  // CHECK: fcmp uno <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_UNORD_Q);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_neq_uq(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_neq_uq
+  // CHECK: fcmp une <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_NEQ_UQ);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_nlt_us(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_nlt_us
+  // CHECK: fcmp uge <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_NLT_US);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_nle_us(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_nle_us
+  // CHECK: fcmp ugt <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_NLE_US);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_ord_q(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_ord_q
+  // CHECK: fcmp ord <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_ORD_Q);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_eq_uq(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_eq_uq
+  // CHECK: fcmp ueq <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_EQ_UQ);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_nge_us(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_nge_us
+  // CHECK: fcmp ult <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_NGE_US);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_ngt_us(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_ngt_us
+  // CHECK: fcmp ule <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_NGT_US);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_false_oq(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_false_oq
+  // CHECK: fcmp false <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_FALSE_OQ);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_neq_oq(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_neq_oq
+  // CHECK: fcmp one <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_NEQ_OQ);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_ge_os(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_ge_os
+  // CHECK: fcmp oge <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_GE_OS);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_gt_os(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_gt_os
+  // CHECK: fcmp ogt <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_GT_OS);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_true_uq(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_true_uq
+  // CHECK: fcmp true <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_TRUE_UQ);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_eq_os(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_eq_os
+  // CHECK: fcmp oeq <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_EQ_OS);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_lt_oq(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_lt_oq
+  // CHECK: fcmp olt <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_LT_OQ);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_le_oq(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_le_oq
+  // CHECK: fcmp ole <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_LE_OQ);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_unord_s(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_unord_s
+  // CHECK: fcmp uno <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_UNORD_S);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_neq_us(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_neq_us
+  // CHECK: fcmp une <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_NEQ_US);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_nlt_uq(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_nlt_uq
+  // CHECK: fcmp uge <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_NLT_UQ);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_nle_uq(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_nle_uq
+  // CHECK: fcmp ugt <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_NLE_UQ);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_ord_s(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_ord_s
+  // CHECK: fcmp ord <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_ORD_S);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_eq_us(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_eq_us
+  // CHECK: fcmp ueq <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_EQ_US);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_nge_uq(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_nge_uq
+  // CHECK: fcmp ult <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_NGE_UQ);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_ngt_uq(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_ngt_uq
+  // CHECK: fcmp ule <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_NGT_UQ);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_false_os(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_false_os
+  // CHECK: fcmp false <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_FALSE_OS);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_neq_os(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_neq_os
+  // CHECK: fcmp one <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_NEQ_OS);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_ge_oq(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_ge_oq
+  // CHECK: fcmp oge <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_GE_OQ);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_gt_oq(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_gt_oq
+  // CHECK: fcmp ogt <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_GT_OQ);
+}
+
+__mmask16 test_mm256_cmp_pbh_mask_true_us(__m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_cmp_pbh_mask_true_us
+  // CHECK: fcmp true <16 x bfloat> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pbh_mask(a, b, _CMP_TRUE_US);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_eq_oq(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: @test_mm256_mask_cmp_pbh_mask_eq_oq
+  // CHECK: fcmp oeq <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_OQ);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_lt_os(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_lt_os
+  // CHECK: fcmp olt <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_LT_OS);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_le_os(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_le_os
+  // CHECK: fcmp ole <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_LE_OS);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_unord_q(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_unord_q
+  // CHECK: fcmp uno <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_UNORD_Q);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_neq_uq(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_neq_uq
+  // CHECK: fcmp une <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_UQ);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_nlt_us(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_nlt_us
+  // CHECK: fcmp uge <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NLT_US);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_nle_us(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_nle_us
+  // CHECK: fcmp ugt <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NLE_US);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_ord_q(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_ord_q
+  // CHECK: fcmp ord <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_ORD_Q);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_eq_uq(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_eq_uq
+  // CHECK: fcmp ueq <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_UQ);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_nge_us(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_nge_us
+  // CHECK: fcmp ult <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NGE_US);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_ngt_us(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_ngt_us
+  // CHECK: fcmp ule <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NGT_US);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_false_oq(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_false_oq
+  // CHECK: fcmp false <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_FALSE_OQ);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_neq_oq(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_neq_oq
+  // CHECK: fcmp one <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_OQ);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_ge_os(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_ge_os
+  // CHECK: fcmp oge <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_GE_OS);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_gt_os(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_gt_os
+  // CHECK: fcmp ogt <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_GT_OS);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_true_uq(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_true_uq
+  // CHECK: fcmp true <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_TRUE_UQ);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_eq_os(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_eq_os
+  // CHECK: fcmp oeq <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_OS);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_lt_oq(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_lt_oq
+  // CHECK: fcmp olt <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_LT_OQ);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_le_oq(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_le_oq
+  // CHECK: fcmp ole <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_LE_OQ);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_unord_s(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_unord_s
+  // CHECK: fcmp uno <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_UNORD_S);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_neq_us(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_neq_us
+  // CHECK: fcmp une <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_US);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_nlt_uq(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_nlt_uq
+  // CHECK: fcmp uge <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NLT_UQ);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_nle_uq(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_nle_uq
+  // CHECK: fcmp ugt <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NLE_UQ);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_ord_s(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_ord_s
+  // CHECK: fcmp ord <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_ORD_S);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_eq_us(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_eq_us
+  // CHECK: fcmp ueq <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_US);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_nge_uq(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_nge_uq
+  // CHECK: fcmp ult <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NGE_UQ);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_ngt_uq(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_ngt_uq
+  // CHECK: fcmp ule <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NGT_UQ);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_false_os(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_false_os
+  // CHECK: fcmp false <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_FALSE_OS);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_neq_os(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_neq_os
+  // CHECK: fcmp one <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_OS);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_ge_oq(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_ge_oq
+  // CHECK: fcmp oge <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_GE_OQ);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_gt_oq(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_gt_oq
+  // CHECK: fcmp ogt <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_GT_OQ);
+}
+
+__mmask16 test_mm256_mask_cmp_pbh_mask_true_us(__mmask16 m, __m256bh a, __m256bh b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pbh_mask_true_us
+  // CHECK: fcmp true <16 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return _mm256_mask_cmp_pbh_mask(m, a, b, _CMP_TRUE_US);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_eq_oq(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: @test_mm_cmp_pbh_mask_eq_oq
+  // CHECK: fcmp oeq <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_EQ_OQ);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_lt_os(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_lt_os
+  // CHECK: fcmp olt <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_LT_OS);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_le_os(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_le_os
+  // CHECK: fcmp ole <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_LE_OS);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_unord_q(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_unord_q
+  // CHECK: fcmp uno <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_UNORD_Q);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_neq_uq(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_neq_uq
+  // CHECK: fcmp une <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_NEQ_UQ);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_nlt_us(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_nlt_us
+  // CHECK: fcmp uge <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_nle_us(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_nle_us
+  // CHECK: fcmp ugt <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_NLE_US);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_ord_q(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_ord_q
+  // CHECK: fcmp ord <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_ORD_Q);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_eq_uq(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_eq_uq
+  // CHECK: fcmp ueq <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_EQ_UQ);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_nge_us(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_nge_us
+  // CHECK: fcmp ult <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_NGE_US);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_ngt_us(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_ngt_us
+  // CHECK: fcmp ule <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_NGT_US);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_false_oq(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_false_oq
+  // CHECK: fcmp false <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_FALSE_OQ);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_neq_oq(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_neq_oq
+  // CHECK: fcmp one <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_NEQ_OQ);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_ge_os(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_ge_os
+  // CHECK: fcmp oge <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_GE_OS);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_gt_os(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_gt_os
+  // CHECK: fcmp ogt <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_GT_OS);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_true_uq(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_true_uq
+  // CHECK: fcmp true <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_TRUE_UQ);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_eq_os(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_eq_os
+  // CHECK: fcmp oeq <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_EQ_OS);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_lt_oq(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_lt_oq
+  // CHECK: fcmp olt <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_LT_OQ);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_le_oq(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_le_oq
+  // CHECK: fcmp ole <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_LE_OQ);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_unord_s(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_unord_s
+  // CHECK: fcmp uno <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_UNORD_S);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_neq_us(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_neq_us
+  // CHECK: fcmp une <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_NEQ_US);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_nlt_uq(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_nlt_uq
+  // CHECK: fcmp uge <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_NLT_UQ);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_nle_uq(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_nle_uq
+  // CHECK: fcmp ugt <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_NLE_UQ);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_ord_s(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_ord_s
+  // CHECK: fcmp ord <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_ORD_S);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_eq_us(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_eq_us
+  // CHECK: fcmp ueq <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_EQ_US);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_nge_uq(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_nge_uq
+  // CHECK: fcmp ult <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_NGE_UQ);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_ngt_uq(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_ngt_uq
+  // CHECK: fcmp ule <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_NGT_UQ);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_false_os(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_false_os
+  // CHECK: fcmp false <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_FALSE_OS);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_neq_os(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_neq_os
+  // CHECK: fcmp one <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_NEQ_OS);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_ge_oq(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_ge_oq
+  // CHECK: fcmp oge <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_GE_OQ);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_gt_oq(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_gt_oq
+  // CHECK: fcmp ogt <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_GT_OQ);
+}
+
+__mmask8 test_mm_cmp_pbh_mask_true_us(__m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_cmp_pbh_mask_true_us
+  // CHECK: fcmp true <8 x bfloat> %{{.*}}, %{{.*}}
+  return _mm_cmp_pbh_mask(a, b, _CMP_TRUE_US);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_eq_oq(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: @test_mm_mask_cmp_pbh_mask_eq_oq
+  // CHECK: fcmp oeq <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_lt_os(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_lt_os
+  // CHECK: fcmp olt <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_LT_OS);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_le_os(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_le_os
+  // CHECK: fcmp ole <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_LE_OS);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_unord_q(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_unord_q
+  // CHECK: fcmp uno <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_UNORD_Q);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_neq_uq(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_neq_uq
+  // CHECK: fcmp une <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_nlt_us(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_nlt_us
+  // CHECK: fcmp uge <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_nle_us(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_nle_us
+  // CHECK: fcmp ugt <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NLE_US);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_ord_q(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_ord_q
+  // CHECK: fcmp ord <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_ORD_Q);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_eq_uq(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_eq_uq
+  // CHECK: fcmp ueq <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_nge_us(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_nge_us
+  // CHECK: fcmp ult <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NGE_US);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_ngt_us(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_ngt_us
+  // CHECK: fcmp ule <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NGT_US);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_false_oq(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_false_oq
+  // CHECK: fcmp false <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_FALSE_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_neq_oq(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_neq_oq
+  // CHECK: fcmp one <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_ge_os(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_ge_os
+  // CHECK: fcmp oge <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_GE_OS);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_gt_os(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_gt_os
+  // CHECK: fcmp ogt <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_GT_OS);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_true_uq(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_true_uq
+  // CHECK: fcmp true <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_TRUE_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_eq_os(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_eq_os
+  // CHECK: fcmp oeq <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_OS);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_lt_oq(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_lt_oq
+  // CHECK: fcmp olt <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_LT_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_le_oq(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_le_oq
+  // CHECK: fcmp ole <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_LE_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_unord_s(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_unord_s
+  // CHECK: fcmp uno <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_UNORD_S);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_neq_us(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_neq_us
+  // CHECK: fcmp une <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_US);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_nlt_uq(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_nlt_uq
+  // CHECK: fcmp uge <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NLT_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_nle_uq(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_nle_uq
+  // CHECK: fcmp ugt <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NLE_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_ord_s(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_ord_s
+  // CHECK: fcmp ord <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_ORD_S);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_eq_us(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_eq_us
+  // CHECK: fcmp ueq <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_EQ_US);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_nge_uq(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_nge_uq
+  // CHECK: fcmp ult <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NGE_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_ngt_uq(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_ngt_uq
+  // CHECK: fcmp ule <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NGT_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_false_os(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_false_os
+  // CHECK: fcmp false <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_FALSE_OS);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_neq_os(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_neq_os
+  // CHECK: fcmp one <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_NEQ_OS);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_ge_oq(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_ge_oq
+  // CHECK: fcmp oge <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_GE_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_gt_oq(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_gt_oq
+  // CHECK: fcmp ogt <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_GT_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_pbh_mask_true_us(__mmask8 m, __m128bh a, __m128bh b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pbh_mask_true_us
+  // CHECK: fcmp true <8 x bfloat> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return _mm_mask_cmp_pbh_mask(m, a, b, _CMP_TRUE_US);
+}
+
+
+__mmask16 test_mm256_mask_fpclass_pbh_mask(__mmask16 __U, __m256bh __A) {
+  // CHECK-LABEL: @test_mm256_mask_fpclass_pbh_mask
+  // CHECK: @llvm.x86.avx10.fpclass.nepbf16.256
+  return _mm256_mask_fpclass_pbh_mask(__U, __A, 4);
+}
+
+__mmask16 test_mm256_fpclass_pbh_mask(__m256bh __A) {
+  // CHECK-LABEL: @test_mm256_fpclass_pbh_mask
+  // CHECK: @llvm.x86.avx10.fpclass.nepbf16.256
+  return _mm256_fpclass_pbh_mask(__A, 4);
+}
+
+__mmask8 test_mm_mask_fpclass_pbh_mask(__mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_mask_fpclass_pbh_mask
+  // CHECK: @llvm.x86.avx10.fpclass.nepbf16.128
+  return _mm_mask_fpclass_pbh_mask(__U, __A, 4);
+}
+
+__mmask8 test_mm_fpclass_pbh_mask(__m128bh __A) {
+  // CHECK-LABEL: @test_mm_fpclass_pbh_mask
+  // CHECK: @llvm.x86.avx10.fpclass.nepbf16.128
+  return _mm_fpclass_pbh_mask(__A, 4);
+}
+
+__m256bh test_mm256_scalef_pbh(__m256bh __A, __m256bh __B) {
+  // CHECK-LABEL: @test_mm256_scalef_pbh
+  // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.256
+  return _mm256_scalef_pbh(__A, __B);
+}
+
+__m256bh test_mm256_mask_scalef_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
+  // CHECK-LABEL: @test_mm256_mask_scalef_pbh
+  // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.256
+  return _mm256_mask_scalef_pbh(__W, __U, __A, __B);
+}
+
+__m256bh test_mm256_maskz_scalef_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
+  // CHECK-LABEL: @test_mm256_maskz_scalef_pbh
+  // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.256
+  return _mm256_maskz_scalef_pbh(__U, __A, __B);
+}
+
+__m256bh test_mm256_rcp_pbh(__m256bh __A) {
+  // CHECK-LABEL: @test_mm256_rcp_pbh
+  // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.256
+  return _mm256_rcp_pbh(__A);
+}
+
+__m256bh test_mm256_mask_rcp_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) {
+  // CHECK-LABEL: @test_mm256_mask_rcp_pbh
+  // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.256
+  return (__m256bh)_mm256_mask_rcp_pbh(__W, __U, __A);
+}
+
+__m256bh test_mm256_maskz_rcp_pbh(__mmask16 __U, __m256bh __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rcp_pbh
+  // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.256
+  return _mm256_maskz_rcp_pbh(__U, __A);
+}
+
+__m256bh test_mm256_getexp_pbh(__m256bh __A) {
+  // CHECK-LABEL: @test_mm256_getexp_pbh
+  // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.256
+  return _mm256_getexp_pbh(__A);
+}
+
+__m256bh test_mm256_mask_getexp_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) {
+  // CHECK-LABEL: @test_mm256_mask_getexp_pbh
+  // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.256
+  return _mm256_mask_getexp_pbh(__W, __U, __A);
+}
+
+__m256bh test_mm256_maskz_getexp_pbh(__mmask16 __U, __m256bh __A) {
+  // CHECK-LABEL: @test_mm256_maskz_getexp_pbh
+  // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.256
+  return _mm256_maskz_getexp_pbh(__U, __A);
+}
+
+__m256bh test_mm256_rsqrt_pbh(__m256bh __A) {
+  // CHECK-LABEL: @test_mm256_rsqrt_pbh
+  // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.256
+  return _mm256_rsqrt_pbh(__A);
+}
+
+__m256bh test_mm256_mask_rsqrt_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) {
+  // CHECK-LABEL: @test_mm256_mask_rsqrt_pbh
+  // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.256
+  return (__m256bh)_mm256_mask_rsqrt_pbh(__W, __U, __A);
+}
+
+__m256bh test_mm256_maskz_rsqrt_pbh(__mmask16 __U, __m256bh __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rsqrt_pbh
+  // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.256
+  return _mm256_maskz_rsqrt_pbh(__U, __A);
+}
+
+__m256bh test_mm256_reducene_pbh(__m256bh __A) {
+  // CHECK-LABEL: @test_mm256_reducene_pbh
+  // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.256
+  return _mm256_reducene_pbh(__A, 3);
+}
+
+__m256bh test_mm256_mask_reducene_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) {
+  // CHECK-LABEL: @test_mm256_mask_reducene_pbh
+  // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.256
+  return _mm256_mask_reducene_pbh(__W, __U, __A, 1);
+}
+
+__m256bh test_mm256_maskz_reducene_pbh(__mmask16 __U, __m256bh __A) {
+  // CHECK-LABEL: @test_mm256_maskz_reducene_pbh
+  // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.256
+  return _mm256_maskz_reducene_pbh(__U, __A, 1);
+}
+
+__m256bh test_mm256_roundscalene_pbh(__m256bh __A) {
+  // CHECK-LABEL: @test_mm256_roundscalene_pbh
+  // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.256
+  return _mm256_roundscalene_pbh(__A, 3);
+}
+
+__m256bh test_mm256_mask_roundscalene_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) {
+  // CHECK-LABEL: @test_mm256_mask_roundscalene_pbh
+  // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.256
+  return _mm256_mask_roundscalene_pbh(__W, __U, __A, 1);
+}
+
+__m256bh test_mm256_maskz_roundscalene_pbh(__mmask16 __U, __m256bh __A) {
+  // CHECK-LABEL: @test_mm256_maskz_roundscalene_pbh
+  // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.256
+  return _mm256_maskz_roundscalene_pbh(__U, __A, 1 );
+}
+
+__m256bh test_mm256_getmant_pbh(__m256bh __A) {
+  // CHECK-LABEL: @test_mm256_getmant_pbh
+  // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.256
+  return _mm256_getmant_pbh(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m256bh test_mm256_mask_getmant_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) {
+  // CHECK-LABEL: @test_mm256_mask_getmant_pbh
+  // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.256
+  return _mm256_mask_getmant_pbh(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m256bh test_mm256_maskz_getmant_pbh(__mmask16 __U, __m256bh __A) {
+  // CHECK-LABEL: @test_mm256_maskz_getmant_pbh
+  // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.256
+  return _mm256_maskz_getmant_pbh(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m256bh test_mm256_sqrt_pbh(__m256bh __A) {
+  // CHECK-LABEL: @test_mm256_sqrt_pbh
+  // CHECK: call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> %{{.*}})
+  return _mm256_sqrt_pbh(__A);
+}
+
+__m256bh test_mm256_mask_sqrt_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) {
+  // CHECK-LABEL: @test_mm256_mask_sqrt_pbh
+  // CHECK: @llvm.sqrt.v16bf16
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return (__m256bh)_mm256_mask_sqrt_pbh(__W, __U, __A);
+}
+
+__m256bh test_mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) {
+  // CHECK-LABEL: @test_mm256_maskz_sqrt_pbh
+  // CHECK: @llvm.sqrt.v16bf16
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_maskz_sqrt_pbh(__U, __A);
+}
+
+__m128bh test_mm_scalef_pbh(__m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: @test_mm_scalef_pbh
+  // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.128
+  return _mm_scalef_pbh(__A, __B);
+}
+
+__m128bh test_mm_mask_scalef_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: @test_mm_mask_scalef_pbh
+  // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.128
+  return _mm_mask_scalef_pbh(__W, __U, __A, __B);
+}
+
+__m128bh test_mm_maskz_scalef_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: @test_mm_maskz_scalef_pbh
+  // CHECK: @llvm.x86.avx10.mask.scalef.nepbf16.128
+  return _mm_maskz_scalef_pbh(__U, __A, __B);
+}
+
+__m128bh test_mm_rcp_pbh(__m128bh __A) {
+  // CHECK-LABEL: @test_mm_rcp_pbh
+  // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.128
+  return _mm_rcp_pbh(__A);
+}
+
+__m128bh test_mm_mask_rcp_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_mask_rcp_pbh
+  // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.128
+  return (__m128bh)_mm_mask_rcp_pbh(__W, __U, __A);
+}
+
+__m128bh test_mm_maskz_rcp_pbh(__mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_maskz_rcp_pbh
+  // CHECK: @llvm.x86.avx10.mask.rcp.nepbf16.128
+  return _mm_maskz_rcp_pbh(__U, __A);
+}
+
+__m128bh test_mm_getexp_pbh(__m128bh __A) {
+  // CHECK-LABEL: @test_mm_getexp_pbh
+  // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.128
+  return _mm_getexp_pbh(__A);
+}
+
+__m128bh test_mm_mask_getexp_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_mask_getexp_pbh
+  // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.128
+  return _mm_mask_getexp_pbh(__W, __U, __A);
+}
+
+__m128bh test_mm_maskz_getexp_pbh(__mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_maskz_getexp_pbh
+  // CHECK: @llvm.x86.avx10.mask.getexp.nepbf16.128
+  return _mm_maskz_getexp_pbh(__U, __A);
+}
+
+__m128bh test_mm_rsqrt_pbh(__m128bh __A) {
+  // CHECK-LABEL: @test_mm_rsqrt_pbh
+  // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.128
+  return _mm_rsqrt_pbh(__A);
+}
+
+__m128bh test_mm_mask_rsqrt_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_mask_rsqrt_pbh
+  // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.128
+  return (__m128bh)_mm_mask_rsqrt_pbh(__W, __U, __A);
+}
+
+__m128bh test_mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_maskz_rsqrt_pbh
+  // CHECK: @llvm.x86.avx10.mask.rsqrt.nepbf16.128
+  return _mm_maskz_rsqrt_pbh(__U, __A);
+}
+
+__m128bh test_mm_reducene_pbh(__m128bh __A) {
+  // CHECK-LABEL: @test_mm_reducene_pbh
+  // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.128
+  return _mm_reducene_pbh(__A, 3);
+}
+
+__m128bh test_mm_mask_reducene_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_mask_reducene_pbh
+  // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.128
+  return _mm_mask_reducene_pbh(__W, __U, __A, 1);
+}
+
+__m128bh test_mm_maskz_reducene_pbh(__mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_maskz_reducene_pbh
+  // CHECK: @llvm.x86.avx10.mask.reduce.nepbf16.128
+  return _mm_maskz_reducene_pbh(__U, __A, 1);
+}
+
+__m128bh test_mm_roundscalene_pbh(__m128bh __A) {
+  // CHECK-LABEL: @test_mm_roundscalene_pbh
+  // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.128
+  return _mm_roundscalene_pbh(__A, 3);
+}
+
+__m128bh test_mm_mask_roundscalene_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_mask_roundscalene_pbh
+  // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.128
+  return _mm_mask_roundscalene_pbh(__W, __U, __A, 1);
+}
+
+__m128bh test_mm_maskz_roundscalene_pbh(__mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_maskz_roundscalene_pbh
+  // CHECK: @llvm.x86.avx10.mask.rndscale.nepbf16.128
+  return _mm_maskz_roundscalene_pbh(__U, __A, 1 );
+}
+
+__m128bh test_mm_getmant_pbh(__m128bh __A) {
+  // CHECK-LABEL: @test_mm_getmant_pbh
+  // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.128
+  return _mm_getmant_pbh(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m128bh test_mm_mask_getmant_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_mask_getmant_pbh
+  // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.128
+  return _mm_mask_getmant_pbh(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m128bh test_mm_maskz_getmant_pbh(__mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_maskz_getmant_pbh
+  // CHECK: @llvm.x86.avx10.mask.getmant.nepbf16.128
+  return _mm_maskz_getmant_pbh(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m128bh test_mm_sqrt_pbh(__m128bh __A) {
+  // CHECK-LABEL: @test_mm_sqrt_pbh
+  // CHECK: call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> {{.*}})
+  return _mm_sqrt_pbh(__A);
+}
+
+__m128bh test_mm_mask_sqrt_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_mask_sqrt_pbh
+  // CHECK: call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> {{.*}})
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return (__m128bh)_mm_mask_sqrt_pbh(__W, __U, __A);
+}
+
+__m128bh test_mm_maskz_sqrt_pbh(__mmask8 __U, __m128bh __A) {
+  // CHECK-LABEL: @test_mm_maskz_sqrt_pbh
+  // CHECK: call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> {{.*}})
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_maskz_sqrt_pbh(__U, __A);
+}
+
+__m256bh test_mm256_fmaddne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
+  // CHECK-LABEL: @test_mm256_fmaddne_pbh
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  return _mm256_fmaddne_pbh(__A, __B, __C);
+}
+
+__m256bh test_mm256_mask_fmaddne_pbh(__m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) {
+  // CHECK-LABEL: @test_mm256_mask_fmaddne_pbh
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_mask_fmaddne_pbh(__A, __U, __B, __C);
+}
+
+__m256bh test_mm256_mask3_fmaddne_pbh(__m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) {
+  // CHECK-LABEL: @test_mm256_mask3_fmaddne_pbh
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_mask3_fmaddne_pbh(__A, __B, __C, __U);
+}
+
+__m256bh test_mm256_maskz_fmaddne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) {
+  // CHECK-LABEL: @test_mm256_maskz_fmaddne_pbh
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_maskz_fmaddne_pbh(__U, __A, __B, __C);
+}
+
+__m256bh test_mm256_fmsubne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
+  // CHECK-LABEL: @test_mm256_fmsubne_pbh
+  // CHECK: fneg
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  return _mm256_fmsubne_pbh(__A, __B, __C);
+}
+
+__m256bh test_mm256_mask_fmsubne_pbh(__m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) {
+  // CHECK-LABEL: @test_mm256_mask_fmsubne_pbh
+  // CHECK: fneg
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_mask_fmsubne_pbh(__A, __U, __B, __C);
+}
+
+__m256bh test_mm256_mask3_fmsubne_pbh(__m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) {
+  // CHECK-LABEL: @test_mm256_mask3_fmsubne_pbh
+  // CHECK: fneg
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_mask3_fmsubne_pbh(__A, __B, __C, __U);
+}
+
+__m256bh test_mm256_maskz_fmsubne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) {
+  // CHECK-LABEL: @test_mm256_maskz_fmsubne_pbh
+  // CHECK: fneg
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_maskz_fmsubne_pbh(__U, __A, __B, __C);
+}
+
+__m256bh test_mm256_fnmaddne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
+  // CHECK-LABEL: @test_mm256_fnmaddne_pbh
+  // CHECK: fneg
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  return _mm256_fnmaddne_pbh(__A, __B, __C);
+}
+
+__m256bh test_mm256_mask_fnmaddne_pbh(__m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) {
+  // CHECK-LABEL: @test_mm256_mask_fnmaddne_pbh
+  // CHECK: fneg
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_mask_fnmaddne_pbh(__A, __U, __B, __C);
+}
+
+__m256bh test_mm256_mask3_fnmaddne_pbh(__m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) {
+  // CHECK-LABEL: @test_mm256_mask3_fnmaddne_pbh
+  // CHECK: fneg
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_mask3_fnmaddne_pbh(__A, __B, __C, __U);
+}
+
+__m256bh test_mm256_maskz_fnmaddne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) {
+  // CHECK-LABEL: @test_mm256_maskz_fnmaddne_pbh
+  // CHECK: fneg
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_maskz_fnmaddne_pbh(__U, __A, __B, __C);
+}
+
+__m256bh test_mm256_fnmsubne_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
+  // CHECK-LABEL: @test_mm256_fnmsubne_pbh
+  // CHECK: fneg
+  // CHECK: fneg
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  return _mm256_fnmsubne_pbh(__A, __B, __C);
+}
+
+__m256bh test_mm256_mask_fnmsubne_pbh(__m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) {
+  // CHECK-LABEL: @test_mm256_mask_fnmsubne_pbh
+  // CHECK: fneg
+  // CHECK: fneg
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_mask_fnmsubne_pbh(__A, __U, __B, __C);
+}
+
+__m256bh test_mm256_mask3_fnmsubne_pbh(__m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) {
+  // CHECK-LABEL: @test_mm256_mask3_fnmsubne_pbh
+  // CHECK: fneg
+  // CHECK: fneg
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_mask3_fnmsubne_pbh(__A, __B, __C, __U);
+}
+
+__m256bh test_mm256_maskz_fnmsubne_pbh(__mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) {
+  // CHECK-LABEL: @test_mm256_maskz_fnmsubne_pbh
+  // CHECK: fneg
+  // CHECK: fneg
+  // CHECK: call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}})
+  // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
+  return _mm256_maskz_fnmsubne_pbh(__U, __A, __B, __C);
+}
+
+__m128bh test_mm_fmaddne_pbh(__m128bh __A, __m128bh __B, __m128bh __C) {
+  // CHECK-LABEL: @test_mm_fmaddne_pbh
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  return _mm_fmaddne_pbh(__A, __B, __C);
+}
+
+__m128bh test_mm_mask_fmaddne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) {
+  // CHECK-LABEL: @test_mm_mask_fmaddne_pbh
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_mask_fmaddne_pbh(__A, __U, __B, __C);
+}
+
+__m128bh test_mm_mask3_fmaddne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) {
+  // CHECK-LABEL: @test_mm_mask3_fmaddne_pbh
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_mask3_fmaddne_pbh(__A, __B, __C, __U);
+}
+
+__m128bh test_mm_maskz_fmaddne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
+  // CHECK-LABEL: @test_mm_maskz_fmaddne_pbh
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_maskz_fmaddne_pbh(__U, __A, __B, __C);
+}
+
+__m128bh test_mm_fmsubne_pbh(__m128bh __A, __m128bh __B, __m128bh __C) {
+  // CHECK-LABEL: @test_mm_fmsubne_pbh
+  // CHECK: fneg
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  return _mm_fmsubne_pbh(__A, __B, __C);
+}
+
+__m128bh test_mm_mask_fmsubne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) {
+  // CHECK-LABEL: @test_mm_mask_fmsubne_pbh
+  // CHECK: fneg
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_mask_fmsubne_pbh(__A, __U, __B, __C);
+}
+
+__m128bh test_mm_mask3_fmsubne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) {
+  // CHECK-LABEL: @test_mm_mask3_fmsubne_pbh
+  // CHECK: fneg
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_mask3_fmsubne_pbh(__A, __B, __C, __U);
+}
+
+__m128bh test_mm_maskz_fmsubne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
+  // CHECK-LABEL: @test_mm_maskz_fmsubne_pbh
+  // CHECK: fneg
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_maskz_fmsubne_pbh(__U, __A, __B, __C);
+}
+
+__m128bh test_mm_fnmaddne_pbh(__m128bh __A, __m128bh __B, __m128bh __C) {
+  // CHECK-LABEL: @test_mm_fnmaddne_pbh
+  // CHECK: fneg
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  return _mm_fnmaddne_pbh(__A, __B, __C);
+}
+
+__m128bh test_mm_mask_fnmaddne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) {
+  // CHECK-LABEL: @test_mm_mask_fnmaddne_pbh
+  // CHECK: fneg
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_mask_fnmaddne_pbh(__A, __U, __B, __C);
+}
+
+__m128bh test_mm_mask3_fnmaddne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) {
+  // CHECK-LABEL: @test_mm_mask3_fnmaddne_pbh
+  // CHECK: fneg
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_mask3_fnmaddne_pbh(__A, __B, __C, __U);
+}
+
+__m128bh test_mm_maskz_fnmaddne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
+  // CHECK-LABEL: @test_mm_maskz_fnmaddne_pbh
+  // CHECK: fneg
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_maskz_fnmaddne_pbh(__U, __A, __B, __C);
+}
+
+__m128bh test_mm_fnmsubne_pbh(__m128bh __A, __m128bh __B, __m128bh __C) {
+  // CHECK-LABEL: @test_mm_fnmsubne_pbh
+  // CHECK: fneg
+  // CHECK: fneg
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  return _mm_fnmsubne_pbh(__A, __B, __C);
+}
+
+__m128bh test_mm_mask_fnmsubne_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) {
+  // CHECK-LABEL: @test_mm_mask_fnmsubne_pbh
+  // CHECK: fneg
+  // CHECK: fneg
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_mask_fnmsubne_pbh(__A, __U, __B, __C);
+}
+
+__m128bh test_mm_mask3_fnmsubne_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) {
+  // CHECK-LABEL: @test_mm_mask3_fnmsubne_pbh
+  // CHECK: fneg
+  // CHECK: fneg
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_mask3_fnmsubne_pbh(__A, __B, __C, __U);
+}
+
+__m128bh test_mm_maskz_fnmsubne_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
+  // CHECK-LABEL: @test_mm_maskz_fnmsubne_pbh
+  // CHECK: fneg
+  // CHECK: fneg
+  // CHECK: call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}})
+  // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
+  return _mm_maskz_fnmsubne_pbh(__U, __A, __B, __C);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 8d000ed1e4f85..fafa5051bfb1b 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -7219,3 +7219,256 @@ def int_x86_avx10_mask_vcvtneph2hf8s512 : ClangBuiltin<"__builtin_ia32_vcvtneph2
         DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v32f16_ty, llvm_v32i8_ty, llvm_i32_ty],
                               [IntrNoMem]>;
 }
+
+//===----------------------------------------------------------------------===//
+let TargetPrefix = "x86" in {
+def int_x86_avx10_vaddnepbf16512 : ClangBuiltin<"__builtin_ia32_vaddnepbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vaddnepbf16256 : ClangBuiltin<"__builtin_ia32_vaddnepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vaddnepbf16128 : ClangBuiltin<"__builtin_ia32_vaddnepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vsubnepbf16512 : ClangBuiltin<"__builtin_ia32_vsubnepbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vsubnepbf16256 : ClangBuiltin<"__builtin_ia32_vsubnepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vsubnepbf16128 : ClangBuiltin<"__builtin_ia32_vsubnepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vmulnepbf16512 : ClangBuiltin<"__builtin_ia32_vmulnepbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vmulnepbf16256 : ClangBuiltin<"__builtin_ia32_vmulnepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vmulnepbf16128 : ClangBuiltin<"__builtin_ia32_vmulnepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vdivnepbf16512 : ClangBuiltin<"__builtin_ia32_vdivnepbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vdivnepbf16256 : ClangBuiltin<"__builtin_ia32_vdivnepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vdivnepbf16128 : ClangBuiltin<"__builtin_ia32_vdivnepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vmaxpbf16512 : ClangBuiltin<"__builtin_ia32_vmaxpbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vmaxpbf16256 : ClangBuiltin<"__builtin_ia32_vmaxpbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vmaxpbf16128 : ClangBuiltin<"__builtin_ia32_vmaxpbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vminpbf16512 : ClangBuiltin<"__builtin_ia32_vminpbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vminpbf16256 : ClangBuiltin<"__builtin_ia32_vminpbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vminpbf16128 : ClangBuiltin<"__builtin_ia32_vminpbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vcomsbf16eq : ClangBuiltin<"__builtin_ia32_vcomsbf16eq">,
+        DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vcomsbf16lt : ClangBuiltin<"__builtin_ia32_vcomsbf16lt">,
+        DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_v8bf16_ty,llvm_v8bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vcomsbf16le : ClangBuiltin<"__builtin_ia32_vcomsbf16le">,
+        DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vcomsbf16gt : ClangBuiltin<"__builtin_ia32_vcomsbf16gt">,
+        DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vcomsbf16ge : ClangBuiltin<"__builtin_ia32_vcomsbf16ge">,
+        DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vcomsbf16neq : ClangBuiltin<"__builtin_ia32_vcomsbf16neq">,
+        DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_mask_rsqrt_nepbf16_128 : ClangBuiltin<"__builtin_ia32_vrsqrtpbf16128_mask">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_i8_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_mask_rsqrt_nepbf16_256 : ClangBuiltin<"__builtin_ia32_vrsqrtpbf16256_mask">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_i16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_mask_rsqrt_nepbf16_512 : ClangBuiltin<"__builtin_ia32_vrsqrtpbf16512_mask">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_i32_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_mask_rcp_nepbf16_128 : ClangBuiltin<"__builtin_ia32_vrcppbf16128_mask">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_i8_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_mask_rcp_nepbf16_256 : ClangBuiltin<"__builtin_ia32_vrcppbf16256_mask">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_i16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_mask_rcp_nepbf16_512 : ClangBuiltin<"__builtin_ia32_vrcppbf16512_mask">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_i32_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_mask_reduce_nepbf16_128 : ClangBuiltin<"__builtin_ia32_vreducenepbf16128_mask">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_i32_ty, llvm_v8bf16_ty, llvm_i8_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+def int_x86_avx10_mask_reduce_nepbf16_256 : ClangBuiltin<"__builtin_ia32_vreducenepbf16256_mask">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_i32_ty, llvm_v16bf16_ty, llvm_i16_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+def int_x86_avx10_mask_reduce_nepbf16_512 : ClangBuiltin<"__builtin_ia32_vreducenepbf16512_mask">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_i32_ty, llvm_v32bf16_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+def int_x86_avx10_fpclass_nepbf16_128 :
+        DefaultAttrsIntrinsic<[llvm_v8i1_ty], [llvm_v8bf16_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+def int_x86_avx10_fpclass_nepbf16_256 :
+        DefaultAttrsIntrinsic<[llvm_v16i1_ty], [llvm_v16bf16_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+def int_x86_avx10_fpclass_nepbf16_512 :
+        DefaultAttrsIntrinsic<[llvm_v32i1_ty], [llvm_v32bf16_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+def int_x86_avx10_mask_getexp_nepbf16_128 : ClangBuiltin<"__builtin_ia32_vgetexppbf16128_mask">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_i8_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_mask_getexp_nepbf16_256 : ClangBuiltin<"__builtin_ia32_vgetexppbf16256_mask">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_i16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_mask_getexp_nepbf16_512 : ClangBuiltin<"__builtin_ia32_vgetexppbf16512_mask">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_i32_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_mask_getmant_nepbf16_128 : ClangBuiltin<"__builtin_ia32_vgetmantpbf16128_mask">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_i32_ty, llvm_v8bf16_ty, llvm_i8_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+def int_x86_avx10_mask_getmant_nepbf16_256 : ClangBuiltin<"__builtin_ia32_vgetmantpbf16256_mask">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_i32_ty, llvm_v16bf16_ty, llvm_i16_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+def int_x86_avx10_mask_getmant_nepbf16_512 : ClangBuiltin<"__builtin_ia32_vgetmantpbf16512_mask">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_i32_ty, llvm_v32bf16_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+def int_x86_avx10_mask_rndscale_nepbf16_128 : ClangBuiltin<"__builtin_ia32_vrndscalenepbf16_128_mask">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_i32_ty, llvm_v8bf16_ty, llvm_i8_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+def int_x86_avx10_mask_rndscale_nepbf16_256 : ClangBuiltin<"__builtin_ia32_vrndscalenepbf16_256_mask">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_i32_ty, llvm_v16bf16_ty, llvm_i16_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+def int_x86_avx10_mask_rndscale_nepbf16_512 : ClangBuiltin<"__builtin_ia32_vrndscalenepbf16_mask">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_i32_ty, llvm_v32bf16_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+def int_x86_avx10_mask_scalef_nepbf16_128 : ClangBuiltin<"__builtin_ia32_vscalefpbf16128_mask">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_i8_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_mask_scalef_nepbf16_256 : ClangBuiltin<"__builtin_ia32_vscalefpbf16256_mask">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_i16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_mask_scalef_nepbf16_512 : ClangBuiltin<"__builtin_ia32_vscalefpbf16512_mask">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_i32_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmadd213nepbf16256 : ClangBuiltin<"__builtin_ia32_vfmadd213nepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmadd213nepbf16128 : ClangBuiltin<"__builtin_ia32_vfmadd213nepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmadd132nepbf16512 : ClangBuiltin<"__builtin_ia32_vfmadd132nepbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmadd132nepbf16256 : ClangBuiltin<"__builtin_ia32_vfmadd132nepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmadd132nepbf16128 : ClangBuiltin<"__builtin_ia32_vfmadd132nepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmadd231nepbf16512 : ClangBuiltin<"__builtin_ia32_vfmadd231nepbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmadd231nepbf16256 : ClangBuiltin<"__builtin_ia32_vfmadd231nepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmadd231nepbf16128 : ClangBuiltin<"__builtin_ia32_vfmadd231nepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmsub213nepbf16512 : ClangBuiltin<"__builtin_ia32_vfmsub213nepbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmsub213nepbf16256 : ClangBuiltin<"__builtin_ia32_vfmsub213nepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmsub213nepbf16128 : ClangBuiltin<"__builtin_ia32_vfmsub213nepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmsub132nepbf16512 : ClangBuiltin<"__builtin_ia32_vfmsub132nepbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmsub132nepbf16256 : ClangBuiltin<"__builtin_ia32_vfmsub132nepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmsub132nepbf16128 : ClangBuiltin<"__builtin_ia32_vfmsub132nepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmsub231nepbf16512 : ClangBuiltin<"__builtin_ia32_vfmsub231nepbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmsub231nepbf16256 : ClangBuiltin<"__builtin_ia32_vfmsub231nepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfmsub231nepbf16128 : ClangBuiltin<"__builtin_ia32_vfmsub231nepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmadd213nepbf16512 : ClangBuiltin<"__builtin_ia32_vfnmadd213nepbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmadd213nepbf16256 : ClangBuiltin<"__builtin_ia32_vfnmadd213nepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmadd213nepbf16128 : ClangBuiltin<"__builtin_ia32_vfnmadd213nepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmadd132nepbf16512 : ClangBuiltin<"__builtin_ia32_vfnmadd132nepbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmadd132nepbf16256 : ClangBuiltin<"__builtin_ia32_vfnmadd132nepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmadd132nepbf16128 : ClangBuiltin<"__builtin_ia32_vfnmadd132nepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmadd231nepbf16512 : ClangBuiltin<"__builtin_ia32_vfnmadd231nepbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmadd231nepbf16256 : ClangBuiltin<"__builtin_ia32_vfnmadd231nepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmadd231nepbf16128 : ClangBuiltin<"__builtin_ia32_vfnmadd231nepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmsub213nepbf16512 : ClangBuiltin<"__builtin_ia32_vfnmsub213nepbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmsub213nepbf16256 : ClangBuiltin<"__builtin_ia32_vfnmsub213nepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmsub213nepbf16128 : ClangBuiltin<"__builtin_ia32_vfnmsub213nepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmsub132nepbf16512 : ClangBuiltin<"__builtin_ia32_vfnmsub132nepbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmsub132nepbf16256 : ClangBuiltin<"__builtin_ia32_vfnmsub132nepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmsub132nepbf16128 : ClangBuiltin<"__builtin_ia32_vfnmsub132nepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmsub231nepbf16512 : ClangBuiltin<"__builtin_ia32_vfnmsub231nepbf16512">,
+        DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_v32bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmsub231nepbf16256 : ClangBuiltin<"__builtin_ia32_vfnmsub231nepbf16256">,
+        DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_v16bf16_ty],
+                              [IntrNoMem]>;
+def int_x86_avx10_vfnmsub231nepbf16128 : ClangBuiltin<"__builtin_ia32_vfnmsub231nepbf16128">,
+        DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_v8bf16_ty ],
+                              [IntrNoMem]>;
+}
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 2b6b0ad16bcf7..03f49306c2b7b 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3305,11 +3305,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   if ((PatchedName.starts_with("cmp") || PatchedName.starts_with("vcmp")) &&
       (PatchedName.ends_with("ss") || PatchedName.ends_with("sd") ||
        PatchedName.ends_with("sh") || PatchedName.ends_with("ph") ||
-       PatchedName.ends_with("ps") || PatchedName.ends_with("pd"))) {
+       PatchedName.ends_with("pbf16") || PatchedName.ends_with("ps") ||
+       PatchedName.ends_with("pd"))) {
     bool IsVCMP = PatchedName[0] == 'v';
     unsigned CCIdx = IsVCMP ? 4 : 3;
+    unsigned suffixLength = PatchedName.ends_with("pbf16") ? 5 : 2;
     unsigned CC = StringSwitch<unsigned>(
-      PatchedName.slice(CCIdx, PatchedName.size() - 2))
+      PatchedName.slice(CCIdx, PatchedName.size() - suffixLength))
       .Case("eq",       0x00)
       .Case("eq_oq",    0x00)
       .Case("lt",       0x01)
@@ -3372,6 +3374,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
         PatchedName = "vcmpsh";
       else if (PatchedName.ends_with("ph"))
         PatchedName = "vcmpph";
+      else if (PatchedName.ends_with("pbf16"))
+        PatchedName = "vcmppbf16";
       else
         llvm_unreachable("Unexpected suffix!");
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index 33104524c5a89..8fcc1c10d93a0 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -167,6 +167,15 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
   case X86::VCMPPHZrmbi:     case X86::VCMPPHZrmbik:
   case X86::VCMPPHZrrib:     case X86::VCMPPHZrribk:
   case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrrib_Intk:
+  case X86::VCMPPBF16Z128rmi:  case X86::VCMPPBF16Z128rri:
+  case X86::VCMPPBF16Z256rmi:  case X86::VCMPPBF16Z256rri:
+  case X86::VCMPPBF16Zrmi:     case X86::VCMPPBF16Zrri:
+  case X86::VCMPPBF16Z128rmik: case X86::VCMPPBF16Z128rrik:
+  case X86::VCMPPBF16Z256rmik: case X86::VCMPPBF16Z256rrik:
+  case X86::VCMPPBF16Zrmik:    case X86::VCMPPBF16Zrrik:
+  case X86::VCMPPBF16Z128rmbi: case X86::VCMPPBF16Z128rmbik:
+  case X86::VCMPPBF16Z256rmbi: case X86::VCMPPBF16Z256rmbik:
+  case X86::VCMPPBF16Zrmbi:    case X86::VCMPPBF16Zrmbik:
     if (Imm >= 0 && Imm <= 31) {
       OS << '\t';
       printCMPMnemonic(MI, /*IsVCMP*/true, OS);
@@ -205,7 +214,8 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
               printwordmem(MI, CurOp--, OS);
             else
               printdwordmem(MI, CurOp--, OS);
-          } else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) {
+          } else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD &&
+                     (Desc.TSFlags & X86II::OpMapMask) != X86II::TA) {
             assert((Desc.TSFlags & X86II::OpMapMask) != X86II::TA &&
                    "Unexpected op map!");
             printqwordmem(MI, CurOp--, OS);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index ad1f2dc532d1c..e7ba13215feb5 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -309,6 +309,17 @@ void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp,
   case X86::VCMPSHZrmi_Intk: case X86::VCMPSHZrri_Intk:
     OS << "sh\t";
     break;
+  case X86::VCMPPBF16Z128rmi:  case X86::VCMPPBF16Z128rri:
+  case X86::VCMPPBF16Z256rmi:  case X86::VCMPPBF16Z256rri:
+  case X86::VCMPPBF16Zrmi:     case X86::VCMPPBF16Zrri:
+  case X86::VCMPPBF16Z128rmik: case X86::VCMPPBF16Z128rrik:
+  case X86::VCMPPBF16Z256rmik: case X86::VCMPPBF16Z256rrik:
+  case X86::VCMPPBF16Zrmik:    case X86::VCMPPBF16Zrrik:
+  case X86::VCMPPBF16Z128rmbi: case X86::VCMPPBF16Z128rmbik:
+  case X86::VCMPPBF16Z256rmbi: case X86::VCMPPBF16Z256rmbik:
+  case X86::VCMPPBF16Zrmbi:    case X86::VCMPPBF16Zrmbik:
+    OS << "pbf16\t";
+    break;
   }
 }
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index 7c8459a546516..39600ffcadd8e 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -146,6 +146,15 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS
   case X86::VCMPPHZrmbi:     case X86::VCMPPHZrmbik:
   case X86::VCMPPHZrrib:     case X86::VCMPPHZrribk:
   case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrrib_Intk:
+  case X86::VCMPPBF16Z128rmi:  case X86::VCMPPBF16Z128rri:
+  case X86::VCMPPBF16Z256rmi:  case X86::VCMPPBF16Z256rri:
+  case X86::VCMPPBF16Zrmi:     case X86::VCMPPBF16Zrri:
+  case X86::VCMPPBF16Z128rmik: case X86::VCMPPBF16Z128rrik:
+  case X86::VCMPPBF16Z256rmik: case X86::VCMPPBF16Z256rrik:
+  case X86::VCMPPBF16Zrmik:    case X86::VCMPPBF16Zrrik:
+  case X86::VCMPPBF16Z128rmbi: case X86::VCMPPBF16Z128rmbik:
+  case X86::VCMPPBF16Z256rmbi: case X86::VCMPPBF16Z256rmbik:
+  case X86::VCMPPBF16Zrmbi:    case X86::VCMPPBF16Zrmbik:
     if (Imm >= 0 && Imm <= 31) {
       OS << '\t';
       printCMPMnemonic(MI, /*IsVCMP*/true, OS);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f7878c78a5231..451881e1d6141 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2360,6 +2360,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CONCAT_VECTORS, MVT::v32bf16, Custom);
   }
 
+  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
+    addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass);
+    addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass);
+    addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
+
+    setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
+    setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
+    setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
+    setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
+    setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
+    setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
+    setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
+    if (Subtarget.hasVLX()) {
+      for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
+        setOperationAction(ISD::FADD, VT, Legal);
+        setOperationAction(ISD::FSUB, VT, Legal);
+        setOperationAction(ISD::FMUL, VT, Legal);
+        setOperationAction(ISD::FDIV, VT, Legal);
+        setOperationAction(ISD::FSQRT, VT, Legal);
+        setOperationAction(ISD::FMA, VT, Legal);
+        setOperationAction(ISD::SETCC, VT, Custom);
+      }
+    }
+  }
+
   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
     setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
     setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
@@ -12212,7 +12237,8 @@ static bool isShuffleFoldableLoad(SDValue V) {
 template<typename T>
 static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
   T EltVT = VT.getScalarType();
-  return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16());
+  return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
+         (EltVT == MVT::f16 && !Subtarget.hasFP16());
 }
 
 /// Try to lower insertion of a single element into a zero vector.
@@ -23265,7 +23291,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
 
   if (isFP) {
     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
-    assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
+    assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
+           EltVT == MVT::f64);
     if (isSoftF16(EltVT, Subtarget))
       return SDValue();
 
@@ -23282,7 +23309,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
          Op0.getSimpleValueType().is512BitVector())) {
 #ifndef NDEBUG
       unsigned Num = VT.getVectorNumElements();
-      assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16));
+      assert(Num <= 16 ||
+             (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
 #endif
       Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
     } else {
@@ -54159,7 +54187,8 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
   EVT ScalarVT = VT.getScalarType();
   if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
        !Subtarget.hasAnyFMA()) &&
-      !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
+      !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
+      !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
     return SDValue();
 
   auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index a518347cfcd82..b0eb210b687b1 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -910,3 +910,318 @@ multiclass avx10_convert_2op_nomb<string OpcodeStr, AVX512VLVectorVTInfo _dest,
 defm VCVTHF82PH : avx10_convert_2op_nomb<"vcvthf82ph", avx512vl_f16_info,
                                          avx512vl_i8_info, 0x1e, X86vcvthf82ph>,
                   AVX512XDIi8Base, T_MAP5, EVEX, EVEX_CD8<16, CD8VH>;
+
+//-------------------------------------------------
+// AVX10 BF16 instructions
+//-------------------------------------------------
+
+// VADDNEPBF16, VSUBNEPBF16, VMULNEPBF16, VDIVNEPBF16, VMAXPBF16, VMINPBF16
+multiclass avx10_fp_binopne_int_pbf16<bits<8> opc, string OpcodeStr,
+                                      X86SchedWriteSizes sched,
+                                      bit IsCommutable = 0> {
+  let Predicates = [HasAVX10_2_512] in
+    defm Z : avx512_fp_packed<opc, OpcodeStr,
+                              !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"pbf16512"),
+                              !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"pbf16512"),
+                              v32bf16_info, sched.PH.ZMM, IsCommutable>, EVEX_V512,
+                              T_MAP5, PD, EVEX_CD8<16, CD8VF>;
+  let Predicates = [HasAVX10_2] in {
+    defm Z128 : avx512_fp_packed<opc, OpcodeStr,
+                                 !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"pbf16128"),
+                                 !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"pbf16128"),
+                                 v8bf16x_info, sched.PH.XMM, IsCommutable>, EVEX_V128,
+                                 T_MAP5, PD, EVEX_CD8<16, CD8VF>;
+    defm Z256 : avx512_fp_packed<opc, OpcodeStr,
+                                 !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"pbf16256"),
+                                 !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"pbf16256"),
+                                 v16bf16x_info, sched.PH.YMM, IsCommutable>, EVEX_V256,
+                                 T_MAP5, PD, EVEX_CD8<16, CD8VF>;
+  }
+}
+
+multiclass avx10_fp_binop_pbf16<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                                X86SchedWriteSizes sched,
+                                bit IsCommutable = 0,
+                                SDPatternOperator MaskOpNode = OpNode> {
+  let Predicates = [HasAVX10_2_512] in
+    defm Z : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, 
+                              v32bf16_info, sched.PH.ZMM, IsCommutable>, EVEX_V512,
+                              T_MAP5, PD, EVEX_CD8<16, CD8VF>;
+  let Predicates = [HasAVX10_2] in {
+    defm Z128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, 
+                                 v8bf16x_info, sched.PH.XMM, IsCommutable>, EVEX_V128,
+                                 T_MAP5, PD, EVEX_CD8<16, CD8VF>;
+    defm Z256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, 
+                                 v16bf16x_info, sched.PH.YMM, IsCommutable>, EVEX_V256,
+                                 T_MAP5, PD, EVEX_CD8<16, CD8VF>;
+  }
+}
+
+let Uses = []<Register>, mayRaiseFPException = 0 in {
+defm VADDNEPBF16 : avx10_fp_binop_pbf16<0x58, "vaddne", fadd, SchedWriteFAddSizes, 1>;
+defm VSUBNEPBF16 : avx10_fp_binop_pbf16<0x5C, "vsubne", fsub, SchedWriteFAddSizes, 0>;
+defm VMULNEPBF16 : avx10_fp_binop_pbf16<0x59, "vmulne", fmul, SchedWriteFMulSizes, 1>;
+defm VDIVNEPBF16 : avx10_fp_binop_pbf16<0x5E, "vdivne", fdiv, SchedWriteFDivSizes, 0>;
+defm VMINPBF16 : avx10_fp_binopne_int_pbf16<0x5D, "vmin", SchedWriteFCmpSizes, 0>;
+defm VMAXPBF16 : avx10_fp_binopne_int_pbf16<0x5F, "vmax", SchedWriteFCmpSizes, 0>;
+}
+
+// VCOMSBF16
+let Uses = []<Register>, mayRaiseFPException = 0,
+  Defs = [EFLAGS], Predicates = [HasAVX10_2] in {
+  //TODO: Replace null_frag with X86fcmp to support lowering `fcmp oeq bfloat *`
+  //which may require extend supports on BFR16X, loadbf16, ...
+  defm VCOMSBF16Z : sse12_ord_cmp<0x2F, FR16X, null_frag, bf16, f16mem, loadf16,
+                                  "comsbf16", SSEPackedSingle>, T_MAP5, PD, EVEX,
+                                  VEX_LIG, EVEX_CD8<16, CD8VT1>;
+
+  let isCodeGenOnly = 1 in {
+    defm VCOMSBF16Z : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v8bf16, f16mem,
+                                        sse_load_bf16, "comsbf16", SSEPackedSingle>,
+                                        T_MAP5, PD, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
+  }
+}
+
+// VCMPPBF16
+multiclass avx10_vcmp_common_bf16<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+  let mayRaiseFPException = 0 in {
+  defm rri  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+                   (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+                   "vcmp"#_.Suffix,
+                   "$cc, $src2, $src1", "$src1, $src2, $cc",
+                   (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
+                   (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
+                   1>, Sched<[sched]>;
+
+  defm rmi  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+                (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+                "vcmp"#_.Suffix,
+                "$cc, $src2, $src1", "$src1, $src2, $cc",
+                (X86cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
+                         timm:$cc),
+                (X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
+                            timm:$cc)>,
+                Sched<[sched.Folded, sched.ReadAfterFold]>;
+
+  defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+                (outs _.KRC:$dst),
+                (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
+                "vcmp"#_.Suffix,
+                "$cc, ${src2}"#_.BroadcastStr#", $src1",
+                "$src1, ${src2}"#_.BroadcastStr#", $cc",
+                (X86cmpm (_.VT _.RC:$src1),
+                         (_.VT (_.BroadcastLdFrag addr:$src2)),
+                         timm:$cc),
+                (X86cmpm_su (_.VT _.RC:$src1),
+                            (_.VT (_.BroadcastLdFrag addr:$src2)),
+                            timm:$cc)>,
+                EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  }
+}
+
+multiclass avx10_vcmp_bf16<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
+  let Predicates = [HasAVX10_2_512] in
+    defm Z : avx10_vcmp_common_bf16<sched.ZMM, _.info512>, EVEX_V512;
+  let Predicates = [HasAVX10_2] in {
+    defm Z128 : avx10_vcmp_common_bf16<sched.XMM, _.info128>, EVEX_V128;
+    defm Z256 : avx10_vcmp_common_bf16<sched.YMM, _.info256>, EVEX_V256;
+  }
+}
+
+defm VCMPPBF16 : avx10_vcmp_bf16<SchedWriteFCmp, avx512vl_bf16_info>,
+                                 AVX512XDIi8Base, EVEX, VVVV,
+                                 EVEX_CD8<16, CD8VF>, TA;
+
+
+// VSQRTNEPBF16
+multiclass avx10_sqrt_packed_bf16<bits<8> opc, string OpcodeStr,
+                                  X86SchedWriteSizes sched> {
+  let Predicates = [HasAVX10_2_512] in
+  defm Z : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pbf16"),
+                              sched.PH.ZMM, v32bf16_info>,
+                              EVEX_V512, PD, T_MAP5, EVEX_CD8<16, CD8VF>;
+  let Predicates = [HasAVX10_2] in {
+    defm Z128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pbf16"),
+                                   sched.PH.XMM, v8bf16x_info>,
+                                   EVEX_V128, PD, T_MAP5, EVEX_CD8<16, CD8VF>;
+    defm Z256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pbf16"),
+                                   sched.PH.YMM, v16bf16x_info>,
+                                   EVEX_V256, PD, T_MAP5, EVEX_CD8<16, CD8VF>;
+  }
+}
+
+let Uses = []<Register>, mayRaiseFPException = 0 in
+defm VSQRTNEPBF16 : avx10_sqrt_packed_bf16<0x51, "vsqrtne", SchedWriteFSqrtSizes>;
+
+// VRSQRTPBF16, VRCPPBF16, VSRQTPBF16, VGETEXPPBF16
+multiclass avx10_fp14_pbf16<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86SchedWriteWidths sched> {
+  let Predicates = [HasAVX10_2_512] in
+  defm PBF16Z : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pbf16"),
+                              OpNode, sched.ZMM, v32bf16_info>,
+                              EVEX_V512;
+  let Predicates = [HasAVX10_2] in {
+    defm PBF16Z128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pbf16"),
+                                   OpNode, sched.XMM, v8bf16x_info>,
+                                   EVEX_V128;
+    defm PBF16Z256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pbf16"),
+                                   OpNode, sched.YMM, v16bf16x_info>,
+                                   EVEX_V256;
+  }
+}
+
+defm VRSQRT  : avx10_fp14_pbf16<0x4E, "vrsqrt", X86rsqrt14, SchedWriteFRsqrt>,
+                                T_MAP6, PS, EVEX_CD8<16, CD8VF>;
+defm VRCP    : avx10_fp14_pbf16<0x4C, "vrcp", X86rcp14, SchedWriteFRcp>,
+                                T_MAP6, PS, EVEX_CD8<16, CD8VF>;
+defm VGETEXP : avx10_fp14_pbf16<0x42, "vgetexp", X86fgetexp, SchedWriteFRnd>,
+                                T_MAP5, EVEX_CD8<16, CD8VF>;
+
+// VSCALEFPBF16
+multiclass avx10_fp_scalef_bf16<bits<8> opc, string OpcodeStr,
+                                X86SchedWriteWidths sched> {
+  let Predicates = [HasAVX10_2_512] in
+    defm Z : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v32bf16_info>,
+                                EVEX_V512, T_MAP6, PS, EVEX_CD8<16, CD8VF>;
+  let Predicates = [HasAVX10_2] in {
+    defm Z128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v8bf16x_info>,
+                                   EVEX_V128, EVEX_CD8<16, CD8VF>, T_MAP6, PS;
+    defm Z256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v16bf16x_info>,
+                                   EVEX_V256, EVEX_CD8<16, CD8VF>, T_MAP6, PS;
+  }
+}
+
+let Uses = []<Register>, mayRaiseFPException = 0 in
+defm VSCALEFPBF16 : avx10_fp_scalef_bf16<0x2C, "vscalef", SchedWriteFAdd>;
+
+// VREDUCENEPBF16, VRNDSCALENEPBF16, VGETMANTPBF16
+multiclass avx10_common_unary_fp_packed_imm_bf16<string OpcodeStr,
+            AVX512VLVectorVTInfo _, bits<8> opc, SDPatternOperator OpNode,
+            SDPatternOperator MaskOpNode, X86SchedWriteWidths sched> {
+  let Predicates = [HasAVX10_2_512] in
+    defm Z    : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                           sched.ZMM, _.info512>, EVEX_V512;
+  let Predicates = [HasAVX10_2] in {
+    defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                           sched.XMM, _.info128>, EVEX_V128;
+    defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                           sched.YMM, _.info256>, EVEX_V256;
+  }
+}
+
+let Uses = []<Register>, mayRaiseFPException = 0 in {
+defm VREDUCENEPBF16 : avx10_common_unary_fp_packed_imm_bf16<"vreducene", avx512vl_bf16_info, 0x56,
+                            X86VReduce, X86VReduce, SchedWriteFRnd>,
+                            AVX512XDIi8Base, TA, EVEX, EVEX_CD8<16, CD8VF>;
+defm VRNDSCALENEPBF16 : avx10_common_unary_fp_packed_imm_bf16<"vrndscalene", avx512vl_bf16_info, 0x08,
+                            X86any_VRndScale, X86VRndScale, SchedWriteFRnd>,
+                            AVX512XDIi8Base, TA, EVEX, EVEX_CD8<16, CD8VF>;
+defm VGETMANTPBF16 : avx10_common_unary_fp_packed_imm_bf16<"vgetmant", avx512vl_bf16_info, 0x26,
+                            X86VGetMant, X86VGetMant, SchedWriteFRnd>,
+                            AVX512XDIi8Base, TA, EVEX, EVEX_CD8<16, CD8VF>;
+}
+
+// VFPCLASSPBF16
+multiclass avx10_fp_fpclass_bf16<string OpcodeStr, bits<8> opcVec,
+                                  X86SchedWriteWidths sched> {
+  let Predicates = [HasAVX10_2_512] in
+    defm Z : avx512_vector_fpclass<opcVec, OpcodeStr, sched.ZMM,
+                                   avx512vl_bf16_info.info512, "z",
+                                   []<Register>>, EVEX_V512;
+  let Predicates = [HasAVX10_2] in {
+    defm Z128 : avx512_vector_fpclass<opcVec, OpcodeStr, sched.XMM,
+                                      avx512vl_bf16_info.info128, "x",
+                                      []<Register>>, EVEX_V128;
+    defm Z256 : avx512_vector_fpclass<opcVec, OpcodeStr, sched.YMM,
+                                      avx512vl_bf16_info.info256, "y",
+                                      []<Register>>, EVEX_V256;
+  }
+}
+
+defm VFPCLASSPBF16 : avx10_fp_fpclass_bf16<"vfpclass", 0x66, SchedWriteFCmp>,
+                                      AVX512XDIi8Base, TA, EVEX, EVEX_CD8<16, CD8VF>;
+
+// VF[,N]M[ADD,SUB][132,213,231]NEPBF16
+multiclass avx10_fma3p_213_bf16<bits<8> opc, string OpcodeStr,
+                                 SDPatternOperator OpNode, SDNode MaskOpNode,
+                                 X86SchedWriteWidths sched> {
+  let Predicates = [HasAVX10_2_512] in
+    defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                               sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS,
+                               EVEX_CD8<16, CD8VF>;
+  let Predicates = [HasAVX10_2] in {
+    defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                               sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS,
+                               EVEX_CD8<16, CD8VF>;
+    defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                               sched.YMM, v16bf16x_info>, EVEX_V256, T_MAP6, PS,
+                               EVEX_CD8<16, CD8VF>;
+  }
+}
+
+let Uses = []<Register>, mayRaiseFPException = 0 in {
+defm VFMADD213NEPBF16 : avx10_fma3p_213_bf16<0xA8, "vfmadd213nepbf16", any_fma,
+                                         fma, SchedWriteFMA>;
+defm VFMSUB213NEPBF16 : avx10_fma3p_213_bf16<0xAA, "vfmsub213nepbf16", X86any_Fmsub,
+                                         X86Fmsub, SchedWriteFMA>;
+defm VFNMADD213NEPBF16 : avx10_fma3p_213_bf16<0xAC, "vfnmadd213nepbf16", X86any_Fnmadd,
+                                          X86Fnmadd, SchedWriteFMA>;
+defm VFNMSUB213NEPBF16 : avx10_fma3p_213_bf16<0xAE, "vfnmsub213nepbf16", X86any_Fnmsub,
+                                          X86Fnmsub, SchedWriteFMA>;
+}
+
+multiclass avx10_fma3p_231_bf16<bits<8> opc, string OpcodeStr,
+                                 SDPatternOperator OpNode, SDNode MaskOpNode,
+                                 X86SchedWriteWidths sched> {
+  let Predicates = [HasAVX10_2_512] in
+    defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                               sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS,
+                               EVEX_CD8<16, CD8VF>;
+  let Predicates = [HasAVX10_2] in {
+    defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                               sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS,
+                               EVEX_CD8<16, CD8VF>;
+    defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                               sched.YMM, v16bf16x_info>, EVEX_V256, T_MAP6, PS,
+                               EVEX_CD8<16, CD8VF>;
+  }
+}
+
+let Uses = []<Register>, mayRaiseFPException = 0 in {
+defm VFMADD231NEPBF16 : avx10_fma3p_231_bf16<0xB8, "vfmadd231nepbf16", any_fma,
+                                         fma, SchedWriteFMA>;
+defm VFMSUB231NEPBF16 : avx10_fma3p_231_bf16<0xBA, "vfmsub231nepbf16", X86any_Fmsub,
+                                         X86Fmsub, SchedWriteFMA>;
+defm VFNMADD231NEPBF16 : avx10_fma3p_231_bf16<0xBC, "vfnmadd231nepbf16", X86any_Fnmadd,
+                                          X86Fnmadd, SchedWriteFMA>;
+defm VFNMSUB231NEPBF16 : avx10_fma3p_231_bf16<0xBE, "vfnmsub231nepbf16", X86any_Fnmsub,
+                                          X86Fnmsub, SchedWriteFMA>;
+}
+
+multiclass avx10_fma3p_132_bf16<bits<8> opc, string OpcodeStr,
+                                 SDPatternOperator OpNode, SDNode MaskOpNode,
+                                 X86SchedWriteWidths sched> {
+  let Predicates = [HasAVX10_2_512] in
+    defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                 sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS,
+                                 EVEX_CD8<16, CD8VF>;
+  let Predicates = [HasAVX10_2] in {
+    defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                    sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS,
+                                    EVEX_CD8<16, CD8VF>;
+    defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                    sched.YMM, v16bf16x_info>, EVEX_V256, T_MAP6, PS,
+                                    EVEX_CD8<16, CD8VF>;
+  }
+}
+
+let Uses = []<Register>, mayRaiseFPException = 0 in {
+defm VFMADD132NEPBF16 : avx10_fma3p_132_bf16<0x98, "vfmadd132nepbf16", any_fma,
+                                             fma, SchedWriteFMA>;
+defm VFMSUB132NEPBF16 : avx10_fma3p_132_bf16<0x9A, "vfmsub132nepbf16", X86any_Fmsub,
+                                             X86Fmsub, SchedWriteFMA>;
+defm VFNMADD132NEPBF16 : avx10_fma3p_132_bf16<0x9C, "vfnmadd132nepbf16", X86any_Fnmadd,
+                                              X86Fnmadd, SchedWriteFMA>;
+defm VFNMSUB132NEPBF16 : avx10_fma3p_132_bf16<0x9E, "vfnmsub132nepbf16", X86any_Fnmsub,
+                                              X86Fnmsub, SchedWriteFMA>;
+}
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 88d1eb5986243..c988524213123 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -2495,8 +2495,8 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
 //                                  fpclass(reg_vec, broadcast(eltVt), imm)
 multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                                  X86FoldableSchedWrite sched, X86VectorVTInfo _,
-                                 string mem>{
-  let ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
+                                 string mem, list<Register> _Uses = [MXCSR]>{
+  let ExeDomain = _.ExeDomain, Uses = _Uses in {
   def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr#_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
index 5825fffc770b0..94de164d5f078 100644
--- a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -43,11 +43,14 @@ using namespace llvm;
   FMA3GROUP(Name, Suf##m, Attrs) \
   FMA3GROUP(Name, Suf##r, Attrs)
 
-#define FMA3GROUP_PACKED(Name, Attrs) \
+#define FMA3GROUP_PACKED_DHS(Name, Attrs) \
   FMA3GROUP_PACKED_WIDTHS_ALL(Name, PD, Attrs) \
   FMA3GROUP_PACKED_WIDTHS_Z(Name, PH, Attrs) \
   FMA3GROUP_PACKED_WIDTHS_ALL(Name, PS, Attrs)
 
+#define FMA3GROUP_PACKED_BF16(Name, Attrs) \
+  FMA3GROUP_PACKED_WIDTHS_Z(Name, NEPBF16, Attrs)
+
 #define FMA3GROUP_SCALAR_WIDTHS_Z(Name, Suf, Attrs) \
   FMA3GROUP(Name, Suf##Zm, Attrs) \
   FMA3GROUP_MASKED(Name, Suf##Zm_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
@@ -67,14 +70,15 @@ using namespace llvm;
   FMA3GROUP_SCALAR_WIDTHS_ALL(Name, SS, Attrs)
 
 #define FMA3GROUP_FULL(Name, Attrs) \
-  FMA3GROUP_PACKED(Name, Attrs) \
+  FMA3GROUP_PACKED_BF16(Name, Attrs) \
+  FMA3GROUP_PACKED_DHS(Name, Attrs) \
   FMA3GROUP_SCALAR(Name, Attrs)
 
 static const X86InstrFMA3Group Groups[] = {
   FMA3GROUP_FULL(VFMADD, 0)
-  FMA3GROUP_PACKED(VFMADDSUB, 0)
+  FMA3GROUP_PACKED_DHS(VFMADDSUB, 0)
   FMA3GROUP_FULL(VFMSUB, 0)
-  FMA3GROUP_PACKED(VFMSUBADD, 0)
+  FMA3GROUP_PACKED_DHS(VFMSUBADD, 0)
   FMA3GROUP_FULL(VFNMADD, 0)
   FMA3GROUP_FULL(VFNMSUB, 0)
 };
@@ -84,7 +88,13 @@ static const X86InstrFMA3Group Groups[] = {
   FMA3GROUP_MASKED(Name, Type##Z256##Suf, Attrs) \
   FMA3GROUP_MASKED(Name, Type##Z##Suf, Attrs)
 
-#define FMA3GROUP_PACKED_AVX512(Name, Suf, Attrs) \
+#define FMA3GROUP_PACKED_AVX512_ALL(Name, Suf, Attrs) \
+  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, NEPBF16, Suf, Attrs) \
+  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs) \
+  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PH, Suf, Attrs) \
+  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PS, Suf, Attrs)
+
+#define FMA3GROUP_PACKED_AVX512_DHS(Name, Suf, Attrs) \
   FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs) \
   FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PH, Suf, Attrs) \
   FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PS, Suf, Attrs)
@@ -106,12 +116,12 @@ static const X86InstrFMA3Group Groups[] = {
   FMA3GROUP_MASKED(Name, SSZ##Suf##_Int, Attrs)
 
 static const X86InstrFMA3Group BroadcastGroups[] = {
-  FMA3GROUP_PACKED_AVX512(VFMADD, mb, 0)
-  FMA3GROUP_PACKED_AVX512(VFMADDSUB, mb, 0)
-  FMA3GROUP_PACKED_AVX512(VFMSUB, mb, 0)
-  FMA3GROUP_PACKED_AVX512(VFMSUBADD, mb, 0)
-  FMA3GROUP_PACKED_AVX512(VFNMADD, mb, 0)
-  FMA3GROUP_PACKED_AVX512(VFNMSUB, mb, 0)
+  FMA3GROUP_PACKED_AVX512_ALL(VFMADD, mb, 0)
+  FMA3GROUP_PACKED_AVX512_DHS(VFMADDSUB, mb, 0)
+  FMA3GROUP_PACKED_AVX512_ALL(VFMSUB, mb, 0)
+  FMA3GROUP_PACKED_AVX512_DHS(VFMSUBADD, mb, 0)
+  FMA3GROUP_PACKED_AVX512_ALL(VFNMADD, mb, 0)
+  FMA3GROUP_PACKED_AVX512_ALL(VFNMSUB, mb, 0)
 };
 
 static const X86InstrFMA3Group RoundGroups[] = {
@@ -153,7 +163,8 @@ const X86InstrFMA3Group *llvm::getFMA3Group(unsigned Opcode, uint64_t TSFlags) {
                         ((TSFlags & X86II::EncodingMask) == X86II::EVEX &&
                          ((TSFlags & X86II::OpMapMask) == X86II::T8 ||
                           (TSFlags & X86II::OpMapMask) == X86II::T_MAP6));
-  bool IsFMA3Prefix = (TSFlags & X86II::OpPrefixMask) == X86II::PD;
+  bool IsFMA3Prefix = (TSFlags & X86II::OpPrefixMask) == X86II::PD ||
+                      (TSFlags & X86II::OpPrefixMask) == 0; // X86II::PS
   if (!IsFMA3Opcode || !IsFMA3Encoding || !IsFMA3Prefix)
     return nullptr;
 
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index e81f2a2fbb951..59bfd2bcbabc2 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -211,6 +211,7 @@ def X86CmpMaskCC :
       SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
                        SDTCisVec<1>, SDTCisSameAs<2, 1>,
                        SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>;
+
 def X86MaskCmpMaskCC :
       SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
                        SDTCisVec<1>, SDTCisSameAs<2, 1>,
@@ -1139,6 +1140,10 @@ def X86SubVBroadcastld256 : PatFrag<(ops node:$src),
 // only load a single element.
 // FIXME: We should add more canolicalizing in DAGCombine. Particulary removing
 // the simple_load case.
+def sse_load_bf16 : PatFrags<(ops node:$ptr),
+                            [(v8bf16 (simple_load node:$ptr)),
+                             (v8bf16 (X86vzload16 node:$ptr)),
+                             (v8bf16 (scalar_to_vector (loadf16 node:$ptr)))]>;
 def sse_load_f16 : PatFrags<(ops node:$ptr),
                             [(v8f16 (simple_load node:$ptr)),
                              (v8f16 (X86vzload16 node:$ptr)),
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index 208af630a352d..531268b41da96 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -313,7 +313,7 @@ def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">;
 def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">;
 def v8i64_info  : X86VectorVTInfo<8,  i64, VR512, "q">;
 def v32f16_info : X86VectorVTInfo<32, f16, VR512, "ph">;
-def v32bf16_info: X86VectorVTInfo<32, bf16, VR512, "pbh">;
+def v32bf16_info: X86VectorVTInfo<32, bf16, VR512, "pbf16">;
 def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">;
 def v8f64_info  : X86VectorVTInfo<8,  f64, VR512, "pd">;
 
@@ -323,7 +323,7 @@ def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">;
 def v8i32x_info  : X86VectorVTInfo<8,  i32, VR256X, "d">;
 def v4i64x_info  : X86VectorVTInfo<4,  i64, VR256X, "q">;
 def v16f16x_info : X86VectorVTInfo<16, f16, VR256X, "ph">;
-def v16bf16x_info: X86VectorVTInfo<16, bf16, VR256X, "pbh">;
+def v16bf16x_info: X86VectorVTInfo<16, bf16, VR256X, "pbf16">;
 def v8f32x_info  : X86VectorVTInfo<8,  f32, VR256X, "ps">;
 def v4f64x_info  : X86VectorVTInfo<4,  f64, VR256X, "pd">;
 
@@ -332,7 +332,7 @@ def v8i16x_info  : X86VectorVTInfo<8,  i16, VR128X, "w">;
 def v4i32x_info  : X86VectorVTInfo<4,  i32, VR128X, "d">;
 def v2i64x_info  : X86VectorVTInfo<2,  i64, VR128X, "q">;
 def v8f16x_info  : X86VectorVTInfo<8,  f16, VR128X, "ph">;
-def v8bf16x_info : X86VectorVTInfo<8,  bf16, VR128X, "pbh">;
+def v8bf16x_info : X86VectorVTInfo<8,  bf16, VR128X, "pbf16">;
 def v4f32x_info  : X86VectorVTInfo<4,  f32, VR128X, "ps">;
 def v2f64x_info  : X86VectorVTInfo<2,  f64, VR128X, "pd">;
 
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 68c1ce072549b..4f39e66e22c23 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -389,6 +389,54 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(avx_vpermilvar_ps, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
     X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV,
                        0),
+    X86_INTRINSIC_DATA(avx10_fpclass_nepbf16_128, INTR_TYPE_2OP,
+                       X86ISD::VFPCLASS, 0),
+    X86_INTRINSIC_DATA(avx10_fpclass_nepbf16_256, INTR_TYPE_2OP,
+                       X86ISD::VFPCLASS, 0),
+    X86_INTRINSIC_DATA(avx10_fpclass_nepbf16_512, INTR_TYPE_2OP,
+                       X86ISD::VFPCLASS, 0),
+    X86_INTRINSIC_DATA(avx10_mask_getexp_nepbf16_128, INTR_TYPE_1OP_MASK,
+                       X86ISD::FGETEXP, 0),
+    X86_INTRINSIC_DATA(avx10_mask_getexp_nepbf16_256, INTR_TYPE_1OP_MASK,
+                       X86ISD::FGETEXP, 0),
+    X86_INTRINSIC_DATA(avx10_mask_getexp_nepbf16_512, INTR_TYPE_1OP_MASK,
+                       X86ISD::FGETEXP, 0),
+    X86_INTRINSIC_DATA(avx10_mask_getmant_nepbf16_128, INTR_TYPE_2OP_MASK,
+                       X86ISD::VGETMANT, 0),
+    X86_INTRINSIC_DATA(avx10_mask_getmant_nepbf16_256, INTR_TYPE_2OP_MASK,
+                       X86ISD::VGETMANT, 0),
+    X86_INTRINSIC_DATA(avx10_mask_getmant_nepbf16_512, INTR_TYPE_2OP_MASK,
+                       X86ISD::VGETMANT, 0),
+    X86_INTRINSIC_DATA(avx10_mask_rcp_nepbf16_128, INTR_TYPE_1OP_MASK,
+                       X86ISD::RCP14, 0),
+    X86_INTRINSIC_DATA(avx10_mask_rcp_nepbf16_256, INTR_TYPE_1OP_MASK,
+                       X86ISD::RCP14, 0),
+    X86_INTRINSIC_DATA(avx10_mask_rcp_nepbf16_512, INTR_TYPE_1OP_MASK,
+                       X86ISD::RCP14, 0),
+    X86_INTRINSIC_DATA(avx10_mask_reduce_nepbf16_128, INTR_TYPE_2OP_MASK,
+                       X86ISD::VREDUCE, 0),
+    X86_INTRINSIC_DATA(avx10_mask_reduce_nepbf16_256, INTR_TYPE_2OP_MASK,
+                       X86ISD::VREDUCE, 0),
+    X86_INTRINSIC_DATA(avx10_mask_reduce_nepbf16_512, INTR_TYPE_2OP_MASK,
+                       X86ISD::VREDUCE, 0),
+    X86_INTRINSIC_DATA(avx10_mask_rndscale_nepbf16_128, INTR_TYPE_2OP_MASK,
+                       X86ISD::VRNDSCALE, 0),
+    X86_INTRINSIC_DATA(avx10_mask_rndscale_nepbf16_256, INTR_TYPE_2OP_MASK,
+                       X86ISD::VRNDSCALE, 0),
+    X86_INTRINSIC_DATA(avx10_mask_rndscale_nepbf16_512, INTR_TYPE_2OP_MASK,
+                       X86ISD::VRNDSCALE, 0),
+    X86_INTRINSIC_DATA(avx10_mask_rsqrt_nepbf16_128, INTR_TYPE_1OP_MASK,
+                       X86ISD::RSQRT14, 0),
+    X86_INTRINSIC_DATA(avx10_mask_rsqrt_nepbf16_256, INTR_TYPE_1OP_MASK,
+                       X86ISD::RSQRT14, 0),
+    X86_INTRINSIC_DATA(avx10_mask_rsqrt_nepbf16_512, INTR_TYPE_1OP_MASK,
+                       X86ISD::RSQRT14, 0),
+    X86_INTRINSIC_DATA(avx10_mask_scalef_nepbf16_128, INTR_TYPE_2OP_MASK,
+                       X86ISD::SCALEF, 0),
+    X86_INTRINSIC_DATA(avx10_mask_scalef_nepbf16_256, INTR_TYPE_2OP_MASK,
+                       X86ISD::SCALEF, 0),
+    X86_INTRINSIC_DATA(avx10_mask_scalef_nepbf16_512, INTR_TYPE_2OP_MASK,
+                       X86ISD::SCALEF, 0),
     X86_INTRINSIC_DATA(avx10_mask_vcmppd256, CMP_MASK_CC, X86ISD::CMPMM,
                        X86ISD::CMPMM_SAE),
     X86_INTRINSIC_DATA(avx10_mask_vcmpph256, CMP_MASK_CC, X86ISD::CMPMM,
@@ -655,6 +703,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
                        X86ISD::FADD_RND),
     X86_INTRINSIC_DATA(avx10_vaddps256, INTR_TYPE_2OP, ISD::FADD,
                        X86ISD::FADD_RND),
+    X86_INTRINSIC_DATA(avx10_vcomsbf16eq, COMI, X86ISD::COMI, ISD::SETEQ),
+    X86_INTRINSIC_DATA(avx10_vcomsbf16ge, COMI, X86ISD::COMI, ISD::SETGE),
+    X86_INTRINSIC_DATA(avx10_vcomsbf16gt, COMI, X86ISD::COMI, ISD::SETGT),
+    X86_INTRINSIC_DATA(avx10_vcomsbf16le, COMI, X86ISD::COMI, ISD::SETLE),
+    X86_INTRINSIC_DATA(avx10_vcomsbf16lt, COMI, X86ISD::COMI, ISD::SETLT),
+    X86_INTRINSIC_DATA(avx10_vcomsbf16neq, COMI, X86ISD::COMI, ISD::SETNE),
     X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8128, INTR_TYPE_2OP,
                        X86ISD::VCVTNE2PH2BF8, 0),
     X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8256, INTR_TYPE_2OP,
diff --git a/llvm/test/CodeGen/X86/avx10.2-fma-commute.ll b/llvm/test/CodeGen/X86/avx10.2-fma-commute.ll
new file mode 100644
index 0000000000000..c41e03ba637cb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx10.2-fma-commute.ll
@@ -0,0 +1,1244 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mtriple=x86_64-unknown-unknown -mattr=avx10.2-512 | FileCheck %s
+
+define <8 x bfloat> @fma_123_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) {
+; CHECK-LABEL: fma_123_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z)
+  ret <8 x bfloat> %a
+}
+
+define <8 x bfloat> @fma_213_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) {
+; CHECK-LABEL: fma_213_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %x, <8 x bfloat> %z)
+  ret <8 x bfloat> %a
+}
+
+define <8 x bfloat> @fma_231_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) {
+; CHECK-LABEL: fma_231_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd231nepbf16 %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %z, <8 x bfloat> %x)
+  ret <8 x bfloat> %a
+}
+
+define <8 x bfloat> @fma_321_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) {
+; CHECK-LABEL: fma_321_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd231nepbf16 %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %y, <8 x bfloat> %x)
+  ret <8 x bfloat> %a
+}
+
+define <8 x bfloat> @fma_132_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) {
+; CHECK-LABEL: fma_132_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %z, <8 x bfloat> %y)
+  ret <8 x bfloat> %a
+}
+
+define <8 x bfloat> @fma_312_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) {
+; CHECK-LABEL: fma_312_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %x, <8 x bfloat> %y)
+  ret <8 x bfloat> %a
+}
+
+define <8 x bfloat> @fma_load_123_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_123_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z)
+  ret <8 x bfloat> %a
+}
+
+define <8 x bfloat> @fma_load_213_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_213_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %x, <8 x bfloat> %z)
+  ret <8 x bfloat> %a
+}
+
+define <8 x bfloat> @fma_load_231_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_231_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %z, <8 x bfloat> %x)
+  ret <8 x bfloat> %a
+}
+
+define <8 x bfloat> @fma_load_321_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_321_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %y, <8 x bfloat> %x)
+  ret <8 x bfloat> %a
+}
+
+define <8 x bfloat> @fma_load_132_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_132_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %z, <8 x bfloat> %y)
+  ret <8 x bfloat> %a
+}
+
+define <8 x bfloat> @fma_load_312_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_312_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %x, <8 x bfloat> %y)
+  ret <8 x bfloat> %a
+}
+
+define <8 x bfloat> @fma_mask_123_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) {
+; CHECK-LABEL: fma_mask_123_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 %xmm1, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_mask_213_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) {
+; CHECK-LABEL: fma_mask_213_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %x, <8 x bfloat> %z)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_mask_231_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) {
+; CHECK-LABEL: fma_mask_231_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %z, <8 x bfloat> %x)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_mask_321_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) {
+; CHECK-LABEL: fma_mask_321_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 %xmm1, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %y, <8 x bfloat> %x)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_mask_132_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) {
+; CHECK-LABEL: fma_mask_132_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %z, <8 x bfloat> %y)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_mask_312_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) {
+; CHECK-LABEL: fma_mask_312_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %xmm1, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %x, <8 x bfloat> %y)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_maskz_123_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) {
+; CHECK-LABEL: fma_maskz_123_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_maskz_213_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) {
+; CHECK-LABEL: fma_maskz_213_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %x, <8 x bfloat> %z)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_maskz_231_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) {
+; CHECK-LABEL: fma_maskz_231_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 %xmm1, %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %z, <8 x bfloat> %x)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_maskz_321_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) {
+; CHECK-LABEL: fma_maskz_321_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 %xmm1, %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %y, <8 x bfloat> %x)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_maskz_132_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) {
+; CHECK-LABEL: fma_maskz_132_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %xmm1, %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %z, <8 x bfloat> %y)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_maskz_312_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z, i8 %mask) {
+; CHECK-LABEL: fma_maskz_312_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %xmm1, %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %x, <8 x bfloat> %y)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_mask_load_123_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) {
+; CHECK-LABEL: fma_mask_load_123_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_mask_load_213_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) {
+; CHECK-LABEL: fma_mask_load_213_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %x, <8 x bfloat> %z)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_mask_load_231_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) {
+; CHECK-LABEL: fma_mask_load_231_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %z, <8 x bfloat> %x)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_mask_load_321_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) {
+; CHECK-LABEL: fma_mask_load_321_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %y, <8 x bfloat> %x)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_mask_load_132_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) {
+; CHECK-LABEL: fma_mask_load_132_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %z, <8 x bfloat> %y)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_mask_load_312_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) {
+; CHECK-LABEL: fma_mask_load_312_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %x, <8 x bfloat> %y)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> %x
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_maskz_load_123_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) {
+; CHECK-LABEL: fma_maskz_load_123_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_maskz_load_213_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) {
+; CHECK-LABEL: fma_maskz_load_213_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %x, <8 x bfloat> %z)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_maskz_load_231_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) {
+; CHECK-LABEL: fma_maskz_load_231_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %y, <8 x bfloat> %z, <8 x bfloat> %x)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_maskz_load_321_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) {
+; CHECK-LABEL: fma_maskz_load_321_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %y, <8 x bfloat> %x)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_maskz_load_132_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) {
+; CHECK-LABEL: fma_maskz_load_132_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %x, <8 x bfloat> %z, <8 x bfloat> %y)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @fma_maskz_load_312_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, ptr %zp, i8 %mask) {
+; CHECK-LABEL: fma_maskz_load_312_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <8 x bfloat>, ptr %zp
+  %a = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %z, <8 x bfloat> %x, <8 x bfloat> %y)
+  %b = bitcast i8 %mask to <8 x i1>
+  %c = select <8 x i1> %b, <8 x bfloat> %a, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_123_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) {
+; CHECK-LABEL: fma_123_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z)
+  ret <16 x bfloat> %a
+}
+
+define <16 x bfloat> @fma_213_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) {
+; CHECK-LABEL: fma_213_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %x, <16 x bfloat> %z)
+  ret <16 x bfloat> %a
+}
+
+define <16 x bfloat> @fma_231_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) {
+; CHECK-LABEL: fma_231_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd231nepbf16 %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %z, <16 x bfloat> %x)
+  ret <16 x bfloat> %a
+}
+
+define <16 x bfloat> @fma_321_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) {
+; CHECK-LABEL: fma_321_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd231nepbf16 %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %y, <16 x bfloat> %x)
+  ret <16 x bfloat> %a
+}
+
+define <16 x bfloat> @fma_132_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) {
+; CHECK-LABEL: fma_132_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %z, <16 x bfloat> %y)
+  ret <16 x bfloat> %a
+}
+
+define <16 x bfloat> @fma_312_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z) {
+; CHECK-LABEL: fma_312_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %x, <16 x bfloat> %y)
+  ret <16 x bfloat> %a
+}
+
+define <16 x bfloat> @fma_load_123_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_123_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z)
+  ret <16 x bfloat> %a
+}
+
+define <16 x bfloat> @fma_load_213_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_213_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %x, <16 x bfloat> %z)
+  ret <16 x bfloat> %a
+}
+
+define <16 x bfloat> @fma_load_231_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_231_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %z, <16 x bfloat> %x)
+  ret <16 x bfloat> %a
+}
+
+define <16 x bfloat> @fma_load_321_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_321_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %y, <16 x bfloat> %x)
+  ret <16 x bfloat> %a
+}
+
+define <16 x bfloat> @fma_load_132_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_132_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %z, <16 x bfloat> %y)
+  ret <16 x bfloat> %a
+}
+
+define <16 x bfloat> @fma_load_312_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_312_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %x, <16 x bfloat> %y)
+  ret <16 x bfloat> %a
+}
+
+define <16 x bfloat> @fma_mask_123_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) {
+; CHECK-LABEL: fma_mask_123_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 %ymm1, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_mask_213_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) {
+; CHECK-LABEL: fma_mask_213_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %x, <16 x bfloat> %z)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_mask_231_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) {
+; CHECK-LABEL: fma_mask_231_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %z, <16 x bfloat> %x)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_mask_321_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) {
+; CHECK-LABEL: fma_mask_321_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 %ymm1, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %y, <16 x bfloat> %x)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_mask_132_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) {
+; CHECK-LABEL: fma_mask_132_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %z, <16 x bfloat> %y)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_mask_312_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) {
+; CHECK-LABEL: fma_mask_312_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %ymm1, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %x, <16 x bfloat> %y)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_maskz_123_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) {
+; CHECK-LABEL: fma_maskz_123_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_maskz_213_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) {
+; CHECK-LABEL: fma_maskz_213_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %x, <16 x bfloat> %z)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_maskz_231_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) {
+; CHECK-LABEL: fma_maskz_231_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 %ymm1, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %z, <16 x bfloat> %x)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_maskz_321_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) {
+; CHECK-LABEL: fma_maskz_321_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 %ymm1, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %y, <16 x bfloat> %x)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_maskz_132_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) {
+; CHECK-LABEL: fma_maskz_132_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %ymm1, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %z, <16 x bfloat> %y)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_maskz_312_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z, i16 %mask) {
+; CHECK-LABEL: fma_maskz_312_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %ymm1, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %x, <16 x bfloat> %y)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_mask_load_123_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) {
+; CHECK-LABEL: fma_mask_load_123_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_mask_load_213_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) {
+; CHECK-LABEL: fma_mask_load_213_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %x, <16 x bfloat> %z)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_mask_load_231_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) {
+; CHECK-LABEL: fma_mask_load_231_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %z, <16 x bfloat> %x)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_mask_load_321_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) {
+; CHECK-LABEL: fma_mask_load_321_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %y, <16 x bfloat> %x)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_mask_load_132_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) {
+; CHECK-LABEL: fma_mask_load_132_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %z, <16 x bfloat> %y)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_mask_load_312_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) {
+; CHECK-LABEL: fma_mask_load_312_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %x, <16 x bfloat> %y)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> %x
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_maskz_load_123_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) {
+; CHECK-LABEL: fma_maskz_load_123_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, <16 x bfloat> %z)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_maskz_load_213_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) {
+; CHECK-LABEL: fma_maskz_load_213_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %x, <16 x bfloat> %z)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_maskz_load_231_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) {
+; CHECK-LABEL: fma_maskz_load_231_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %y, <16 x bfloat> %z, <16 x bfloat> %x)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_maskz_load_321_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) {
+; CHECK-LABEL: fma_maskz_load_321_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %y, <16 x bfloat> %x)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_maskz_load_132_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) {
+; CHECK-LABEL: fma_maskz_load_132_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %x, <16 x bfloat> %z, <16 x bfloat> %y)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @fma_maskz_load_312_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y, ptr %zp, i16 %mask) {
+; CHECK-LABEL: fma_maskz_load_312_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <16 x bfloat>, ptr %zp
+  %a = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %z, <16 x bfloat> %x, <16 x bfloat> %y)
+  %b = bitcast i16 %mask to <16 x i1>
+  %c = select <16 x i1> %b, <16 x bfloat> %a, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_123_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) {
+; CHECK-LABEL: fma_123_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z)
+  ret <32 x bfloat> %a
+}
+
+define <32 x bfloat> @fma_213_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) {
+; CHECK-LABEL: fma_213_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %x, <32 x bfloat> %z)
+  ret <32 x bfloat> %a
+}
+
+define <32 x bfloat> @fma_231_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) {
+; CHECK-LABEL: fma_231_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd231nepbf16 %zmm1, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %z, <32 x bfloat> %x)
+  ret <32 x bfloat> %a
+}
+
+define <32 x bfloat> @fma_321_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) {
+; CHECK-LABEL: fma_321_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd231nepbf16 %zmm1, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %y, <32 x bfloat> %x)
+  ret <32 x bfloat> %a
+}
+
+define <32 x bfloat> @fma_132_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) {
+; CHECK-LABEL: fma_132_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 %zmm1, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %z, <32 x bfloat> %y)
+  ret <32 x bfloat> %a
+}
+
+define <32 x bfloat> @fma_312_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z) {
+; CHECK-LABEL: fma_312_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 %zmm1, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %x, <32 x bfloat> %y)
+  ret <32 x bfloat> %a
+}
+
+define <32 x bfloat> @fma_load_123_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_123_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z)
+  ret <32 x bfloat> %a
+}
+
+define <32 x bfloat> @fma_load_213_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_213_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %x, <32 x bfloat> %z)
+  ret <32 x bfloat> %a
+}
+
+define <32 x bfloat> @fma_load_231_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_231_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %z, <32 x bfloat> %x)
+  ret <32 x bfloat> %a
+}
+
+define <32 x bfloat> @fma_load_321_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_321_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %y, <32 x bfloat> %x)
+  ret <32 x bfloat> %a
+}
+
+define <32 x bfloat> @fma_load_132_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_132_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %z, <32 x bfloat> %y)
+  ret <32 x bfloat> %a
+}
+
+define <32 x bfloat> @fma_load_312_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp) {
+; CHECK-LABEL: fma_load_312_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %x, <32 x bfloat> %y)
+  ret <32 x bfloat> %a
+}
+
+define <32 x bfloat> @fma_mask_123_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) {
+; CHECK-LABEL: fma_mask_123_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_mask_213_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) {
+; CHECK-LABEL: fma_mask_213_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %x, <32 x bfloat> %z)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_mask_231_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) {
+; CHECK-LABEL: fma_mask_231_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %z, <32 x bfloat> %x)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_mask_321_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) {
+; CHECK-LABEL: fma_mask_321_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %y, <32 x bfloat> %x)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_mask_132_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) {
+; CHECK-LABEL: fma_mask_132_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %z, <32 x bfloat> %y)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_mask_312_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) {
+; CHECK-LABEL: fma_mask_312_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %x, <32 x bfloat> %y)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_maskz_123_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) {
+; CHECK-LABEL: fma_maskz_123_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_maskz_213_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) {
+; CHECK-LABEL: fma_maskz_213_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %x, <32 x bfloat> %z)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_maskz_231_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) {
+; CHECK-LABEL: fma_maskz_231_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %z, <32 x bfloat> %x)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_maskz_321_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) {
+; CHECK-LABEL: fma_maskz_321_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %y, <32 x bfloat> %x)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_maskz_132_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) {
+; CHECK-LABEL: fma_maskz_132_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %z, <32 x bfloat> %y)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_maskz_312_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z, i32 %mask) {
+; CHECK-LABEL: fma_maskz_312_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %x, <32 x bfloat> %y)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_mask_load_123_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) {
+; CHECK-LABEL: fma_mask_load_123_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_mask_load_213_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) {
+; CHECK-LABEL: fma_mask_load_213_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %x, <32 x bfloat> %z)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_mask_load_231_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) {
+; CHECK-LABEL: fma_mask_load_231_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %z, <32 x bfloat> %x)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_mask_load_321_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) {
+; CHECK-LABEL: fma_mask_load_321_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %y, <32 x bfloat> %x)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_mask_load_132_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) {
+; CHECK-LABEL: fma_mask_load_132_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %z, <32 x bfloat> %y)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_mask_load_312_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) {
+; CHECK-LABEL: fma_mask_load_312_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %x, <32 x bfloat> %y)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> %x
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_maskz_load_123_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) {
+; CHECK-LABEL: fma_maskz_load_123_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, <32 x bfloat> %z)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_maskz_load_213_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) {
+; CHECK-LABEL: fma_maskz_load_213_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213nepbf16 (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %x, <32 x bfloat> %z)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_maskz_load_231_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) {
+; CHECK-LABEL: fma_maskz_load_231_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %y, <32 x bfloat> %z, <32 x bfloat> %x)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_maskz_load_321_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) {
+; CHECK-LABEL: fma_maskz_load_321_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231nepbf16 (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %y, <32 x bfloat> %x)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_maskz_load_132_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) {
+; CHECK-LABEL: fma_maskz_load_132_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %x, <32 x bfloat> %z, <32 x bfloat> %y)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @fma_maskz_load_312_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y, ptr %zp, i32 %mask) {
+; CHECK-LABEL: fma_maskz_load_312_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132nepbf16 (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %z = load <32 x bfloat>, ptr %zp
+  %a = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %z, <32 x bfloat> %x, <32 x bfloat> %y)
+  %b = bitcast i32 %mask to <32 x i1>
+  %c = select <32 x i1> %b, <32 x bfloat> %a, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %c
+}
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
new file mode 100644
index 0000000000000..33c40ac6bb32c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
@@ -0,0 +1,587 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86
+
+define <32 x bfloat> @test_int_x86_avx10_vaddnepbf16512(<32 x bfloat> %x1, <32 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_vaddnepbf16512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x58,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = fadd <32 x bfloat>  %x1, %x2
+  ret <32 x bfloat> %res
+}
+
+define <32 x bfloat> @test_int_x86_avx10_mask_add_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_mask_add_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vaddnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x58,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_mask_add_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vaddnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x58,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i32 %msk to <32 x i1>
+  %res0 = fadd <32 x bfloat>  %x1, %x2
+  %res = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> %src
+  ret <32 x bfloat> %res
+}
+
+define <32 x bfloat> @test_int_x86_avx10_maskz_add_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_maskz_add_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vaddnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x58,0xc2]
+; X64-NEXT:    vaddnepbf16 (%rsi), %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x58,0x0e]
+; X64-NEXT:    vaddnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x58,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_add_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vaddnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x58,0xc2]
+; X86-NEXT:    vaddnepbf16 (%eax), %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x58,0x08]
+; X86-NEXT:    vaddnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x58,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i32 %msk to <32 x i1>
+  %val = load <32 x bfloat>, ptr %ptr
+  %res0 = fadd <32 x bfloat>  %x1, %x2
+  %res1 = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> zeroinitializer
+  %t2 = fadd <32 x bfloat>  %x1, %val
+  %res2 = select <32 x i1> %mask, <32 x bfloat> %t2, <32 x bfloat> zeroinitializer
+  %res3  =  fadd <32 x bfloat>  %res1, %res2
+  ret <32 x bfloat> %res3
+}
+
+define <32 x bfloat> @test_int_x86_avx10_sub_nepbf16_512(<32 x bfloat> %x1, <32 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_sub_nepbf16_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsubnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x5c,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = fsub <32 x bfloat> %x1, %x2
+  ret <32 x bfloat> %res
+}
+
+define <32 x bfloat> @test_int_x86_avx10_mask_sub_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_mask_sub_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vsubnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x5c,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_mask_sub_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vsubnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x5c,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i32 %msk to <32 x i1>
+  %res0 = fsub <32 x bfloat> %x1, %x2
+  %res = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> %src
+  ret <32 x bfloat> %res
+}
+
+define <32 x bfloat> @test_int_x86_avx10_maskz_sub_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_maskz_sub_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vsubnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2]
+; X64-NEXT:    vsubnepbf16 (%rsi), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x0e]
+; X64-NEXT:    vsubnepbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_sub_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vsubnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2]
+; X86-NEXT:    vsubnepbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08]
+; X86-NEXT:    vsubnepbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i32 %msk to <32 x i1>
+  %val = load <32 x bfloat>, ptr %ptr
+  %res0 = fsub <32 x bfloat> %x1, %x2
+  %res1 = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> zeroinitializer
+  %t2 = fsub <32 x bfloat> %x1, %val
+  %res2 = select <32 x i1> %mask, <32 x bfloat> %t2, <32 x bfloat> zeroinitializer
+  %res3  = fsub <32 x bfloat> %res1, %res2
+  ret <32 x bfloat> %res3
+}
+
+declare <32 x bfloat> @llvm.x86.avx10.vmulnepbf16512(<32 x bfloat>, <32 x bfloat>)
+
+define <32 x bfloat> @test_int_x86_avx10_mul_nepbf16_512(<32 x bfloat> %x1, <32 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_mul_nepbf16_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmulnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x59,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = fmul <32 x bfloat> %x1, %x2
+  ret <32 x bfloat> %res
+}
+
+define <32 x bfloat> @test_int_x86_avx10_mask_mul_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_mask_mul_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vmulnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x59,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_mask_mul_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vmulnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x59,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i32 %msk to <32 x i1>
+  %res0 = fmul <32 x bfloat> %x1, %x2
+  %res = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> %src
+  ret <32 x bfloat> %res
+}
+
+define <32 x bfloat> @test_int_x86_avx10_maskz_mul_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_maskz_mul_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vmulnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x59,0xc2]
+; X64-NEXT:    vmulnepbf16 (%rsi), %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x59,0x0e]
+; X64-NEXT:    vmulnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x59,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_mul_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vmulnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x59,0xc2]
+; X86-NEXT:    vmulnepbf16 (%eax), %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x59,0x08]
+; X86-NEXT:    vmulnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x59,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i32 %msk to <32 x i1>
+  %val = load <32 x bfloat>, ptr %ptr
+  %res0 = fmul <32 x bfloat> %x1, %x2
+  %res1 = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> zeroinitializer
+  %t2 = fmul <32 x bfloat> %x1, %val
+  %res2 = select <32 x i1> %mask, <32 x bfloat> %t2, <32 x bfloat> zeroinitializer
+  %res3  = fmul <32 x bfloat> %res1, %res2
+  ret <32 x bfloat> %res3
+}
+
+define <32 x bfloat> @test_int_x86_avx10_div_nepbf16_512(<32 x bfloat> %x1, <32 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_div_nepbf16_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x5e,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = fdiv <32 x bfloat> %x1, %x2
+  ret <32 x bfloat> %res
+}
+
+define <32 x bfloat> @test_int_x86_avx10_mask_div_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_mask_div_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vdivnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x5e,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_mask_div_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vdivnepbf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x49,0x5e,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i32 %msk to <32 x i1>
+  %res0 = fdiv <32 x bfloat> %x1, %x2
+  %res = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> %src
+  ret <32 x bfloat> %res
+}
+
+define <32 x bfloat> @test_int_x86_avx10_maskz_div_nepbf16_512(<32 x bfloat> %src, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_maskz_div_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vdivnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5e,0xc2]
+; X64-NEXT:    vdivnepbf16 (%rsi), %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5e,0x0e]
+; X64-NEXT:    vdivnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x5e,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_div_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vdivnepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5e,0xc2]
+; X86-NEXT:    vdivnepbf16 (%eax), %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5e,0x08]
+; X86-NEXT:    vdivnepbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x5e,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i32 %msk to <32 x i1>
+  %val = load <32 x bfloat>, ptr %ptr
+  %res0 = fdiv <32 x bfloat> %x1, %x2
+  %res1 = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> zeroinitializer
+  %t2 = fdiv <32 x bfloat> %x1, %val
+  %res2 = select <32 x i1> %mask, <32 x bfloat> %t2, <32 x bfloat> zeroinitializer
+  %res3  = fdiv <32 x bfloat> %res1, %res2
+  ret <32 x bfloat> %res3
+}
+
+define i32 @test_int_x86_avx10_vcmppbf16512(<32 x bfloat> %x1, <32 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_vcmppbf16512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcmpunordpbf16 %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7f,0x48,0xc2,0xc1,0x03]
+; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %1 = fcmp uno <32 x bfloat> %x1, %x2
+  %res = bitcast <32 x i1> %1 to i32
+  ret i32 %res
+}
+
+; FIXME: _mm512_mask_cmp_p[s|h]_mask is not using {k2} but gcc does
+define i32 @test_int_x86_avx10_vcmppbf16512_mask2(<32 x bfloat> %x1, <32 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_vcmppbf16512_mask2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcmpeqpbf16 %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7f,0x48,0xc2,0xc1,0x00]
+; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    andl $3, %eax # encoding: [0x83,0xe0,0x03]
+; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %1 = fcmp oeq <32 x bfloat> %x1, %x2
+  %2 = and <32 x i1> %1, <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>
+  %3 = bitcast <32 x i1> %2 to i32
+  ret i32 %3
+}
+
+define <32 x bfloat> @test_sqrt_nepbf16_512(<32 x bfloat> %a0) {
+; CHECK-LABEL: test_sqrt_nepbf16_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsqrtnepbf16 %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x51,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %1 = tail call <32 x bfloat> @llvm.sqrt.v32bf16(<32 x bfloat> %a0)
+  ret <32 x bfloat> %1
+}
+
+define <32 x bfloat> @test_mm512_mask_sqrt_pbh(<32 x bfloat> %__W, i32 %__U, <32 x bfloat> %__A) {
+; X64-LABEL: test_mm512_mask_sqrt_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vsqrtnepbf16 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x51,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm512_mask_sqrt_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vsqrtnepbf16 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x51,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %0 = tail call <32 x bfloat> @llvm.sqrt.v32bf16(<32 x bfloat> %__A)
+  %1 = bitcast i32 %__U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__W
+  ret <32 x bfloat> %2
+}
+
+define <32 x bfloat> @test_mm512_maskz_sqrt_pbh(i32 %__U, <32 x bfloat>%__A) {
+; X64-LABEL: test_mm512_maskz_sqrt_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vsqrtnepbf16 %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x51,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm512_maskz_sqrt_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vsqrtnepbf16 %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x51,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %0 = tail call <32 x bfloat> @llvm.sqrt.v32bf16(<32 x bfloat> %__A)
+  %1 = bitcast i32 %__U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %2
+}
+
+define <32 x bfloat> @test_mm512_fmaddne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) {
+; CHECK-LABEL: test_mm512_fmaddne_pbh:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfmadd213nepbf16 %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x74,0x48,0xa8,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+entry:
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C)
+  ret <32 x bfloat> %0
+}
+
+define <32 x bfloat> @test_mm512_mask_fmaddne_pbh(<32 x bfloat> %__A, i32 %__U, <32 x bfloat> %__B, <32 x bfloat> %__C) {
+; X64-LABEL: test_mm512_mask_fmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmadd132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x98,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm512_mask_fmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmadd132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x98,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C)
+  %1 = bitcast i32 %__U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__A
+  ret <32 x bfloat> %2
+}
+
+define <32 x bfloat> @test_mm512_mask3_fmaddne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C, i32 %__U) {
+; X64-LABEL: test_mm512_mask3_fmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmadd231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xb8,0xd1]
+; X64-NEXT:    vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm512_mask3_fmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmadd231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xb8,0xd1]
+; X86-NEXT:    vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C)
+  %1 = bitcast i32 %__U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__C
+  ret <32 x bfloat> %2
+}
+
+define <32 x bfloat> @test_mm512_maskz_fmaddne_pbh(i32 %__U, <32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) {
+; X64-LABEL: test_mm512_maskz_fmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmadd213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xa8,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm512_maskz_fmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmadd213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xa8,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C)
+  %1 = bitcast i32 %__U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %2
+}
+
+define <32 x bfloat> @test_mm512_fmsubne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) {
+; CHECK-LABEL: test_mm512_fmsubne_pbh:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfmsub213nepbf16 %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x74,0x48,0xaa,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+entry:
+  %fneg.i = fneg <32 x bfloat> %__C
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %fneg.i)
+  ret <32 x bfloat> %0
+}
+
+define <32 x bfloat> @test_mm512_mask_fmsubne_pbh(<32 x bfloat> %__A, i32 %__U, <32 x bfloat> %__B, <32 x bfloat> %__C) {
+; X64-LABEL: test_mm512_mask_fmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmsub132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x9a,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm512_mask_fmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmsub132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x9a,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <32 x bfloat> %__C
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %fneg.i.i)
+  %1 = bitcast i32 %__U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__A
+  ret <32 x bfloat> %2
+}
+
+define <32 x bfloat> @test_mm512_mask3_fmsubne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C, i32 %__U) {
+; X64-LABEL: test_mm512_mask3_fmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmsub231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xba,0xd1]
+; X64-NEXT:    vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm512_mask3_fmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmsub231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xba,0xd1]
+; X86-NEXT:    vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <32 x bfloat> %__C
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %fneg.i.i)
+  %1 = bitcast i32 %__U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__C
+  ret <32 x bfloat> %2
+}
+
+define <32 x bfloat> @test_mm512_maskz_fmsubne_pbh(i32 %__U, <32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) {
+; X64-LABEL: test_mm512_maskz_fmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmsub213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xaa,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm512_maskz_fmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmsub213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xaa,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <32 x bfloat> %__C
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %fneg.i.i)
+  %1 = bitcast i32 %__U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %2
+}
+
+define <32 x bfloat> @test_mm512_fnmaddne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) {
+; CHECK-LABEL: test_mm512_fnmaddne_pbh:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfnmadd213nepbf16 %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x74,0x48,0xac,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+entry:
+  %fneg.i = fneg <32 x bfloat> %__B
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i, <32 x bfloat> %__C)
+  ret <32 x bfloat> %0
+}
+
+define <32 x bfloat> @test_mm512_mask_fnmaddne_pbh(<32 x bfloat> %__A, i32 %__U, <32 x bfloat> %__B, <32 x bfloat> %__C) {
+; X64-LABEL: test_mm512_mask_fnmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmadd132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x9c,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm512_mask_fnmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmadd132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x9c,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <32 x bfloat> %__B
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i.i, <32 x bfloat> %__C)
+  %1 = bitcast i32 %__U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__A
+  ret <32 x bfloat> %2
+}
+
+define <32 x bfloat> @test_mm512_mask3_fnmaddne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C, i32 %__U) {
+; X64-LABEL: test_mm512_mask3_fnmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmadd231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xbc,0xd1]
+; X64-NEXT:    vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm512_mask3_fnmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmadd231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xbc,0xd1]
+; X86-NEXT:    vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <32 x bfloat> %__B
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i.i, <32 x bfloat> %__C)
+  %1 = bitcast i32 %__U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__C
+  ret <32 x bfloat> %2
+}
+
+define <32 x bfloat> @test_mm512_maskz_fnmaddne_pbh(i32 %__U, <32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) {
+; X64-LABEL: test_mm512_maskz_fnmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmadd213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xac,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm512_maskz_fnmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmadd213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xac,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <32 x bfloat> %__B
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i.i, <32 x bfloat> %__C)
+  %1 = bitcast i32 %__U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %2
+}
+
+define <32 x bfloat> @test_mm512_fnmsubne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) {
+; CHECK-LABEL: test_mm512_fnmsubne_pbh:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfnmsub213nepbf16 %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x74,0x48,0xae,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+entry:
+  %fneg.i = fneg <32 x bfloat> %__B
+  %fneg1.i = fneg <32 x bfloat> %__C
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i, <32 x bfloat> %fneg1.i)
+  ret <32 x bfloat> %0
+}
+
+define <32 x bfloat> @test_mm512_mask_fnmsubne_pbh(<32 x bfloat> %__A, i32 %__U, <32 x bfloat> %__B, <32 x bfloat> %__C) {
+; X64-LABEL: test_mm512_mask_fnmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmsub132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x9e,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm512_mask_fnmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmsub132nepbf16 %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x49,0x9e,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <32 x bfloat> %__B
+  %fneg1.i.i = fneg <32 x bfloat> %__C
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i.i, <32 x bfloat> %fneg1.i.i)
+  %1 = bitcast i32 %__U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__A
+  ret <32 x bfloat> %2
+}
+
+define <32 x bfloat> @test_mm512_mask3_fnmsubne_pbh(<32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C, i32 %__U) {
+; X64-LABEL: test_mm512_mask3_fnmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmsub231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xbe,0xd1]
+; X64-NEXT:    vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm512_mask3_fnmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmsub231nepbf16 %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0xbe,0xd1]
+; X86-NEXT:    vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <32 x bfloat> %__B
+  %fneg1.i.i = fneg <32 x bfloat> %__C
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i.i, <32 x bfloat> %fneg1.i.i)
+  %1 = bitcast i32 %__U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> %__C
+  ret <32 x bfloat> %2
+}
+
+define <32 x bfloat> @test_mm512_maskz_fnmsubne_pbh(i32 %__U, <32 x bfloat> %__A, <32 x bfloat> %__B, <32 x bfloat> %__C) {
+; X64-LABEL: test_mm512_maskz_fnmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmsub213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xae,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm512_maskz_fnmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmsub213nepbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xc9,0xae,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <32 x bfloat> %__B
+  %fneg1.i.i = fneg <32 x bfloat> %__C
+  %0 = tail call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %__A, <32 x bfloat> %fneg.i.i, <32 x bfloat> %fneg1.i.i)
+  %1 = bitcast i32 %__U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %2
+}
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll
new file mode 100644
index 0000000000000..7b81d547db085
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll
@@ -0,0 +1,230 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86
+
+declare <32 x bfloat> @llvm.x86.avx10.vminpbf16512(<32 x bfloat>, <32 x bfloat>)
+
+define <32 x bfloat> @test_int_x86_avx10_min_nepbf16_512(<32 x bfloat> %x1, <32 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_min_nepbf16_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminpbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x5d,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res0 = call <32 x bfloat> @llvm.x86.avx10.vminpbf16512(<32 x bfloat> %x1, <32 x bfloat> %x2)
+  ret <32 x bfloat> %res0
+}
+
+define <32 x bfloat> @test_int_x86_avx10_maskz_min_nepbf16_512(<32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk) {
+; X64-LABEL: test_int_x86_avx10_maskz_min_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vminpbf16 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x5d,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_min_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vminpbf16 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x5d,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i32 %msk to <32 x i1>
+  %res0 = call <32 x bfloat> @llvm.x86.avx10.vminpbf16512(<32 x bfloat> %x1, <32 x bfloat> %x2)
+  %res1 = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %res1
+}
+
+declare <32 x bfloat> @llvm.x86.avx10.vmaxpbf16512(<32 x bfloat>, <32 x bfloat>)
+
+define <32 x bfloat> @test_int_x86_avx10_max_nepbf16_512(<32 x bfloat> %x1, <32 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_max_nepbf16_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxpbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x5f,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res0 = call <32 x bfloat> @llvm.x86.avx10.vmaxpbf16512(<32 x bfloat> %x1, <32 x bfloat> %x2)
+  ret <32 x bfloat> %res0
+}
+
+define <32 x bfloat> @test_int_x86_avx10_maskz_max_nepbf16_512(<32 x bfloat> %x1, <32 x bfloat> %x2, i32 %msk) {
+; X64-LABEL: test_int_x86_avx10_maskz_max_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vmaxpbf16 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x5f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_max_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vmaxpbf16 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x5f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i32 %msk to <32 x i1>
+  %res0 = call <32 x bfloat> @llvm.x86.avx10.vmaxpbf16512(<32 x bfloat> %x1, <32 x bfloat> %x2)
+  %res1 = select <32 x i1> %mask, <32 x bfloat> %res0, <32 x bfloat> zeroinitializer
+  ret <32 x bfloat> %res1
+}
+
+declare <32 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.512(<32 x bfloat>, <32 x bfloat>, i32)
+
+define <32 x bfloat> @test_rsqrt_nepbf16_512(<32 x bfloat> %a0) {
+; CHECK-LABEL: test_rsqrt_nepbf16_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrsqrtpbf16 %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x4e,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <32 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.512(<32 x bfloat> %a0, <32 x bfloat> zeroinitializer, i32 -1)
+  ret <32 x bfloat> %res
+}
+
+declare <32 x i1> @llvm.x86.avx10.fpclass.nepbf16.512(<32 x bfloat>, i32)
+
+define i32 @test_int_x86_avx512_fpclass_nepbf16_512(<32 x bfloat> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_fpclass_nepbf16_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfpclasspbf16 $2, %zmm0, %k1 # encoding: [0x62,0xf3,0x7f,0x48,0x66,0xc8,0x02]
+; CHECK-NEXT:    vfpclasspbf16 $4, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7f,0x49,0x66,0xc0,0x04]
+; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <32 x i1> @llvm.x86.avx10.fpclass.nepbf16.512(<32 x bfloat> %x0, i32 4)
+  %res1 = call <32 x i1> @llvm.x86.avx10.fpclass.nepbf16.512(<32 x bfloat> %x0, i32 2)
+  %1 = and <32 x i1> %res1, %res
+  %2 = bitcast <32 x i1> %1 to i32
+  ret i32 %2
+}
+
+declare <32 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.512(<32 x bfloat>, <32 x bfloat>, i32)
+
+define <32 x bfloat> @test_rcp_nepbf16_512(<32 x bfloat> %a0, <32 x bfloat> %a1, i32 %mask) {
+; X64-LABEL: test_rcp_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vrcppbf16 %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0x4c,0xc8]
+; X64-NEXT:    vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_rcp_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vrcppbf16 %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0x4c,0xc8]
+; X86-NEXT:    vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <32 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.512(<32 x bfloat> %a0, <32 x bfloat> %a1, i32 %mask)
+  ret <32 x bfloat> %res
+}
+
+declare <32 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.512(<32 x bfloat>, i32, <32 x bfloat>, i32)
+
+define <32 x bfloat>@test_int_x86_avx512_mask_reduce_nepbf16_512(<32 x bfloat> %x0, <32 x bfloat> %x2, i32 %x3) {
+; X64-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vreducenepbf16 $8, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x49,0x56,0xc8,0x08]
+; X64-NEXT:    vreducenepbf16 $4, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x56,0xc0,0x04]
+; X64-NEXT:    vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vreducenepbf16 $8, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x49,0x56,0xc8,0x08]
+; X86-NEXT:    vreducenepbf16 $4, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x56,0xc0,0x04]
+; X86-NEXT:    vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <32 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.512(<32 x bfloat> %x0, i32 8, <32 x bfloat> %x2, i32 %x3)
+  %res1 = call <32 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.512(<32 x bfloat> %x0, i32 4, <32 x bfloat> %x2, i32 -1)
+  %res2 = fadd <32 x bfloat> %res, %res1
+  ret <32 x bfloat> %res2
+}
+
+declare <32 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.512(<32 x bfloat>, i32, <32 x bfloat>, i32)
+
+define <32 x bfloat>@test_int_x86_avx512_mask_rndscale_nepbf16_512(<32 x bfloat> %x0, <32 x bfloat> %x2, i32 %x3) {
+; X64-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vrndscalenepbf16 $8, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x49,0x08,0xc8,0x08]
+; X64-NEXT:    vrndscalenepbf16 $4, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x08,0xc0,0x04]
+; X64-NEXT:    vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vrndscalenepbf16 $8, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x49,0x08,0xc8,0x08]
+; X86-NEXT:    vrndscalenepbf16 $4, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x08,0xc0,0x04]
+; X86-NEXT:    vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <32 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.512(<32 x bfloat> %x0, i32 8, <32 x bfloat> %x2, i32 %x3)
+  %res1 = call <32 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.512(<32 x bfloat> %x0, i32 4, <32 x bfloat> %x2, i32 -1)
+  %res2 = fadd <32 x bfloat> %res, %res1
+  ret <32 x bfloat> %res2
+}
+
+declare <32 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.512(<32 x bfloat>, <32 x bfloat>, i32)
+
+define <32 x bfloat>@test_int_x86_avx512_mask_getexp_nepbf16_512(<32 x bfloat> %x0, <32 x bfloat> %x1, i32 %x2) {
+; X64-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vgetexppbf16 %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x42,0xc0]
+; X64-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc8]
+; X64-NEXT:    vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vgetexppbf16 %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x42,0xc0]
+; X86-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc8]
+; X86-NEXT:    vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res1 = call <32 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.512(<32 x bfloat> %x0, <32 x bfloat> %x1, i32 %x2)
+  %res2 = call <32 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.512(<32 x bfloat> %x0, <32 x bfloat> zeroinitializer, i32 -1)
+  %res3 = fadd <32 x bfloat> %res1, %res2
+  ret <32 x bfloat> %res3
+}
+
+declare <32 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.512(<32 x bfloat>, i32, <32 x bfloat>, i32)
+
+define <32 x bfloat>@test_int_x86_avx512_mask_getmant_nepbf16_512(<32 x bfloat> %x0, <32 x bfloat> %x2, i32 %x3) {
+; X64-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vgetmantpbf16 $8, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x49,0x26,0xc8,0x08]
+; X64-NEXT:    vgetmantpbf16 $4, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x26,0xc0,0x04]
+; X64-NEXT:    vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vgetmantpbf16 $8, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x49,0x26,0xc8,0x08]
+; X86-NEXT:    vgetmantpbf16 $4, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x26,0xc0,0x04]
+; X86-NEXT:    vaddnepbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <32 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.512(<32 x bfloat> %x0, i32 8, <32 x bfloat> %x2, i32 %x3)
+  %res1 = call <32 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.512(<32 x bfloat> %x0, i32 4, <32 x bfloat> %x2, i32 -1)
+  %res2 = fadd <32 x bfloat> %res, %res1
+  ret <32 x bfloat> %res2
+}
+
+declare <32 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.512(<32 x bfloat>, <32 x bfloat>, <32 x bfloat>, i32)
+
+define <32 x bfloat>@test_int_x86_avx512_mask_scalef_nepbf16_512(<32 x bfloat> %x0, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %x3) {
+; X64-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vscalefpbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x2c,0xc1]
+; X64-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd0]
+; X64-NEXT:    vaddnepbf16 %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf5,0x6d,0x48,0x58,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vscalefpbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x2c,0xc1]
+; X86-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd0]
+; X86-NEXT:    vaddnepbf16 %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf5,0x6d,0x48,0x58,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i32 %x3 to <32 x i1>
+  %res1 = call <32 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.512(<32 x bfloat> %x0, <32 x bfloat> %x1, <32 x bfloat> %x2, i32 %x3)
+  %res2 = call <32 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.512(<32 x bfloat> %x0, <32 x bfloat> %x1, <32 x bfloat> zeroinitializer, i32 -1)
+  %res3 = fadd <32 x bfloat> %res1, %res2
+  ret <32 x bfloat> %res3
+}
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
new file mode 100644
index 0000000000000..e0f5679e8ac96
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
@@ -0,0 +1,1168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86
+
+define <16 x bfloat> @test_int_x86_avx10_add_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_add_nepbf16_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x58,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = fadd <16 x bfloat> %x1, %x2
+  ret <16 x bfloat> %res
+}
+
+define <16 x bfloat> @test_int_x86_avx10_mask_add_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_mask_add_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vaddnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x58,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_mask_add_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vaddnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x58,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i16 %msk to <16 x i1>
+  %res0 = fadd <16 x bfloat> %x1, %x2
+  %res = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> %src
+  ret <16 x bfloat> %res
+}
+define <16 x bfloat> @test_int_x86_avx10_maskz_add_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_maskz_add_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vaddnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x58,0xc2]
+; X64-NEXT:    vaddnepbf16 (%rsi), %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x58,0x0e]
+; X64-NEXT:    vaddnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x58,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_add_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vaddnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x58,0xc2]
+; X86-NEXT:    vaddnepbf16 (%eax), %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x58,0x08]
+; X86-NEXT:    vaddnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x58,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i16 %msk to <16 x i1>
+  %val = load <16 x bfloat>, ptr %ptr
+  %res0 = fadd <16 x bfloat> %x1, %x2
+  %res1 = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> zeroinitializer
+  %t2 = fadd <16 x bfloat> %x1, %val
+  %res2 = select <16 x i1> %mask, <16 x bfloat> %t2, <16 x bfloat> zeroinitializer
+  %res3  = fadd <16 x bfloat> %res1, %res2
+  ret <16 x bfloat> %res3
+}
+
+define <8 x bfloat> @test_int_x86_avx10_add_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_add_nepbf16_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x58,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = fadd <8 x bfloat> %x1, %x2
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_int_x86_avx10_mask_add_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_mask_add_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vaddnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x58,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_mask_add_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vaddnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x58,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i8 %msk to <8 x i1>
+  %res0 = fadd <8 x bfloat> %x1, %x2
+  %res = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> %src
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_int_x86_avx10_maskz_add_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_maskz_add_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vaddnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x58,0xc2]
+; X64-NEXT:    vaddnepbf16 (%rsi), %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x58,0x0e]
+; X64-NEXT:    vaddnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x58,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_add_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vaddnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x58,0xc2]
+; X86-NEXT:    vaddnepbf16 (%eax), %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x58,0x08]
+; X86-NEXT:    vaddnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x58,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i8 %msk to <8 x i1>
+  %val = load <8 x bfloat>, ptr %ptr
+  %res0 = fadd <8 x bfloat> %x1, %x2
+  %res1 = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> zeroinitializer
+  %t2 = fadd <8 x bfloat> %x1, %val
+  %res2 = select <8 x i1> %mask, <8 x bfloat> %t2, <8 x bfloat> zeroinitializer
+  %res3  = fadd <8 x bfloat> %res1, %res2
+  ret <8 x bfloat> %res3
+}
+
+define <16 x bfloat> @test_int_x86_avx10_sub_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_sub_nepbf16_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsubnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x5c,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = fsub <16 x bfloat> %x1, %x2
+  ret <16 x bfloat> %res
+}
+
+define <16 x bfloat> @test_int_x86_avx10_mask_sub_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_mask_sub_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vsubnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x5c,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_mask_sub_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vsubnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x5c,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i16 %msk to <16 x i1>
+  %res0 = fsub <16 x bfloat> %x1, %x2
+  %res = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> %src
+  ret <16 x bfloat> %res
+}
+
+define <16 x bfloat> @test_int_x86_avx10_maskz_sub_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_maskz_sub_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vsubnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2]
+; X64-NEXT:    vsubnepbf16 (%rsi), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x0e]
+; X64-NEXT:    vsubnepbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_sub_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vsubnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2]
+; X86-NEXT:    vsubnepbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08]
+; X86-NEXT:    vsubnepbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i16 %msk to <16 x i1>
+  %val = load <16 x bfloat>, ptr %ptr
+  %res0 = fsub <16 x bfloat> %x1, %x2
+  %res1 = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> zeroinitializer
+  %t2 = fsub <16 x bfloat> %x1, %val
+  %res2 = select <16 x i1> %mask, <16 x bfloat> %t2, <16 x bfloat> zeroinitializer
+  %res3  = fsub <16 x bfloat> %res1, %res2
+  ret <16 x bfloat> %res3
+}
+
+define <8 x bfloat> @test_int_x86_avx10_sub_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_sub_nepbf16_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsubnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x5c,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = fsub <8 x bfloat> %x1, %x2
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_int_x86_avx10_mask_sub_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_mask_sub_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vsubnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x5c,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_mask_sub_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vsubnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x5c,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i8 %msk to <8 x i1>
+  %res0 = fsub <8 x bfloat> %x1, %x2
+  %res = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> %src
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_int_x86_avx10_maskz_sub_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_maskz_sub_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vsubnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2]
+; X64-NEXT:    vsubnepbf16 (%rsi), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x0e]
+; X64-NEXT:    vsubnepbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_sub_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vsubnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2]
+; X86-NEXT:    vsubnepbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08]
+; X86-NEXT:    vsubnepbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i8 %msk to <8 x i1>
+  %val = load <8 x bfloat>, ptr %ptr
+  %res0 = fsub <8 x bfloat> %x1, %x2
+  %res1 = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> zeroinitializer
+  %t2 = fsub <8 x bfloat> %x1, %val
+  %res2 = select <8 x i1> %mask, <8 x bfloat> %t2, <8 x bfloat> zeroinitializer
+  %res3  = fsub <8 x bfloat> %res1, %res2
+  ret <8 x bfloat> %res3
+}
+
+define <16 x bfloat> @test_int_x86_avx10_mul_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_mul_nepbf16_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmulnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x59,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = fmul <16 x bfloat> %x1, %x2
+  ret <16 x bfloat> %res
+}
+
+define <16 x bfloat> @test_int_x86_avx10_mask_mul_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_mask_mul_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vmulnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x59,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_mask_mul_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vmulnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x59,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i16 %msk to <16 x i1>
+  %res0 = fmul <16 x bfloat> %x1, %x2
+  %res = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> %src
+  ret <16 x bfloat> %res
+}
+
+define <16 x bfloat> @test_int_x86_avx10_maskz_mul_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_maskz_mul_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vmulnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x59,0xc2]
+; X64-NEXT:    vmulnepbf16 (%rsi), %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x59,0x0e]
+; X64-NEXT:    vmulnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x59,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_mul_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vmulnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x59,0xc2]
+; X86-NEXT:    vmulnepbf16 (%eax), %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x59,0x08]
+; X86-NEXT:    vmulnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x59,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i16 %msk to <16 x i1>
+  %val = load <16 x bfloat>, ptr %ptr
+  %res0 = fmul <16 x bfloat> %x1, %x2
+  %res1 = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> zeroinitializer
+  %t2 = fmul <16 x bfloat> %x1, %val
+  %res2 = select <16 x i1> %mask, <16 x bfloat> %t2, <16 x bfloat> zeroinitializer
+  %res3  = fmul <16 x bfloat> %res1, %res2
+  ret <16 x bfloat> %res3
+}
+
+define <8 x bfloat> @test_int_x86_avx10_mul_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_mul_nepbf16_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmulnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x59,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = fmul <8 x bfloat> %x1, %x2
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_int_x86_avx10_mask_mul_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_mask_mul_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vmulnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x59,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_mask_mul_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vmulnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x59,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i8 %msk to <8 x i1>
+  %res0 = fmul <8 x bfloat> %x1, %x2
+  %res = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> %src
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_int_x86_avx10_maskz_mul_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_maskz_mul_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vmulnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x59,0xc2]
+; X64-NEXT:    vmulnepbf16 (%rsi), %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x59,0x0e]
+; X64-NEXT:    vmulnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x59,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_mul_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vmulnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x59,0xc2]
+; X86-NEXT:    vmulnepbf16 (%eax), %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x59,0x08]
+; X86-NEXT:    vmulnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x59,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i8 %msk to <8 x i1>
+  %val = load <8 x bfloat>, ptr %ptr
+  %res0 = fmul <8 x bfloat> %x1, %x2
+  %res1 = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> zeroinitializer
+  %t2 = fmul <8 x bfloat> %x1, %val
+  %res2 = select <8 x i1> %mask, <8 x bfloat> %t2, <8 x bfloat> zeroinitializer
+  %res3  = fmul <8 x bfloat> %res1, %res2
+  ret <8 x bfloat> %res3
+}
+
+define <16 x bfloat> @test_int_x86_avx10_div_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_div_nepbf16_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x5e,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = fdiv <16 x bfloat> %x1, %x2
+  ret <16 x bfloat> %res
+}
+
+define <16 x bfloat> @test_int_x86_avx10_mask_div_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_mask_div_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vdivnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x5e,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_mask_div_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vdivnepbf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x75,0x29,0x5e,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i16 %msk to <16 x i1>
+  %res0 = fdiv <16 x bfloat> %x1, %x2
+  %res = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> %src
+  ret <16 x bfloat> %res
+}
+
+; FIXME: assembly order is different from fp16 ones
+define <16 x bfloat> @test_int_x86_avx10_maskz_div_nepbf16_256(<16 x bfloat> %src, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_maskz_div_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vdivnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5e,0xc2]
+; X64-NEXT:    vdivnepbf16 (%rsi), %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5e,0x0e]
+; X64-NEXT:    vdivnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x5e,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_div_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vdivnepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5e,0xc2]
+; X86-NEXT:    vdivnepbf16 (%eax), %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5e,0x08]
+; X86-NEXT:    vdivnepbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x5e,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i16 %msk to <16 x i1>
+  %val = load <16 x bfloat>, ptr %ptr
+  %res0 = fdiv <16 x bfloat> %x1, %x2
+  %res1 = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> zeroinitializer
+  %t2 = fdiv <16 x bfloat> %x1, %val
+  %res2 = select <16 x i1> %mask, <16 x bfloat> %t2, <16 x bfloat> zeroinitializer
+  %res3  = fdiv <16 x bfloat> %res1, %res2
+  ret <16 x bfloat> %res3
+}
+
+define <8 x bfloat> @test_int_x86_avx10_div_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_div_nepbf16_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x5e,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = fdiv <8 x bfloat> %x1, %x2
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_int_x86_avx10_mask_div_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_mask_div_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vdivnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x5e,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_mask_div_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vdivnepbf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x75,0x09,0x5e,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i8 %msk to <8 x i1>
+  %res0 = fdiv <8 x bfloat> %x1, %x2
+  %res = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> %src
+  ret <8 x bfloat> %res
+}
+
+; FIXME: assembly order is different from fp16 ones
+define <8 x bfloat> @test_int_x86_avx10_maskz_div_nepbf16_128(<8 x bfloat> %src, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk, ptr %ptr) {
+; X64-LABEL: test_int_x86_avx10_maskz_div_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vdivnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5e,0xc2]
+; X64-NEXT:    vdivnepbf16 (%rsi), %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5e,0x0e]
+; X64-NEXT:    vdivnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x5e,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_div_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vdivnepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5e,0xc2]
+; X86-NEXT:    vdivnepbf16 (%eax), %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5e,0x08]
+; X86-NEXT:    vdivnepbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x5e,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i8 %msk to <8 x i1>
+  %val = load <8 x bfloat>, ptr %ptr
+  %res0 = fdiv <8 x bfloat> %x1, %x2
+  %res1 = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> zeroinitializer
+  %t2 = fdiv <8 x bfloat> %x1, %val
+  %res2 = select <8 x i1> %mask, <8 x bfloat> %t2, <8 x bfloat> zeroinitializer
+  %res3  = fdiv <8 x bfloat> %res1, %res2
+  ret <8 x bfloat> %res3
+}
+
+define i16 @test_int_x86_avx10_vcmppbf16256(<16 x bfloat> %x1, <16 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_vcmppbf16256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcmpunordpbf16 %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7f,0x28,0xc2,0xc1,0x03]
+; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %1 = fcmp uno <16 x bfloat> %x1, %x2
+  %res = bitcast <16 x i1> %1 to i16
+  ret i16 %res
+}
+
+define i16 @test_int_x86_avx10_vcmppbf16256_mask2(<16 x bfloat> %x1, <16 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_vcmppbf16256_mask2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcmpeqpbf16 %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7f,0x28,0xc2,0xc1,0x00]
+; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    andl $3, %eax # encoding: [0x83,0xe0,0x03]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %1 = fcmp oeq <16 x bfloat> %x1, %x2
+  %2 = and <16 x i1> %1, <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>
+  %3 = bitcast <16 x i1> %2 to i16
+  ret i16 %3
+}
+
+define i8 @test_int_x86_avx10_vcmppbf16128(<8 x bfloat> %x1, <8 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_vcmppbf16128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcmpunordpbf16 %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7f,0x08,0xc2,0xc1,0x03]
+; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %1 = fcmp uno <8 x bfloat> %x1, %x2
+  %res = bitcast <8 x i1> %1 to i8
+  ret i8 %res
+}
+
+define i8 @test_int_x86_avx10_vcmppbf16128_mask2(<8 x bfloat> %x1, <8 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_vcmppbf16128_mask2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcmpeqpbf16 %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7f,0x08,0xc2,0xc1,0x00]
+; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    andb $3, %al # encoding: [0x24,0x03]
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %1 = fcmp oeq <8 x bfloat> %x1, %x2
+  %2 = and <8 x i1> %1, <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>
+  %3 = bitcast <8 x i1> %2 to i8
+  ret i8 %3
+}
+
+define <16 x bfloat> @test_sqrt_nepbf16_256(<16 x bfloat> %a0) {
+; CHECK-LABEL: test_sqrt_nepbf16_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsqrtnepbf16 %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x51,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %1 = tail call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> %a0)
+  ret <16 x bfloat> %1
+}
+
+define <16 x bfloat> @test_mm256_mask_sqrt_pbh(<16 x bfloat> %__W, i16 %__U, <16 x bfloat> %__A) {
+; X64-LABEL: test_mm256_mask_sqrt_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vsqrtnepbf16 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x51,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm256_mask_sqrt_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vsqrtnepbf16 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x51,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %0 = tail call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> %__A)
+  %1 = bitcast i16 %__U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__W
+  ret <16 x bfloat> %2
+}
+
+define <16 x bfloat> @test_mm256_maskz_sqrt_pbh(i16 %__U, <16 x bfloat>%__A) {
+; X64-LABEL: test_mm256_maskz_sqrt_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vsqrtnepbf16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x51,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm256_maskz_sqrt_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vsqrtnepbf16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x51,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %0 = tail call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> %__A)
+  %1 = bitcast i16 %__U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %2
+}
+
+define <8 x bfloat> @test_sqrt_nepbf16_128(<8 x bfloat> %a0) {
+; CHECK-LABEL: test_sqrt_nepbf16_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsqrtnepbf16 %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x51,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %1 = tail call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> %a0)
+  ret <8 x bfloat> %1
+}
+
+define <8 x bfloat> @test_mm_mask_sqrt_pbh(<8 x bfloat> %__W, i8 %__U, <8 x bfloat> %__A) {
+; X64-LABEL: test_mm_mask_sqrt_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vsqrtnepbf16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x51,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm_mask_sqrt_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vsqrtnepbf16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x51,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %0 = tail call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> %__A)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__W
+  ret <8 x bfloat> %2
+}
+
+define <8 x bfloat> @test_mm_maskz_sqrt_pbh(i8 %__U, <8 x bfloat>%__A) {
+; X64-LABEL: test_mm_maskz_sqrt_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vsqrtnepbf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x51,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm_maskz_sqrt_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vsqrtnepbf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x51,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %0 = tail call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> %__A)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %2
+}
+
+define <16 x bfloat> @test_mm256_fmaddne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) {
+; CHECK-LABEL: test_mm256_fmaddne_pbh:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfmadd213nepbf16 %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x74,0x28,0xa8,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+entry:
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C)
+  ret <16 x bfloat> %0
+}
+
+define <16 x bfloat> @test_mm256_mask_fmaddne_pbh(<16 x bfloat> %__A, i16 zeroext %__U, <16 x bfloat> %__B, <16 x bfloat> %__C) {
+; X64-LABEL: test_mm256_mask_fmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmadd132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x98,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm256_mask_fmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmadd132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x98,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C)
+  %1 = bitcast i16 %__U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__A
+  ret <16 x bfloat> %2
+}
+
+define <16 x bfloat> @test_mm256_mask3_fmaddne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C, i16 zeroext %__U) {
+; X64-LABEL: test_mm256_mask3_fmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmadd231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xb8,0xd1]
+; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm256_mask3_fmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmadd231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xb8,0xd1]
+; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C)
+  %1 = bitcast i16 %__U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__C
+  ret <16 x bfloat> %2
+}
+
+define <16 x bfloat> @test_mm256_maskz_fmaddne_pbh(i16 zeroext %__U, <16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) {
+; X64-LABEL: test_mm256_maskz_fmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmadd213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xa8,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm256_maskz_fmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmadd213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xa8,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C)
+  %1 = bitcast i16 %__U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %2
+}
+
+define <16 x bfloat> @test_mm256_fmsubne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) {
+; CHECK-LABEL: test_mm256_fmsubne_pbh:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfmsub213nepbf16 %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x74,0x28,0xaa,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+entry:
+  %fneg.i = fneg <16 x bfloat> %__C
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %fneg.i)
+  ret <16 x bfloat> %0
+}
+
+define <16 x bfloat> @test_mm256_mask_fmsubne_pbh(<16 x bfloat> %__A, i16 zeroext %__U, <16 x bfloat> %__B, <16 x bfloat> %__C) {
+; X64-LABEL: test_mm256_mask_fmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmsub132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x9a,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm256_mask_fmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmsub132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x9a,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <16 x bfloat> %__C
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %fneg.i.i)
+  %1 = bitcast i16 %__U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__A
+  ret <16 x bfloat> %2
+}
+
+define <16 x bfloat> @test_mm256_mask3_fmsubne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C, i16 zeroext %__U) {
+; X64-LABEL: test_mm256_mask3_fmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmsub231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xba,0xd1]
+; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm256_mask3_fmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmsub231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xba,0xd1]
+; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <16 x bfloat> %__C
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %fneg.i.i)
+  %1 = bitcast i16 %__U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__C
+  ret <16 x bfloat> %2
+}
+
+define <16 x bfloat> @test_mm256_maskz_fmsubne_pbh(i16 zeroext %__U, <16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) {
+; X64-LABEL: test_mm256_maskz_fmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmsub213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xaa,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm256_maskz_fmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmsub213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xaa,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <16 x bfloat> %__C
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %fneg.i.i)
+  %1 = bitcast i16 %__U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %2
+}
+
+define <16 x bfloat> @test_mm256_fnmaddne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) {
+; CHECK-LABEL: test_mm256_fnmaddne_pbh:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfnmadd213nepbf16 %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x74,0x28,0xac,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+entry:
+  %fneg.i = fneg <16 x bfloat> %__B
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i, <16 x bfloat> %__C)
+  ret <16 x bfloat> %0
+}
+
+define <16 x bfloat> @test_mm256_mask_fnmaddne_pbh(<16 x bfloat> %__A, i16 zeroext %__U, <16 x bfloat> %__B, <16 x bfloat> %__C) {
+; X64-LABEL: test_mm256_mask_fnmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmadd132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x9c,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm256_mask_fnmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmadd132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x9c,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <16 x bfloat> %__B
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i.i, <16 x bfloat> %__C)
+  %1 = bitcast i16 %__U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__A
+  ret <16 x bfloat> %2
+}
+
+define <16 x bfloat> @test_mm256_mask3_fnmaddne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C, i16 zeroext %__U) {
+; X64-LABEL: test_mm256_mask3_fnmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmadd231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xbc,0xd1]
+; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm256_mask3_fnmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmadd231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xbc,0xd1]
+; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <16 x bfloat> %__B
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i.i, <16 x bfloat> %__C)
+  %1 = bitcast i16 %__U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__C
+  ret <16 x bfloat> %2
+}
+
+define <16 x bfloat> @test_mm256_maskz_fnmaddne_pbh(i16 zeroext %__U, <16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) {
+; X64-LABEL: test_mm256_maskz_fnmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmadd213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xac,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm256_maskz_fnmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmadd213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xac,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <16 x bfloat> %__B
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i.i, <16 x bfloat> %__C)
+  %1 = bitcast i16 %__U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %2
+}
+
+define <16 x bfloat> @test_mm256_fnmsubne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) {
+; CHECK-LABEL: test_mm256_fnmsubne_pbh:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfnmsub213nepbf16 %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x74,0x28,0xae,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+entry:
+  %fneg.i = fneg <16 x bfloat> %__B
+  %fneg1.i = fneg <16 x bfloat> %__C
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i, <16 x bfloat> %fneg1.i)
+  ret <16 x bfloat> %0
+}
+
+define <16 x bfloat> @test_mm256_mask_fnmsubne_pbh(<16 x bfloat> %__A, i16 zeroext %__U, <16 x bfloat> %__B, <16 x bfloat> %__C) {
+; X64-LABEL: test_mm256_mask_fnmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmsub132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x9e,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm256_mask_fnmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmsub132nepbf16 %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x29,0x9e,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <16 x bfloat> %__B
+  %fneg1.i.i = fneg <16 x bfloat> %__C
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i.i, <16 x bfloat> %fneg1.i.i)
+  %1 = bitcast i16 %__U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__A
+  ret <16 x bfloat> %2
+}
+
+define <16 x bfloat> @test_mm256_mask3_fnmsubne_pbh(<16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C, i16 zeroext %__U) {
+; X64-LABEL: test_mm256_mask3_fnmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmsub231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xbe,0xd1]
+; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm256_mask3_fnmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmsub231nepbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0xbe,0xd1]
+; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <16 x bfloat> %__B
+  %fneg1.i.i = fneg <16 x bfloat> %__C
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i.i, <16 x bfloat> %fneg1.i.i)
+  %1 = bitcast i16 %__U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> %__C
+  ret <16 x bfloat> %2
+}
+
+define <16 x bfloat> @test_mm256_maskz_fnmsubne_pbh(i16 zeroext %__U, <16 x bfloat> %__A, <16 x bfloat> %__B, <16 x bfloat> %__C) {
+; X64-LABEL: test_mm256_maskz_fnmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmsub213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xae,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm256_maskz_fnmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmsub213nepbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0xa9,0xae,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <16 x bfloat> %__B
+  %fneg1.i.i = fneg <16 x bfloat> %__C
+  %0 = tail call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %__A, <16 x bfloat> %fneg.i.i, <16 x bfloat> %fneg1.i.i)
+  %1 = bitcast i16 %__U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %2
+}
+
+define <8 x bfloat> @test_mm_fmaddne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) {
+; CHECK-LABEL: test_mm_fmaddne_pbh:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfmadd213nepbf16 %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x74,0x08,0xa8,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+entry:
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C)
+  ret <8 x bfloat> %0
+}
+
+define <8 x bfloat> @test_mm_mask_fmaddne_pbh(<8 x bfloat> %__A, i8 zeroext %__U, <8 x bfloat> %__B, <8 x bfloat> %__C) {
+; X64-LABEL: test_mm_mask_fmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmadd132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x98,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm_mask_fmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmadd132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x98,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__A
+  ret <8 x bfloat> %2
+}
+
+define <8 x bfloat> @test_mm_mask3_fmaddne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C, i8 zeroext %__U) {
+; X64-LABEL: test_mm_mask3_fmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmadd231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xb8,0xd1]
+; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm_mask3_fmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmadd231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xb8,0xd1]
+; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__C
+  ret <8 x bfloat> %2
+}
+
+define <8 x bfloat> @test_mm_maskz_fmaddne_pbh(i8 zeroext %__U, <8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) {
+; X64-LABEL: test_mm_maskz_fmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmadd213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xa8,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm_maskz_fmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmadd213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xa8,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %2
+}
+
+define <8 x bfloat> @test_mm_fmsubne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) {
+; CHECK-LABEL: test_mm_fmsubne_pbh:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfmsub213nepbf16 %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x74,0x08,0xaa,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+entry:
+  %fneg.i = fneg <8 x bfloat> %__C
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %fneg.i)
+  ret <8 x bfloat> %0
+}
+
+define <8 x bfloat> @test_mm_mask_fmsubne_pbh(<8 x bfloat> %__A, i8 zeroext %__U, <8 x bfloat> %__B, <8 x bfloat> %__C) {
+; X64-LABEL: test_mm_mask_fmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmsub132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x9a,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm_mask_fmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmsub132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x9a,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <8 x bfloat> %__C
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %fneg.i.i)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__A
+  ret <8 x bfloat> %2
+}
+
+define <8 x bfloat> @test_mm_mask3_fmsubne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C, i8 zeroext %__U) {
+; X64-LABEL: test_mm_mask3_fmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmsub231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xba,0xd1]
+; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm_mask3_fmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmsub231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xba,0xd1]
+; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <8 x bfloat> %__C
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %fneg.i.i)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__C
+  ret <8 x bfloat> %2
+}
+
+define <8 x bfloat> @test_mm_maskz_fmsubne_pbh(i8 zeroext %__U, <8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) {
+; X64-LABEL: test_mm_maskz_fmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfmsub213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xaa,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm_maskz_fmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfmsub213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xaa,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <8 x bfloat> %__C
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %fneg.i.i)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %2
+}
+
+define <8 x bfloat> @test_mm_fnmaddne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) {
+; CHECK-LABEL: test_mm_fnmaddne_pbh:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfnmadd213nepbf16 %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x74,0x08,0xac,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+entry:
+  %fneg.i = fneg <8 x bfloat> %__B
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i, <8 x bfloat> %__C)
+  ret <8 x bfloat> %0
+}
+
+define <8 x bfloat> @test_mm_mask_fnmaddne_pbh(<8 x bfloat> %__A, i8 zeroext %__U, <8 x bfloat> %__B, <8 x bfloat> %__C) {
+; X64-LABEL: test_mm_mask_fnmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmadd132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x9c,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm_mask_fnmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmadd132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x9c,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <8 x bfloat> %__B
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i.i, <8 x bfloat> %__C)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__A
+  ret <8 x bfloat> %2
+}
+
+define <8 x bfloat> @test_mm_mask3_fnmaddne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C, i8 zeroext %__U) {
+; X64-LABEL: test_mm_mask3_fnmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmadd231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xbc,0xd1]
+; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm_mask3_fnmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmadd231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xbc,0xd1]
+; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <8 x bfloat> %__B
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i.i, <8 x bfloat> %__C)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__C
+  ret <8 x bfloat> %2
+}
+
+define <8 x bfloat> @test_mm_maskz_fnmaddne_pbh(i8 zeroext %__U, <8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) {
+; X64-LABEL: test_mm_maskz_fnmaddne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmadd213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xac,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm_maskz_fnmaddne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmadd213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xac,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <8 x bfloat> %__B
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i.i, <8 x bfloat> %__C)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %2
+}
+
+define <8 x bfloat> @test_mm_fnmsubne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) {
+; CHECK-LABEL: test_mm_fnmsubne_pbh:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfnmsub213nepbf16 %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x74,0x08,0xae,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+entry:
+  %fneg.i = fneg <8 x bfloat> %__B
+  %fneg1.i = fneg <8 x bfloat> %__C
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i, <8 x bfloat> %fneg1.i)
+  ret <8 x bfloat> %0
+}
+
+define <8 x bfloat> @test_mm_mask_fnmsubne_pbh(<8 x bfloat> %__A, i8 zeroext %__U, <8 x bfloat> %__B, <8 x bfloat> %__C) {
+; X64-LABEL: test_mm_mask_fnmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmsub132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x9e,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm_mask_fnmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmsub132nepbf16 %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6c,0x09,0x9e,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <8 x bfloat> %__B
+  %fneg1.i.i = fneg <8 x bfloat> %__C
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i.i, <8 x bfloat> %fneg1.i.i)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__A
+  ret <8 x bfloat> %2
+}
+
+define <8 x bfloat> @test_mm_mask3_fnmsubne_pbh(<8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C, i8 zeroext %__U) {
+; X64-LABEL: test_mm_mask3_fnmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmsub231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xbe,0xd1]
+; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm_mask3_fnmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmsub231nepbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0xbe,0xd1]
+; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <8 x bfloat> %__B
+  %fneg1.i.i = fneg <8 x bfloat> %__C
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i.i, <8 x bfloat> %fneg1.i.i)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> %__C
+  ret <8 x bfloat> %2
+}
+
+define <8 x bfloat> @test_mm_maskz_fnmsubne_pbh(i8 zeroext %__U, <8 x bfloat> %__A, <8 x bfloat> %__B, <8 x bfloat> %__C) {
+; X64-LABEL: test_mm_maskz_fnmsubne_pbh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vfnmsub213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xae,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mm_maskz_fnmsubne_pbh:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vfnmsub213nepbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x74,0x89,0xae,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+entry:
+  %fneg.i.i = fneg <8 x bfloat> %__B
+  %fneg1.i.i = fneg <8 x bfloat> %__C
+  %0 = tail call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %__A, <8 x bfloat> %fneg.i.i, <8 x bfloat> %fneg1.i.i)
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %2
+}
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll
new file mode 100644
index 0000000000000..559d866b55cc7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll
@@ -0,0 +1,602 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86
+
+declare <16 x bfloat> @llvm.x86.avx10.vminpbf16256(<16 x bfloat>, <16 x bfloat>)
+
+define <16 x bfloat> @test_int_x86_avx10_min_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_min_nepbf16_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminpbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x5d,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res0 = call <16 x bfloat> @llvm.x86.avx10.vminpbf16256(<16 x bfloat> %x1, <16 x bfloat> %x2)
+  ret <16 x bfloat> %res0
+}
+
+define <16 x bfloat> @test_int_x86_avx10_maskz_min_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk) {
+; X64-LABEL: test_int_x86_avx10_maskz_min_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vminpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x5d,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_min_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vminpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x5d,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i16 %msk to <16 x i1>
+  %res0 = call <16 x bfloat> @llvm.x86.avx10.vminpbf16256(<16 x bfloat> %x1, <16 x bfloat> %x2)
+  %res1 = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %res1
+}
+
+declare <8 x bfloat> @llvm.x86.avx10.vminpbf16128(<8 x bfloat>, <8 x bfloat>)
+
+define <8 x bfloat> @test_int_x86_avx10_min_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_min_nepbf16_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminpbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x5d,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res0 = call <8 x bfloat> @llvm.x86.avx10.vminpbf16128(<8 x bfloat> %x1, <8 x bfloat> %x2)
+  ret <8 x bfloat> %res0
+}
+
+define <8 x bfloat> @test_int_x86_avx10_maskz_min_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk) {
+; X64-LABEL: test_int_x86_avx10_maskz_min_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vminpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x5d,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_min_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vminpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x5d,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i8 %msk to <8 x i1>
+  %res0 = call <8 x bfloat> @llvm.x86.avx10.vminpbf16128(<8 x bfloat> %x1, <8 x bfloat> %x2)
+  %res1 = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %res1
+}
+
+declare <16 x bfloat> @llvm.x86.avx10.vmaxpbf16256(<16 x bfloat>, <16 x bfloat>)
+
+define <16 x bfloat> @test_int_x86_avx10_max_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_max_nepbf16_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxpbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x5f,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res0 = call <16 x bfloat> @llvm.x86.avx10.vmaxpbf16256(<16 x bfloat> %x1, <16 x bfloat> %x2)
+  ret <16 x bfloat> %res0
+}
+
+define <16 x bfloat> @test_int_x86_avx10_maskz_max_nepbf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2, i16 %msk) {
+; X64-LABEL: test_int_x86_avx10_maskz_max_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vmaxpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x5f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_max_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vmaxpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x5f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i16 %msk to <16 x i1>
+  %res0 = call <16 x bfloat> @llvm.x86.avx10.vmaxpbf16256(<16 x bfloat> %x1, <16 x bfloat> %x2)
+  %res1 = select <16 x i1> %mask, <16 x bfloat> %res0, <16 x bfloat> zeroinitializer
+  ret <16 x bfloat> %res1
+}
+
+declare <8 x bfloat> @llvm.x86.avx10.vmaxpbf16128(<8 x bfloat>, <8 x bfloat>)
+
+define <8 x bfloat> @test_int_x86_avx10_max_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2) {
+; CHECK-LABEL: test_int_x86_avx10_max_nepbf16_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxpbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x5f,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res0 = call <8 x bfloat> @llvm.x86.avx10.vmaxpbf16128(<8 x bfloat> %x1, <8 x bfloat> %x2)
+  ret <8 x bfloat> %res0
+}
+
+define <8 x bfloat> @test_int_x86_avx10_maskz_max_nepbf16_128(<8 x bfloat> %x1, <8 x bfloat> %x2, i8 %msk) {
+; X64-LABEL: test_int_x86_avx10_maskz_max_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vmaxpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x5f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx10_maskz_max_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vmaxpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x5f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i8 %msk to <8 x i1>
+  %res0 = call <8 x bfloat> @llvm.x86.avx10.vmaxpbf16128(<8 x bfloat> %x1, <8 x bfloat> %x2)
+  %res1 = select <8 x i1> %mask, <8 x bfloat> %res0, <8 x bfloat> zeroinitializer
+  ret <8 x bfloat> %res1
+}
+
+declare i32 @llvm.x86.avx10.vcomsbf16eq(<8 x bfloat>, <8 x bfloat>)
+declare i32 @llvm.x86.avx10.vcomsbf16lt(<8 x bfloat>, <8 x bfloat>)
+declare i32 @llvm.x86.avx10.vcomsbf16le(<8 x bfloat>, <8 x bfloat>)
+declare i32 @llvm.x86.avx10.vcomsbf16gt(<8 x bfloat>, <8 x bfloat>)
+declare i32 @llvm.x86.avx10.vcomsbf16ge(<8 x bfloat>, <8 x bfloat>)
+declare i32 @llvm.x86.avx10.vcomsbf16neq(<8 x bfloat>, <8 x bfloat>)
+
+define i32 @test_x86_avx10_com_nesbf16_eq(<8 x bfloat> %a0, <8 x bfloat> %a1) {
+; CHECK-LABEL: test_x86_avx10_com_nesbf16_eq:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomsbf16 %xmm1, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xc1]
+; CHECK-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
+; CHECK-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
+; CHECK-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
+; CHECK-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call i32 @llvm.x86.avx10.vcomsbf16eq(<8 x bfloat> %a0, <8 x bfloat> %a1)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx10_com_nesbf16_lt(<8 x bfloat> %a0, <8 x bfloat> %a1) {
+; CHECK-LABEL: test_x86_avx10_com_nesbf16_lt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    vcomsbf16 %xmm0, %xmm1 # encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xc8]
+; CHECK-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call i32 @llvm.x86.avx10.vcomsbf16lt(<8 x bfloat> %a0, <8 x bfloat> %a1)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx10_com_nesbf16_le(<8 x bfloat> %a0, <8 x bfloat> %a1) {
+; CHECK-LABEL: test_x86_avx10_com_nesbf16_le:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    vcomsbf16 %xmm0, %xmm1 # encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xc8]
+; CHECK-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call i32 @llvm.x86.avx10.vcomsbf16le(<8 x bfloat> %a0, <8 x bfloat> %a1)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx10_com_nesbf16_gt(<8 x bfloat> %a0, <8 x bfloat> %a1) {
+; CHECK-LABEL: test_x86_avx10_com_nesbf16_gt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    vcomsbf16 %xmm1, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xc1]
+; CHECK-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call i32 @llvm.x86.avx10.vcomsbf16ge(<8 x bfloat> %a0, <8 x bfloat> %a1)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx10_com_nesbf16_neq(<8 x bfloat> %a0, <8 x bfloat> %a1) {
+; CHECK-LABEL: test_x86_avx10_com_nesbf16_neq:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomsbf16 %xmm1, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xc1]
+; CHECK-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
+; CHECK-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
+; CHECK-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
+; CHECK-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call i32 @llvm.x86.avx10.vcomsbf16neq(<8 x bfloat> %a0, <8 x bfloat> %a1)
+  ret i32 %res
+}
+
+declare <8 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.128(<8 x bfloat>, <8 x bfloat>, i8)
+declare <16 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.256(<16 x bfloat>, <16 x bfloat>, i16)
+
+define <8 x bfloat> @test_rsqrt_nepbf16_128(<8 x bfloat> %a0) {
+; CHECK-LABEL: test_rsqrt_nepbf16_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrsqrtpbf16 %xmm0, %xmm0 # encoding: [0x62,0xf6,0x7c,0x08,0x4e,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.128(<8 x bfloat> %a0, <8 x bfloat> zeroinitializer, i8 -1)
+  ret <8 x bfloat> %res
+}
+
+define <16 x bfloat> @test_rsqrt_nepbf16_256(<16 x bfloat> %a0) {
+; CHECK-LABEL: test_rsqrt_nepbf16_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrsqrtpbf16 %ymm0, %ymm0 # encoding: [0x62,0xf6,0x7c,0x28,0x4e,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <16 x bfloat> @llvm.x86.avx10.mask.rsqrt.nepbf16.256(<16 x bfloat> %a0, <16 x bfloat> zeroinitializer, i16 -1)
+  ret <16 x bfloat> %res
+}
+
+declare <8 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.128(<8 x bfloat>, <8 x bfloat>, i8)
+declare <16 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.256(<16 x bfloat>, <16 x bfloat>, i16)
+
+define <8 x bfloat> @test_rcp_nepbf16_128(<8 x bfloat> %a0, <8 x bfloat> %a1, i8 %mask) {
+; X64-LABEL: test_rcp_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vrcppbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x4c,0xc8]
+; X64-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_rcp_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vrcppbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x4c,0xc8]
+; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <8 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.128(<8 x bfloat> %a0, <8 x bfloat> %a1, i8 %mask)
+  ret <8 x bfloat> %res
+}
+
+define <16 x bfloat> @test_rcp_nepbf16_256(<16 x bfloat> %a0, <16 x bfloat> %a1, i16 %mask) {
+; X64-LABEL: test_rcp_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vrcppbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x4c,0xc8]
+; X64-NEXT:    vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_rcp_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vrcppbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x4c,0xc8]
+; X86-NEXT:    vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <16 x bfloat> @llvm.x86.avx10.mask.rcp.nepbf16.256(<16 x bfloat> %a0, <16 x bfloat> %a1, i16 %mask)
+  ret <16 x bfloat> %res
+}
+
+declare <8 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.128(<8 x bfloat>, i32, <8 x bfloat>, i8)
+declare <16 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.256(<16 x bfloat>, i32, <16 x bfloat>, i16)
+
+define <8 x bfloat>@test_int_x86_avx512_mask_reduce_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x2, i8 %x3) {
+; X64-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vreducenepbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x56,0xc8,0x08]
+; X64-NEXT:    vreducenepbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x56,0xc0,0x04]
+; X64-NEXT:    vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vreducenepbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x56,0xc8,0x08]
+; X86-NEXT:    vreducenepbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x56,0xc0,0x04]
+; X86-NEXT:    vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <8 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.128(<8 x bfloat> %x0, i32 8, <8 x bfloat> %x2, i8 %x3)
+  %res1 = call <8 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.128(<8 x bfloat> %x0, i32 4, <8 x bfloat> %x2, i8 -1)
+  %res2 = fadd <8 x bfloat> %res, %res1
+  ret <8 x bfloat> %res2
+}
+
+define <16 x bfloat>@test_int_x86_avx512_mask_reduce_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x2, i16 %x3) {
+; X64-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vreducenepbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x56,0xc8,0x08]
+; X64-NEXT:    vreducenepbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x56,0xc0,0x04]
+; X64-NEXT:    vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_mask_reduce_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vreducenepbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x56,0xc8,0x08]
+; X86-NEXT:    vreducenepbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x56,0xc0,0x04]
+; X86-NEXT:    vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <16 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.256(<16 x bfloat> %x0, i32 8, <16 x bfloat> %x2, i16 %x3)
+  %res1 = call <16 x bfloat> @llvm.x86.avx10.mask.reduce.nepbf16.256(<16 x bfloat> %x0, i32 4, <16 x bfloat> %x2, i16 -1)
+  %res2 = fadd <16 x bfloat> %res, %res1
+  ret <16 x bfloat> %res2
+}
+
+declare <8 x i1> @llvm.x86.avx10.fpclass.nepbf16.128(<8 x bfloat>, i32)
+declare <16 x i1> @llvm.x86.avx10.fpclass.nepbf16.256(<16 x bfloat>, i32)
+
+define i8 @test_int_x86_avx512_fpclass_nepbf16_128(<8 x bfloat> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_fpclass_nepbf16_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfpclasspbf16 $2, %xmm0, %k1 # encoding: [0x62,0xf3,0x7f,0x08,0x66,0xc8,0x02]
+; CHECK-NEXT:    vfpclasspbf16 $4, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x66,0xc0,0x04]
+; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x i1> @llvm.x86.avx10.fpclass.nepbf16.128(<8 x bfloat> %x0, i32 4)
+  %res1 = call <8 x i1> @llvm.x86.avx10.fpclass.nepbf16.128(<8 x bfloat> %x0, i32 2)
+  %1 = and <8 x i1> %res1, %res
+  %2 = bitcast <8 x i1> %1 to i8
+  ret i8 %2
+}
+
+define i16 @test_int_x86_avx512_fpclass_nepbf16_256(<16 x bfloat> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_fpclass_nepbf16_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfpclasspbf16 $2, %ymm0, %k1 # encoding: [0x62,0xf3,0x7f,0x28,0x66,0xc8,0x02]
+; CHECK-NEXT:    vfpclasspbf16 $4, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x66,0xc0,0x04]
+; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <16 x i1> @llvm.x86.avx10.fpclass.nepbf16.256(<16 x bfloat> %x0, i32 4)
+  %res1 = call <16 x i1> @llvm.x86.avx10.fpclass.nepbf16.256(<16 x bfloat> %x0, i32 2)
+  %1 = and <16 x i1> %res1, %res
+  %2 = bitcast <16 x i1> %1 to i16
+  ret i16 %2
+}
+
+declare <8 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.128(<8 x bfloat>, <8 x bfloat>, i8)
+declare <16 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.256(<16 x bfloat>, <16 x bfloat>, i16)
+
+define <8 x bfloat>@test_int_x86_avx512_getexp_nepbf16_128(<8 x bfloat> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_getexp_nepbf16_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgetexppbf16 %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x42,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> zeroinitializer, i8 -1)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat>@test_int_x86_avx512_mask_getexp_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x1, i8 %x2) {
+; X64-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vgetexppbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x42,0xc8]
+; X64-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vgetexppbf16 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x42,0xc8]
+; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <8 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> %x1, i8 %x2)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat>@test_int_x86_avx512_maskz_getexp_nepbf16_128(<8 x bfloat> %x0, i8 %x2) {
+; X64-LABEL: test_int_x86_avx512_maskz_getexp_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vgetexppbf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x42,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_maskz_getexp_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vgetexppbf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x42,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <8 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> zeroinitializer, i8 %x2)
+  ret <8 x bfloat> %res
+}
+
+define <16 x bfloat>@test_int_x86_avx512_getexp_nepbf16_256(<16 x bfloat> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_getexp_nepbf16_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgetexppbf16 %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x42,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <16 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> zeroinitializer, i16 -1)
+  ret <16 x bfloat> %res
+}
+
+define <16 x bfloat>@test_int_x86_avx512_mask_getexp_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x1, i16 %x2) {
+; X64-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vgetexppbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x42,0xc8]
+; X64-NEXT:    vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_mask_getexp_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vgetexppbf16 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x42,0xc8]
+; X86-NEXT:    vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <16 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> %x1, i16 %x2)
+  ret <16 x bfloat> %res
+}
+
+define <16 x bfloat>@test_int_x86_avx512_maskz_getexp_nepbf16_256(<16 x bfloat> %x0, i16 %x2) {
+; X64-LABEL: test_int_x86_avx512_maskz_getexp_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vgetexppbf16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x42,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_maskz_getexp_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vgetexppbf16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x42,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <16 x bfloat> @llvm.x86.avx10.mask.getexp.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> zeroinitializer, i16 %x2)
+  ret <16 x bfloat> %res
+}
+
+declare <8 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.128(<8 x bfloat>, i32, <8 x bfloat>, i8)
+declare <16 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.256(<16 x bfloat>, i32, <16 x bfloat>, i16)
+
+define <8 x bfloat>@test_int_x86_avx512_mask_getmant_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x2, i8 %x3) {
+; X64-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vgetmantpbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x26,0xc8,0x08]
+; X64-NEXT:    vgetmantpbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x26,0xc0,0x04]
+; X64-NEXT:    vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vgetmantpbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x26,0xc8,0x08]
+; X86-NEXT:    vgetmantpbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x26,0xc0,0x04]
+; X86-NEXT:    vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <8 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.128(<8 x bfloat> %x0, i32 8, <8 x bfloat> %x2, i8 %x3)
+  %res1 = call <8 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.128(<8 x bfloat> %x0, i32 4, <8 x bfloat> %x2, i8 -1)
+  %res2 = fadd <8 x bfloat> %res, %res1
+  ret <8 x bfloat> %res2
+}
+
+define <16 x bfloat>@test_int_x86_avx512_mask_getmant_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x2, i16 %x3) {
+; X64-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vgetmantpbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x26,0xc8,0x08]
+; X64-NEXT:    vgetmantpbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x26,0xc0,0x04]
+; X64-NEXT:    vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_mask_getmant_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vgetmantpbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x26,0xc8,0x08]
+; X86-NEXT:    vgetmantpbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x26,0xc0,0x04]
+; X86-NEXT:    vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <16 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.256(<16 x bfloat> %x0, i32 8, <16 x bfloat> %x2, i16 %x3)
+  %res1 = call <16 x bfloat> @llvm.x86.avx10.mask.getmant.nepbf16.256(<16 x bfloat> %x0, i32 4, <16 x bfloat> %x2, i16 -1)
+  %res2 = fadd <16 x bfloat> %res, %res1
+  ret <16 x bfloat> %res2
+}
+
+declare <8 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.128(<8 x bfloat>, i32, <8 x bfloat>, i8)
+declare <16 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.256(<16 x bfloat>, i32, <16 x bfloat>, i16)
+
+define <8 x bfloat>@test_int_x86_avx512_mask_rndscale_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x2, i8 %x3) {
+; X64-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vrndscalenepbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x08,0xc8,0x08]
+; X64-NEXT:    vrndscalenepbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x08,0xc0,0x04]
+; X64-NEXT:    vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vrndscalenepbf16 $8, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x09,0x08,0xc8,0x08]
+; X86-NEXT:    vrndscalenepbf16 $4, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7f,0x08,0x08,0xc0,0x04]
+; X86-NEXT:    vaddnepbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <8 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.128(<8 x bfloat> %x0, i32 8, <8 x bfloat> %x2, i8 %x3)
+  %res1 = call <8 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.128(<8 x bfloat> %x0, i32 4, <8 x bfloat> %x2, i8 -1)
+  %res2 = fadd <8 x bfloat> %res, %res1
+  ret <8 x bfloat> %res2
+}
+
+define <16 x bfloat>@test_int_x86_avx512_mask_rndscale_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x2, i16 %x3) {
+; X64-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vrndscalenepbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x08,0xc8,0x08]
+; X64-NEXT:    vrndscalenepbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x08,0xc0,0x04]
+; X64-NEXT:    vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_mask_rndscale_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vrndscalenepbf16 $8, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7f,0x29,0x08,0xc8,0x08]
+; X86-NEXT:    vrndscalenepbf16 $4, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7f,0x28,0x08,0xc0,0x04]
+; X86-NEXT:    vaddnepbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %res = call <16 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.256(<16 x bfloat> %x0, i32 8, <16 x bfloat> %x2, i16 %x3)
+  %res1 = call <16 x bfloat> @llvm.x86.avx10.mask.rndscale.nepbf16.256(<16 x bfloat> %x0, i32 4, <16 x bfloat> %x2, i16 -1)
+  %res2 = fadd <16 x bfloat> %res, %res1
+  ret <16 x bfloat> %res2
+}
+
+declare <8 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.128(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i8)
+declare <16 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.256(<16 x bfloat>, <16 x bfloat>, <16 x bfloat>, i16)
+
+define <8 x bfloat>@test_int_x86_avx512_scalef_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_scalef_nepbf16_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vscalefpbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf6,0x7c,0x08,0x2c,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> %x1, <8 x bfloat> zeroinitializer, i8 -1)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat>@test_int_x86_avx512_mask_scalef_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %x3) {
+; X64-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vscalefpbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x2c,0xd1]
+; X64-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vscalefpbf16 %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x2c,0xd1]
+; X86-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i8 %x3 to <8 x i1>
+  %res = call <8 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> %x1, <8 x bfloat> %x2, i8 %x3)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat>@test_int_x86_avx512_maskz_scalef_nepbf16_128(<8 x bfloat> %x0, <8 x bfloat> %x1, i8 %x3) {
+; X64-LABEL: test_int_x86_avx512_maskz_scalef_nepbf16_128:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vscalefpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0x89,0x2c,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_maskz_scalef_nepbf16_128:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vscalefpbf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0x89,0x2c,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i8 %x3 to <8 x i1>
+  %res = call <8 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.128(<8 x bfloat> %x0, <8 x bfloat> %x1, <8 x bfloat> zeroinitializer, i8 %x3)
+  ret <8 x bfloat> %res
+}
+
+define <16 x bfloat>@test_int_x86_avx512_scalef_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_scalef_nepbf16_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vscalefpbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf6,0x7c,0x28,0x2c,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <16 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> %x1, <16 x bfloat> zeroinitializer, i16 -1)
+  ret <16 x bfloat> %res
+}
+
+define <16 x bfloat>@test_int_x86_avx512_mask_scalef_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %x3) {
+; X64-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vscalefpbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x2c,0xd1]
+; X64-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_mask_scalef_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vscalefpbf16 %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x2c,0xd1]
+; X86-NEXT:    vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i16 %x3 to <16 x i1>
+  %res = call <16 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> %x1, <16 x bfloat> %x2, i16 %x3)
+  ret <16 x bfloat> %res
+}
+
+define <16 x bfloat>@test_int_x86_avx512_maskz_scalef_nepbf16_256(<16 x bfloat> %x0, <16 x bfloat> %x1, i16 %x3) {
+; X64-LABEL: test_int_x86_avx512_maskz_scalef_nepbf16_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vscalefpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xa9,0x2c,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_int_x86_avx512_maskz_scalef_nepbf16_256:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vscalefpbf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xa9,0x2c,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %mask = bitcast i16 %x3 to <16 x i1>
+  %res = call <16 x bfloat> @llvm.x86.avx10.mask.scalef.nepbf16.256(<16 x bfloat> %x0, <16 x bfloat> %x1, <16 x bfloat> zeroinitializer, i16 %x3)
+  ret <16 x bfloat> %res
+}
diff --git a/llvm/test/MC/Disassembler/X86/avx10.2-bf16-32.txt b/llvm/test/MC/Disassembler/X86/avx10.2-bf16-32.txt
new file mode 100644
index 0000000000000..8cc53db077e4f
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/avx10.2-bf16-32.txt
@@ -0,0 +1,3015 @@
+# RUN: llvm-mc --disassemble %s -triple=i386 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=i386 --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   vaddnepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vaddnepbf16 ymm2, ymm3, ymm4
+0x62,0xf5,0x65,0x28,0x58,0xd4
+
+# ATT:   vaddnepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vaddnepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf5,0x65,0x2f,0x58,0xd4
+
+# ATT:   vaddnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vaddnepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf5,0x65,0xaf,0x58,0xd4
+
+# ATT:   vaddnepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vaddnepbf16 zmm2, zmm3, zmm4
+0x62,0xf5,0x65,0x48,0x58,0xd4
+
+# ATT:   vaddnepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vaddnepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf5,0x65,0x4f,0x58,0xd4
+
+# ATT:   vaddnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vaddnepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf5,0x65,0xcf,0x58,0xd4
+
+# ATT:   vaddnepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vaddnepbf16 xmm2, xmm3, xmm4
+0x62,0xf5,0x65,0x08,0x58,0xd4
+
+# ATT:   vaddnepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vaddnepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf5,0x65,0x0f,0x58,0xd4
+
+# ATT:   vaddnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vaddnepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf5,0x65,0x8f,0x58,0xd4
+
+# ATT:   vaddnepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vaddnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x48,0x58,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vaddnepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vaddnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x4f,0x58,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vaddnepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vaddnepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf5,0x65,0x58,0x58,0x10
+
+# ATT:   vaddnepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vaddnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf5,0x65,0x48,0x58,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vaddnepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vaddnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf5,0x65,0xcf,0x58,0x51,0x7f
+
+# ATT:   vaddnepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vaddnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf5,0x65,0xdf,0x58,0x52,0x80
+
+# ATT:   vaddnepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vaddnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x28,0x58,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vaddnepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vaddnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x2f,0x58,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vaddnepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vaddnepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf5,0x65,0x38,0x58,0x10
+
+# ATT:   vaddnepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vaddnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf5,0x65,0x28,0x58,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vaddnepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vaddnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x65,0xaf,0x58,0x51,0x7f
+
+# ATT:   vaddnepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vaddnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x65,0xbf,0x58,0x52,0x80
+
+# ATT:   vaddnepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vaddnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x08,0x58,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vaddnepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vaddnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x0f,0x58,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vaddnepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vaddnepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf5,0x65,0x18,0x58,0x10
+
+# ATT:   vaddnepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vaddnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf5,0x65,0x08,0x58,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vaddnepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vaddnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x65,0x8f,0x58,0x51,0x7f
+
+# ATT:   vaddnepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vaddnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x65,0x9f,0x58,0x52,0x80
+
+# ATT:   vcmppbf16 $123, %ymm4, %ymm3, %k5
+# INTEL: vcmppbf16 k5, ymm3, ymm4, 123
+0x62,0xf3,0x67,0x28,0xc2,0xec,0x7b
+
+# ATT:   vcmppbf16 $123, %ymm4, %ymm3, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, ymm3, ymm4, 123
+0x62,0xf3,0x67,0x2f,0xc2,0xec,0x7b
+
+# ATT:   vcmppbf16 $123, %xmm4, %xmm3, %k5
+# INTEL: vcmppbf16 k5, xmm3, xmm4, 123
+0x62,0xf3,0x67,0x08,0xc2,0xec,0x7b
+
+# ATT:   vcmppbf16 $123, %xmm4, %xmm3, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, xmm3, xmm4, 123
+0x62,0xf3,0x67,0x0f,0xc2,0xec,0x7b
+
+# ATT:   vcmppbf16 $123, %zmm4, %zmm3, %k5
+# INTEL: vcmppbf16 k5, zmm3, zmm4, 123
+0x62,0xf3,0x67,0x48,0xc2,0xec,0x7b
+
+# ATT:   vcmppbf16 $123, %zmm4, %zmm3, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, zmm3, zmm4, 123
+0x62,0xf3,0x67,0x4f,0xc2,0xec,0x7b
+
+# ATT:   vcmppbf16  $123, 268435456(%esp,%esi,8), %zmm3, %k5
+# INTEL: vcmppbf16 k5, zmm3, zmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x67,0x48,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vcmppbf16  $123, 291(%edi,%eax,4), %zmm3, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291], 123
+0x62,0xf3,0x67,0x4f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vcmppbf16  $123, (%eax){1to32}, %zmm3, %k5
+# INTEL: vcmppbf16 k5, zmm3, word ptr [eax]{1to32}, 123
+0x62,0xf3,0x67,0x58,0xc2,0x28,0x7b
+
+# ATT:   vcmppbf16  $123, -2048(,%ebp,2), %zmm3, %k5
+# INTEL: vcmppbf16 k5, zmm3, zmmword ptr [2*ebp - 2048], 123
+0x62,0xf3,0x67,0x48,0xc2,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b
+
+# ATT:   vcmppbf16  $123, 8128(%ecx), %zmm3, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, zmm3, zmmword ptr [ecx + 8128], 123
+0x62,0xf3,0x67,0x4f,0xc2,0x69,0x7f,0x7b
+
+# ATT:   vcmppbf16  $123, -256(%edx){1to32}, %zmm3, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, zmm3, word ptr [edx - 256]{1to32}, 123
+0x62,0xf3,0x67,0x5f,0xc2,0x6a,0x80,0x7b
+
+# ATT:   vcmppbf16  $123, 268435456(%esp,%esi,8), %xmm3, %k5
+# INTEL: vcmppbf16 k5, xmm3, xmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x67,0x08,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vcmppbf16  $123, 291(%edi,%eax,4), %xmm3, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291], 123
+0x62,0xf3,0x67,0x0f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vcmppbf16  $123, (%eax){1to8}, %xmm3, %k5
+# INTEL: vcmppbf16 k5, xmm3, word ptr [eax]{1to8}, 123
+0x62,0xf3,0x67,0x18,0xc2,0x28,0x7b
+
+# ATT:   vcmppbf16  $123, -512(,%ebp,2), %xmm3, %k5
+# INTEL: vcmppbf16 k5, xmm3, xmmword ptr [2*ebp - 512], 123
+0x62,0xf3,0x67,0x08,0xc2,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b
+
+# ATT:   vcmppbf16  $123, 2032(%ecx), %xmm3, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, xmm3, xmmword ptr [ecx + 2032], 123
+0x62,0xf3,0x67,0x0f,0xc2,0x69,0x7f,0x7b
+
+# ATT:   vcmppbf16  $123, -256(%edx){1to8}, %xmm3, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, xmm3, word ptr [edx - 256]{1to8}, 123
+0x62,0xf3,0x67,0x1f,0xc2,0x6a,0x80,0x7b
+
+# ATT:   vcmppbf16  $123, 268435456(%esp,%esi,8), %ymm3, %k5
+# INTEL: vcmppbf16 k5, ymm3, ymmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x67,0x28,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vcmppbf16  $123, 291(%edi,%eax,4), %ymm3, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291], 123
+0x62,0xf3,0x67,0x2f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vcmppbf16  $123, (%eax){1to16}, %ymm3, %k5
+# INTEL: vcmppbf16 k5, ymm3, word ptr [eax]{1to16}, 123
+0x62,0xf3,0x67,0x38,0xc2,0x28,0x7b
+
+# ATT:   vcmppbf16  $123, -1024(,%ebp,2), %ymm3, %k5
+# INTEL: vcmppbf16 k5, ymm3, ymmword ptr [2*ebp - 1024], 123
+0x62,0xf3,0x67,0x28,0xc2,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b
+
+# ATT:   vcmppbf16  $123, 4064(%ecx), %ymm3, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, ymm3, ymmword ptr [ecx + 4064], 123
+0x62,0xf3,0x67,0x2f,0xc2,0x69,0x7f,0x7b
+
+# ATT:   vcmppbf16  $123, -256(%edx){1to16}, %ymm3, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, ymm3, word ptr [edx - 256]{1to16}, 123
+0x62,0xf3,0x67,0x3f,0xc2,0x6a,0x80,0x7b
+
+# ATT:   vcomsbf16 %xmm3, %xmm2
+# INTEL: vcomsbf16 xmm2, xmm3
+0x62,0xf5,0x7d,0x08,0x2f,0xd3
+
+# ATT:   vcomsbf16  268435456(%esp,%esi,8), %xmm2
+# INTEL: vcomsbf16 xmm2, word ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcomsbf16  291(%edi,%eax,4), %xmm2
+# INTEL: vcomsbf16 xmm2, word ptr [edi + 4*eax + 291]
+0x62,0xf5,0x7d,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vcomsbf16  (%eax), %xmm2
+# INTEL: vcomsbf16 xmm2, word ptr [eax]
+0x62,0xf5,0x7d,0x08,0x2f,0x10
+
+# ATT:   vcomsbf16  -64(,%ebp,2), %xmm2
+# INTEL: vcomsbf16 xmm2, word ptr [2*ebp - 64]
+0x62,0xf5,0x7d,0x08,0x2f,0x14,0x6d,0xc0,0xff,0xff,0xff
+
+# ATT:   vcomsbf16  254(%ecx), %xmm2
+# INTEL: vcomsbf16 xmm2, word ptr [ecx + 254]
+0x62,0xf5,0x7d,0x08,0x2f,0x51,0x7f
+
+# ATT:   vcomsbf16  -256(%edx), %xmm2
+# INTEL: vcomsbf16 xmm2, word ptr [edx - 256]
+0x62,0xf5,0x7d,0x08,0x2f,0x52,0x80
+
+# ATT:   vdivnepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vdivnepbf16 ymm2, ymm3, ymm4
+0x62,0xf5,0x65,0x28,0x5e,0xd4
+
+# ATT:   vdivnepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vdivnepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf5,0x65,0x2f,0x5e,0xd4
+
+# ATT:   vdivnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vdivnepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf5,0x65,0xaf,0x5e,0xd4
+
+# ATT:   vdivnepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vdivnepbf16 zmm2, zmm3, zmm4
+0x62,0xf5,0x65,0x48,0x5e,0xd4
+
+# ATT:   vdivnepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vdivnepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf5,0x65,0x4f,0x5e,0xd4
+
+# ATT:   vdivnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vdivnepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf5,0x65,0xcf,0x5e,0xd4
+
+# ATT:   vdivnepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vdivnepbf16 xmm2, xmm3, xmm4
+0x62,0xf5,0x65,0x08,0x5e,0xd4
+
+# ATT:   vdivnepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vdivnepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf5,0x65,0x0f,0x5e,0xd4
+
+# ATT:   vdivnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vdivnepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf5,0x65,0x8f,0x5e,0xd4
+
+# ATT:   vdivnepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vdivnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x48,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vdivnepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vdivnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x4f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vdivnepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vdivnepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf5,0x65,0x58,0x5e,0x10
+
+# ATT:   vdivnepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vdivnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf5,0x65,0x48,0x5e,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vdivnepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vdivnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf5,0x65,0xcf,0x5e,0x51,0x7f
+
+# ATT:   vdivnepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vdivnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf5,0x65,0xdf,0x5e,0x52,0x80
+
+# ATT:   vdivnepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vdivnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x28,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vdivnepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vdivnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x2f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vdivnepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vdivnepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf5,0x65,0x38,0x5e,0x10
+
+# ATT:   vdivnepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vdivnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf5,0x65,0x28,0x5e,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vdivnepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vdivnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x65,0xaf,0x5e,0x51,0x7f
+
+# ATT:   vdivnepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vdivnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x65,0xbf,0x5e,0x52,0x80
+
+# ATT:   vdivnepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vdivnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x08,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vdivnepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vdivnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x0f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vdivnepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vdivnepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf5,0x65,0x18,0x5e,0x10
+
+# ATT:   vdivnepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vdivnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf5,0x65,0x08,0x5e,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vdivnepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vdivnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x65,0x8f,0x5e,0x51,0x7f
+
+# ATT:   vdivnepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vdivnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x65,0x9f,0x5e,0x52,0x80
+
+# ATT:   vfmadd132nepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vfmadd132nepbf16 ymm2, ymm3, ymm4
+0x62,0xf6,0x64,0x28,0x98,0xd4
+
+# ATT:   vfmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vfmadd132nepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf6,0x64,0x2f,0x98,0xd4
+
+# ATT:   vfmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf6,0x64,0xaf,0x98,0xd4
+
+# ATT:   vfmadd132nepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vfmadd132nepbf16 zmm2, zmm3, zmm4
+0x62,0xf6,0x64,0x48,0x98,0xd4
+
+# ATT:   vfmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vfmadd132nepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf6,0x64,0x4f,0x98,0xd4
+
+# ATT:   vfmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf6,0x64,0xcf,0x98,0xd4
+
+# ATT:   vfmadd132nepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vfmadd132nepbf16 xmm2, xmm3, xmm4
+0x62,0xf6,0x64,0x08,0x98,0xd4
+
+# ATT:   vfmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vfmadd132nepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf6,0x64,0x0f,0x98,0xd4
+
+# ATT:   vfmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf6,0x64,0x8f,0x98,0xd4
+
+# ATT:   vfmadd132nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vfmadd132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x48,0x98,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd132nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vfmadd132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x4f,0x98,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd132nepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vfmadd132nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf6,0x64,0x58,0x98,0x10
+
+# ATT:   vfmadd132nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vfmadd132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf6,0x64,0x48,0x98,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfmadd132nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf6,0x64,0xcf,0x98,0x51,0x7f
+
+# ATT:   vfmadd132nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf6,0x64,0xdf,0x98,0x52,0x80
+
+# ATT:   vfmadd132nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vfmadd132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x28,0x98,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd132nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vfmadd132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x2f,0x98,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd132nepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vfmadd132nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf6,0x64,0x38,0x98,0x10
+
+# ATT:   vfmadd132nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vfmadd132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf6,0x64,0x28,0x98,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfmadd132nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x64,0xaf,0x98,0x51,0x7f
+
+# ATT:   vfmadd132nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x64,0xbf,0x98,0x52,0x80
+
+# ATT:   vfmadd132nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vfmadd132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x08,0x98,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd132nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vfmadd132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x0f,0x98,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd132nepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vfmadd132nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf6,0x64,0x18,0x98,0x10
+
+# ATT:   vfmadd132nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vfmadd132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf6,0x64,0x08,0x98,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfmadd132nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x64,0x8f,0x98,0x51,0x7f
+
+# ATT:   vfmadd132nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x64,0x9f,0x98,0x52,0x80
+
+# ATT:   vfmadd213nepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vfmadd213nepbf16 ymm2, ymm3, ymm4
+0x62,0xf6,0x64,0x28,0xa8,0xd4
+
+# ATT:   vfmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vfmadd213nepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf6,0x64,0x2f,0xa8,0xd4
+
+# ATT:   vfmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf6,0x64,0xaf,0xa8,0xd4
+
+# ATT:   vfmadd213nepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vfmadd213nepbf16 zmm2, zmm3, zmm4
+0x62,0xf6,0x64,0x48,0xa8,0xd4
+
+# ATT:   vfmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vfmadd213nepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf6,0x64,0x4f,0xa8,0xd4
+
+# ATT:   vfmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf6,0x64,0xcf,0xa8,0xd4
+
+# ATT:   vfmadd213nepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vfmadd213nepbf16 xmm2, xmm3, xmm4
+0x62,0xf6,0x64,0x08,0xa8,0xd4
+
+# ATT:   vfmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vfmadd213nepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf6,0x64,0x0f,0xa8,0xd4
+
+# ATT:   vfmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf6,0x64,0x8f,0xa8,0xd4
+
+# ATT:   vfmadd213nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vfmadd213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x48,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd213nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vfmadd213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x4f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd213nepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vfmadd213nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf6,0x64,0x58,0xa8,0x10
+
+# ATT:   vfmadd213nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vfmadd213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf6,0x64,0x48,0xa8,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfmadd213nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf6,0x64,0xcf,0xa8,0x51,0x7f
+
+# ATT:   vfmadd213nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf6,0x64,0xdf,0xa8,0x52,0x80
+
+# ATT:   vfmadd213nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vfmadd213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x28,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd213nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vfmadd213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x2f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd213nepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vfmadd213nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf6,0x64,0x38,0xa8,0x10
+
+# ATT:   vfmadd213nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vfmadd213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf6,0x64,0x28,0xa8,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfmadd213nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x64,0xaf,0xa8,0x51,0x7f
+
+# ATT:   vfmadd213nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x64,0xbf,0xa8,0x52,0x80
+
+# ATT:   vfmadd213nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vfmadd213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x08,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd213nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vfmadd213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x0f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd213nepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vfmadd213nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf6,0x64,0x18,0xa8,0x10
+
+# ATT:   vfmadd213nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vfmadd213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf6,0x64,0x08,0xa8,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfmadd213nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x64,0x8f,0xa8,0x51,0x7f
+
+# ATT:   vfmadd213nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x64,0x9f,0xa8,0x52,0x80
+
+# ATT:   vfmadd231nepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vfmadd231nepbf16 ymm2, ymm3, ymm4
+0x62,0xf6,0x64,0x28,0xb8,0xd4
+
+# ATT:   vfmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vfmadd231nepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf6,0x64,0x2f,0xb8,0xd4
+
+# ATT:   vfmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf6,0x64,0xaf,0xb8,0xd4
+
+# ATT:   vfmadd231nepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vfmadd231nepbf16 zmm2, zmm3, zmm4
+0x62,0xf6,0x64,0x48,0xb8,0xd4
+
+# ATT:   vfmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vfmadd231nepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf6,0x64,0x4f,0xb8,0xd4
+
+# ATT:   vfmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf6,0x64,0xcf,0xb8,0xd4
+
+# ATT:   vfmadd231nepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vfmadd231nepbf16 xmm2, xmm3, xmm4
+0x62,0xf6,0x64,0x08,0xb8,0xd4
+
+# ATT:   vfmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vfmadd231nepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf6,0x64,0x0f,0xb8,0xd4
+
+# ATT:   vfmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf6,0x64,0x8f,0xb8,0xd4
+
+# ATT:   vfmadd231nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vfmadd231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x48,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd231nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vfmadd231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x4f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd231nepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vfmadd231nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf6,0x64,0x58,0xb8,0x10
+
+# ATT:   vfmadd231nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vfmadd231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf6,0x64,0x48,0xb8,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfmadd231nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf6,0x64,0xcf,0xb8,0x51,0x7f
+
+# ATT:   vfmadd231nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf6,0x64,0xdf,0xb8,0x52,0x80
+
+# ATT:   vfmadd231nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vfmadd231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x28,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd231nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vfmadd231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x2f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd231nepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vfmadd231nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf6,0x64,0x38,0xb8,0x10
+
+# ATT:   vfmadd231nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vfmadd231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf6,0x64,0x28,0xb8,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfmadd231nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x64,0xaf,0xb8,0x51,0x7f
+
+# ATT:   vfmadd231nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x64,0xbf,0xb8,0x52,0x80
+
+# ATT:   vfmadd231nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vfmadd231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x08,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd231nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vfmadd231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x0f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd231nepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vfmadd231nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf6,0x64,0x18,0xb8,0x10
+
+# ATT:   vfmadd231nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vfmadd231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf6,0x64,0x08,0xb8,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfmadd231nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x64,0x8f,0xb8,0x51,0x7f
+
+# ATT:   vfmadd231nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x64,0x9f,0xb8,0x52,0x80
+
+# ATT:   vfmsub132nepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vfmsub132nepbf16 ymm2, ymm3, ymm4
+0x62,0xf6,0x64,0x28,0x9a,0xd4
+
+# ATT:   vfmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vfmsub132nepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf6,0x64,0x2f,0x9a,0xd4
+
+# ATT:   vfmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf6,0x64,0xaf,0x9a,0xd4
+
+# ATT:   vfmsub132nepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vfmsub132nepbf16 zmm2, zmm3, zmm4
+0x62,0xf6,0x64,0x48,0x9a,0xd4
+
+# ATT:   vfmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vfmsub132nepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf6,0x64,0x4f,0x9a,0xd4
+
+# ATT:   vfmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf6,0x64,0xcf,0x9a,0xd4
+
+# ATT:   vfmsub132nepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vfmsub132nepbf16 xmm2, xmm3, xmm4
+0x62,0xf6,0x64,0x08,0x9a,0xd4
+
+# ATT:   vfmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vfmsub132nepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf6,0x64,0x0f,0x9a,0xd4
+
+# ATT:   vfmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf6,0x64,0x8f,0x9a,0xd4
+
+# ATT:   vfmsub132nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vfmsub132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x48,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub132nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vfmsub132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x4f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub132nepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vfmsub132nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf6,0x64,0x58,0x9a,0x10
+
+# ATT:   vfmsub132nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vfmsub132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf6,0x64,0x48,0x9a,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfmsub132nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf6,0x64,0xcf,0x9a,0x51,0x7f
+
+# ATT:   vfmsub132nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf6,0x64,0xdf,0x9a,0x52,0x80
+
+# ATT:   vfmsub132nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vfmsub132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x28,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub132nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vfmsub132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x2f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub132nepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vfmsub132nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf6,0x64,0x38,0x9a,0x10
+
+# ATT:   vfmsub132nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vfmsub132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf6,0x64,0x28,0x9a,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfmsub132nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x64,0xaf,0x9a,0x51,0x7f
+
+# ATT:   vfmsub132nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x64,0xbf,0x9a,0x52,0x80
+
+# ATT:   vfmsub132nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vfmsub132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x08,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub132nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vfmsub132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x0f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub132nepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vfmsub132nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf6,0x64,0x18,0x9a,0x10
+
+# ATT:   vfmsub132nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vfmsub132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf6,0x64,0x08,0x9a,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfmsub132nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x64,0x8f,0x9a,0x51,0x7f
+
+# ATT:   vfmsub132nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x64,0x9f,0x9a,0x52,0x80
+
+# ATT:   vfmsub213nepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vfmsub213nepbf16 ymm2, ymm3, ymm4
+0x62,0xf6,0x64,0x28,0xaa,0xd4
+
+# ATT:   vfmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vfmsub213nepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf6,0x64,0x2f,0xaa,0xd4
+
+# ATT:   vfmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf6,0x64,0xaf,0xaa,0xd4
+
+# ATT:   vfmsub213nepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vfmsub213nepbf16 zmm2, zmm3, zmm4
+0x62,0xf6,0x64,0x48,0xaa,0xd4
+
+# ATT:   vfmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vfmsub213nepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf6,0x64,0x4f,0xaa,0xd4
+
+# ATT:   vfmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf6,0x64,0xcf,0xaa,0xd4
+
+# ATT:   vfmsub213nepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vfmsub213nepbf16 xmm2, xmm3, xmm4
+0x62,0xf6,0x64,0x08,0xaa,0xd4
+
+# ATT:   vfmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vfmsub213nepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf6,0x64,0x0f,0xaa,0xd4
+
+# ATT:   vfmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf6,0x64,0x8f,0xaa,0xd4
+
+# ATT:   vfmsub213nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vfmsub213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x48,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub213nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vfmsub213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x4f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub213nepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vfmsub213nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf6,0x64,0x58,0xaa,0x10
+
+# ATT:   vfmsub213nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vfmsub213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf6,0x64,0x48,0xaa,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfmsub213nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf6,0x64,0xcf,0xaa,0x51,0x7f
+
+# ATT:   vfmsub213nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf6,0x64,0xdf,0xaa,0x52,0x80
+
+# ATT:   vfmsub213nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vfmsub213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x28,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub213nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vfmsub213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x2f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub213nepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vfmsub213nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf6,0x64,0x38,0xaa,0x10
+
+# ATT:   vfmsub213nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vfmsub213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf6,0x64,0x28,0xaa,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfmsub213nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x64,0xaf,0xaa,0x51,0x7f
+
+# ATT:   vfmsub213nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x64,0xbf,0xaa,0x52,0x80
+
+# ATT:   vfmsub213nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vfmsub213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x08,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub213nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vfmsub213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x0f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub213nepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vfmsub213nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf6,0x64,0x18,0xaa,0x10
+
+# ATT:   vfmsub213nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vfmsub213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf6,0x64,0x08,0xaa,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfmsub213nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x64,0x8f,0xaa,0x51,0x7f
+
+# ATT:   vfmsub213nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x64,0x9f,0xaa,0x52,0x80
+
+# ATT:   vfmsub231nepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vfmsub231nepbf16 ymm2, ymm3, ymm4
+0x62,0xf6,0x64,0x28,0xba,0xd4
+
+# ATT:   vfmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vfmsub231nepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf6,0x64,0x2f,0xba,0xd4
+
+# ATT:   vfmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf6,0x64,0xaf,0xba,0xd4
+
+# ATT:   vfmsub231nepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vfmsub231nepbf16 zmm2, zmm3, zmm4
+0x62,0xf6,0x64,0x48,0xba,0xd4
+
+# ATT:   vfmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vfmsub231nepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf6,0x64,0x4f,0xba,0xd4
+
+# ATT:   vfmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf6,0x64,0xcf,0xba,0xd4
+
+# ATT:   vfmsub231nepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vfmsub231nepbf16 xmm2, xmm3, xmm4
+0x62,0xf6,0x64,0x08,0xba,0xd4
+
+# ATT:   vfmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vfmsub231nepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf6,0x64,0x0f,0xba,0xd4
+
+# ATT:   vfmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf6,0x64,0x8f,0xba,0xd4
+
+# ATT:   vfmsub231nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vfmsub231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x48,0xba,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub231nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vfmsub231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x4f,0xba,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub231nepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vfmsub231nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf6,0x64,0x58,0xba,0x10
+
+# ATT:   vfmsub231nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vfmsub231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf6,0x64,0x48,0xba,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfmsub231nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf6,0x64,0xcf,0xba,0x51,0x7f
+
+# ATT:   vfmsub231nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf6,0x64,0xdf,0xba,0x52,0x80
+
+# ATT:   vfmsub231nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vfmsub231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x28,0xba,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub231nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vfmsub231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x2f,0xba,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub231nepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vfmsub231nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf6,0x64,0x38,0xba,0x10
+
+# ATT:   vfmsub231nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vfmsub231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf6,0x64,0x28,0xba,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfmsub231nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x64,0xaf,0xba,0x51,0x7f
+
+# ATT:   vfmsub231nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x64,0xbf,0xba,0x52,0x80
+
+# ATT:   vfmsub231nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vfmsub231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x08,0xba,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub231nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vfmsub231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x0f,0xba,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub231nepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vfmsub231nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf6,0x64,0x18,0xba,0x10
+
+# ATT:   vfmsub231nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vfmsub231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf6,0x64,0x08,0xba,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfmsub231nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x64,0x8f,0xba,0x51,0x7f
+
+# ATT:   vfmsub231nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x64,0x9f,0xba,0x52,0x80
+
+# ATT:   vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vfnmadd132nepbf16 ymm2, ymm3, ymm4
+0x62,0xf6,0x64,0x28,0x9c,0xd4
+
+# ATT:   vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vfnmadd132nepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf6,0x64,0x2f,0x9c,0xd4
+
+# ATT:   vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf6,0x64,0xaf,0x9c,0xd4
+
+# ATT:   vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vfnmadd132nepbf16 zmm2, zmm3, zmm4
+0x62,0xf6,0x64,0x48,0x9c,0xd4
+
+# ATT:   vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vfnmadd132nepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf6,0x64,0x4f,0x9c,0xd4
+
+# ATT:   vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf6,0x64,0xcf,0x9c,0xd4
+
+# ATT:   vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vfnmadd132nepbf16 xmm2, xmm3, xmm4
+0x62,0xf6,0x64,0x08,0x9c,0xd4
+
+# ATT:   vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vfnmadd132nepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf6,0x64,0x0f,0x9c,0xd4
+
+# ATT:   vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf6,0x64,0x8f,0x9c,0xd4
+
+# ATT:   vfnmadd132nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vfnmadd132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x48,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd132nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vfnmadd132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x4f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd132nepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vfnmadd132nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf6,0x64,0x58,0x9c,0x10
+
+# ATT:   vfnmadd132nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vfnmadd132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf6,0x64,0x48,0x9c,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfnmadd132nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf6,0x64,0xcf,0x9c,0x51,0x7f
+
+# ATT:   vfnmadd132nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf6,0x64,0xdf,0x9c,0x52,0x80
+
+# ATT:   vfnmadd132nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vfnmadd132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x28,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd132nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vfnmadd132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x2f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd132nepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vfnmadd132nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf6,0x64,0x38,0x9c,0x10
+
+# ATT:   vfnmadd132nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vfnmadd132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf6,0x64,0x28,0x9c,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfnmadd132nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x64,0xaf,0x9c,0x51,0x7f
+
+# ATT:   vfnmadd132nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x64,0xbf,0x9c,0x52,0x80
+
+# ATT:   vfnmadd132nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vfnmadd132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x08,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd132nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vfnmadd132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x0f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd132nepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vfnmadd132nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf6,0x64,0x18,0x9c,0x10
+
+# ATT:   vfnmadd132nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vfnmadd132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf6,0x64,0x08,0x9c,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfnmadd132nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x64,0x8f,0x9c,0x51,0x7f
+
+# ATT:   vfnmadd132nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x64,0x9f,0x9c,0x52,0x80
+
+# ATT:   vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vfnmadd213nepbf16 ymm2, ymm3, ymm4
+0x62,0xf6,0x64,0x28,0xac,0xd4
+
+# ATT:   vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vfnmadd213nepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf6,0x64,0x2f,0xac,0xd4
+
+# ATT:   vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf6,0x64,0xaf,0xac,0xd4
+
+# ATT:   vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vfnmadd213nepbf16 zmm2, zmm3, zmm4
+0x62,0xf6,0x64,0x48,0xac,0xd4
+
+# ATT:   vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vfnmadd213nepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf6,0x64,0x4f,0xac,0xd4
+
+# ATT:   vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf6,0x64,0xcf,0xac,0xd4
+
+# ATT:   vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vfnmadd213nepbf16 xmm2, xmm3, xmm4
+0x62,0xf6,0x64,0x08,0xac,0xd4
+
+# ATT:   vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vfnmadd213nepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf6,0x64,0x0f,0xac,0xd4
+
+# ATT:   vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf6,0x64,0x8f,0xac,0xd4
+
+# ATT:   vfnmadd213nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vfnmadd213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x48,0xac,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd213nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vfnmadd213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x4f,0xac,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd213nepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vfnmadd213nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf6,0x64,0x58,0xac,0x10
+
+# ATT:   vfnmadd213nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vfnmadd213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf6,0x64,0x48,0xac,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfnmadd213nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf6,0x64,0xcf,0xac,0x51,0x7f
+
+# ATT:   vfnmadd213nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf6,0x64,0xdf,0xac,0x52,0x80
+
+# ATT:   vfnmadd213nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vfnmadd213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x28,0xac,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd213nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vfnmadd213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x2f,0xac,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd213nepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vfnmadd213nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf6,0x64,0x38,0xac,0x10
+
+# ATT:   vfnmadd213nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vfnmadd213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf6,0x64,0x28,0xac,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfnmadd213nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x64,0xaf,0xac,0x51,0x7f
+
+# ATT:   vfnmadd213nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x64,0xbf,0xac,0x52,0x80
+
+# ATT:   vfnmadd213nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vfnmadd213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x08,0xac,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd213nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vfnmadd213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x0f,0xac,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd213nepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vfnmadd213nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf6,0x64,0x18,0xac,0x10
+
+# ATT:   vfnmadd213nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vfnmadd213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf6,0x64,0x08,0xac,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfnmadd213nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x64,0x8f,0xac,0x51,0x7f
+
+# ATT:   vfnmadd213nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x64,0x9f,0xac,0x52,0x80
+
+# ATT:   vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vfnmadd231nepbf16 ymm2, ymm3, ymm4
+0x62,0xf6,0x64,0x28,0xbc,0xd4
+
+# ATT:   vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vfnmadd231nepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf6,0x64,0x2f,0xbc,0xd4
+
+# ATT:   vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf6,0x64,0xaf,0xbc,0xd4
+
+# ATT:   vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vfnmadd231nepbf16 zmm2, zmm3, zmm4
+0x62,0xf6,0x64,0x48,0xbc,0xd4
+
+# ATT:   vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vfnmadd231nepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf6,0x64,0x4f,0xbc,0xd4
+
+# ATT:   vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf6,0x64,0xcf,0xbc,0xd4
+
+# ATT:   vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vfnmadd231nepbf16 xmm2, xmm3, xmm4
+0x62,0xf6,0x64,0x08,0xbc,0xd4
+
+# ATT:   vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vfnmadd231nepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf6,0x64,0x0f,0xbc,0xd4
+
+# ATT:   vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf6,0x64,0x8f,0xbc,0xd4
+
+# ATT:   vfnmadd231nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vfnmadd231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x48,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd231nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vfnmadd231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x4f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd231nepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vfnmadd231nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf6,0x64,0x58,0xbc,0x10
+
+# ATT:   vfnmadd231nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vfnmadd231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf6,0x64,0x48,0xbc,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfnmadd231nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf6,0x64,0xcf,0xbc,0x51,0x7f
+
+# ATT:   vfnmadd231nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf6,0x64,0xdf,0xbc,0x52,0x80
+
+# ATT:   vfnmadd231nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vfnmadd231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x28,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd231nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vfnmadd231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x2f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd231nepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vfnmadd231nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf6,0x64,0x38,0xbc,0x10
+
+# ATT:   vfnmadd231nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vfnmadd231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf6,0x64,0x28,0xbc,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfnmadd231nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x64,0xaf,0xbc,0x51,0x7f
+
+# ATT:   vfnmadd231nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x64,0xbf,0xbc,0x52,0x80
+
+# ATT:   vfnmadd231nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vfnmadd231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x08,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd231nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vfnmadd231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x0f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd231nepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vfnmadd231nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf6,0x64,0x18,0xbc,0x10
+
+# ATT:   vfnmadd231nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vfnmadd231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf6,0x64,0x08,0xbc,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfnmadd231nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x64,0x8f,0xbc,0x51,0x7f
+
+# ATT:   vfnmadd231nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x64,0x9f,0xbc,0x52,0x80
+
+# ATT:   vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vfnmsub132nepbf16 ymm2, ymm3, ymm4
+0x62,0xf6,0x64,0x28,0x9e,0xd4
+
+# ATT:   vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vfnmsub132nepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf6,0x64,0x2f,0x9e,0xd4
+
+# ATT:   vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf6,0x64,0xaf,0x9e,0xd4
+
+# ATT:   vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vfnmsub132nepbf16 zmm2, zmm3, zmm4
+0x62,0xf6,0x64,0x48,0x9e,0xd4
+
+# ATT:   vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vfnmsub132nepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf6,0x64,0x4f,0x9e,0xd4
+
+# ATT:   vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf6,0x64,0xcf,0x9e,0xd4
+
+# ATT:   vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vfnmsub132nepbf16 xmm2, xmm3, xmm4
+0x62,0xf6,0x64,0x08,0x9e,0xd4
+
+# ATT:   vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vfnmsub132nepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf6,0x64,0x0f,0x9e,0xd4
+
+# ATT:   vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf6,0x64,0x8f,0x9e,0xd4
+
+# ATT:   vfnmsub132nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vfnmsub132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x48,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub132nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vfnmsub132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x4f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub132nepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vfnmsub132nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf6,0x64,0x58,0x9e,0x10
+
+# ATT:   vfnmsub132nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vfnmsub132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf6,0x64,0x48,0x9e,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfnmsub132nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf6,0x64,0xcf,0x9e,0x51,0x7f
+
+# ATT:   vfnmsub132nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf6,0x64,0xdf,0x9e,0x52,0x80
+
+# ATT:   vfnmsub132nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vfnmsub132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x28,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub132nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vfnmsub132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x2f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub132nepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vfnmsub132nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf6,0x64,0x38,0x9e,0x10
+
+# ATT:   vfnmsub132nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vfnmsub132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf6,0x64,0x28,0x9e,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfnmsub132nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x64,0xaf,0x9e,0x51,0x7f
+
+# ATT:   vfnmsub132nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x64,0xbf,0x9e,0x52,0x80
+
+# ATT:   vfnmsub132nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vfnmsub132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x08,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub132nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vfnmsub132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x0f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub132nepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vfnmsub132nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf6,0x64,0x18,0x9e,0x10
+
+# ATT:   vfnmsub132nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vfnmsub132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf6,0x64,0x08,0x9e,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfnmsub132nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x64,0x8f,0x9e,0x51,0x7f
+
+# ATT:   vfnmsub132nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x64,0x9f,0x9e,0x52,0x80
+
+# ATT:   vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vfnmsub213nepbf16 ymm2, ymm3, ymm4
+0x62,0xf6,0x64,0x28,0xae,0xd4
+
+# ATT:   vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vfnmsub213nepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf6,0x64,0x2f,0xae,0xd4
+
+# ATT:   vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf6,0x64,0xaf,0xae,0xd4
+
+# ATT:   vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vfnmsub213nepbf16 zmm2, zmm3, zmm4
+0x62,0xf6,0x64,0x48,0xae,0xd4
+
+# ATT:   vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vfnmsub213nepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf6,0x64,0x4f,0xae,0xd4
+
+# ATT:   vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf6,0x64,0xcf,0xae,0xd4
+
+# ATT:   vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vfnmsub213nepbf16 xmm2, xmm3, xmm4
+0x62,0xf6,0x64,0x08,0xae,0xd4
+
+# ATT:   vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vfnmsub213nepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf6,0x64,0x0f,0xae,0xd4
+
+# ATT:   vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf6,0x64,0x8f,0xae,0xd4
+
+# ATT:   vfnmsub213nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vfnmsub213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x48,0xae,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub213nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vfnmsub213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x4f,0xae,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub213nepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vfnmsub213nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf6,0x64,0x58,0xae,0x10
+
+# ATT:   vfnmsub213nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vfnmsub213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf6,0x64,0x48,0xae,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfnmsub213nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf6,0x64,0xcf,0xae,0x51,0x7f
+
+# ATT:   vfnmsub213nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf6,0x64,0xdf,0xae,0x52,0x80
+
+# ATT:   vfnmsub213nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vfnmsub213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x28,0xae,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub213nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vfnmsub213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x2f,0xae,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub213nepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vfnmsub213nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf6,0x64,0x38,0xae,0x10
+
+# ATT:   vfnmsub213nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vfnmsub213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf6,0x64,0x28,0xae,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfnmsub213nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x64,0xaf,0xae,0x51,0x7f
+
+# ATT:   vfnmsub213nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x64,0xbf,0xae,0x52,0x80
+
+# ATT:   vfnmsub213nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vfnmsub213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x08,0xae,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub213nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vfnmsub213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x0f,0xae,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub213nepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vfnmsub213nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf6,0x64,0x18,0xae,0x10
+
+# ATT:   vfnmsub213nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vfnmsub213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf6,0x64,0x08,0xae,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfnmsub213nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x64,0x8f,0xae,0x51,0x7f
+
+# ATT:   vfnmsub213nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x64,0x9f,0xae,0x52,0x80
+
+# ATT:   vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vfnmsub231nepbf16 ymm2, ymm3, ymm4
+0x62,0xf6,0x64,0x28,0xbe,0xd4
+
+# ATT:   vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vfnmsub231nepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf6,0x64,0x2f,0xbe,0xd4
+
+# ATT:   vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf6,0x64,0xaf,0xbe,0xd4
+
+# ATT:   vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vfnmsub231nepbf16 zmm2, zmm3, zmm4
+0x62,0xf6,0x64,0x48,0xbe,0xd4
+
+# ATT:   vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vfnmsub231nepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf6,0x64,0x4f,0xbe,0xd4
+
+# ATT:   vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf6,0x64,0xcf,0xbe,0xd4
+
+# ATT:   vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vfnmsub231nepbf16 xmm2, xmm3, xmm4
+0x62,0xf6,0x64,0x08,0xbe,0xd4
+
+# ATT:   vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vfnmsub231nepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf6,0x64,0x0f,0xbe,0xd4
+
+# ATT:   vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf6,0x64,0x8f,0xbe,0xd4
+
+# ATT:   vfnmsub231nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vfnmsub231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x48,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub231nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vfnmsub231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x4f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub231nepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vfnmsub231nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf6,0x64,0x58,0xbe,0x10
+
+# ATT:   vfnmsub231nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vfnmsub231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf6,0x64,0x48,0xbe,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfnmsub231nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf6,0x64,0xcf,0xbe,0x51,0x7f
+
+# ATT:   vfnmsub231nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf6,0x64,0xdf,0xbe,0x52,0x80
+
+# ATT:   vfnmsub231nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vfnmsub231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x28,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub231nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vfnmsub231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x2f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub231nepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vfnmsub231nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf6,0x64,0x38,0xbe,0x10
+
+# ATT:   vfnmsub231nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vfnmsub231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf6,0x64,0x28,0xbe,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfnmsub231nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x64,0xaf,0xbe,0x51,0x7f
+
+# ATT:   vfnmsub231nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x64,0xbf,0xbe,0x52,0x80
+
+# ATT:   vfnmsub231nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vfnmsub231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x08,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub231nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vfnmsub231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x0f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub231nepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vfnmsub231nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf6,0x64,0x18,0xbe,0x10
+
+# ATT:   vfnmsub231nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vfnmsub231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf6,0x64,0x08,0xbe,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfnmsub231nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x64,0x8f,0xbe,0x51,0x7f
+
+# ATT:   vfnmsub231nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x64,0x9f,0xbe,0x52,0x80
+
+# ATT:   vfpclasspbf16 $123, %zmm3, %k5
+# INTEL: vfpclasspbf16 k5, zmm3, 123
+0x62,0xf3,0x7f,0x48,0x66,0xeb,0x7b
+
+# ATT:   vfpclasspbf16 $123, %zmm3, %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, zmm3, 123
+0x62,0xf3,0x7f,0x4f,0x66,0xeb,0x7b
+
+# ATT:   vfpclasspbf16 $123, %ymm3, %k5
+# INTEL: vfpclasspbf16 k5, ymm3, 123
+0x62,0xf3,0x7f,0x28,0x66,0xeb,0x7b
+
+# ATT:   vfpclasspbf16 $123, %ymm3, %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, ymm3, 123
+0x62,0xf3,0x7f,0x2f,0x66,0xeb,0x7b
+
+# ATT:   vfpclasspbf16 $123, %xmm3, %k5
+# INTEL: vfpclasspbf16 k5, xmm3, 123
+0x62,0xf3,0x7f,0x08,0x66,0xeb,0x7b
+
+# ATT:   vfpclasspbf16 $123, %xmm3, %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, xmm3, 123
+0x62,0xf3,0x7f,0x0f,0x66,0xeb,0x7b
+
+# ATT:   vfpclasspbf16x  $123, 268435456(%esp,%esi,8), %k5
+# INTEL: vfpclasspbf16 k5, xmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7f,0x08,0x66,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vfpclasspbf16x  $123, 291(%edi,%eax,4), %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, xmmword ptr [edi + 4*eax + 291], 123
+0x62,0xf3,0x7f,0x0f,0x66,0xac,0x87,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vfpclasspbf16  $123, (%eax){1to8}, %k5
+# INTEL: vfpclasspbf16 k5, word ptr [eax]{1to8}, 123
+0x62,0xf3,0x7f,0x18,0x66,0x28,0x7b
+
+# ATT:   vfpclasspbf16x  $123, -512(,%ebp,2), %k5
+# INTEL: vfpclasspbf16 k5, xmmword ptr [2*ebp - 512], 123
+0x62,0xf3,0x7f,0x08,0x66,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b
+
+# ATT:   vfpclasspbf16x  $123, 2032(%ecx), %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, xmmword ptr [ecx + 2032], 123
+0x62,0xf3,0x7f,0x0f,0x66,0x69,0x7f,0x7b
+
+# ATT:   vfpclasspbf16  $123, -256(%edx){1to8}, %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to8}, 123
+0x62,0xf3,0x7f,0x1f,0x66,0x6a,0x80,0x7b
+
+# ATT:   vfpclasspbf16  $123, (%eax){1to16}, %k5
+# INTEL: vfpclasspbf16 k5, word ptr [eax]{1to16}, 123
+0x62,0xf3,0x7f,0x38,0x66,0x28,0x7b
+
+# ATT:   vfpclasspbf16y  $123, -1024(,%ebp,2), %k5
+# INTEL: vfpclasspbf16 k5, ymmword ptr [2*ebp - 1024], 123
+0x62,0xf3,0x7f,0x28,0x66,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b
+
+# ATT:   vfpclasspbf16y  $123, 4064(%ecx), %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, ymmword ptr [ecx + 4064], 123
+0x62,0xf3,0x7f,0x2f,0x66,0x69,0x7f,0x7b
+
+# ATT:   vfpclasspbf16  $123, -256(%edx){1to16}, %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to16}, 123
+0x62,0xf3,0x7f,0x3f,0x66,0x6a,0x80,0x7b
+
+# ATT:   vfpclasspbf16  $123, (%eax){1to32}, %k5
+# INTEL: vfpclasspbf16 k5, word ptr [eax]{1to32}, 123
+0x62,0xf3,0x7f,0x58,0x66,0x28,0x7b
+
+# ATT:   vfpclasspbf16z  $123, -2048(,%ebp,2), %k5
+# INTEL: vfpclasspbf16 k5, zmmword ptr [2*ebp - 2048], 123
+0x62,0xf3,0x7f,0x48,0x66,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b
+
+# ATT:   vfpclasspbf16z  $123, 8128(%ecx), %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, zmmword ptr [ecx + 8128], 123
+0x62,0xf3,0x7f,0x4f,0x66,0x69,0x7f,0x7b
+
+# ATT:   vfpclasspbf16  $123, -256(%edx){1to32}, %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to32}, 123
+0x62,0xf3,0x7f,0x5f,0x66,0x6a,0x80,0x7b
+
+# ATT:   vgetexppbf16 %xmm3, %xmm2
+# INTEL: vgetexppbf16 xmm2, xmm3
+0x62,0xf5,0x7d,0x08,0x42,0xd3
+
+# ATT:   vgetexppbf16 %xmm3, %xmm2 {%k7}
+# INTEL: vgetexppbf16 xmm2 {k7}, xmm3
+0x62,0xf5,0x7d,0x0f,0x42,0xd3
+
+# ATT:   vgetexppbf16 %xmm3, %xmm2 {%k7} {z}
+# INTEL: vgetexppbf16 xmm2 {k7} {z}, xmm3
+0x62,0xf5,0x7d,0x8f,0x42,0xd3
+
+# ATT:   vgetexppbf16 %zmm3, %zmm2
+# INTEL: vgetexppbf16 zmm2, zmm3
+0x62,0xf5,0x7d,0x48,0x42,0xd3
+
+# ATT:   vgetexppbf16 %zmm3, %zmm2 {%k7}
+# INTEL: vgetexppbf16 zmm2 {k7}, zmm3
+0x62,0xf5,0x7d,0x4f,0x42,0xd3
+
+# ATT:   vgetexppbf16 %zmm3, %zmm2 {%k7} {z}
+# INTEL: vgetexppbf16 zmm2 {k7} {z}, zmm3
+0x62,0xf5,0x7d,0xcf,0x42,0xd3
+
+# ATT:   vgetexppbf16 %ymm3, %ymm2
+# INTEL: vgetexppbf16 ymm2, ymm3
+0x62,0xf5,0x7d,0x28,0x42,0xd3
+
+# ATT:   vgetexppbf16 %ymm3, %ymm2 {%k7}
+# INTEL: vgetexppbf16 ymm2 {k7}, ymm3
+0x62,0xf5,0x7d,0x2f,0x42,0xd3
+
+# ATT:   vgetexppbf16 %ymm3, %ymm2 {%k7} {z}
+# INTEL: vgetexppbf16 ymm2 {k7} {z}, ymm3
+0x62,0xf5,0x7d,0xaf,0x42,0xd3
+
+# ATT:   vgetexppbf16  268435456(%esp,%esi,8), %xmm2
+# INTEL: vgetexppbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x08,0x42,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vgetexppbf16  291(%edi,%eax,4), %xmm2 {%k7}
+# INTEL: vgetexppbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x7d,0x0f,0x42,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vgetexppbf16  (%eax){1to8}, %xmm2
+# INTEL: vgetexppbf16 xmm2, word ptr [eax]{1to8}
+0x62,0xf5,0x7d,0x18,0x42,0x10
+
+# ATT:   vgetexppbf16  -512(,%ebp,2), %xmm2
+# INTEL: vgetexppbf16 xmm2, xmmword ptr [2*ebp - 512]
+0x62,0xf5,0x7d,0x08,0x42,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vgetexppbf16  2032(%ecx), %xmm2 {%k7} {z}
+# INTEL: vgetexppbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7d,0x8f,0x42,0x51,0x7f
+
+# ATT:   vgetexppbf16  -256(%edx){1to8}, %xmm2 {%k7} {z}
+# INTEL: vgetexppbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x7d,0x9f,0x42,0x52,0x80
+
+# ATT:   vgetexppbf16  268435456(%esp,%esi,8), %ymm2
+# INTEL: vgetexppbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x28,0x42,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vgetexppbf16  291(%edi,%eax,4), %ymm2 {%k7}
+# INTEL: vgetexppbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x7d,0x2f,0x42,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vgetexppbf16  (%eax){1to16}, %ymm2
+# INTEL: vgetexppbf16 ymm2, word ptr [eax]{1to16}
+0x62,0xf5,0x7d,0x38,0x42,0x10
+
+# ATT:   vgetexppbf16  -1024(,%ebp,2), %ymm2
+# INTEL: vgetexppbf16 ymm2, ymmword ptr [2*ebp - 1024]
+0x62,0xf5,0x7d,0x28,0x42,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vgetexppbf16  4064(%ecx), %ymm2 {%k7} {z}
+# INTEL: vgetexppbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x7d,0xaf,0x42,0x51,0x7f
+
+# ATT:   vgetexppbf16  -256(%edx){1to16}, %ymm2 {%k7} {z}
+# INTEL: vgetexppbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x7d,0xbf,0x42,0x52,0x80
+
+# ATT:   vgetexppbf16  268435456(%esp,%esi,8), %zmm2
+# INTEL: vgetexppbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x48,0x42,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vgetexppbf16  291(%edi,%eax,4), %zmm2 {%k7}
+# INTEL: vgetexppbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x7d,0x4f,0x42,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vgetexppbf16  (%eax){1to32}, %zmm2
+# INTEL: vgetexppbf16 zmm2, word ptr [eax]{1to32}
+0x62,0xf5,0x7d,0x58,0x42,0x10
+
+# ATT:   vgetexppbf16  -2048(,%ebp,2), %zmm2
+# INTEL: vgetexppbf16 zmm2, zmmword ptr [2*ebp - 2048]
+0x62,0xf5,0x7d,0x48,0x42,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vgetexppbf16  8128(%ecx), %zmm2 {%k7} {z}
+# INTEL: vgetexppbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128]
+0x62,0xf5,0x7d,0xcf,0x42,0x51,0x7f
+
+# ATT:   vgetexppbf16  -256(%edx){1to32}, %zmm2 {%k7} {z}
+# INTEL: vgetexppbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}
+0x62,0xf5,0x7d,0xdf,0x42,0x52,0x80
+
+# ATT:   vgetmantpbf16 $123, %zmm3, %zmm2
+# INTEL: vgetmantpbf16 zmm2, zmm3, 123
+0x62,0xf3,0x7f,0x48,0x26,0xd3,0x7b
+
+# ATT:   vgetmantpbf16 $123, %zmm3, %zmm2 {%k7}
+# INTEL: vgetmantpbf16 zmm2 {k7}, zmm3, 123
+0x62,0xf3,0x7f,0x4f,0x26,0xd3,0x7b
+
+# ATT:   vgetmantpbf16 $123, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vgetmantpbf16 zmm2 {k7} {z}, zmm3, 123
+0x62,0xf3,0x7f,0xcf,0x26,0xd3,0x7b
+
+# ATT:   vgetmantpbf16 $123, %ymm3, %ymm2
+# INTEL: vgetmantpbf16 ymm2, ymm3, 123
+0x62,0xf3,0x7f,0x28,0x26,0xd3,0x7b
+
+# ATT:   vgetmantpbf16 $123, %ymm3, %ymm2 {%k7}
+# INTEL: vgetmantpbf16 ymm2 {k7}, ymm3, 123
+0x62,0xf3,0x7f,0x2f,0x26,0xd3,0x7b
+
+# ATT:   vgetmantpbf16 $123, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vgetmantpbf16 ymm2 {k7} {z}, ymm3, 123
+0x62,0xf3,0x7f,0xaf,0x26,0xd3,0x7b
+
+# ATT:   vgetmantpbf16 $123, %xmm3, %xmm2
+# INTEL: vgetmantpbf16 xmm2, xmm3, 123
+0x62,0xf3,0x7f,0x08,0x26,0xd3,0x7b
+
+# ATT:   vgetmantpbf16 $123, %xmm3, %xmm2 {%k7}
+# INTEL: vgetmantpbf16 xmm2 {k7}, xmm3, 123
+0x62,0xf3,0x7f,0x0f,0x26,0xd3,0x7b
+
+# ATT:   vgetmantpbf16 $123, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vgetmantpbf16 xmm2 {k7} {z}, xmm3, 123
+0x62,0xf3,0x7f,0x8f,0x26,0xd3,0x7b
+
+# ATT:   vgetmantpbf16  $123, 268435456(%esp,%esi,8), %xmm2
+# INTEL: vgetmantpbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7f,0x08,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vgetmantpbf16  $123, 291(%edi,%eax,4), %xmm2 {%k7}
+# INTEL: vgetmantpbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123
+0x62,0xf3,0x7f,0x0f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vgetmantpbf16  $123, (%eax){1to8}, %xmm2
+# INTEL: vgetmantpbf16 xmm2, word ptr [eax]{1to8}, 123
+0x62,0xf3,0x7f,0x18,0x26,0x10,0x7b
+
+# ATT:   vgetmantpbf16  $123, -512(,%ebp,2), %xmm2
+# INTEL: vgetmantpbf16 xmm2, xmmword ptr [2*ebp - 512], 123
+0x62,0xf3,0x7f,0x08,0x26,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b
+
+# ATT:   vgetmantpbf16  $123, 2032(%ecx), %xmm2 {%k7} {z}
+# INTEL: vgetmantpbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123
+0x62,0xf3,0x7f,0x8f,0x26,0x51,0x7f,0x7b
+
+# ATT:   vgetmantpbf16  $123, -256(%edx){1to8}, %xmm2 {%k7} {z}
+# INTEL: vgetmantpbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123
+0x62,0xf3,0x7f,0x9f,0x26,0x52,0x80,0x7b
+
+# ATT:   vgetmantpbf16  $123, 268435456(%esp,%esi,8), %ymm2
+# INTEL: vgetmantpbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7f,0x28,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vgetmantpbf16  $123, 291(%edi,%eax,4), %ymm2 {%k7}
+# INTEL: vgetmantpbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123
+0x62,0xf3,0x7f,0x2f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vgetmantpbf16  $123, (%eax){1to16}, %ymm2
+# INTEL: vgetmantpbf16 ymm2, word ptr [eax]{1to16}, 123
+0x62,0xf3,0x7f,0x38,0x26,0x10,0x7b
+
+# ATT:   vgetmantpbf16  $123, -1024(,%ebp,2), %ymm2
+# INTEL: vgetmantpbf16 ymm2, ymmword ptr [2*ebp - 1024], 123
+0x62,0xf3,0x7f,0x28,0x26,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b
+
+# ATT:   vgetmantpbf16  $123, 4064(%ecx), %ymm2 {%k7} {z}
+# INTEL: vgetmantpbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123
+0x62,0xf3,0x7f,0xaf,0x26,0x51,0x7f,0x7b
+
+# ATT:   vgetmantpbf16  $123, -256(%edx){1to16}, %ymm2 {%k7} {z}
+# INTEL: vgetmantpbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123
+0x62,0xf3,0x7f,0xbf,0x26,0x52,0x80,0x7b
+
+# ATT:   vgetmantpbf16  $123, 268435456(%esp,%esi,8), %zmm2
+# INTEL: vgetmantpbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7f,0x48,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vgetmantpbf16  $123, 291(%edi,%eax,4), %zmm2 {%k7}
+# INTEL: vgetmantpbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123
+0x62,0xf3,0x7f,0x4f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vgetmantpbf16  $123, (%eax){1to32}, %zmm2
+# INTEL: vgetmantpbf16 zmm2, word ptr [eax]{1to32}, 123
+0x62,0xf3,0x7f,0x58,0x26,0x10,0x7b
+
+# ATT:   vgetmantpbf16  $123, -2048(,%ebp,2), %zmm2
+# INTEL: vgetmantpbf16 zmm2, zmmword ptr [2*ebp - 2048], 123
+0x62,0xf3,0x7f,0x48,0x26,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b
+
+# ATT:   vgetmantpbf16  $123, 8128(%ecx), %zmm2 {%k7} {z}
+# INTEL: vgetmantpbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123
+0x62,0xf3,0x7f,0xcf,0x26,0x51,0x7f,0x7b
+
+# ATT:   vgetmantpbf16  $123, -256(%edx){1to32}, %zmm2 {%k7} {z}
+# INTEL: vgetmantpbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+0x62,0xf3,0x7f,0xdf,0x26,0x52,0x80,0x7b
+
+# ATT:   vmaxpbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vmaxpbf16 ymm2, ymm3, ymm4
+0x62,0xf5,0x65,0x28,0x5f,0xd4
+
+# ATT:   vmaxpbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vmaxpbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf5,0x65,0x2f,0x5f,0xd4
+
+# ATT:   vmaxpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vmaxpbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf5,0x65,0xaf,0x5f,0xd4
+
+# ATT:   vmaxpbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vmaxpbf16 zmm2, zmm3, zmm4
+0x62,0xf5,0x65,0x48,0x5f,0xd4
+
+# ATT:   vmaxpbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vmaxpbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf5,0x65,0x4f,0x5f,0xd4
+
+# ATT:   vmaxpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vmaxpbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf5,0x65,0xcf,0x5f,0xd4
+
+# ATT:   vmaxpbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vmaxpbf16 xmm2, xmm3, xmm4
+0x62,0xf5,0x65,0x08,0x5f,0xd4
+
+# ATT:   vmaxpbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vmaxpbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf5,0x65,0x0f,0x5f,0xd4
+
+# ATT:   vmaxpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vmaxpbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf5,0x65,0x8f,0x5f,0xd4
+
+# ATT:   vmaxpbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vmaxpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x48,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vmaxpbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vmaxpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x4f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vmaxpbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vmaxpbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf5,0x65,0x58,0x5f,0x10
+
+# ATT:   vmaxpbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vmaxpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf5,0x65,0x48,0x5f,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vmaxpbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vmaxpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf5,0x65,0xcf,0x5f,0x51,0x7f
+
+# ATT:   vmaxpbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vmaxpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf5,0x65,0xdf,0x5f,0x52,0x80
+
+# ATT:   vmaxpbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vmaxpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x28,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vmaxpbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vmaxpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x2f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vmaxpbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vmaxpbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf5,0x65,0x38,0x5f,0x10
+
+# ATT:   vmaxpbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vmaxpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf5,0x65,0x28,0x5f,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vmaxpbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vmaxpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x65,0xaf,0x5f,0x51,0x7f
+
+# ATT:   vmaxpbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vmaxpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x65,0xbf,0x5f,0x52,0x80
+
+# ATT:   vmaxpbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vmaxpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x08,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vmaxpbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vmaxpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x0f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vmaxpbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vmaxpbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf5,0x65,0x18,0x5f,0x10
+
+# ATT:   vmaxpbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vmaxpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf5,0x65,0x08,0x5f,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vmaxpbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vmaxpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x65,0x8f,0x5f,0x51,0x7f
+
+# ATT:   vmaxpbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vmaxpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x65,0x9f,0x5f,0x52,0x80
+
+# ATT:   vminpbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vminpbf16 ymm2, ymm3, ymm4
+0x62,0xf5,0x65,0x28,0x5d,0xd4
+
+# ATT:   vminpbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vminpbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf5,0x65,0x2f,0x5d,0xd4
+
+# ATT:   vminpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vminpbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf5,0x65,0xaf,0x5d,0xd4
+
+# ATT:   vminpbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vminpbf16 zmm2, zmm3, zmm4
+0x62,0xf5,0x65,0x48,0x5d,0xd4
+
+# ATT:   vminpbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vminpbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf5,0x65,0x4f,0x5d,0xd4
+
+# ATT:   vminpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vminpbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf5,0x65,0xcf,0x5d,0xd4
+
+# ATT:   vminpbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vminpbf16 xmm2, xmm3, xmm4
+0x62,0xf5,0x65,0x08,0x5d,0xd4
+
+# ATT:   vminpbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vminpbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf5,0x65,0x0f,0x5d,0xd4
+
+# ATT:   vminpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vminpbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf5,0x65,0x8f,0x5d,0xd4
+
+# ATT:   vminpbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vminpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x48,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vminpbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vminpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x4f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vminpbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vminpbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf5,0x65,0x58,0x5d,0x10
+
+# ATT:   vminpbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vminpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf5,0x65,0x48,0x5d,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vminpbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vminpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf5,0x65,0xcf,0x5d,0x51,0x7f
+
+# ATT:   vminpbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vminpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf5,0x65,0xdf,0x5d,0x52,0x80
+
+# ATT:   vminpbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vminpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x28,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vminpbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vminpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x2f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vminpbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vminpbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf5,0x65,0x38,0x5d,0x10
+
+# ATT:   vminpbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vminpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf5,0x65,0x28,0x5d,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vminpbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vminpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x65,0xaf,0x5d,0x51,0x7f
+
+# ATT:   vminpbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vminpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x65,0xbf,0x5d,0x52,0x80
+
+# ATT:   vminpbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vminpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x08,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vminpbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vminpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x0f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vminpbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vminpbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf5,0x65,0x18,0x5d,0x10
+
+# ATT:   vminpbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vminpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf5,0x65,0x08,0x5d,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vminpbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vminpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x65,0x8f,0x5d,0x51,0x7f
+
+# ATT:   vminpbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vminpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x65,0x9f,0x5d,0x52,0x80
+
+# ATT:   vmulnepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vmulnepbf16 ymm2, ymm3, ymm4
+0x62,0xf5,0x65,0x28,0x59,0xd4
+
+# ATT:   vmulnepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vmulnepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf5,0x65,0x2f,0x59,0xd4
+
+# ATT:   vmulnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vmulnepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf5,0x65,0xaf,0x59,0xd4
+
+# ATT:   vmulnepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vmulnepbf16 zmm2, zmm3, zmm4
+0x62,0xf5,0x65,0x48,0x59,0xd4
+
+# ATT:   vmulnepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vmulnepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf5,0x65,0x4f,0x59,0xd4
+
+# ATT:   vmulnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vmulnepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf5,0x65,0xcf,0x59,0xd4
+
+# ATT:   vmulnepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vmulnepbf16 xmm2, xmm3, xmm4
+0x62,0xf5,0x65,0x08,0x59,0xd4
+
+# ATT:   vmulnepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vmulnepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf5,0x65,0x0f,0x59,0xd4
+
+# ATT:   vmulnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vmulnepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf5,0x65,0x8f,0x59,0xd4
+
+# ATT:   vmulnepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vmulnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x48,0x59,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vmulnepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vmulnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x4f,0x59,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vmulnepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vmulnepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf5,0x65,0x58,0x59,0x10
+
+# ATT:   vmulnepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vmulnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf5,0x65,0x48,0x59,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vmulnepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vmulnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf5,0x65,0xcf,0x59,0x51,0x7f
+
+# ATT:   vmulnepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vmulnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf5,0x65,0xdf,0x59,0x52,0x80
+
+# ATT:   vmulnepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vmulnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x28,0x59,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vmulnepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vmulnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x2f,0x59,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vmulnepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vmulnepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf5,0x65,0x38,0x59,0x10
+
+# ATT:   vmulnepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vmulnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf5,0x65,0x28,0x59,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vmulnepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vmulnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x65,0xaf,0x59,0x51,0x7f
+
+# ATT:   vmulnepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vmulnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x65,0xbf,0x59,0x52,0x80
+
+# ATT:   vmulnepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vmulnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x08,0x59,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vmulnepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vmulnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x0f,0x59,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vmulnepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vmulnepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf5,0x65,0x18,0x59,0x10
+
+# ATT:   vmulnepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vmulnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf5,0x65,0x08,0x59,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vmulnepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vmulnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x65,0x8f,0x59,0x51,0x7f
+
+# ATT:   vmulnepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vmulnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x65,0x9f,0x59,0x52,0x80
+
+# ATT:   vrcppbf16 %xmm3, %xmm2
+# INTEL: vrcppbf16 xmm2, xmm3
+0x62,0xf6,0x7c,0x08,0x4c,0xd3
+
+# ATT:   vrcppbf16 %xmm3, %xmm2 {%k7}
+# INTEL: vrcppbf16 xmm2 {k7}, xmm3
+0x62,0xf6,0x7c,0x0f,0x4c,0xd3
+
+# ATT:   vrcppbf16 %xmm3, %xmm2 {%k7} {z}
+# INTEL: vrcppbf16 xmm2 {k7} {z}, xmm3
+0x62,0xf6,0x7c,0x8f,0x4c,0xd3
+
+# ATT:   vrcppbf16 %zmm3, %zmm2
+# INTEL: vrcppbf16 zmm2, zmm3
+0x62,0xf6,0x7c,0x48,0x4c,0xd3
+
+# ATT:   vrcppbf16 %zmm3, %zmm2 {%k7}
+# INTEL: vrcppbf16 zmm2 {k7}, zmm3
+0x62,0xf6,0x7c,0x4f,0x4c,0xd3
+
+# ATT:   vrcppbf16 %zmm3, %zmm2 {%k7} {z}
+# INTEL: vrcppbf16 zmm2 {k7} {z}, zmm3
+0x62,0xf6,0x7c,0xcf,0x4c,0xd3
+
+# ATT:   vrcppbf16 %ymm3, %ymm2
+# INTEL: vrcppbf16 ymm2, ymm3
+0x62,0xf6,0x7c,0x28,0x4c,0xd3
+
+# ATT:   vrcppbf16 %ymm3, %ymm2 {%k7}
+# INTEL: vrcppbf16 ymm2 {k7}, ymm3
+0x62,0xf6,0x7c,0x2f,0x4c,0xd3
+
+# ATT:   vrcppbf16 %ymm3, %ymm2 {%k7} {z}
+# INTEL: vrcppbf16 ymm2 {k7} {z}, ymm3
+0x62,0xf6,0x7c,0xaf,0x4c,0xd3
+
+# ATT:   vrcppbf16  268435456(%esp,%esi,8), %xmm2
+# INTEL: vrcppbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7c,0x08,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vrcppbf16  291(%edi,%eax,4), %xmm2 {%k7}
+# INTEL: vrcppbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x7c,0x0f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vrcppbf16  (%eax){1to8}, %xmm2
+# INTEL: vrcppbf16 xmm2, word ptr [eax]{1to8}
+0x62,0xf6,0x7c,0x18,0x4c,0x10
+
+# ATT:   vrcppbf16  -512(,%ebp,2), %xmm2
+# INTEL: vrcppbf16 xmm2, xmmword ptr [2*ebp - 512]
+0x62,0xf6,0x7c,0x08,0x4c,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vrcppbf16  2032(%ecx), %xmm2 {%k7} {z}
+# INTEL: vrcppbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x7c,0x8f,0x4c,0x51,0x7f
+
+# ATT:   vrcppbf16  -256(%edx){1to8}, %xmm2 {%k7} {z}
+# INTEL: vrcppbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x7c,0x9f,0x4c,0x52,0x80
+
+# ATT:   vrcppbf16  268435456(%esp,%esi,8), %ymm2
+# INTEL: vrcppbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7c,0x28,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vrcppbf16  291(%edi,%eax,4), %ymm2 {%k7}
+# INTEL: vrcppbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x7c,0x2f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vrcppbf16  (%eax){1to16}, %ymm2
+# INTEL: vrcppbf16 ymm2, word ptr [eax]{1to16}
+0x62,0xf6,0x7c,0x38,0x4c,0x10
+
+# ATT:   vrcppbf16  -1024(,%ebp,2), %ymm2
+# INTEL: vrcppbf16 ymm2, ymmword ptr [2*ebp - 1024]
+0x62,0xf6,0x7c,0x28,0x4c,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vrcppbf16  4064(%ecx), %ymm2 {%k7} {z}
+# INTEL: vrcppbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x7c,0xaf,0x4c,0x51,0x7f
+
+# ATT:   vrcppbf16  -256(%edx){1to16}, %ymm2 {%k7} {z}
+# INTEL: vrcppbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x7c,0xbf,0x4c,0x52,0x80
+
+# ATT:   vrcppbf16  268435456(%esp,%esi,8), %zmm2
+# INTEL: vrcppbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7c,0x48,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vrcppbf16  291(%edi,%eax,4), %zmm2 {%k7}
+# INTEL: vrcppbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x7c,0x4f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vrcppbf16  (%eax){1to32}, %zmm2
+# INTEL: vrcppbf16 zmm2, word ptr [eax]{1to32}
+0x62,0xf6,0x7c,0x58,0x4c,0x10
+
+# ATT:   vrcppbf16  -2048(,%ebp,2), %zmm2
+# INTEL: vrcppbf16 zmm2, zmmword ptr [2*ebp - 2048]
+0x62,0xf6,0x7c,0x48,0x4c,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vrcppbf16  8128(%ecx), %zmm2 {%k7} {z}
+# INTEL: vrcppbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128]
+0x62,0xf6,0x7c,0xcf,0x4c,0x51,0x7f
+
+# ATT:   vrcppbf16  -256(%edx){1to32}, %zmm2 {%k7} {z}
+# INTEL: vrcppbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}
+0x62,0xf6,0x7c,0xdf,0x4c,0x52,0x80
+
+# ATT:   vreducenepbf16 $123, %zmm3, %zmm2
+# INTEL: vreducenepbf16 zmm2, zmm3, 123
+0x62,0xf3,0x7f,0x48,0x56,0xd3,0x7b
+
+# ATT:   vreducenepbf16 $123, %zmm3, %zmm2 {%k7}
+# INTEL: vreducenepbf16 zmm2 {k7}, zmm3, 123
+0x62,0xf3,0x7f,0x4f,0x56,0xd3,0x7b
+
+# ATT:   vreducenepbf16 $123, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vreducenepbf16 zmm2 {k7} {z}, zmm3, 123
+0x62,0xf3,0x7f,0xcf,0x56,0xd3,0x7b
+
+# ATT:   vreducenepbf16 $123, %ymm3, %ymm2
+# INTEL: vreducenepbf16 ymm2, ymm3, 123
+0x62,0xf3,0x7f,0x28,0x56,0xd3,0x7b
+
+# ATT:   vreducenepbf16 $123, %ymm3, %ymm2 {%k7}
+# INTEL: vreducenepbf16 ymm2 {k7}, ymm3, 123
+0x62,0xf3,0x7f,0x2f,0x56,0xd3,0x7b
+
+# ATT:   vreducenepbf16 $123, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vreducenepbf16 ymm2 {k7} {z}, ymm3, 123
+0x62,0xf3,0x7f,0xaf,0x56,0xd3,0x7b
+
+# ATT:   vreducenepbf16 $123, %xmm3, %xmm2
+# INTEL: vreducenepbf16 xmm2, xmm3, 123
+0x62,0xf3,0x7f,0x08,0x56,0xd3,0x7b
+
+# ATT:   vreducenepbf16 $123, %xmm3, %xmm2 {%k7}
+# INTEL: vreducenepbf16 xmm2 {k7}, xmm3, 123
+0x62,0xf3,0x7f,0x0f,0x56,0xd3,0x7b
+
+# ATT:   vreducenepbf16 $123, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vreducenepbf16 xmm2 {k7} {z}, xmm3, 123
+0x62,0xf3,0x7f,0x8f,0x56,0xd3,0x7b
+
+# ATT:   vreducenepbf16  $123, 268435456(%esp,%esi,8), %xmm2
+# INTEL: vreducenepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7f,0x08,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vreducenepbf16  $123, 291(%edi,%eax,4), %xmm2 {%k7}
+# INTEL: vreducenepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123
+0x62,0xf3,0x7f,0x0f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vreducenepbf16  $123, (%eax){1to8}, %xmm2
+# INTEL: vreducenepbf16 xmm2, word ptr [eax]{1to8}, 123
+0x62,0xf3,0x7f,0x18,0x56,0x10,0x7b
+
+# ATT:   vreducenepbf16  $123, -512(,%ebp,2), %xmm2
+# INTEL: vreducenepbf16 xmm2, xmmword ptr [2*ebp - 512], 123
+0x62,0xf3,0x7f,0x08,0x56,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b
+
+# ATT:   vreducenepbf16  $123, 2032(%ecx), %xmm2 {%k7} {z}
+# INTEL: vreducenepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123
+0x62,0xf3,0x7f,0x8f,0x56,0x51,0x7f,0x7b
+
+# ATT:   vreducenepbf16  $123, -256(%edx){1to8}, %xmm2 {%k7} {z}
+# INTEL: vreducenepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123
+0x62,0xf3,0x7f,0x9f,0x56,0x52,0x80,0x7b
+
+# ATT:   vreducenepbf16  $123, 268435456(%esp,%esi,8), %ymm2
+# INTEL: vreducenepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7f,0x28,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vreducenepbf16  $123, 291(%edi,%eax,4), %ymm2 {%k7}
+# INTEL: vreducenepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123
+0x62,0xf3,0x7f,0x2f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vreducenepbf16  $123, (%eax){1to16}, %ymm2
+# INTEL: vreducenepbf16 ymm2, word ptr [eax]{1to16}, 123
+0x62,0xf3,0x7f,0x38,0x56,0x10,0x7b
+
+# ATT:   vreducenepbf16  $123, -1024(,%ebp,2), %ymm2
+# INTEL: vreducenepbf16 ymm2, ymmword ptr [2*ebp - 1024], 123
+0x62,0xf3,0x7f,0x28,0x56,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b
+
+# ATT:   vreducenepbf16  $123, 4064(%ecx), %ymm2 {%k7} {z}
+# INTEL: vreducenepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123
+0x62,0xf3,0x7f,0xaf,0x56,0x51,0x7f,0x7b
+
+# ATT:   vreducenepbf16  $123, -256(%edx){1to16}, %ymm2 {%k7} {z}
+# INTEL: vreducenepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123
+0x62,0xf3,0x7f,0xbf,0x56,0x52,0x80,0x7b
+
+# ATT:   vreducenepbf16  $123, 268435456(%esp,%esi,8), %zmm2
+# INTEL: vreducenepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7f,0x48,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vreducenepbf16  $123, 291(%edi,%eax,4), %zmm2 {%k7}
+# INTEL: vreducenepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123
+0x62,0xf3,0x7f,0x4f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vreducenepbf16  $123, (%eax){1to32}, %zmm2
+# INTEL: vreducenepbf16 zmm2, word ptr [eax]{1to32}, 123
+0x62,0xf3,0x7f,0x58,0x56,0x10,0x7b
+
+# ATT:   vreducenepbf16  $123, -2048(,%ebp,2), %zmm2
+# INTEL: vreducenepbf16 zmm2, zmmword ptr [2*ebp - 2048], 123
+0x62,0xf3,0x7f,0x48,0x56,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b
+
+# ATT:   vreducenepbf16  $123, 8128(%ecx), %zmm2 {%k7} {z}
+# INTEL: vreducenepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123
+0x62,0xf3,0x7f,0xcf,0x56,0x51,0x7f,0x7b
+
+# ATT:   vreducenepbf16  $123, -256(%edx){1to32}, %zmm2 {%k7} {z}
+# INTEL: vreducenepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+0x62,0xf3,0x7f,0xdf,0x56,0x52,0x80,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %zmm3, %zmm2
+# INTEL: vrndscalenepbf16 zmm2, zmm3, 123
+0x62,0xf3,0x7f,0x48,0x08,0xd3,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %zmm3, %zmm2 {%k7}
+# INTEL: vrndscalenepbf16 zmm2 {k7}, zmm3, 123
+0x62,0xf3,0x7f,0x4f,0x08,0xd3,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vrndscalenepbf16 zmm2 {k7} {z}, zmm3, 123
+0x62,0xf3,0x7f,0xcf,0x08,0xd3,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %ymm3, %ymm2
+# INTEL: vrndscalenepbf16 ymm2, ymm3, 123
+0x62,0xf3,0x7f,0x28,0x08,0xd3,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %ymm3, %ymm2 {%k7}
+# INTEL: vrndscalenepbf16 ymm2 {k7}, ymm3, 123
+0x62,0xf3,0x7f,0x2f,0x08,0xd3,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vrndscalenepbf16 ymm2 {k7} {z}, ymm3, 123
+0x62,0xf3,0x7f,0xaf,0x08,0xd3,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %xmm3, %xmm2
+# INTEL: vrndscalenepbf16 xmm2, xmm3, 123
+0x62,0xf3,0x7f,0x08,0x08,0xd3,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %xmm3, %xmm2 {%k7}
+# INTEL: vrndscalenepbf16 xmm2 {k7}, xmm3, 123
+0x62,0xf3,0x7f,0x0f,0x08,0xd3,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vrndscalenepbf16 xmm2 {k7} {z}, xmm3, 123
+0x62,0xf3,0x7f,0x8f,0x08,0xd3,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 268435456(%esp,%esi,8), %xmm2
+# INTEL: vrndscalenepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7f,0x08,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 291(%edi,%eax,4), %xmm2 {%k7}
+# INTEL: vrndscalenepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123
+0x62,0xf3,0x7f,0x0f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vrndscalenepbf16  $123, (%eax){1to8}, %xmm2
+# INTEL: vrndscalenepbf16 xmm2, word ptr [eax]{1to8}, 123
+0x62,0xf3,0x7f,0x18,0x08,0x10,0x7b
+
+# ATT:   vrndscalenepbf16  $123, -512(,%ebp,2), %xmm2
+# INTEL: vrndscalenepbf16 xmm2, xmmword ptr [2*ebp - 512], 123
+0x62,0xf3,0x7f,0x08,0x08,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 2032(%ecx), %xmm2 {%k7} {z}
+# INTEL: vrndscalenepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123
+0x62,0xf3,0x7f,0x8f,0x08,0x51,0x7f,0x7b
+
+# ATT:   vrndscalenepbf16  $123, -256(%edx){1to8}, %xmm2 {%k7} {z}
+# INTEL: vrndscalenepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123
+0x62,0xf3,0x7f,0x9f,0x08,0x52,0x80,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 268435456(%esp,%esi,8), %ymm2
+# INTEL: vrndscalenepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7f,0x28,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 291(%edi,%eax,4), %ymm2 {%k7}
+# INTEL: vrndscalenepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123
+0x62,0xf3,0x7f,0x2f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vrndscalenepbf16  $123, (%eax){1to16}, %ymm2
+# INTEL: vrndscalenepbf16 ymm2, word ptr [eax]{1to16}, 123
+0x62,0xf3,0x7f,0x38,0x08,0x10,0x7b
+
+# ATT:   vrndscalenepbf16  $123, -1024(,%ebp,2), %ymm2
+# INTEL: vrndscalenepbf16 ymm2, ymmword ptr [2*ebp - 1024], 123
+0x62,0xf3,0x7f,0x28,0x08,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 4064(%ecx), %ymm2 {%k7} {z}
+# INTEL: vrndscalenepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123
+0x62,0xf3,0x7f,0xaf,0x08,0x51,0x7f,0x7b
+
+# ATT:   vrndscalenepbf16  $123, -256(%edx){1to16}, %ymm2 {%k7} {z}
+# INTEL: vrndscalenepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123
+0x62,0xf3,0x7f,0xbf,0x08,0x52,0x80,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 268435456(%esp,%esi,8), %zmm2
+# INTEL: vrndscalenepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7f,0x48,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 291(%edi,%eax,4), %zmm2 {%k7}
+# INTEL: vrndscalenepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123
+0x62,0xf3,0x7f,0x4f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vrndscalenepbf16  $123, (%eax){1to32}, %zmm2
+# INTEL: vrndscalenepbf16 zmm2, word ptr [eax]{1to32}, 123
+0x62,0xf3,0x7f,0x58,0x08,0x10,0x7b
+
+# ATT:   vrndscalenepbf16  $123, -2048(,%ebp,2), %zmm2
+# INTEL: vrndscalenepbf16 zmm2, zmmword ptr [2*ebp - 2048], 123
+0x62,0xf3,0x7f,0x48,0x08,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 8128(%ecx), %zmm2 {%k7} {z}
+# INTEL: vrndscalenepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123
+0x62,0xf3,0x7f,0xcf,0x08,0x51,0x7f,0x7b
+
+# ATT:   vrndscalenepbf16  $123, -256(%edx){1to32}, %zmm2 {%k7} {z}
+# INTEL: vrndscalenepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+0x62,0xf3,0x7f,0xdf,0x08,0x52,0x80,0x7b
+
+# ATT:   vrsqrtpbf16 %xmm3, %xmm2
+# INTEL: vrsqrtpbf16 xmm2, xmm3
+0x62,0xf6,0x7c,0x08,0x4e,0xd3
+
+# ATT:   vrsqrtpbf16 %xmm3, %xmm2 {%k7}
+# INTEL: vrsqrtpbf16 xmm2 {k7}, xmm3
+0x62,0xf6,0x7c,0x0f,0x4e,0xd3
+
+# ATT:   vrsqrtpbf16 %xmm3, %xmm2 {%k7} {z}
+# INTEL: vrsqrtpbf16 xmm2 {k7} {z}, xmm3
+0x62,0xf6,0x7c,0x8f,0x4e,0xd3
+
+# ATT:   vrsqrtpbf16 %zmm3, %zmm2
+# INTEL: vrsqrtpbf16 zmm2, zmm3
+0x62,0xf6,0x7c,0x48,0x4e,0xd3
+
+# ATT:   vrsqrtpbf16 %zmm3, %zmm2 {%k7}
+# INTEL: vrsqrtpbf16 zmm2 {k7}, zmm3
+0x62,0xf6,0x7c,0x4f,0x4e,0xd3
+
+# ATT:   vrsqrtpbf16 %zmm3, %zmm2 {%k7} {z}
+# INTEL: vrsqrtpbf16 zmm2 {k7} {z}, zmm3
+0x62,0xf6,0x7c,0xcf,0x4e,0xd3
+
+# ATT:   vrsqrtpbf16 %ymm3, %ymm2
+# INTEL: vrsqrtpbf16 ymm2, ymm3
+0x62,0xf6,0x7c,0x28,0x4e,0xd3
+
+# ATT:   vrsqrtpbf16 %ymm3, %ymm2 {%k7}
+# INTEL: vrsqrtpbf16 ymm2 {k7}, ymm3
+0x62,0xf6,0x7c,0x2f,0x4e,0xd3
+
+# ATT:   vrsqrtpbf16 %ymm3, %ymm2 {%k7} {z}
+# INTEL: vrsqrtpbf16 ymm2 {k7} {z}, ymm3
+0x62,0xf6,0x7c,0xaf,0x4e,0xd3
+
+# ATT:   vrsqrtpbf16  268435456(%esp,%esi,8), %xmm2
+# INTEL: vrsqrtpbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7c,0x08,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vrsqrtpbf16  291(%edi,%eax,4), %xmm2 {%k7}
+# INTEL: vrsqrtpbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x7c,0x0f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vrsqrtpbf16  (%eax){1to8}, %xmm2
+# INTEL: vrsqrtpbf16 xmm2, word ptr [eax]{1to8}
+0x62,0xf6,0x7c,0x18,0x4e,0x10
+
+# ATT:   vrsqrtpbf16  -512(,%ebp,2), %xmm2
+# INTEL: vrsqrtpbf16 xmm2, xmmword ptr [2*ebp - 512]
+0x62,0xf6,0x7c,0x08,0x4e,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vrsqrtpbf16  2032(%ecx), %xmm2 {%k7} {z}
+# INTEL: vrsqrtpbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x7c,0x8f,0x4e,0x51,0x7f
+
+# ATT:   vrsqrtpbf16  -256(%edx){1to8}, %xmm2 {%k7} {z}
+# INTEL: vrsqrtpbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x7c,0x9f,0x4e,0x52,0x80
+
+# ATT:   vrsqrtpbf16  268435456(%esp,%esi,8), %ymm2
+# INTEL: vrsqrtpbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7c,0x28,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vrsqrtpbf16  291(%edi,%eax,4), %ymm2 {%k7}
+# INTEL: vrsqrtpbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x7c,0x2f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vrsqrtpbf16  (%eax){1to16}, %ymm2
+# INTEL: vrsqrtpbf16 ymm2, word ptr [eax]{1to16}
+0x62,0xf6,0x7c,0x38,0x4e,0x10
+
+# ATT:   vrsqrtpbf16  -1024(,%ebp,2), %ymm2
+# INTEL: vrsqrtpbf16 ymm2, ymmword ptr [2*ebp - 1024]
+0x62,0xf6,0x7c,0x28,0x4e,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vrsqrtpbf16  4064(%ecx), %ymm2 {%k7} {z}
+# INTEL: vrsqrtpbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x7c,0xaf,0x4e,0x51,0x7f
+
+# ATT:   vrsqrtpbf16  -256(%edx){1to16}, %ymm2 {%k7} {z}
+# INTEL: vrsqrtpbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x7c,0xbf,0x4e,0x52,0x80
+
+# ATT:   vrsqrtpbf16  268435456(%esp,%esi,8), %zmm2
+# INTEL: vrsqrtpbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7c,0x48,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vrsqrtpbf16  291(%edi,%eax,4), %zmm2 {%k7}
+# INTEL: vrsqrtpbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x7c,0x4f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vrsqrtpbf16  (%eax){1to32}, %zmm2
+# INTEL: vrsqrtpbf16 zmm2, word ptr [eax]{1to32}
+0x62,0xf6,0x7c,0x58,0x4e,0x10
+
+# ATT:   vrsqrtpbf16  -2048(,%ebp,2), %zmm2
+# INTEL: vrsqrtpbf16 zmm2, zmmword ptr [2*ebp - 2048]
+0x62,0xf6,0x7c,0x48,0x4e,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vrsqrtpbf16  8128(%ecx), %zmm2 {%k7} {z}
+# INTEL: vrsqrtpbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128]
+0x62,0xf6,0x7c,0xcf,0x4e,0x51,0x7f
+
+# ATT:   vrsqrtpbf16  -256(%edx){1to32}, %zmm2 {%k7} {z}
+# INTEL: vrsqrtpbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}
+0x62,0xf6,0x7c,0xdf,0x4e,0x52,0x80
+
+# ATT:   vscalefpbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vscalefpbf16 ymm2, ymm3, ymm4
+0x62,0xf6,0x64,0x28,0x2c,0xd4
+
+# ATT:   vscalefpbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vscalefpbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf6,0x64,0x2f,0x2c,0xd4
+
+# ATT:   vscalefpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vscalefpbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf6,0x64,0xaf,0x2c,0xd4
+
+# ATT:   vscalefpbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vscalefpbf16 zmm2, zmm3, zmm4
+0x62,0xf6,0x64,0x48,0x2c,0xd4
+
+# ATT:   vscalefpbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vscalefpbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf6,0x64,0x4f,0x2c,0xd4
+
+# ATT:   vscalefpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vscalefpbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf6,0x64,0xcf,0x2c,0xd4
+
+# ATT:   vscalefpbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vscalefpbf16 xmm2, xmm3, xmm4
+0x62,0xf6,0x64,0x08,0x2c,0xd4
+
+# ATT:   vscalefpbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vscalefpbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf6,0x64,0x0f,0x2c,0xd4
+
+# ATT:   vscalefpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vscalefpbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf6,0x64,0x8f,0x2c,0xd4
+
+# ATT:   vscalefpbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vscalefpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x48,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vscalefpbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vscalefpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x4f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vscalefpbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vscalefpbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf6,0x64,0x58,0x2c,0x10
+
+# ATT:   vscalefpbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vscalefpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf6,0x64,0x48,0x2c,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vscalefpbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vscalefpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf6,0x64,0xcf,0x2c,0x51,0x7f
+
+# ATT:   vscalefpbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vscalefpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf6,0x64,0xdf,0x2c,0x52,0x80
+
+# ATT:   vscalefpbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vscalefpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x28,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vscalefpbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vscalefpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x2f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vscalefpbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vscalefpbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf6,0x64,0x38,0x2c,0x10
+
+# ATT:   vscalefpbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vscalefpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf6,0x64,0x28,0x2c,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vscalefpbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vscalefpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x64,0xaf,0x2c,0x51,0x7f
+
+# ATT:   vscalefpbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vscalefpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x64,0xbf,0x2c,0x52,0x80
+
+# ATT:   vscalefpbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vscalefpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x64,0x08,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vscalefpbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vscalefpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf6,0x64,0x0f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vscalefpbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vscalefpbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf6,0x64,0x18,0x2c,0x10
+
+# ATT:   vscalefpbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vscalefpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf6,0x64,0x08,0x2c,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vscalefpbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vscalefpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x64,0x8f,0x2c,0x51,0x7f
+
+# ATT:   vscalefpbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vscalefpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x64,0x9f,0x2c,0x52,0x80
+
+# ATT:   vsqrtnepbf16 %xmm3, %xmm2
+# INTEL: vsqrtnepbf16 xmm2, xmm3
+0x62,0xf5,0x7d,0x08,0x51,0xd3
+
+# ATT:   vsqrtnepbf16 %xmm3, %xmm2 {%k7}
+# INTEL: vsqrtnepbf16 xmm2 {k7}, xmm3
+0x62,0xf5,0x7d,0x0f,0x51,0xd3
+
+# ATT:   vsqrtnepbf16 %xmm3, %xmm2 {%k7} {z}
+# INTEL: vsqrtnepbf16 xmm2 {k7} {z}, xmm3
+0x62,0xf5,0x7d,0x8f,0x51,0xd3
+
+# ATT:   vsqrtnepbf16 %zmm3, %zmm2
+# INTEL: vsqrtnepbf16 zmm2, zmm3
+0x62,0xf5,0x7d,0x48,0x51,0xd3
+
+# ATT:   vsqrtnepbf16 %zmm3, %zmm2 {%k7}
+# INTEL: vsqrtnepbf16 zmm2 {k7}, zmm3
+0x62,0xf5,0x7d,0x4f,0x51,0xd3
+
+# ATT:   vsqrtnepbf16 %zmm3, %zmm2 {%k7} {z}
+# INTEL: vsqrtnepbf16 zmm2 {k7} {z}, zmm3
+0x62,0xf5,0x7d,0xcf,0x51,0xd3
+
+# ATT:   vsqrtnepbf16 %ymm3, %ymm2
+# INTEL: vsqrtnepbf16 ymm2, ymm3
+0x62,0xf5,0x7d,0x28,0x51,0xd3
+
+# ATT:   vsqrtnepbf16 %ymm3, %ymm2 {%k7}
+# INTEL: vsqrtnepbf16 ymm2 {k7}, ymm3
+0x62,0xf5,0x7d,0x2f,0x51,0xd3
+
+# ATT:   vsqrtnepbf16 %ymm3, %ymm2 {%k7} {z}
+# INTEL: vsqrtnepbf16 ymm2 {k7} {z}, ymm3
+0x62,0xf5,0x7d,0xaf,0x51,0xd3
+
+# ATT:   vsqrtnepbf16  268435456(%esp,%esi,8), %xmm2
+# INTEL: vsqrtnepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x08,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vsqrtnepbf16  291(%edi,%eax,4), %xmm2 {%k7}
+# INTEL: vsqrtnepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x7d,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vsqrtnepbf16  (%eax){1to8}, %xmm2
+# INTEL: vsqrtnepbf16 xmm2, word ptr [eax]{1to8}
+0x62,0xf5,0x7d,0x18,0x51,0x10
+
+# ATT:   vsqrtnepbf16  -512(,%ebp,2), %xmm2
+# INTEL: vsqrtnepbf16 xmm2, xmmword ptr [2*ebp - 512]
+0x62,0xf5,0x7d,0x08,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vsqrtnepbf16  2032(%ecx), %xmm2 {%k7} {z}
+# INTEL: vsqrtnepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7d,0x8f,0x51,0x51,0x7f
+
+# ATT:   vsqrtnepbf16  -256(%edx){1to8}, %xmm2 {%k7} {z}
+# INTEL: vsqrtnepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x7d,0x9f,0x51,0x52,0x80
+
+# ATT:   vsqrtnepbf16  268435456(%esp,%esi,8), %ymm2
+# INTEL: vsqrtnepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x28,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vsqrtnepbf16  291(%edi,%eax,4), %ymm2 {%k7}
+# INTEL: vsqrtnepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x7d,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vsqrtnepbf16  (%eax){1to16}, %ymm2
+# INTEL: vsqrtnepbf16 ymm2, word ptr [eax]{1to16}
+0x62,0xf5,0x7d,0x38,0x51,0x10
+
+# ATT:   vsqrtnepbf16  -1024(,%ebp,2), %ymm2
+# INTEL: vsqrtnepbf16 ymm2, ymmword ptr [2*ebp - 1024]
+0x62,0xf5,0x7d,0x28,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vsqrtnepbf16  4064(%ecx), %ymm2 {%k7} {z}
+# INTEL: vsqrtnepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x7d,0xaf,0x51,0x51,0x7f
+
+# ATT:   vsqrtnepbf16  -256(%edx){1to16}, %ymm2 {%k7} {z}
+# INTEL: vsqrtnepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x7d,0xbf,0x51,0x52,0x80
+
+# ATT:   vsqrtnepbf16  268435456(%esp,%esi,8), %zmm2
+# INTEL: vsqrtnepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vsqrtnepbf16  291(%edi,%eax,4), %zmm2 {%k7}
+# INTEL: vsqrtnepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x7d,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vsqrtnepbf16  (%eax){1to32}, %zmm2
+# INTEL: vsqrtnepbf16 zmm2, word ptr [eax]{1to32}
+0x62,0xf5,0x7d,0x58,0x51,0x10
+
+# ATT:   vsqrtnepbf16  -2048(,%ebp,2), %zmm2
+# INTEL: vsqrtnepbf16 zmm2, zmmword ptr [2*ebp - 2048]
+0x62,0xf5,0x7d,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vsqrtnepbf16  8128(%ecx), %zmm2 {%k7} {z}
+# INTEL: vsqrtnepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128]
+0x62,0xf5,0x7d,0xcf,0x51,0x51,0x7f
+
+# ATT:   vsqrtnepbf16  -256(%edx){1to32}, %zmm2 {%k7} {z}
+# INTEL: vsqrtnepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}
+0x62,0xf5,0x7d,0xdf,0x51,0x52,0x80
+
+# ATT:   vsubnepbf16 %ymm4, %ymm3, %ymm2
+# INTEL: vsubnepbf16 ymm2, ymm3, ymm4
+0x62,0xf5,0x65,0x28,0x5c,0xd4
+
+# ATT:   vsubnepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vsubnepbf16 ymm2 {k7}, ymm3, ymm4
+0x62,0xf5,0x65,0x2f,0x5c,0xd4
+
+# ATT:   vsubnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vsubnepbf16 ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf5,0x65,0xaf,0x5c,0xd4
+
+# ATT:   vsubnepbf16 %zmm4, %zmm3, %zmm2
+# INTEL: vsubnepbf16 zmm2, zmm3, zmm4
+0x62,0xf5,0x65,0x48,0x5c,0xd4
+
+# ATT:   vsubnepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vsubnepbf16 zmm2 {k7}, zmm3, zmm4
+0x62,0xf5,0x65,0x4f,0x5c,0xd4
+
+# ATT:   vsubnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vsubnepbf16 zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf5,0x65,0xcf,0x5c,0xd4
+
+# ATT:   vsubnepbf16 %xmm4, %xmm3, %xmm2
+# INTEL: vsubnepbf16 xmm2, xmm3, xmm4
+0x62,0xf5,0x65,0x08,0x5c,0xd4
+
+# ATT:   vsubnepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vsubnepbf16 xmm2 {k7}, xmm3, xmm4
+0x62,0xf5,0x65,0x0f,0x5c,0xd4
+
+# ATT:   vsubnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vsubnepbf16 xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf5,0x65,0x8f,0x5c,0xd4
+
+# ATT:   vsubnepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vsubnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x48,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vsubnepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vsubnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x4f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vsubnepbf16  (%eax){1to32}, %zmm3, %zmm2
+# INTEL: vsubnepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+0x62,0xf5,0x65,0x58,0x5c,0x10
+
+# ATT:   vsubnepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vsubnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf5,0x65,0x48,0x5c,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vsubnepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vsubnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf5,0x65,0xcf,0x5c,0x51,0x7f
+
+# ATT:   vsubnepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vsubnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+0x62,0xf5,0x65,0xdf,0x5c,0x52,0x80
+
+# ATT:   vsubnepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vsubnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x28,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vsubnepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vsubnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x2f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vsubnepbf16  (%eax){1to16}, %ymm3, %ymm2
+# INTEL: vsubnepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+0x62,0xf5,0x65,0x38,0x5c,0x10
+
+# ATT:   vsubnepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vsubnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf5,0x65,0x28,0x5c,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vsubnepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vsubnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x65,0xaf,0x5c,0x51,0x7f
+
+# ATT:   vsubnepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vsubnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x65,0xbf,0x5c,0x52,0x80
+
+# ATT:   vsubnepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vsubnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x65,0x08,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vsubnepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vsubnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf5,0x65,0x0f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vsubnepbf16  (%eax){1to8}, %xmm3, %xmm2
+# INTEL: vsubnepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+0x62,0xf5,0x65,0x18,0x5c,0x10
+
+# ATT:   vsubnepbf16  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vsubnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf5,0x65,0x08,0x5c,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vsubnepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vsubnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x65,0x8f,0x5c,0x51,0x7f
+
+# ATT:   vsubnepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vsubnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x65,0x9f,0x5c,0x52,0x80
+
diff --git a/llvm/test/MC/Disassembler/X86/avx10.2-bf16-64.txt b/llvm/test/MC/Disassembler/X86/avx10.2-bf16-64.txt
new file mode 100644
index 0000000000000..953ef8dd8a14c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/avx10.2-bf16-64.txt
@@ -0,0 +1,3015 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   vaddnepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vaddnepbf16 ymm22, ymm23, ymm24
+0x62,0x85,0x45,0x20,0x58,0xf0
+
+# ATT:   vaddnepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vaddnepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x85,0x45,0x27,0x58,0xf0
+
+# ATT:   vaddnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vaddnepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x85,0x45,0xa7,0x58,0xf0
+
+# ATT:   vaddnepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vaddnepbf16 zmm22, zmm23, zmm24
+0x62,0x85,0x45,0x40,0x58,0xf0
+
+# ATT:   vaddnepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vaddnepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x85,0x45,0x47,0x58,0xf0
+
+# ATT:   vaddnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vaddnepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x85,0x45,0xc7,0x58,0xf0
+
+# ATT:   vaddnepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vaddnepbf16 xmm22, xmm23, xmm24
+0x62,0x85,0x45,0x00,0x58,0xf0
+
+# ATT:   vaddnepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vaddnepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x85,0x45,0x07,0x58,0xf0
+
+# ATT:   vaddnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vaddnepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x85,0x45,0x87,0x58,0xf0
+
+# ATT:   vaddnepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vaddnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x40,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vaddnepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vaddnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x47,0x58,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vaddnepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vaddnepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe5,0x45,0x50,0x58,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vaddnepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vaddnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe5,0x45,0x40,0x58,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vaddnepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vaddnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe5,0x45,0xc7,0x58,0x71,0x7f
+
+# ATT:   vaddnepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vaddnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe5,0x45,0xd7,0x58,0x72,0x80
+
+# ATT:   vaddnepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vaddnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x20,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vaddnepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vaddnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x27,0x58,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vaddnepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vaddnepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe5,0x45,0x30,0x58,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vaddnepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vaddnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe5,0x45,0x20,0x58,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vaddnepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vaddnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe5,0x45,0xa7,0x58,0x71,0x7f
+
+# ATT:   vaddnepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vaddnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe5,0x45,0xb7,0x58,0x72,0x80
+
+# ATT:   vaddnepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vaddnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x00,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vaddnepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vaddnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x07,0x58,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vaddnepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vaddnepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe5,0x45,0x10,0x58,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vaddnepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vaddnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe5,0x45,0x00,0x58,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vaddnepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vaddnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe5,0x45,0x87,0x58,0x71,0x7f
+
+# ATT:   vaddnepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vaddnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe5,0x45,0x97,0x58,0x72,0x80
+
+# ATT:   vcmppbf16 $123, %ymm24, %ymm23, %k5
+# INTEL: vcmppbf16 k5, ymm23, ymm24, 123
+0x62,0x93,0x47,0x20,0xc2,0xe8,0x7b
+
+# ATT:   vcmppbf16 $123, %ymm24, %ymm23, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, ymm23, ymm24, 123
+0x62,0x93,0x47,0x27,0xc2,0xe8,0x7b
+
+# ATT:   vcmppbf16 $123, %xmm24, %xmm23, %k5
+# INTEL: vcmppbf16 k5, xmm23, xmm24, 123
+0x62,0x93,0x47,0x00,0xc2,0xe8,0x7b
+
+# ATT:   vcmppbf16 $123, %xmm24, %xmm23, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, xmm23, xmm24, 123
+0x62,0x93,0x47,0x07,0xc2,0xe8,0x7b
+
+# ATT:   vcmppbf16 $123, %zmm24, %zmm23, %k5
+# INTEL: vcmppbf16 k5, zmm23, zmm24, 123
+0x62,0x93,0x47,0x40,0xc2,0xe8,0x7b
+
+# ATT:   vcmppbf16 $123, %zmm24, %zmm23, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, zmm23, zmm24, 123
+0x62,0x93,0x47,0x47,0xc2,0xe8,0x7b
+
+# ATT:   vcmppbf16  $123, 268435456(%rbp,%r14,8), %zmm23, %k5
+# INTEL: vcmppbf16 k5, zmm23, zmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xb3,0x47,0x40,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vcmppbf16  $123, 291(%r8,%rax,4), %zmm23, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291], 123
+0x62,0xd3,0x47,0x47,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vcmppbf16  $123, (%rip){1to32}, %zmm23, %k5
+# INTEL: vcmppbf16 k5, zmm23, word ptr [rip]{1to32}, 123
+0x62,0xf3,0x47,0x50,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b
+
+# ATT:   vcmppbf16  $123, -2048(,%rbp,2), %zmm23, %k5
+# INTEL: vcmppbf16 k5, zmm23, zmmword ptr [2*rbp - 2048], 123
+0x62,0xf3,0x47,0x40,0xc2,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b
+
+# ATT:   vcmppbf16  $123, 8128(%rcx), %zmm23, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, zmm23, zmmword ptr [rcx + 8128], 123
+0x62,0xf3,0x47,0x47,0xc2,0x69,0x7f,0x7b
+
+# ATT:   vcmppbf16  $123, -256(%rdx){1to32}, %zmm23, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, zmm23, word ptr [rdx - 256]{1to32}, 123
+0x62,0xf3,0x47,0x57,0xc2,0x6a,0x80,0x7b
+
+# ATT:   vcmppbf16  $123, 268435456(%rbp,%r14,8), %xmm23, %k5
+# INTEL: vcmppbf16 k5, xmm23, xmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xb3,0x47,0x00,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vcmppbf16  $123, 291(%r8,%rax,4), %xmm23, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291], 123
+0x62,0xd3,0x47,0x07,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vcmppbf16  $123, (%rip){1to8}, %xmm23, %k5
+# INTEL: vcmppbf16 k5, xmm23, word ptr [rip]{1to8}, 123
+0x62,0xf3,0x47,0x10,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b
+
+# ATT:   vcmppbf16  $123, -512(,%rbp,2), %xmm23, %k5
+# INTEL: vcmppbf16 k5, xmm23, xmmword ptr [2*rbp - 512], 123
+0x62,0xf3,0x47,0x00,0xc2,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b
+
+# ATT:   vcmppbf16  $123, 2032(%rcx), %xmm23, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, xmm23, xmmword ptr [rcx + 2032], 123
+0x62,0xf3,0x47,0x07,0xc2,0x69,0x7f,0x7b
+
+# ATT:   vcmppbf16  $123, -256(%rdx){1to8}, %xmm23, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, xmm23, word ptr [rdx - 256]{1to8}, 123
+0x62,0xf3,0x47,0x17,0xc2,0x6a,0x80,0x7b
+
+# ATT:   vcmppbf16  $123, 268435456(%rbp,%r14,8), %ymm23, %k5
+# INTEL: vcmppbf16 k5, ymm23, ymmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xb3,0x47,0x20,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vcmppbf16  $123, 291(%r8,%rax,4), %ymm23, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291], 123
+0x62,0xd3,0x47,0x27,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vcmppbf16  $123, (%rip){1to16}, %ymm23, %k5
+# INTEL: vcmppbf16 k5, ymm23, word ptr [rip]{1to16}, 123
+0x62,0xf3,0x47,0x30,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b
+
+# ATT:   vcmppbf16  $123, -1024(,%rbp,2), %ymm23, %k5
+# INTEL: vcmppbf16 k5, ymm23, ymmword ptr [2*rbp - 1024], 123
+0x62,0xf3,0x47,0x20,0xc2,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b
+
+# ATT:   vcmppbf16  $123, 4064(%rcx), %ymm23, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, ymm23, ymmword ptr [rcx + 4064], 123
+0x62,0xf3,0x47,0x27,0xc2,0x69,0x7f,0x7b
+
+# ATT:   vcmppbf16  $123, -256(%rdx){1to16}, %ymm23, %k5 {%k7}
+# INTEL: vcmppbf16 k5 {k7}, ymm23, word ptr [rdx - 256]{1to16}, 123
+0x62,0xf3,0x47,0x37,0xc2,0x6a,0x80,0x7b
+
+# ATT:   vcomsbf16 %xmm23, %xmm22
+# INTEL: vcomsbf16 xmm22, xmm23
+0x62,0xa5,0x7d,0x08,0x2f,0xf7
+
+# ATT:   vcomsbf16  268435456(%rbp,%r14,8), %xmm22
+# INTEL: vcomsbf16 xmm22, word ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x7d,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcomsbf16  291(%r8,%rax,4), %xmm22
+# INTEL: vcomsbf16 xmm22, word ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x7d,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vcomsbf16  (%rip), %xmm22
+# INTEL: vcomsbf16 xmm22, word ptr [rip]
+0x62,0xe5,0x7d,0x08,0x2f,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vcomsbf16  -64(,%rbp,2), %xmm22
+# INTEL: vcomsbf16 xmm22, word ptr [2*rbp - 64]
+0x62,0xe5,0x7d,0x08,0x2f,0x34,0x6d,0xc0,0xff,0xff,0xff
+
+# ATT:   vcomsbf16  254(%rcx), %xmm22
+# INTEL: vcomsbf16 xmm22, word ptr [rcx + 254]
+0x62,0xe5,0x7d,0x08,0x2f,0x71,0x7f
+
+# ATT:   vcomsbf16  -256(%rdx), %xmm22
+# INTEL: vcomsbf16 xmm22, word ptr [rdx - 256]
+0x62,0xe5,0x7d,0x08,0x2f,0x72,0x80
+
+# ATT:   vdivnepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vdivnepbf16 ymm22, ymm23, ymm24
+0x62,0x85,0x45,0x20,0x5e,0xf0
+
+# ATT:   vdivnepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vdivnepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x85,0x45,0x27,0x5e,0xf0
+
+# ATT:   vdivnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vdivnepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x85,0x45,0xa7,0x5e,0xf0
+
+# ATT:   vdivnepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vdivnepbf16 zmm22, zmm23, zmm24
+0x62,0x85,0x45,0x40,0x5e,0xf0
+
+# ATT:   vdivnepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vdivnepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x85,0x45,0x47,0x5e,0xf0
+
+# ATT:   vdivnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vdivnepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x85,0x45,0xc7,0x5e,0xf0
+
+# ATT:   vdivnepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vdivnepbf16 xmm22, xmm23, xmm24
+0x62,0x85,0x45,0x00,0x5e,0xf0
+
+# ATT:   vdivnepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vdivnepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x85,0x45,0x07,0x5e,0xf0
+
+# ATT:   vdivnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vdivnepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x85,0x45,0x87,0x5e,0xf0
+
+# ATT:   vdivnepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vdivnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x40,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vdivnepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vdivnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x47,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vdivnepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vdivnepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe5,0x45,0x50,0x5e,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vdivnepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vdivnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe5,0x45,0x40,0x5e,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vdivnepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vdivnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe5,0x45,0xc7,0x5e,0x71,0x7f
+
+# ATT:   vdivnepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vdivnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe5,0x45,0xd7,0x5e,0x72,0x80
+
+# ATT:   vdivnepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vdivnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x20,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vdivnepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vdivnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x27,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vdivnepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vdivnepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe5,0x45,0x30,0x5e,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vdivnepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vdivnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe5,0x45,0x20,0x5e,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vdivnepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vdivnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe5,0x45,0xa7,0x5e,0x71,0x7f
+
+# ATT:   vdivnepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vdivnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe5,0x45,0xb7,0x5e,0x72,0x80
+
+# ATT:   vdivnepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vdivnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x00,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vdivnepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vdivnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x07,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vdivnepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vdivnepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe5,0x45,0x10,0x5e,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vdivnepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vdivnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe5,0x45,0x00,0x5e,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vdivnepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vdivnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe5,0x45,0x87,0x5e,0x71,0x7f
+
+# ATT:   vdivnepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vdivnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe5,0x45,0x97,0x5e,0x72,0x80
+
+# ATT:   vfmadd132nepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vfmadd132nepbf16 ymm22, ymm23, ymm24
+0x62,0x86,0x44,0x20,0x98,0xf0
+
+# ATT:   vfmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vfmadd132nepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x86,0x44,0x27,0x98,0xf0
+
+# ATT:   vfmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x86,0x44,0xa7,0x98,0xf0
+
+# ATT:   vfmadd132nepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vfmadd132nepbf16 zmm22, zmm23, zmm24
+0x62,0x86,0x44,0x40,0x98,0xf0
+
+# ATT:   vfmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vfmadd132nepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x86,0x44,0x47,0x98,0xf0
+
+# ATT:   vfmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x86,0x44,0xc7,0x98,0xf0
+
+# ATT:   vfmadd132nepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vfmadd132nepbf16 xmm22, xmm23, xmm24
+0x62,0x86,0x44,0x00,0x98,0xf0
+
+# ATT:   vfmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vfmadd132nepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x86,0x44,0x07,0x98,0xf0
+
+# ATT:   vfmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x86,0x44,0x87,0x98,0xf0
+
+# ATT:   vfmadd132nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vfmadd132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x40,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd132nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vfmadd132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x47,0x98,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd132nepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vfmadd132nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe6,0x44,0x50,0x98,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmadd132nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vfmadd132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe6,0x44,0x40,0x98,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfmadd132nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe6,0x44,0xc7,0x98,0x71,0x7f
+
+# ATT:   vfmadd132nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe6,0x44,0xd7,0x98,0x72,0x80
+
+# ATT:   vfmadd132nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vfmadd132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x20,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd132nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vfmadd132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x27,0x98,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd132nepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vfmadd132nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe6,0x44,0x30,0x98,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmadd132nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vfmadd132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe6,0x44,0x20,0x98,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfmadd132nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe6,0x44,0xa7,0x98,0x71,0x7f
+
+# ATT:   vfmadd132nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe6,0x44,0xb7,0x98,0x72,0x80
+
+# ATT:   vfmadd132nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vfmadd132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x00,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd132nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vfmadd132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x07,0x98,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd132nepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vfmadd132nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe6,0x44,0x10,0x98,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmadd132nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vfmadd132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe6,0x44,0x00,0x98,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfmadd132nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe6,0x44,0x87,0x98,0x71,0x7f
+
+# ATT:   vfmadd132nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe6,0x44,0x97,0x98,0x72,0x80
+
+# ATT:   vfmadd213nepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vfmadd213nepbf16 ymm22, ymm23, ymm24
+0x62,0x86,0x44,0x20,0xa8,0xf0
+
+# ATT:   vfmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vfmadd213nepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x86,0x44,0x27,0xa8,0xf0
+
+# ATT:   vfmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x86,0x44,0xa7,0xa8,0xf0
+
+# ATT:   vfmadd213nepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vfmadd213nepbf16 zmm22, zmm23, zmm24
+0x62,0x86,0x44,0x40,0xa8,0xf0
+
+# ATT:   vfmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vfmadd213nepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x86,0x44,0x47,0xa8,0xf0
+
+# ATT:   vfmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x86,0x44,0xc7,0xa8,0xf0
+
+# ATT:   vfmadd213nepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vfmadd213nepbf16 xmm22, xmm23, xmm24
+0x62,0x86,0x44,0x00,0xa8,0xf0
+
+# ATT:   vfmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vfmadd213nepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x86,0x44,0x07,0xa8,0xf0
+
+# ATT:   vfmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x86,0x44,0x87,0xa8,0xf0
+
+# ATT:   vfmadd213nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vfmadd213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x40,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd213nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vfmadd213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x47,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd213nepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vfmadd213nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe6,0x44,0x50,0xa8,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmadd213nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vfmadd213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe6,0x44,0x40,0xa8,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfmadd213nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe6,0x44,0xc7,0xa8,0x71,0x7f
+
+# ATT:   vfmadd213nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe6,0x44,0xd7,0xa8,0x72,0x80
+
+# ATT:   vfmadd213nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vfmadd213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x20,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd213nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vfmadd213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x27,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd213nepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vfmadd213nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe6,0x44,0x30,0xa8,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmadd213nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vfmadd213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe6,0x44,0x20,0xa8,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfmadd213nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe6,0x44,0xa7,0xa8,0x71,0x7f
+
+# ATT:   vfmadd213nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe6,0x44,0xb7,0xa8,0x72,0x80
+
+# ATT:   vfmadd213nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vfmadd213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x00,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd213nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vfmadd213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x07,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd213nepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vfmadd213nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe6,0x44,0x10,0xa8,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmadd213nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vfmadd213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe6,0x44,0x00,0xa8,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfmadd213nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe6,0x44,0x87,0xa8,0x71,0x7f
+
+# ATT:   vfmadd213nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe6,0x44,0x97,0xa8,0x72,0x80
+
+# ATT:   vfmadd231nepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vfmadd231nepbf16 ymm22, ymm23, ymm24
+0x62,0x86,0x44,0x20,0xb8,0xf0
+
+# ATT:   vfmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vfmadd231nepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x86,0x44,0x27,0xb8,0xf0
+
+# ATT:   vfmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x86,0x44,0xa7,0xb8,0xf0
+
+# ATT:   vfmadd231nepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vfmadd231nepbf16 zmm22, zmm23, zmm24
+0x62,0x86,0x44,0x40,0xb8,0xf0
+
+# ATT:   vfmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vfmadd231nepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x86,0x44,0x47,0xb8,0xf0
+
+# ATT:   vfmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x86,0x44,0xc7,0xb8,0xf0
+
+# ATT:   vfmadd231nepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vfmadd231nepbf16 xmm22, xmm23, xmm24
+0x62,0x86,0x44,0x00,0xb8,0xf0
+
+# ATT:   vfmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vfmadd231nepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x86,0x44,0x07,0xb8,0xf0
+
+# ATT:   vfmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x86,0x44,0x87,0xb8,0xf0
+
+# ATT:   vfmadd231nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vfmadd231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x40,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd231nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vfmadd231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x47,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd231nepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vfmadd231nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe6,0x44,0x50,0xb8,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmadd231nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vfmadd231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe6,0x44,0x40,0xb8,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfmadd231nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe6,0x44,0xc7,0xb8,0x71,0x7f
+
+# ATT:   vfmadd231nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe6,0x44,0xd7,0xb8,0x72,0x80
+
+# ATT:   vfmadd231nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vfmadd231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x20,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd231nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vfmadd231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x27,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd231nepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vfmadd231nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe6,0x44,0x30,0xb8,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmadd231nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vfmadd231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe6,0x44,0x20,0xb8,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfmadd231nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe6,0x44,0xa7,0xb8,0x71,0x7f
+
+# ATT:   vfmadd231nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe6,0x44,0xb7,0xb8,0x72,0x80
+
+# ATT:   vfmadd231nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vfmadd231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x00,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmadd231nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vfmadd231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x07,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmadd231nepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vfmadd231nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe6,0x44,0x10,0xb8,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmadd231nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vfmadd231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe6,0x44,0x00,0xb8,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfmadd231nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe6,0x44,0x87,0xb8,0x71,0x7f
+
+# ATT:   vfmadd231nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe6,0x44,0x97,0xb8,0x72,0x80
+
+# ATT:   vfmsub132nepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vfmsub132nepbf16 ymm22, ymm23, ymm24
+0x62,0x86,0x44,0x20,0x9a,0xf0
+
+# ATT:   vfmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vfmsub132nepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x86,0x44,0x27,0x9a,0xf0
+
+# ATT:   vfmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x86,0x44,0xa7,0x9a,0xf0
+
+# ATT:   vfmsub132nepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vfmsub132nepbf16 zmm22, zmm23, zmm24
+0x62,0x86,0x44,0x40,0x9a,0xf0
+
+# ATT:   vfmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vfmsub132nepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x86,0x44,0x47,0x9a,0xf0
+
+# ATT:   vfmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x86,0x44,0xc7,0x9a,0xf0
+
+# ATT:   vfmsub132nepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vfmsub132nepbf16 xmm22, xmm23, xmm24
+0x62,0x86,0x44,0x00,0x9a,0xf0
+
+# ATT:   vfmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vfmsub132nepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x86,0x44,0x07,0x9a,0xf0
+
+# ATT:   vfmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x86,0x44,0x87,0x9a,0xf0
+
+# ATT:   vfmsub132nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vfmsub132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x40,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub132nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vfmsub132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x47,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub132nepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vfmsub132nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe6,0x44,0x50,0x9a,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmsub132nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vfmsub132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe6,0x44,0x40,0x9a,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfmsub132nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe6,0x44,0xc7,0x9a,0x71,0x7f
+
+# ATT:   vfmsub132nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe6,0x44,0xd7,0x9a,0x72,0x80
+
+# ATT:   vfmsub132nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vfmsub132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x20,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub132nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vfmsub132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x27,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub132nepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vfmsub132nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe6,0x44,0x30,0x9a,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmsub132nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vfmsub132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe6,0x44,0x20,0x9a,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfmsub132nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe6,0x44,0xa7,0x9a,0x71,0x7f
+
+# ATT:   vfmsub132nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe6,0x44,0xb7,0x9a,0x72,0x80
+
+# ATT:   vfmsub132nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vfmsub132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x00,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub132nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vfmsub132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x07,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub132nepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vfmsub132nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe6,0x44,0x10,0x9a,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmsub132nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vfmsub132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe6,0x44,0x00,0x9a,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfmsub132nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe6,0x44,0x87,0x9a,0x71,0x7f
+
+# ATT:   vfmsub132nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe6,0x44,0x97,0x9a,0x72,0x80
+
+# ATT:   vfmsub213nepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vfmsub213nepbf16 ymm22, ymm23, ymm24
+0x62,0x86,0x44,0x20,0xaa,0xf0
+
+# ATT:   vfmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vfmsub213nepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x86,0x44,0x27,0xaa,0xf0
+
+# ATT:   vfmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x86,0x44,0xa7,0xaa,0xf0
+
+# ATT:   vfmsub213nepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vfmsub213nepbf16 zmm22, zmm23, zmm24
+0x62,0x86,0x44,0x40,0xaa,0xf0
+
+# ATT:   vfmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vfmsub213nepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x86,0x44,0x47,0xaa,0xf0
+
+# ATT:   vfmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x86,0x44,0xc7,0xaa,0xf0
+
+# ATT:   vfmsub213nepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vfmsub213nepbf16 xmm22, xmm23, xmm24
+0x62,0x86,0x44,0x00,0xaa,0xf0
+
+# ATT:   vfmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vfmsub213nepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x86,0x44,0x07,0xaa,0xf0
+
+# ATT:   vfmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x86,0x44,0x87,0xaa,0xf0
+
+# ATT:   vfmsub213nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vfmsub213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x40,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub213nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vfmsub213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x47,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub213nepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vfmsub213nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe6,0x44,0x50,0xaa,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmsub213nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vfmsub213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe6,0x44,0x40,0xaa,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfmsub213nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe6,0x44,0xc7,0xaa,0x71,0x7f
+
+# ATT:   vfmsub213nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe6,0x44,0xd7,0xaa,0x72,0x80
+
+# ATT:   vfmsub213nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vfmsub213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x20,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub213nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vfmsub213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x27,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub213nepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vfmsub213nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe6,0x44,0x30,0xaa,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmsub213nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vfmsub213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe6,0x44,0x20,0xaa,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfmsub213nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe6,0x44,0xa7,0xaa,0x71,0x7f
+
+# ATT:   vfmsub213nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe6,0x44,0xb7,0xaa,0x72,0x80
+
+# ATT:   vfmsub213nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vfmsub213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x00,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub213nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vfmsub213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x07,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub213nepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vfmsub213nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe6,0x44,0x10,0xaa,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmsub213nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vfmsub213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe6,0x44,0x00,0xaa,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfmsub213nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe6,0x44,0x87,0xaa,0x71,0x7f
+
+# ATT:   vfmsub213nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe6,0x44,0x97,0xaa,0x72,0x80
+
+# ATT:   vfmsub231nepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vfmsub231nepbf16 ymm22, ymm23, ymm24
+0x62,0x86,0x44,0x20,0xba,0xf0
+
+# ATT:   vfmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vfmsub231nepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x86,0x44,0x27,0xba,0xf0
+
+# ATT:   vfmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x86,0x44,0xa7,0xba,0xf0
+
+# ATT:   vfmsub231nepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vfmsub231nepbf16 zmm22, zmm23, zmm24
+0x62,0x86,0x44,0x40,0xba,0xf0
+
+# ATT:   vfmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vfmsub231nepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x86,0x44,0x47,0xba,0xf0
+
+# ATT:   vfmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x86,0x44,0xc7,0xba,0xf0
+
+# ATT:   vfmsub231nepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vfmsub231nepbf16 xmm22, xmm23, xmm24
+0x62,0x86,0x44,0x00,0xba,0xf0
+
+# ATT:   vfmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vfmsub231nepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x86,0x44,0x07,0xba,0xf0
+
+# ATT:   vfmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x86,0x44,0x87,0xba,0xf0
+
+# ATT:   vfmsub231nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vfmsub231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x40,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub231nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vfmsub231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x47,0xba,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub231nepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vfmsub231nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe6,0x44,0x50,0xba,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmsub231nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vfmsub231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe6,0x44,0x40,0xba,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfmsub231nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe6,0x44,0xc7,0xba,0x71,0x7f
+
+# ATT:   vfmsub231nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe6,0x44,0xd7,0xba,0x72,0x80
+
+# ATT:   vfmsub231nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vfmsub231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x20,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub231nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vfmsub231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x27,0xba,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub231nepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vfmsub231nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe6,0x44,0x30,0xba,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmsub231nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vfmsub231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe6,0x44,0x20,0xba,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfmsub231nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe6,0x44,0xa7,0xba,0x71,0x7f
+
+# ATT:   vfmsub231nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe6,0x44,0xb7,0xba,0x72,0x80
+
+# ATT:   vfmsub231nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vfmsub231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x00,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfmsub231nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vfmsub231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x07,0xba,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfmsub231nepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vfmsub231nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe6,0x44,0x10,0xba,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfmsub231nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vfmsub231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe6,0x44,0x00,0xba,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfmsub231nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe6,0x44,0x87,0xba,0x71,0x7f
+
+# ATT:   vfmsub231nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe6,0x44,0x97,0xba,0x72,0x80
+
+# ATT:   vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vfnmadd132nepbf16 ymm22, ymm23, ymm24
+0x62,0x86,0x44,0x20,0x9c,0xf0
+
+# ATT:   vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vfnmadd132nepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x86,0x44,0x27,0x9c,0xf0
+
+# ATT:   vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x86,0x44,0xa7,0x9c,0xf0
+
+# ATT:   vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vfnmadd132nepbf16 zmm22, zmm23, zmm24
+0x62,0x86,0x44,0x40,0x9c,0xf0
+
+# ATT:   vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vfnmadd132nepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x86,0x44,0x47,0x9c,0xf0
+
+# ATT:   vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x86,0x44,0xc7,0x9c,0xf0
+
+# ATT:   vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vfnmadd132nepbf16 xmm22, xmm23, xmm24
+0x62,0x86,0x44,0x00,0x9c,0xf0
+
+# ATT:   vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vfnmadd132nepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x86,0x44,0x07,0x9c,0xf0
+
+# ATT:   vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x86,0x44,0x87,0x9c,0xf0
+
+# ATT:   vfnmadd132nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vfnmadd132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x40,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd132nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vfnmadd132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x47,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd132nepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vfnmadd132nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe6,0x44,0x50,0x9c,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmadd132nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vfnmadd132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe6,0x44,0x40,0x9c,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfnmadd132nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe6,0x44,0xc7,0x9c,0x71,0x7f
+
+# ATT:   vfnmadd132nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe6,0x44,0xd7,0x9c,0x72,0x80
+
+# ATT:   vfnmadd132nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vfnmadd132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x20,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd132nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vfnmadd132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x27,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd132nepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vfnmadd132nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe6,0x44,0x30,0x9c,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmadd132nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vfnmadd132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe6,0x44,0x20,0x9c,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfnmadd132nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe6,0x44,0xa7,0x9c,0x71,0x7f
+
+# ATT:   vfnmadd132nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe6,0x44,0xb7,0x9c,0x72,0x80
+
+# ATT:   vfnmadd132nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vfnmadd132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x00,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd132nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vfnmadd132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x07,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd132nepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vfnmadd132nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe6,0x44,0x10,0x9c,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmadd132nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vfnmadd132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe6,0x44,0x00,0x9c,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfnmadd132nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe6,0x44,0x87,0x9c,0x71,0x7f
+
+# ATT:   vfnmadd132nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe6,0x44,0x97,0x9c,0x72,0x80
+
+# ATT:   vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vfnmadd213nepbf16 ymm22, ymm23, ymm24
+0x62,0x86,0x44,0x20,0xac,0xf0
+
+# ATT:   vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vfnmadd213nepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x86,0x44,0x27,0xac,0xf0
+
+# ATT:   vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x86,0x44,0xa7,0xac,0xf0
+
+# ATT:   vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vfnmadd213nepbf16 zmm22, zmm23, zmm24
+0x62,0x86,0x44,0x40,0xac,0xf0
+
+# ATT:   vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vfnmadd213nepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x86,0x44,0x47,0xac,0xf0
+
+# ATT:   vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x86,0x44,0xc7,0xac,0xf0
+
+# ATT:   vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vfnmadd213nepbf16 xmm22, xmm23, xmm24
+0x62,0x86,0x44,0x00,0xac,0xf0
+
+# ATT:   vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vfnmadd213nepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x86,0x44,0x07,0xac,0xf0
+
+# ATT:   vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x86,0x44,0x87,0xac,0xf0
+
+# ATT:   vfnmadd213nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vfnmadd213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x40,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd213nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vfnmadd213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x47,0xac,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd213nepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vfnmadd213nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe6,0x44,0x50,0xac,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmadd213nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vfnmadd213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe6,0x44,0x40,0xac,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfnmadd213nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe6,0x44,0xc7,0xac,0x71,0x7f
+
+# ATT:   vfnmadd213nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe6,0x44,0xd7,0xac,0x72,0x80
+
+# ATT:   vfnmadd213nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vfnmadd213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x20,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd213nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vfnmadd213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x27,0xac,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd213nepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vfnmadd213nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe6,0x44,0x30,0xac,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmadd213nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vfnmadd213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe6,0x44,0x20,0xac,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfnmadd213nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe6,0x44,0xa7,0xac,0x71,0x7f
+
+# ATT:   vfnmadd213nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe6,0x44,0xb7,0xac,0x72,0x80
+
+# ATT:   vfnmadd213nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vfnmadd213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x00,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd213nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vfnmadd213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x07,0xac,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd213nepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vfnmadd213nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe6,0x44,0x10,0xac,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmadd213nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vfnmadd213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe6,0x44,0x00,0xac,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfnmadd213nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe6,0x44,0x87,0xac,0x71,0x7f
+
+# ATT:   vfnmadd213nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe6,0x44,0x97,0xac,0x72,0x80
+
+# ATT:   vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vfnmadd231nepbf16 ymm22, ymm23, ymm24
+0x62,0x86,0x44,0x20,0xbc,0xf0
+
+# ATT:   vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vfnmadd231nepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x86,0x44,0x27,0xbc,0xf0
+
+# ATT:   vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x86,0x44,0xa7,0xbc,0xf0
+
+# ATT:   vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vfnmadd231nepbf16 zmm22, zmm23, zmm24
+0x62,0x86,0x44,0x40,0xbc,0xf0
+
+# ATT:   vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vfnmadd231nepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x86,0x44,0x47,0xbc,0xf0
+
+# ATT:   vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x86,0x44,0xc7,0xbc,0xf0
+
+# ATT:   vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vfnmadd231nepbf16 xmm22, xmm23, xmm24
+0x62,0x86,0x44,0x00,0xbc,0xf0
+
+# ATT:   vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vfnmadd231nepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x86,0x44,0x07,0xbc,0xf0
+
+# ATT:   vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x86,0x44,0x87,0xbc,0xf0
+
+# ATT:   vfnmadd231nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vfnmadd231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x40,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd231nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vfnmadd231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x47,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd231nepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vfnmadd231nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe6,0x44,0x50,0xbc,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmadd231nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vfnmadd231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe6,0x44,0x40,0xbc,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfnmadd231nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe6,0x44,0xc7,0xbc,0x71,0x7f
+
+# ATT:   vfnmadd231nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe6,0x44,0xd7,0xbc,0x72,0x80
+
+# ATT:   vfnmadd231nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vfnmadd231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x20,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd231nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vfnmadd231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x27,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd231nepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vfnmadd231nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe6,0x44,0x30,0xbc,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmadd231nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vfnmadd231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe6,0x44,0x20,0xbc,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfnmadd231nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe6,0x44,0xa7,0xbc,0x71,0x7f
+
+# ATT:   vfnmadd231nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe6,0x44,0xb7,0xbc,0x72,0x80
+
+# ATT:   vfnmadd231nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vfnmadd231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x00,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmadd231nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vfnmadd231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x07,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmadd231nepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vfnmadd231nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe6,0x44,0x10,0xbc,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmadd231nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vfnmadd231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe6,0x44,0x00,0xbc,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfnmadd231nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe6,0x44,0x87,0xbc,0x71,0x7f
+
+# ATT:   vfnmadd231nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe6,0x44,0x97,0xbc,0x72,0x80
+
+# ATT:   vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vfnmsub132nepbf16 ymm22, ymm23, ymm24
+0x62,0x86,0x44,0x20,0x9e,0xf0
+
+# ATT:   vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vfnmsub132nepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x86,0x44,0x27,0x9e,0xf0
+
+# ATT:   vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x86,0x44,0xa7,0x9e,0xf0
+
+# ATT:   vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vfnmsub132nepbf16 zmm22, zmm23, zmm24
+0x62,0x86,0x44,0x40,0x9e,0xf0
+
+# ATT:   vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vfnmsub132nepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x86,0x44,0x47,0x9e,0xf0
+
+# ATT:   vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x86,0x44,0xc7,0x9e,0xf0
+
+# ATT:   vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vfnmsub132nepbf16 xmm22, xmm23, xmm24
+0x62,0x86,0x44,0x00,0x9e,0xf0
+
+# ATT:   vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vfnmsub132nepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x86,0x44,0x07,0x9e,0xf0
+
+# ATT:   vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x86,0x44,0x87,0x9e,0xf0
+
+# ATT:   vfnmsub132nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vfnmsub132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x40,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub132nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vfnmsub132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x47,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub132nepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vfnmsub132nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe6,0x44,0x50,0x9e,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmsub132nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vfnmsub132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe6,0x44,0x40,0x9e,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfnmsub132nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe6,0x44,0xc7,0x9e,0x71,0x7f
+
+# ATT:   vfnmsub132nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe6,0x44,0xd7,0x9e,0x72,0x80
+
+# ATT:   vfnmsub132nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vfnmsub132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x20,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub132nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vfnmsub132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x27,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub132nepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vfnmsub132nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe6,0x44,0x30,0x9e,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmsub132nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vfnmsub132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe6,0x44,0x20,0x9e,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfnmsub132nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe6,0x44,0xa7,0x9e,0x71,0x7f
+
+# ATT:   vfnmsub132nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe6,0x44,0xb7,0x9e,0x72,0x80
+
+# ATT:   vfnmsub132nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vfnmsub132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x00,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub132nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vfnmsub132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x07,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub132nepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vfnmsub132nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe6,0x44,0x10,0x9e,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmsub132nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vfnmsub132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe6,0x44,0x00,0x9e,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfnmsub132nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe6,0x44,0x87,0x9e,0x71,0x7f
+
+# ATT:   vfnmsub132nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe6,0x44,0x97,0x9e,0x72,0x80
+
+# ATT:   vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vfnmsub213nepbf16 ymm22, ymm23, ymm24
+0x62,0x86,0x44,0x20,0xae,0xf0
+
+# ATT:   vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vfnmsub213nepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x86,0x44,0x27,0xae,0xf0
+
+# ATT:   vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x86,0x44,0xa7,0xae,0xf0
+
+# ATT:   vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vfnmsub213nepbf16 zmm22, zmm23, zmm24
+0x62,0x86,0x44,0x40,0xae,0xf0
+
+# ATT:   vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vfnmsub213nepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x86,0x44,0x47,0xae,0xf0
+
+# ATT:   vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x86,0x44,0xc7,0xae,0xf0
+
+# ATT:   vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vfnmsub213nepbf16 xmm22, xmm23, xmm24
+0x62,0x86,0x44,0x00,0xae,0xf0
+
+# ATT:   vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vfnmsub213nepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x86,0x44,0x07,0xae,0xf0
+
+# ATT:   vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x86,0x44,0x87,0xae,0xf0
+
+# ATT:   vfnmsub213nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vfnmsub213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x40,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub213nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vfnmsub213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x47,0xae,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub213nepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vfnmsub213nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe6,0x44,0x50,0xae,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmsub213nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vfnmsub213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe6,0x44,0x40,0xae,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfnmsub213nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe6,0x44,0xc7,0xae,0x71,0x7f
+
+# ATT:   vfnmsub213nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe6,0x44,0xd7,0xae,0x72,0x80
+
+# ATT:   vfnmsub213nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vfnmsub213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x20,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub213nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vfnmsub213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x27,0xae,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub213nepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vfnmsub213nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe6,0x44,0x30,0xae,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmsub213nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vfnmsub213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe6,0x44,0x20,0xae,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfnmsub213nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe6,0x44,0xa7,0xae,0x71,0x7f
+
+# ATT:   vfnmsub213nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe6,0x44,0xb7,0xae,0x72,0x80
+
+# ATT:   vfnmsub213nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vfnmsub213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x00,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub213nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vfnmsub213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x07,0xae,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub213nepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vfnmsub213nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe6,0x44,0x10,0xae,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmsub213nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vfnmsub213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe6,0x44,0x00,0xae,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfnmsub213nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe6,0x44,0x87,0xae,0x71,0x7f
+
+# ATT:   vfnmsub213nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe6,0x44,0x97,0xae,0x72,0x80
+
+# ATT:   vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vfnmsub231nepbf16 ymm22, ymm23, ymm24
+0x62,0x86,0x44,0x20,0xbe,0xf0
+
+# ATT:   vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vfnmsub231nepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x86,0x44,0x27,0xbe,0xf0
+
+# ATT:   vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x86,0x44,0xa7,0xbe,0xf0
+
+# ATT:   vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vfnmsub231nepbf16 zmm22, zmm23, zmm24
+0x62,0x86,0x44,0x40,0xbe,0xf0
+
+# ATT:   vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vfnmsub231nepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x86,0x44,0x47,0xbe,0xf0
+
+# ATT:   vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x86,0x44,0xc7,0xbe,0xf0
+
+# ATT:   vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vfnmsub231nepbf16 xmm22, xmm23, xmm24
+0x62,0x86,0x44,0x00,0xbe,0xf0
+
+# ATT:   vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vfnmsub231nepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x86,0x44,0x07,0xbe,0xf0
+
+# ATT:   vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x86,0x44,0x87,0xbe,0xf0
+
+# ATT:   vfnmsub231nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vfnmsub231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x40,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub231nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vfnmsub231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x47,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub231nepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vfnmsub231nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe6,0x44,0x50,0xbe,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmsub231nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vfnmsub231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe6,0x44,0x40,0xbe,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vfnmsub231nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe6,0x44,0xc7,0xbe,0x71,0x7f
+
+# ATT:   vfnmsub231nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe6,0x44,0xd7,0xbe,0x72,0x80
+
+# ATT:   vfnmsub231nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vfnmsub231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x20,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub231nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vfnmsub231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x27,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub231nepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vfnmsub231nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe6,0x44,0x30,0xbe,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmsub231nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vfnmsub231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe6,0x44,0x20,0xbe,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vfnmsub231nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe6,0x44,0xa7,0xbe,0x71,0x7f
+
+# ATT:   vfnmsub231nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe6,0x44,0xb7,0xbe,0x72,0x80
+
+# ATT:   vfnmsub231nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vfnmsub231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x00,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vfnmsub231nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vfnmsub231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x07,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vfnmsub231nepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vfnmsub231nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe6,0x44,0x10,0xbe,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vfnmsub231nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vfnmsub231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe6,0x44,0x00,0xbe,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vfnmsub231nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe6,0x44,0x87,0xbe,0x71,0x7f
+
+# ATT:   vfnmsub231nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe6,0x44,0x97,0xbe,0x72,0x80
+
+# ATT:   vfpclasspbf16 $123, %zmm23, %k5
+# INTEL: vfpclasspbf16 k5, zmm23, 123
+0x62,0xb3,0x7f,0x48,0x66,0xef,0x7b
+
+# ATT:   vfpclasspbf16 $123, %zmm23, %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, zmm23, 123
+0x62,0xb3,0x7f,0x4f,0x66,0xef,0x7b
+
+# ATT:   vfpclasspbf16 $123, %ymm23, %k5
+# INTEL: vfpclasspbf16 k5, ymm23, 123
+0x62,0xb3,0x7f,0x28,0x66,0xef,0x7b
+
+# ATT:   vfpclasspbf16 $123, %ymm23, %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, ymm23, 123
+0x62,0xb3,0x7f,0x2f,0x66,0xef,0x7b
+
+# ATT:   vfpclasspbf16 $123, %xmm23, %k5
+# INTEL: vfpclasspbf16 k5, xmm23, 123
+0x62,0xb3,0x7f,0x08,0x66,0xef,0x7b
+
+# ATT:   vfpclasspbf16 $123, %xmm23, %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, xmm23, 123
+0x62,0xb3,0x7f,0x0f,0x66,0xef,0x7b
+
+# ATT:   vfpclasspbf16x  $123, 268435456(%rbp,%r14,8), %k5
+# INTEL: vfpclasspbf16 k5, xmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xb3,0x7f,0x08,0x66,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vfpclasspbf16x  $123, 291(%r8,%rax,4), %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, xmmword ptr [r8 + 4*rax + 291], 123
+0x62,0xd3,0x7f,0x0f,0x66,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vfpclasspbf16  $123, (%rip){1to8}, %k5
+# INTEL: vfpclasspbf16 k5, word ptr [rip]{1to8}, 123
+0x62,0xf3,0x7f,0x18,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b
+
+# ATT:   vfpclasspbf16x  $123, -512(,%rbp,2), %k5
+# INTEL: vfpclasspbf16 k5, xmmword ptr [2*rbp - 512], 123
+0x62,0xf3,0x7f,0x08,0x66,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b
+
+# ATT:   vfpclasspbf16x  $123, 2032(%rcx), %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, xmmword ptr [rcx + 2032], 123
+0x62,0xf3,0x7f,0x0f,0x66,0x69,0x7f,0x7b
+
+# ATT:   vfpclasspbf16  $123, -256(%rdx){1to8}, %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to8}, 123
+0x62,0xf3,0x7f,0x1f,0x66,0x6a,0x80,0x7b
+
+# ATT:   vfpclasspbf16  $123, (%rip){1to16}, %k5
+# INTEL: vfpclasspbf16 k5, word ptr [rip]{1to16}, 123
+0x62,0xf3,0x7f,0x38,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b
+
+# ATT:   vfpclasspbf16y  $123, -1024(,%rbp,2), %k5
+# INTEL: vfpclasspbf16 k5, ymmword ptr [2*rbp - 1024], 123
+0x62,0xf3,0x7f,0x28,0x66,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b
+
+# ATT:   vfpclasspbf16y  $123, 4064(%rcx), %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, ymmword ptr [rcx + 4064], 123
+0x62,0xf3,0x7f,0x2f,0x66,0x69,0x7f,0x7b
+
+# ATT:   vfpclasspbf16  $123, -256(%rdx){1to16}, %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to16}, 123
+0x62,0xf3,0x7f,0x3f,0x66,0x6a,0x80,0x7b
+
+# ATT:   vfpclasspbf16  $123, (%rip){1to32}, %k5
+# INTEL: vfpclasspbf16 k5, word ptr [rip]{1to32}, 123
+0x62,0xf3,0x7f,0x58,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b
+
+# ATT:   vfpclasspbf16z  $123, -2048(,%rbp,2), %k5
+# INTEL: vfpclasspbf16 k5, zmmword ptr [2*rbp - 2048], 123
+0x62,0xf3,0x7f,0x48,0x66,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b
+
+# ATT:   vfpclasspbf16z  $123, 8128(%rcx), %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, zmmword ptr [rcx + 8128], 123
+0x62,0xf3,0x7f,0x4f,0x66,0x69,0x7f,0x7b
+
+# ATT:   vfpclasspbf16  $123, -256(%rdx){1to32}, %k5 {%k7}
+# INTEL: vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to32}, 123
+0x62,0xf3,0x7f,0x5f,0x66,0x6a,0x80,0x7b
+
+# ATT:   vgetexppbf16 %xmm23, %xmm22
+# INTEL: vgetexppbf16 xmm22, xmm23
+0x62,0xa5,0x7d,0x08,0x42,0xf7
+
+# ATT:   vgetexppbf16 %xmm23, %xmm22 {%k7}
+# INTEL: vgetexppbf16 xmm22 {k7}, xmm23
+0x62,0xa5,0x7d,0x0f,0x42,0xf7
+
+# ATT:   vgetexppbf16 %xmm23, %xmm22 {%k7} {z}
+# INTEL: vgetexppbf16 xmm22 {k7} {z}, xmm23
+0x62,0xa5,0x7d,0x8f,0x42,0xf7
+
+# ATT:   vgetexppbf16 %zmm23, %zmm22
+# INTEL: vgetexppbf16 zmm22, zmm23
+0x62,0xa5,0x7d,0x48,0x42,0xf7
+
+# ATT:   vgetexppbf16 %zmm23, %zmm22 {%k7}
+# INTEL: vgetexppbf16 zmm22 {k7}, zmm23
+0x62,0xa5,0x7d,0x4f,0x42,0xf7
+
+# ATT:   vgetexppbf16 %zmm23, %zmm22 {%k7} {z}
+# INTEL: vgetexppbf16 zmm22 {k7} {z}, zmm23
+0x62,0xa5,0x7d,0xcf,0x42,0xf7
+
+# ATT:   vgetexppbf16 %ymm23, %ymm22
+# INTEL: vgetexppbf16 ymm22, ymm23
+0x62,0xa5,0x7d,0x28,0x42,0xf7
+
+# ATT:   vgetexppbf16 %ymm23, %ymm22 {%k7}
+# INTEL: vgetexppbf16 ymm22 {k7}, ymm23
+0x62,0xa5,0x7d,0x2f,0x42,0xf7
+
+# ATT:   vgetexppbf16 %ymm23, %ymm22 {%k7} {z}
+# INTEL: vgetexppbf16 ymm22 {k7} {z}, ymm23
+0x62,0xa5,0x7d,0xaf,0x42,0xf7
+
+# ATT:   vgetexppbf16  268435456(%rbp,%r14,8), %xmm22
+# INTEL: vgetexppbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x7d,0x08,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vgetexppbf16  291(%r8,%rax,4), %xmm22 {%k7}
+# INTEL: vgetexppbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x7d,0x0f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vgetexppbf16  (%rip){1to8}, %xmm22
+# INTEL: vgetexppbf16 xmm22, word ptr [rip]{1to8}
+0x62,0xe5,0x7d,0x18,0x42,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vgetexppbf16  -512(,%rbp,2), %xmm22
+# INTEL: vgetexppbf16 xmm22, xmmword ptr [2*rbp - 512]
+0x62,0xe5,0x7d,0x08,0x42,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vgetexppbf16  2032(%rcx), %xmm22 {%k7} {z}
+# INTEL: vgetexppbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032]
+0x62,0xe5,0x7d,0x8f,0x42,0x71,0x7f
+
+# ATT:   vgetexppbf16  -256(%rdx){1to8}, %xmm22 {%k7} {z}
+# INTEL: vgetexppbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}
+0x62,0xe5,0x7d,0x9f,0x42,0x72,0x80
+
+# ATT:   vgetexppbf16  268435456(%rbp,%r14,8), %ymm22
+# INTEL: vgetexppbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x7d,0x28,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vgetexppbf16  291(%r8,%rax,4), %ymm22 {%k7}
+# INTEL: vgetexppbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x7d,0x2f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vgetexppbf16  (%rip){1to16}, %ymm22
+# INTEL: vgetexppbf16 ymm22, word ptr [rip]{1to16}
+0x62,0xe5,0x7d,0x38,0x42,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vgetexppbf16  -1024(,%rbp,2), %ymm22
+# INTEL: vgetexppbf16 ymm22, ymmword ptr [2*rbp - 1024]
+0x62,0xe5,0x7d,0x28,0x42,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vgetexppbf16  4064(%rcx), %ymm22 {%k7} {z}
+# INTEL: vgetexppbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064]
+0x62,0xe5,0x7d,0xaf,0x42,0x71,0x7f
+
+# ATT:   vgetexppbf16  -256(%rdx){1to16}, %ymm22 {%k7} {z}
+# INTEL: vgetexppbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}
+0x62,0xe5,0x7d,0xbf,0x42,0x72,0x80
+
+# ATT:   vgetexppbf16  268435456(%rbp,%r14,8), %zmm22
+# INTEL: vgetexppbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x7d,0x48,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vgetexppbf16  291(%r8,%rax,4), %zmm22 {%k7}
+# INTEL: vgetexppbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x7d,0x4f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vgetexppbf16  (%rip){1to32}, %zmm22
+# INTEL: vgetexppbf16 zmm22, word ptr [rip]{1to32}
+0x62,0xe5,0x7d,0x58,0x42,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vgetexppbf16  -2048(,%rbp,2), %zmm22
+# INTEL: vgetexppbf16 zmm22, zmmword ptr [2*rbp - 2048]
+0x62,0xe5,0x7d,0x48,0x42,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vgetexppbf16  8128(%rcx), %zmm22 {%k7} {z}
+# INTEL: vgetexppbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128]
+0x62,0xe5,0x7d,0xcf,0x42,0x71,0x7f
+
+# ATT:   vgetexppbf16  -256(%rdx){1to32}, %zmm22 {%k7} {z}
+# INTEL: vgetexppbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0xe5,0x7d,0xdf,0x42,0x72,0x80
+
+# ATT:   vgetmantpbf16 $123, %zmm23, %zmm22
+# INTEL: vgetmantpbf16 zmm22, zmm23, 123
+0x62,0xa3,0x7f,0x48,0x26,0xf7,0x7b
+
+# ATT:   vgetmantpbf16 $123, %zmm23, %zmm22 {%k7}
+# INTEL: vgetmantpbf16 zmm22 {k7}, zmm23, 123
+0x62,0xa3,0x7f,0x4f,0x26,0xf7,0x7b
+
+# ATT:   vgetmantpbf16 $123, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vgetmantpbf16 zmm22 {k7} {z}, zmm23, 123
+0x62,0xa3,0x7f,0xcf,0x26,0xf7,0x7b
+
+# ATT:   vgetmantpbf16 $123, %ymm23, %ymm22
+# INTEL: vgetmantpbf16 ymm22, ymm23, 123
+0x62,0xa3,0x7f,0x28,0x26,0xf7,0x7b
+
+# ATT:   vgetmantpbf16 $123, %ymm23, %ymm22 {%k7}
+# INTEL: vgetmantpbf16 ymm22 {k7}, ymm23, 123
+0x62,0xa3,0x7f,0x2f,0x26,0xf7,0x7b
+
+# ATT:   vgetmantpbf16 $123, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vgetmantpbf16 ymm22 {k7} {z}, ymm23, 123
+0x62,0xa3,0x7f,0xaf,0x26,0xf7,0x7b
+
+# ATT:   vgetmantpbf16 $123, %xmm23, %xmm22
+# INTEL: vgetmantpbf16 xmm22, xmm23, 123
+0x62,0xa3,0x7f,0x08,0x26,0xf7,0x7b
+
+# ATT:   vgetmantpbf16 $123, %xmm23, %xmm22 {%k7}
+# INTEL: vgetmantpbf16 xmm22 {k7}, xmm23, 123
+0x62,0xa3,0x7f,0x0f,0x26,0xf7,0x7b
+
+# ATT:   vgetmantpbf16 $123, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vgetmantpbf16 xmm22 {k7} {z}, xmm23, 123
+0x62,0xa3,0x7f,0x8f,0x26,0xf7,0x7b
+
+# ATT:   vgetmantpbf16  $123, 268435456(%rbp,%r14,8), %xmm22
+# INTEL: vgetmantpbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xa3,0x7f,0x08,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vgetmantpbf16  $123, 291(%r8,%rax,4), %xmm22 {%k7}
+# INTEL: vgetmantpbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123
+0x62,0xc3,0x7f,0x0f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vgetmantpbf16  $123, (%rip){1to8}, %xmm22
+# INTEL: vgetmantpbf16 xmm22, word ptr [rip]{1to8}, 123
+0x62,0xe3,0x7f,0x18,0x26,0x35,0x00,0x00,0x00,0x00,0x7b
+
+# ATT:   vgetmantpbf16  $123, -512(,%rbp,2), %xmm22
+# INTEL: vgetmantpbf16 xmm22, xmmword ptr [2*rbp - 512], 123
+0x62,0xe3,0x7f,0x08,0x26,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b
+
+# ATT:   vgetmantpbf16  $123, 2032(%rcx), %xmm22 {%k7} {z}
+# INTEL: vgetmantpbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123
+0x62,0xe3,0x7f,0x8f,0x26,0x71,0x7f,0x7b
+
+# ATT:   vgetmantpbf16  $123, -256(%rdx){1to8}, %xmm22 {%k7} {z}
+# INTEL: vgetmantpbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+0x62,0xe3,0x7f,0x9f,0x26,0x72,0x80,0x7b
+
+# ATT:   vgetmantpbf16  $123, 268435456(%rbp,%r14,8), %ymm22
+# INTEL: vgetmantpbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xa3,0x7f,0x28,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vgetmantpbf16  $123, 291(%r8,%rax,4), %ymm22 {%k7}
+# INTEL: vgetmantpbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123
+0x62,0xc3,0x7f,0x2f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vgetmantpbf16  $123, (%rip){1to16}, %ymm22
+# INTEL: vgetmantpbf16 ymm22, word ptr [rip]{1to16}, 123
+0x62,0xe3,0x7f,0x38,0x26,0x35,0x00,0x00,0x00,0x00,0x7b
+
+# ATT:   vgetmantpbf16  $123, -1024(,%rbp,2), %ymm22
+# INTEL: vgetmantpbf16 ymm22, ymmword ptr [2*rbp - 1024], 123
+0x62,0xe3,0x7f,0x28,0x26,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b
+
+# ATT:   vgetmantpbf16  $123, 4064(%rcx), %ymm22 {%k7} {z}
+# INTEL: vgetmantpbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123
+0x62,0xe3,0x7f,0xaf,0x26,0x71,0x7f,0x7b
+
+# ATT:   vgetmantpbf16  $123, -256(%rdx){1to16}, %ymm22 {%k7} {z}
+# INTEL: vgetmantpbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+0x62,0xe3,0x7f,0xbf,0x26,0x72,0x80,0x7b
+
+# ATT:   vgetmantpbf16  $123, 268435456(%rbp,%r14,8), %zmm22
+# INTEL: vgetmantpbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xa3,0x7f,0x48,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vgetmantpbf16  $123, 291(%r8,%rax,4), %zmm22 {%k7}
+# INTEL: vgetmantpbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123
+0x62,0xc3,0x7f,0x4f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vgetmantpbf16  $123, (%rip){1to32}, %zmm22
+# INTEL: vgetmantpbf16 zmm22, word ptr [rip]{1to32}, 123
+0x62,0xe3,0x7f,0x58,0x26,0x35,0x00,0x00,0x00,0x00,0x7b
+
+# ATT:   vgetmantpbf16  $123, -2048(,%rbp,2), %zmm22
+# INTEL: vgetmantpbf16 zmm22, zmmword ptr [2*rbp - 2048], 123
+0x62,0xe3,0x7f,0x48,0x26,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b
+
+# ATT:   vgetmantpbf16  $123, 8128(%rcx), %zmm22 {%k7} {z}
+# INTEL: vgetmantpbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123
+0x62,0xe3,0x7f,0xcf,0x26,0x71,0x7f,0x7b
+
+# ATT:   vgetmantpbf16  $123, -256(%rdx){1to32}, %zmm22 {%k7} {z}
+# INTEL: vgetmantpbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123
+0x62,0xe3,0x7f,0xdf,0x26,0x72,0x80,0x7b
+
+# ATT:   vmaxpbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vmaxpbf16 ymm22, ymm23, ymm24
+0x62,0x85,0x45,0x20,0x5f,0xf0
+
+# ATT:   vmaxpbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vmaxpbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x85,0x45,0x27,0x5f,0xf0
+
+# ATT:   vmaxpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vmaxpbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x85,0x45,0xa7,0x5f,0xf0
+
+# ATT:   vmaxpbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vmaxpbf16 zmm22, zmm23, zmm24
+0x62,0x85,0x45,0x40,0x5f,0xf0
+
+# ATT:   vmaxpbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vmaxpbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x85,0x45,0x47,0x5f,0xf0
+
+# ATT:   vmaxpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vmaxpbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x85,0x45,0xc7,0x5f,0xf0
+
+# ATT:   vmaxpbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vmaxpbf16 xmm22, xmm23, xmm24
+0x62,0x85,0x45,0x00,0x5f,0xf0
+
+# ATT:   vmaxpbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vmaxpbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x85,0x45,0x07,0x5f,0xf0
+
+# ATT:   vmaxpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vmaxpbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x85,0x45,0x87,0x5f,0xf0
+
+# ATT:   vmaxpbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vmaxpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x40,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vmaxpbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vmaxpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x47,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vmaxpbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vmaxpbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe5,0x45,0x50,0x5f,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vmaxpbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vmaxpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe5,0x45,0x40,0x5f,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vmaxpbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vmaxpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe5,0x45,0xc7,0x5f,0x71,0x7f
+
+# ATT:   vmaxpbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vmaxpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe5,0x45,0xd7,0x5f,0x72,0x80
+
+# ATT:   vmaxpbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vmaxpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x20,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vmaxpbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vmaxpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x27,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vmaxpbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vmaxpbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe5,0x45,0x30,0x5f,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vmaxpbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vmaxpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe5,0x45,0x20,0x5f,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vmaxpbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vmaxpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe5,0x45,0xa7,0x5f,0x71,0x7f
+
+# ATT:   vmaxpbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vmaxpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe5,0x45,0xb7,0x5f,0x72,0x80
+
+# ATT:   vmaxpbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vmaxpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x00,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vmaxpbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vmaxpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x07,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vmaxpbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vmaxpbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe5,0x45,0x10,0x5f,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vmaxpbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vmaxpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe5,0x45,0x00,0x5f,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vmaxpbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vmaxpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe5,0x45,0x87,0x5f,0x71,0x7f
+
+# ATT:   vmaxpbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vmaxpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe5,0x45,0x97,0x5f,0x72,0x80
+
+# ATT:   vminpbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vminpbf16 ymm22, ymm23, ymm24
+0x62,0x85,0x45,0x20,0x5d,0xf0
+
+# ATT:   vminpbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vminpbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x85,0x45,0x27,0x5d,0xf0
+
+# ATT:   vminpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vminpbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x85,0x45,0xa7,0x5d,0xf0
+
+# ATT:   vminpbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vminpbf16 zmm22, zmm23, zmm24
+0x62,0x85,0x45,0x40,0x5d,0xf0
+
+# ATT:   vminpbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vminpbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x85,0x45,0x47,0x5d,0xf0
+
+# ATT:   vminpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vminpbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x85,0x45,0xc7,0x5d,0xf0
+
+# ATT:   vminpbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vminpbf16 xmm22, xmm23, xmm24
+0x62,0x85,0x45,0x00,0x5d,0xf0
+
+# ATT:   vminpbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vminpbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x85,0x45,0x07,0x5d,0xf0
+
+# ATT:   vminpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vminpbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x85,0x45,0x87,0x5d,0xf0
+
+# ATT:   vminpbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vminpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x40,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vminpbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vminpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x47,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vminpbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vminpbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe5,0x45,0x50,0x5d,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vminpbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vminpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe5,0x45,0x40,0x5d,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vminpbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vminpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe5,0x45,0xc7,0x5d,0x71,0x7f
+
+# ATT:   vminpbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vminpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe5,0x45,0xd7,0x5d,0x72,0x80
+
+# ATT:   vminpbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vminpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x20,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vminpbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vminpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x27,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vminpbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vminpbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe5,0x45,0x30,0x5d,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vminpbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vminpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe5,0x45,0x20,0x5d,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vminpbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vminpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe5,0x45,0xa7,0x5d,0x71,0x7f
+
+# ATT:   vminpbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vminpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe5,0x45,0xb7,0x5d,0x72,0x80
+
+# ATT:   vminpbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vminpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x00,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vminpbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vminpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x07,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vminpbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vminpbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe5,0x45,0x10,0x5d,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vminpbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vminpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe5,0x45,0x00,0x5d,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vminpbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vminpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe5,0x45,0x87,0x5d,0x71,0x7f
+
+# ATT:   vminpbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vminpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe5,0x45,0x97,0x5d,0x72,0x80
+
+# ATT:   vmulnepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vmulnepbf16 ymm22, ymm23, ymm24
+0x62,0x85,0x45,0x20,0x59,0xf0
+
+# ATT:   vmulnepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vmulnepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x85,0x45,0x27,0x59,0xf0
+
+# ATT:   vmulnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vmulnepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x85,0x45,0xa7,0x59,0xf0
+
+# ATT:   vmulnepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vmulnepbf16 zmm22, zmm23, zmm24
+0x62,0x85,0x45,0x40,0x59,0xf0
+
+# ATT:   vmulnepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vmulnepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x85,0x45,0x47,0x59,0xf0
+
+# ATT:   vmulnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vmulnepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x85,0x45,0xc7,0x59,0xf0
+
+# ATT:   vmulnepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vmulnepbf16 xmm22, xmm23, xmm24
+0x62,0x85,0x45,0x00,0x59,0xf0
+
+# ATT:   vmulnepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vmulnepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x85,0x45,0x07,0x59,0xf0
+
+# ATT:   vmulnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vmulnepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x85,0x45,0x87,0x59,0xf0
+
+# ATT:   vmulnepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vmulnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x40,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vmulnepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vmulnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x47,0x59,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vmulnepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vmulnepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe5,0x45,0x50,0x59,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vmulnepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vmulnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe5,0x45,0x40,0x59,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vmulnepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vmulnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe5,0x45,0xc7,0x59,0x71,0x7f
+
+# ATT:   vmulnepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vmulnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe5,0x45,0xd7,0x59,0x72,0x80
+
+# ATT:   vmulnepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vmulnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x20,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vmulnepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vmulnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x27,0x59,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vmulnepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vmulnepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe5,0x45,0x30,0x59,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vmulnepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vmulnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe5,0x45,0x20,0x59,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vmulnepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vmulnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe5,0x45,0xa7,0x59,0x71,0x7f
+
+# ATT:   vmulnepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vmulnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe5,0x45,0xb7,0x59,0x72,0x80
+
+# ATT:   vmulnepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vmulnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x00,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vmulnepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vmulnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x07,0x59,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vmulnepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vmulnepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe5,0x45,0x10,0x59,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vmulnepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vmulnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe5,0x45,0x00,0x59,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vmulnepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vmulnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe5,0x45,0x87,0x59,0x71,0x7f
+
+# ATT:   vmulnepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vmulnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe5,0x45,0x97,0x59,0x72,0x80
+
+# ATT:   vrcppbf16 %xmm23, %xmm22
+# INTEL: vrcppbf16 xmm22, xmm23
+0x62,0xa6,0x7c,0x08,0x4c,0xf7
+
+# ATT:   vrcppbf16 %xmm23, %xmm22 {%k7}
+# INTEL: vrcppbf16 xmm22 {k7}, xmm23
+0x62,0xa6,0x7c,0x0f,0x4c,0xf7
+
+# ATT:   vrcppbf16 %xmm23, %xmm22 {%k7} {z}
+# INTEL: vrcppbf16 xmm22 {k7} {z}, xmm23
+0x62,0xa6,0x7c,0x8f,0x4c,0xf7
+
+# ATT:   vrcppbf16 %zmm23, %zmm22
+# INTEL: vrcppbf16 zmm22, zmm23
+0x62,0xa6,0x7c,0x48,0x4c,0xf7
+
+# ATT:   vrcppbf16 %zmm23, %zmm22 {%k7}
+# INTEL: vrcppbf16 zmm22 {k7}, zmm23
+0x62,0xa6,0x7c,0x4f,0x4c,0xf7
+
+# ATT:   vrcppbf16 %zmm23, %zmm22 {%k7} {z}
+# INTEL: vrcppbf16 zmm22 {k7} {z}, zmm23
+0x62,0xa6,0x7c,0xcf,0x4c,0xf7
+
+# ATT:   vrcppbf16 %ymm23, %ymm22
+# INTEL: vrcppbf16 ymm22, ymm23
+0x62,0xa6,0x7c,0x28,0x4c,0xf7
+
+# ATT:   vrcppbf16 %ymm23, %ymm22 {%k7}
+# INTEL: vrcppbf16 ymm22 {k7}, ymm23
+0x62,0xa6,0x7c,0x2f,0x4c,0xf7
+
+# ATT:   vrcppbf16 %ymm23, %ymm22 {%k7} {z}
+# INTEL: vrcppbf16 ymm22 {k7} {z}, ymm23
+0x62,0xa6,0x7c,0xaf,0x4c,0xf7
+
+# ATT:   vrcppbf16  268435456(%rbp,%r14,8), %xmm22
+# INTEL: vrcppbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x7c,0x08,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vrcppbf16  291(%r8,%rax,4), %xmm22 {%k7}
+# INTEL: vrcppbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x7c,0x0f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vrcppbf16  (%rip){1to8}, %xmm22
+# INTEL: vrcppbf16 xmm22, word ptr [rip]{1to8}
+0x62,0xe6,0x7c,0x18,0x4c,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vrcppbf16  -512(,%rbp,2), %xmm22
+# INTEL: vrcppbf16 xmm22, xmmword ptr [2*rbp - 512]
+0x62,0xe6,0x7c,0x08,0x4c,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vrcppbf16  2032(%rcx), %xmm22 {%k7} {z}
+# INTEL: vrcppbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032]
+0x62,0xe6,0x7c,0x8f,0x4c,0x71,0x7f
+
+# ATT:   vrcppbf16  -256(%rdx){1to8}, %xmm22 {%k7} {z}
+# INTEL: vrcppbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}
+0x62,0xe6,0x7c,0x9f,0x4c,0x72,0x80
+
+# ATT:   vrcppbf16  268435456(%rbp,%r14,8), %ymm22
+# INTEL: vrcppbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x7c,0x28,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vrcppbf16  291(%r8,%rax,4), %ymm22 {%k7}
+# INTEL: vrcppbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x7c,0x2f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vrcppbf16  (%rip){1to16}, %ymm22
+# INTEL: vrcppbf16 ymm22, word ptr [rip]{1to16}
+0x62,0xe6,0x7c,0x38,0x4c,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vrcppbf16  -1024(,%rbp,2), %ymm22
+# INTEL: vrcppbf16 ymm22, ymmword ptr [2*rbp - 1024]
+0x62,0xe6,0x7c,0x28,0x4c,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vrcppbf16  4064(%rcx), %ymm22 {%k7} {z}
+# INTEL: vrcppbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064]
+0x62,0xe6,0x7c,0xaf,0x4c,0x71,0x7f
+
+# ATT:   vrcppbf16  -256(%rdx){1to16}, %ymm22 {%k7} {z}
+# INTEL: vrcppbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}
+0x62,0xe6,0x7c,0xbf,0x4c,0x72,0x80
+
+# ATT:   vrcppbf16  268435456(%rbp,%r14,8), %zmm22
+# INTEL: vrcppbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x7c,0x48,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vrcppbf16  291(%r8,%rax,4), %zmm22 {%k7}
+# INTEL: vrcppbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x7c,0x4f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vrcppbf16  (%rip){1to32}, %zmm22
+# INTEL: vrcppbf16 zmm22, word ptr [rip]{1to32}
+0x62,0xe6,0x7c,0x58,0x4c,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vrcppbf16  -2048(,%rbp,2), %zmm22
+# INTEL: vrcppbf16 zmm22, zmmword ptr [2*rbp - 2048]
+0x62,0xe6,0x7c,0x48,0x4c,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vrcppbf16  8128(%rcx), %zmm22 {%k7} {z}
+# INTEL: vrcppbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128]
+0x62,0xe6,0x7c,0xcf,0x4c,0x71,0x7f
+
+# ATT:   vrcppbf16  -256(%rdx){1to32}, %zmm22 {%k7} {z}
+# INTEL: vrcppbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0xe6,0x7c,0xdf,0x4c,0x72,0x80
+
+# ATT:   vreducenepbf16 $123, %zmm23, %zmm22
+# INTEL: vreducenepbf16 zmm22, zmm23, 123
+0x62,0xa3,0x7f,0x48,0x56,0xf7,0x7b
+
+# ATT:   vreducenepbf16 $123, %zmm23, %zmm22 {%k7}
+# INTEL: vreducenepbf16 zmm22 {k7}, zmm23, 123
+0x62,0xa3,0x7f,0x4f,0x56,0xf7,0x7b
+
+# ATT:   vreducenepbf16 $123, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vreducenepbf16 zmm22 {k7} {z}, zmm23, 123
+0x62,0xa3,0x7f,0xcf,0x56,0xf7,0x7b
+
+# ATT:   vreducenepbf16 $123, %ymm23, %ymm22
+# INTEL: vreducenepbf16 ymm22, ymm23, 123
+0x62,0xa3,0x7f,0x28,0x56,0xf7,0x7b
+
+# ATT:   vreducenepbf16 $123, %ymm23, %ymm22 {%k7}
+# INTEL: vreducenepbf16 ymm22 {k7}, ymm23, 123
+0x62,0xa3,0x7f,0x2f,0x56,0xf7,0x7b
+
+# ATT:   vreducenepbf16 $123, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vreducenepbf16 ymm22 {k7} {z}, ymm23, 123
+0x62,0xa3,0x7f,0xaf,0x56,0xf7,0x7b
+
+# ATT:   vreducenepbf16 $123, %xmm23, %xmm22
+# INTEL: vreducenepbf16 xmm22, xmm23, 123
+0x62,0xa3,0x7f,0x08,0x56,0xf7,0x7b
+
+# ATT:   vreducenepbf16 $123, %xmm23, %xmm22 {%k7}
+# INTEL: vreducenepbf16 xmm22 {k7}, xmm23, 123
+0x62,0xa3,0x7f,0x0f,0x56,0xf7,0x7b
+
+# ATT:   vreducenepbf16 $123, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vreducenepbf16 xmm22 {k7} {z}, xmm23, 123
+0x62,0xa3,0x7f,0x8f,0x56,0xf7,0x7b
+
+# ATT:   vreducenepbf16  $123, 268435456(%rbp,%r14,8), %xmm22
+# INTEL: vreducenepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xa3,0x7f,0x08,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vreducenepbf16  $123, 291(%r8,%rax,4), %xmm22 {%k7}
+# INTEL: vreducenepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123
+0x62,0xc3,0x7f,0x0f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vreducenepbf16  $123, (%rip){1to8}, %xmm22
+# INTEL: vreducenepbf16 xmm22, word ptr [rip]{1to8}, 123
+0x62,0xe3,0x7f,0x18,0x56,0x35,0x00,0x00,0x00,0x00,0x7b
+
+# ATT:   vreducenepbf16  $123, -512(,%rbp,2), %xmm22
+# INTEL: vreducenepbf16 xmm22, xmmword ptr [2*rbp - 512], 123
+0x62,0xe3,0x7f,0x08,0x56,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b
+
+# ATT:   vreducenepbf16  $123, 2032(%rcx), %xmm22 {%k7} {z}
+# INTEL: vreducenepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123
+0x62,0xe3,0x7f,0x8f,0x56,0x71,0x7f,0x7b
+
+# ATT:   vreducenepbf16  $123, -256(%rdx){1to8}, %xmm22 {%k7} {z}
+# INTEL: vreducenepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+0x62,0xe3,0x7f,0x9f,0x56,0x72,0x80,0x7b
+
+# ATT:   vreducenepbf16  $123, 268435456(%rbp,%r14,8), %ymm22
+# INTEL: vreducenepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xa3,0x7f,0x28,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vreducenepbf16  $123, 291(%r8,%rax,4), %ymm22 {%k7}
+# INTEL: vreducenepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123
+0x62,0xc3,0x7f,0x2f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vreducenepbf16  $123, (%rip){1to16}, %ymm22
+# INTEL: vreducenepbf16 ymm22, word ptr [rip]{1to16}, 123
+0x62,0xe3,0x7f,0x38,0x56,0x35,0x00,0x00,0x00,0x00,0x7b
+
+# ATT:   vreducenepbf16  $123, -1024(,%rbp,2), %ymm22
+# INTEL: vreducenepbf16 ymm22, ymmword ptr [2*rbp - 1024], 123
+0x62,0xe3,0x7f,0x28,0x56,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b
+
+# ATT:   vreducenepbf16  $123, 4064(%rcx), %ymm22 {%k7} {z}
+# INTEL: vreducenepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123
+0x62,0xe3,0x7f,0xaf,0x56,0x71,0x7f,0x7b
+
+# ATT:   vreducenepbf16  $123, -256(%rdx){1to16}, %ymm22 {%k7} {z}
+# INTEL: vreducenepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+0x62,0xe3,0x7f,0xbf,0x56,0x72,0x80,0x7b
+
+# ATT:   vreducenepbf16  $123, 268435456(%rbp,%r14,8), %zmm22
+# INTEL: vreducenepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xa3,0x7f,0x48,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vreducenepbf16  $123, 291(%r8,%rax,4), %zmm22 {%k7}
+# INTEL: vreducenepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123
+0x62,0xc3,0x7f,0x4f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vreducenepbf16  $123, (%rip){1to32}, %zmm22
+# INTEL: vreducenepbf16 zmm22, word ptr [rip]{1to32}, 123
+0x62,0xe3,0x7f,0x58,0x56,0x35,0x00,0x00,0x00,0x00,0x7b
+
+# ATT:   vreducenepbf16  $123, -2048(,%rbp,2), %zmm22
+# INTEL: vreducenepbf16 zmm22, zmmword ptr [2*rbp - 2048], 123
+0x62,0xe3,0x7f,0x48,0x56,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b
+
+# ATT:   vreducenepbf16  $123, 8128(%rcx), %zmm22 {%k7} {z}
+# INTEL: vreducenepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123
+0x62,0xe3,0x7f,0xcf,0x56,0x71,0x7f,0x7b
+
+# ATT:   vreducenepbf16  $123, -256(%rdx){1to32}, %zmm22 {%k7} {z}
+# INTEL: vreducenepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123
+0x62,0xe3,0x7f,0xdf,0x56,0x72,0x80,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %zmm23, %zmm22
+# INTEL: vrndscalenepbf16 zmm22, zmm23, 123
+0x62,0xa3,0x7f,0x48,0x08,0xf7,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %zmm23, %zmm22 {%k7}
+# INTEL: vrndscalenepbf16 zmm22 {k7}, zmm23, 123
+0x62,0xa3,0x7f,0x4f,0x08,0xf7,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vrndscalenepbf16 zmm22 {k7} {z}, zmm23, 123
+0x62,0xa3,0x7f,0xcf,0x08,0xf7,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %ymm23, %ymm22
+# INTEL: vrndscalenepbf16 ymm22, ymm23, 123
+0x62,0xa3,0x7f,0x28,0x08,0xf7,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %ymm23, %ymm22 {%k7}
+# INTEL: vrndscalenepbf16 ymm22 {k7}, ymm23, 123
+0x62,0xa3,0x7f,0x2f,0x08,0xf7,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vrndscalenepbf16 ymm22 {k7} {z}, ymm23, 123
+0x62,0xa3,0x7f,0xaf,0x08,0xf7,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %xmm23, %xmm22
+# INTEL: vrndscalenepbf16 xmm22, xmm23, 123
+0x62,0xa3,0x7f,0x08,0x08,0xf7,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %xmm23, %xmm22 {%k7}
+# INTEL: vrndscalenepbf16 xmm22 {k7}, xmm23, 123
+0x62,0xa3,0x7f,0x0f,0x08,0xf7,0x7b
+
+# ATT:   vrndscalenepbf16 $123, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vrndscalenepbf16 xmm22 {k7} {z}, xmm23, 123
+0x62,0xa3,0x7f,0x8f,0x08,0xf7,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 268435456(%rbp,%r14,8), %xmm22
+# INTEL: vrndscalenepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xa3,0x7f,0x08,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 291(%r8,%rax,4), %xmm22 {%k7}
+# INTEL: vrndscalenepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123
+0x62,0xc3,0x7f,0x0f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vrndscalenepbf16  $123, (%rip){1to8}, %xmm22
+# INTEL: vrndscalenepbf16 xmm22, word ptr [rip]{1to8}, 123
+0x62,0xe3,0x7f,0x18,0x08,0x35,0x00,0x00,0x00,0x00,0x7b
+
+# ATT:   vrndscalenepbf16  $123, -512(,%rbp,2), %xmm22
+# INTEL: vrndscalenepbf16 xmm22, xmmword ptr [2*rbp - 512], 123
+0x62,0xe3,0x7f,0x08,0x08,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 2032(%rcx), %xmm22 {%k7} {z}
+# INTEL: vrndscalenepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123
+0x62,0xe3,0x7f,0x8f,0x08,0x71,0x7f,0x7b
+
+# ATT:   vrndscalenepbf16  $123, -256(%rdx){1to8}, %xmm22 {%k7} {z}
+# INTEL: vrndscalenepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+0x62,0xe3,0x7f,0x9f,0x08,0x72,0x80,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 268435456(%rbp,%r14,8), %ymm22
+# INTEL: vrndscalenepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xa3,0x7f,0x28,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 291(%r8,%rax,4), %ymm22 {%k7}
+# INTEL: vrndscalenepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123
+0x62,0xc3,0x7f,0x2f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vrndscalenepbf16  $123, (%rip){1to16}, %ymm22
+# INTEL: vrndscalenepbf16 ymm22, word ptr [rip]{1to16}, 123
+0x62,0xe3,0x7f,0x38,0x08,0x35,0x00,0x00,0x00,0x00,0x7b
+
+# ATT:   vrndscalenepbf16  $123, -1024(,%rbp,2), %ymm22
+# INTEL: vrndscalenepbf16 ymm22, ymmword ptr [2*rbp - 1024], 123
+0x62,0xe3,0x7f,0x28,0x08,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 4064(%rcx), %ymm22 {%k7} {z}
+# INTEL: vrndscalenepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123
+0x62,0xe3,0x7f,0xaf,0x08,0x71,0x7f,0x7b
+
+# ATT:   vrndscalenepbf16  $123, -256(%rdx){1to16}, %ymm22 {%k7} {z}
+# INTEL: vrndscalenepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+0x62,0xe3,0x7f,0xbf,0x08,0x72,0x80,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 268435456(%rbp,%r14,8), %zmm22
+# INTEL: vrndscalenepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xa3,0x7f,0x48,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 291(%r8,%rax,4), %zmm22 {%k7}
+# INTEL: vrndscalenepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123
+0x62,0xc3,0x7f,0x4f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   vrndscalenepbf16  $123, (%rip){1to32}, %zmm22
+# INTEL: vrndscalenepbf16 zmm22, word ptr [rip]{1to32}, 123
+0x62,0xe3,0x7f,0x58,0x08,0x35,0x00,0x00,0x00,0x00,0x7b
+
+# ATT:   vrndscalenepbf16  $123, -2048(,%rbp,2), %zmm22
+# INTEL: vrndscalenepbf16 zmm22, zmmword ptr [2*rbp - 2048], 123
+0x62,0xe3,0x7f,0x48,0x08,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b
+
+# ATT:   vrndscalenepbf16  $123, 8128(%rcx), %zmm22 {%k7} {z}
+# INTEL: vrndscalenepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123
+0x62,0xe3,0x7f,0xcf,0x08,0x71,0x7f,0x7b
+
+# ATT:   vrndscalenepbf16  $123, -256(%rdx){1to32}, %zmm22 {%k7} {z}
+# INTEL: vrndscalenepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123
+0x62,0xe3,0x7f,0xdf,0x08,0x72,0x80,0x7b
+
+# ATT:   vrsqrtpbf16 %xmm23, %xmm22
+# INTEL: vrsqrtpbf16 xmm22, xmm23
+0x62,0xa6,0x7c,0x08,0x4e,0xf7
+
+# ATT:   vrsqrtpbf16 %xmm23, %xmm22 {%k7}
+# INTEL: vrsqrtpbf16 xmm22 {k7}, xmm23
+0x62,0xa6,0x7c,0x0f,0x4e,0xf7
+
+# ATT:   vrsqrtpbf16 %xmm23, %xmm22 {%k7} {z}
+# INTEL: vrsqrtpbf16 xmm22 {k7} {z}, xmm23
+0x62,0xa6,0x7c,0x8f,0x4e,0xf7
+
+# ATT:   vrsqrtpbf16 %zmm23, %zmm22
+# INTEL: vrsqrtpbf16 zmm22, zmm23
+0x62,0xa6,0x7c,0x48,0x4e,0xf7
+
+# ATT:   vrsqrtpbf16 %zmm23, %zmm22 {%k7}
+# INTEL: vrsqrtpbf16 zmm22 {k7}, zmm23
+0x62,0xa6,0x7c,0x4f,0x4e,0xf7
+
+# ATT:   vrsqrtpbf16 %zmm23, %zmm22 {%k7} {z}
+# INTEL: vrsqrtpbf16 zmm22 {k7} {z}, zmm23
+0x62,0xa6,0x7c,0xcf,0x4e,0xf7
+
+# ATT:   vrsqrtpbf16 %ymm23, %ymm22
+# INTEL: vrsqrtpbf16 ymm22, ymm23
+0x62,0xa6,0x7c,0x28,0x4e,0xf7
+
+# ATT:   vrsqrtpbf16 %ymm23, %ymm22 {%k7}
+# INTEL: vrsqrtpbf16 ymm22 {k7}, ymm23
+0x62,0xa6,0x7c,0x2f,0x4e,0xf7
+
+# ATT:   vrsqrtpbf16 %ymm23, %ymm22 {%k7} {z}
+# INTEL: vrsqrtpbf16 ymm22 {k7} {z}, ymm23
+0x62,0xa6,0x7c,0xaf,0x4e,0xf7
+
+# ATT:   vrsqrtpbf16  268435456(%rbp,%r14,8), %xmm22
+# INTEL: vrsqrtpbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x7c,0x08,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vrsqrtpbf16  291(%r8,%rax,4), %xmm22 {%k7}
+# INTEL: vrsqrtpbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x7c,0x0f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vrsqrtpbf16  (%rip){1to8}, %xmm22
+# INTEL: vrsqrtpbf16 xmm22, word ptr [rip]{1to8}
+0x62,0xe6,0x7c,0x18,0x4e,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vrsqrtpbf16  -512(,%rbp,2), %xmm22
+# INTEL: vrsqrtpbf16 xmm22, xmmword ptr [2*rbp - 512]
+0x62,0xe6,0x7c,0x08,0x4e,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vrsqrtpbf16  2032(%rcx), %xmm22 {%k7} {z}
+# INTEL: vrsqrtpbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032]
+0x62,0xe6,0x7c,0x8f,0x4e,0x71,0x7f
+
+# ATT:   vrsqrtpbf16  -256(%rdx){1to8}, %xmm22 {%k7} {z}
+# INTEL: vrsqrtpbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}
+0x62,0xe6,0x7c,0x9f,0x4e,0x72,0x80
+
+# ATT:   vrsqrtpbf16  268435456(%rbp,%r14,8), %ymm22
+# INTEL: vrsqrtpbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x7c,0x28,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vrsqrtpbf16  291(%r8,%rax,4), %ymm22 {%k7}
+# INTEL: vrsqrtpbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x7c,0x2f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vrsqrtpbf16  (%rip){1to16}, %ymm22
+# INTEL: vrsqrtpbf16 ymm22, word ptr [rip]{1to16}
+0x62,0xe6,0x7c,0x38,0x4e,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vrsqrtpbf16  -1024(,%rbp,2), %ymm22
+# INTEL: vrsqrtpbf16 ymm22, ymmword ptr [2*rbp - 1024]
+0x62,0xe6,0x7c,0x28,0x4e,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vrsqrtpbf16  4064(%rcx), %ymm22 {%k7} {z}
+# INTEL: vrsqrtpbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064]
+0x62,0xe6,0x7c,0xaf,0x4e,0x71,0x7f
+
+# ATT:   vrsqrtpbf16  -256(%rdx){1to16}, %ymm22 {%k7} {z}
+# INTEL: vrsqrtpbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}
+0x62,0xe6,0x7c,0xbf,0x4e,0x72,0x80
+
+# ATT:   vrsqrtpbf16  268435456(%rbp,%r14,8), %zmm22
+# INTEL: vrsqrtpbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x7c,0x48,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vrsqrtpbf16  291(%r8,%rax,4), %zmm22 {%k7}
+# INTEL: vrsqrtpbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x7c,0x4f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vrsqrtpbf16  (%rip){1to32}, %zmm22
+# INTEL: vrsqrtpbf16 zmm22, word ptr [rip]{1to32}
+0x62,0xe6,0x7c,0x58,0x4e,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vrsqrtpbf16  -2048(,%rbp,2), %zmm22
+# INTEL: vrsqrtpbf16 zmm22, zmmword ptr [2*rbp - 2048]
+0x62,0xe6,0x7c,0x48,0x4e,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vrsqrtpbf16  8128(%rcx), %zmm22 {%k7} {z}
+# INTEL: vrsqrtpbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128]
+0x62,0xe6,0x7c,0xcf,0x4e,0x71,0x7f
+
+# ATT:   vrsqrtpbf16  -256(%rdx){1to32}, %zmm22 {%k7} {z}
+# INTEL: vrsqrtpbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0xe6,0x7c,0xdf,0x4e,0x72,0x80
+
+# ATT:   vscalefpbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vscalefpbf16 ymm22, ymm23, ymm24
+0x62,0x86,0x44,0x20,0x2c,0xf0
+
+# ATT:   vscalefpbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vscalefpbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x86,0x44,0x27,0x2c,0xf0
+
+# ATT:   vscalefpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vscalefpbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x86,0x44,0xa7,0x2c,0xf0
+
+# ATT:   vscalefpbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vscalefpbf16 zmm22, zmm23, zmm24
+0x62,0x86,0x44,0x40,0x2c,0xf0
+
+# ATT:   vscalefpbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vscalefpbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x86,0x44,0x47,0x2c,0xf0
+
+# ATT:   vscalefpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vscalefpbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x86,0x44,0xc7,0x2c,0xf0
+
+# ATT:   vscalefpbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vscalefpbf16 xmm22, xmm23, xmm24
+0x62,0x86,0x44,0x00,0x2c,0xf0
+
+# ATT:   vscalefpbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vscalefpbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x86,0x44,0x07,0x2c,0xf0
+
+# ATT:   vscalefpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vscalefpbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x86,0x44,0x87,0x2c,0xf0
+
+# ATT:   vscalefpbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vscalefpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x40,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vscalefpbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vscalefpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x47,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vscalefpbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vscalefpbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe6,0x44,0x50,0x2c,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vscalefpbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vscalefpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe6,0x44,0x40,0x2c,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vscalefpbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vscalefpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe6,0x44,0xc7,0x2c,0x71,0x7f
+
+# ATT:   vscalefpbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vscalefpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe6,0x44,0xd7,0x2c,0x72,0x80
+
+# ATT:   vscalefpbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vscalefpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x20,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vscalefpbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vscalefpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x27,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vscalefpbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vscalefpbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe6,0x44,0x30,0x2c,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vscalefpbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vscalefpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe6,0x44,0x20,0x2c,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vscalefpbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vscalefpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe6,0x44,0xa7,0x2c,0x71,0x7f
+
+# ATT:   vscalefpbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vscalefpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe6,0x44,0xb7,0x2c,0x72,0x80
+
+# ATT:   vscalefpbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vscalefpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa6,0x44,0x00,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vscalefpbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vscalefpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc6,0x44,0x07,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vscalefpbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vscalefpbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe6,0x44,0x10,0x2c,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vscalefpbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vscalefpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe6,0x44,0x00,0x2c,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vscalefpbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vscalefpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe6,0x44,0x87,0x2c,0x71,0x7f
+
+# ATT:   vscalefpbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vscalefpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe6,0x44,0x97,0x2c,0x72,0x80
+
+# ATT:   vsqrtnepbf16 %xmm23, %xmm22
+# INTEL: vsqrtnepbf16 xmm22, xmm23
+0x62,0xa5,0x7d,0x08,0x51,0xf7
+
+# ATT:   vsqrtnepbf16 %xmm23, %xmm22 {%k7}
+# INTEL: vsqrtnepbf16 xmm22 {k7}, xmm23
+0x62,0xa5,0x7d,0x0f,0x51,0xf7
+
+# ATT:   vsqrtnepbf16 %xmm23, %xmm22 {%k7} {z}
+# INTEL: vsqrtnepbf16 xmm22 {k7} {z}, xmm23
+0x62,0xa5,0x7d,0x8f,0x51,0xf7
+
+# ATT:   vsqrtnepbf16 %zmm23, %zmm22
+# INTEL: vsqrtnepbf16 zmm22, zmm23
+0x62,0xa5,0x7d,0x48,0x51,0xf7
+
+# ATT:   vsqrtnepbf16 %zmm23, %zmm22 {%k7}
+# INTEL: vsqrtnepbf16 zmm22 {k7}, zmm23
+0x62,0xa5,0x7d,0x4f,0x51,0xf7
+
+# ATT:   vsqrtnepbf16 %zmm23, %zmm22 {%k7} {z}
+# INTEL: vsqrtnepbf16 zmm22 {k7} {z}, zmm23
+0x62,0xa5,0x7d,0xcf,0x51,0xf7
+
+# ATT:   vsqrtnepbf16 %ymm23, %ymm22
+# INTEL: vsqrtnepbf16 ymm22, ymm23
+0x62,0xa5,0x7d,0x28,0x51,0xf7
+
+# ATT:   vsqrtnepbf16 %ymm23, %ymm22 {%k7}
+# INTEL: vsqrtnepbf16 ymm22 {k7}, ymm23
+0x62,0xa5,0x7d,0x2f,0x51,0xf7
+
+# ATT:   vsqrtnepbf16 %ymm23, %ymm22 {%k7} {z}
+# INTEL: vsqrtnepbf16 ymm22 {k7} {z}, ymm23
+0x62,0xa5,0x7d,0xaf,0x51,0xf7
+
+# ATT:   vsqrtnepbf16  268435456(%rbp,%r14,8), %xmm22
+# INTEL: vsqrtnepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x7d,0x08,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vsqrtnepbf16  291(%r8,%rax,4), %xmm22 {%k7}
+# INTEL: vsqrtnepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x7d,0x0f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vsqrtnepbf16  (%rip){1to8}, %xmm22
+# INTEL: vsqrtnepbf16 xmm22, word ptr [rip]{1to8}
+0x62,0xe5,0x7d,0x18,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vsqrtnepbf16  -512(,%rbp,2), %xmm22
+# INTEL: vsqrtnepbf16 xmm22, xmmword ptr [2*rbp - 512]
+0x62,0xe5,0x7d,0x08,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vsqrtnepbf16  2032(%rcx), %xmm22 {%k7} {z}
+# INTEL: vsqrtnepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032]
+0x62,0xe5,0x7d,0x8f,0x51,0x71,0x7f
+
+# ATT:   vsqrtnepbf16  -256(%rdx){1to8}, %xmm22 {%k7} {z}
+# INTEL: vsqrtnepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}
+0x62,0xe5,0x7d,0x9f,0x51,0x72,0x80
+
+# ATT:   vsqrtnepbf16  268435456(%rbp,%r14,8), %ymm22
+# INTEL: vsqrtnepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x7d,0x28,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vsqrtnepbf16  291(%r8,%rax,4), %ymm22 {%k7}
+# INTEL: vsqrtnepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x7d,0x2f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vsqrtnepbf16  (%rip){1to16}, %ymm22
+# INTEL: vsqrtnepbf16 ymm22, word ptr [rip]{1to16}
+0x62,0xe5,0x7d,0x38,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vsqrtnepbf16  -1024(,%rbp,2), %ymm22
+# INTEL: vsqrtnepbf16 ymm22, ymmword ptr [2*rbp - 1024]
+0x62,0xe5,0x7d,0x28,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vsqrtnepbf16  4064(%rcx), %ymm22 {%k7} {z}
+# INTEL: vsqrtnepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064]
+0x62,0xe5,0x7d,0xaf,0x51,0x71,0x7f
+
+# ATT:   vsqrtnepbf16  -256(%rdx){1to16}, %ymm22 {%k7} {z}
+# INTEL: vsqrtnepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}
+0x62,0xe5,0x7d,0xbf,0x51,0x72,0x80
+
+# ATT:   vsqrtnepbf16  268435456(%rbp,%r14,8), %zmm22
+# INTEL: vsqrtnepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x7d,0x48,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vsqrtnepbf16  291(%r8,%rax,4), %zmm22 {%k7}
+# INTEL: vsqrtnepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x7d,0x4f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vsqrtnepbf16  (%rip){1to32}, %zmm22
+# INTEL: vsqrtnepbf16 zmm22, word ptr [rip]{1to32}
+0x62,0xe5,0x7d,0x58,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vsqrtnepbf16  -2048(,%rbp,2), %zmm22
+# INTEL: vsqrtnepbf16 zmm22, zmmword ptr [2*rbp - 2048]
+0x62,0xe5,0x7d,0x48,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vsqrtnepbf16  8128(%rcx), %zmm22 {%k7} {z}
+# INTEL: vsqrtnepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128]
+0x62,0xe5,0x7d,0xcf,0x51,0x71,0x7f
+
+# ATT:   vsqrtnepbf16  -256(%rdx){1to32}, %zmm22 {%k7} {z}
+# INTEL: vsqrtnepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0xe5,0x7d,0xdf,0x51,0x72,0x80
+
+# ATT:   vsubnepbf16 %ymm24, %ymm23, %ymm22
+# INTEL: vsubnepbf16 ymm22, ymm23, ymm24
+0x62,0x85,0x45,0x20,0x5c,0xf0
+
+# ATT:   vsubnepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vsubnepbf16 ymm22 {k7}, ymm23, ymm24
+0x62,0x85,0x45,0x27,0x5c,0xf0
+
+# ATT:   vsubnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vsubnepbf16 ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x85,0x45,0xa7,0x5c,0xf0
+
+# ATT:   vsubnepbf16 %zmm24, %zmm23, %zmm22
+# INTEL: vsubnepbf16 zmm22, zmm23, zmm24
+0x62,0x85,0x45,0x40,0x5c,0xf0
+
+# ATT:   vsubnepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vsubnepbf16 zmm22 {k7}, zmm23, zmm24
+0x62,0x85,0x45,0x47,0x5c,0xf0
+
+# ATT:   vsubnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vsubnepbf16 zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x85,0x45,0xc7,0x5c,0xf0
+
+# ATT:   vsubnepbf16 %xmm24, %xmm23, %xmm22
+# INTEL: vsubnepbf16 xmm22, xmm23, xmm24
+0x62,0x85,0x45,0x00,0x5c,0xf0
+
+# ATT:   vsubnepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vsubnepbf16 xmm22 {k7}, xmm23, xmm24
+0x62,0x85,0x45,0x07,0x5c,0xf0
+
+# ATT:   vsubnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vsubnepbf16 xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x85,0x45,0x87,0x5c,0xf0
+
+# ATT:   vsubnepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vsubnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x40,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vsubnepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vsubnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x47,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vsubnepbf16  (%rip){1to32}, %zmm23, %zmm22
+# INTEL: vsubnepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+0x62,0xe5,0x45,0x50,0x5c,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vsubnepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vsubnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe5,0x45,0x40,0x5c,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vsubnepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vsubnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe5,0x45,0xc7,0x5c,0x71,0x7f
+
+# ATT:   vsubnepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vsubnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+0x62,0xe5,0x45,0xd7,0x5c,0x72,0x80
+
+# ATT:   vsubnepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vsubnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x20,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vsubnepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vsubnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x27,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vsubnepbf16  (%rip){1to16}, %ymm23, %ymm22
+# INTEL: vsubnepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+0x62,0xe5,0x45,0x30,0x5c,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vsubnepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vsubnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe5,0x45,0x20,0x5c,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vsubnepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vsubnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe5,0x45,0xa7,0x5c,0x71,0x7f
+
+# ATT:   vsubnepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vsubnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+0x62,0xe5,0x45,0xb7,0x5c,0x72,0x80
+
+# ATT:   vsubnepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vsubnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa5,0x45,0x00,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vsubnepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vsubnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc5,0x45,0x07,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vsubnepbf16  (%rip){1to8}, %xmm23, %xmm22
+# INTEL: vsubnepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+0x62,0xe5,0x45,0x10,0x5c,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vsubnepbf16  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vsubnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe5,0x45,0x00,0x5c,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vsubnepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vsubnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe5,0x45,0x87,0x5c,0x71,0x7f
+
+# ATT:   vsubnepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vsubnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+0x62,0xe5,0x45,0x97,0x5c,0x72,0x80
+
diff --git a/llvm/test/MC/X86/avx10.2-bf16-32-att.s b/llvm/test/MC/X86/avx10.2-bf16-32-att.s
new file mode 100644
index 0000000000000..9f62743177c9b
--- /dev/null
+++ b/llvm/test/MC/X86/avx10.2-bf16-32-att.s
@@ -0,0 +1,3014 @@
+// RUN: llvm-mc -triple i386 --show-encoding %s | FileCheck %s
+
+// CHECK: vaddnepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x58,0xd4]
+          vaddnepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vaddnepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x58,0xd4]
+          vaddnepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vaddnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x58,0xd4]
+          vaddnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vaddnepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x58,0xd4]
+          vaddnepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vaddnepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x58,0xd4]
+          vaddnepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vaddnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x58,0xd4]
+          vaddnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vaddnepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x58,0xd4]
+          vaddnepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vaddnepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x58,0xd4]
+          vaddnepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vaddnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x58,0xd4]
+          vaddnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vaddnepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x58,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vaddnepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vaddnepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x58,0x94,0x87,0x23,0x01,0x00,0x00]
+          vaddnepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vaddnepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x58,0x10]
+          vaddnepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vaddnepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x58,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vaddnepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vaddnepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x58,0x51,0x7f]
+          vaddnepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vaddnepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x58,0x52,0x80]
+          vaddnepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vaddnepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x58,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vaddnepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vaddnepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x58,0x94,0x87,0x23,0x01,0x00,0x00]
+          vaddnepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vaddnepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x58,0x10]
+          vaddnepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vaddnepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x58,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vaddnepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vaddnepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x58,0x51,0x7f]
+          vaddnepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vaddnepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x58,0x52,0x80]
+          vaddnepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vaddnepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x58,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vaddnepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vaddnepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x58,0x94,0x87,0x23,0x01,0x00,0x00]
+          vaddnepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vaddnepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x58,0x10]
+          vaddnepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vaddnepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x58,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vaddnepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vaddnepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x58,0x51,0x7f]
+          vaddnepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vaddnepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x58,0x52,0x80]
+          vaddnepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vcmppbf16 $123, %ymm4, %ymm3, %k5
+// CHECK: encoding: [0x62,0xf3,0x67,0x28,0xc2,0xec,0x7b]
+          vcmppbf16 $123, %ymm4, %ymm3, %k5
+
+// CHECK: vcmppbf16 $123, %ymm4, %ymm3, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x67,0x2f,0xc2,0xec,0x7b]
+          vcmppbf16 $123, %ymm4, %ymm3, %k5 {%k7}
+
+// CHECK: vcmppbf16 $123, %xmm4, %xmm3, %k5
+// CHECK: encoding: [0x62,0xf3,0x67,0x08,0xc2,0xec,0x7b]
+          vcmppbf16 $123, %xmm4, %xmm3, %k5
+
+// CHECK: vcmppbf16 $123, %xmm4, %xmm3, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x67,0x0f,0xc2,0xec,0x7b]
+          vcmppbf16 $123, %xmm4, %xmm3, %k5 {%k7}
+
+// CHECK: vcmppbf16 $123, %zmm4, %zmm3, %k5
+// CHECK: encoding: [0x62,0xf3,0x67,0x48,0xc2,0xec,0x7b]
+          vcmppbf16 $123, %zmm4, %zmm3, %k5
+
+// CHECK: vcmppbf16 $123, %zmm4, %zmm3, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x67,0x4f,0xc2,0xec,0x7b]
+          vcmppbf16 $123, %zmm4, %zmm3, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, 268435456(%esp,%esi,8), %zmm3, %k5
+// CHECK: encoding: [0x62,0xf3,0x67,0x48,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vcmppbf16  $123, 268435456(%esp,%esi,8), %zmm3, %k5
+
+// CHECK: vcmppbf16  $123, 291(%edi,%eax,4), %zmm3, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x67,0x4f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vcmppbf16  $123, 291(%edi,%eax,4), %zmm3, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, (%eax){1to32}, %zmm3, %k5
+// CHECK: encoding: [0x62,0xf3,0x67,0x58,0xc2,0x28,0x7b]
+          vcmppbf16  $123, (%eax){1to32}, %zmm3, %k5
+
+// CHECK: vcmppbf16  $123, -2048(,%ebp,2), %zmm3, %k5
+// CHECK: encoding: [0x62,0xf3,0x67,0x48,0xc2,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vcmppbf16  $123, -2048(,%ebp,2), %zmm3, %k5
+
+// CHECK: vcmppbf16  $123, 8128(%ecx), %zmm3, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x67,0x4f,0xc2,0x69,0x7f,0x7b]
+          vcmppbf16  $123, 8128(%ecx), %zmm3, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, -256(%edx){1to32}, %zmm3, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x67,0x5f,0xc2,0x6a,0x80,0x7b]
+          vcmppbf16  $123, -256(%edx){1to32}, %zmm3, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, 268435456(%esp,%esi,8), %xmm3, %k5
+// CHECK: encoding: [0x62,0xf3,0x67,0x08,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vcmppbf16  $123, 268435456(%esp,%esi,8), %xmm3, %k5
+
+// CHECK: vcmppbf16  $123, 291(%edi,%eax,4), %xmm3, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x67,0x0f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vcmppbf16  $123, 291(%edi,%eax,4), %xmm3, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, (%eax){1to8}, %xmm3, %k5
+// CHECK: encoding: [0x62,0xf3,0x67,0x18,0xc2,0x28,0x7b]
+          vcmppbf16  $123, (%eax){1to8}, %xmm3, %k5
+
+// CHECK: vcmppbf16  $123, -512(,%ebp,2), %xmm3, %k5
+// CHECK: encoding: [0x62,0xf3,0x67,0x08,0xc2,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vcmppbf16  $123, -512(,%ebp,2), %xmm3, %k5
+
+// CHECK: vcmppbf16  $123, 2032(%ecx), %xmm3, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x67,0x0f,0xc2,0x69,0x7f,0x7b]
+          vcmppbf16  $123, 2032(%ecx), %xmm3, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, -256(%edx){1to8}, %xmm3, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x67,0x1f,0xc2,0x6a,0x80,0x7b]
+          vcmppbf16  $123, -256(%edx){1to8}, %xmm3, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, 268435456(%esp,%esi,8), %ymm3, %k5
+// CHECK: encoding: [0x62,0xf3,0x67,0x28,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vcmppbf16  $123, 268435456(%esp,%esi,8), %ymm3, %k5
+
+// CHECK: vcmppbf16  $123, 291(%edi,%eax,4), %ymm3, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x67,0x2f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vcmppbf16  $123, 291(%edi,%eax,4), %ymm3, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, (%eax){1to16}, %ymm3, %k5
+// CHECK: encoding: [0x62,0xf3,0x67,0x38,0xc2,0x28,0x7b]
+          vcmppbf16  $123, (%eax){1to16}, %ymm3, %k5
+
+// CHECK: vcmppbf16  $123, -1024(,%ebp,2), %ymm3, %k5
+// CHECK: encoding: [0x62,0xf3,0x67,0x28,0xc2,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vcmppbf16  $123, -1024(,%ebp,2), %ymm3, %k5
+
+// CHECK: vcmppbf16  $123, 4064(%ecx), %ymm3, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x67,0x2f,0xc2,0x69,0x7f,0x7b]
+          vcmppbf16  $123, 4064(%ecx), %ymm3, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, -256(%edx){1to16}, %ymm3, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x67,0x3f,0xc2,0x6a,0x80,0x7b]
+          vcmppbf16  $123, -256(%edx){1to16}, %ymm3, %k5 {%k7}
+
+// CHECK: vcomsbf16 %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xd3]
+          vcomsbf16 %xmm3, %xmm2
+
+// CHECK: vcomsbf16  268435456(%esp,%esi,8), %xmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vcomsbf16  268435456(%esp,%esi,8), %xmm2
+
+// CHECK: vcomsbf16  291(%edi,%eax,4), %xmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00]
+          vcomsbf16  291(%edi,%eax,4), %xmm2
+
+// CHECK: vcomsbf16  (%eax), %xmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x10]
+          vcomsbf16  (%eax), %xmm2
+
+// CHECK: vcomsbf16  -64(,%ebp,2), %xmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x14,0x6d,0xc0,0xff,0xff,0xff]
+          vcomsbf16  -64(,%ebp,2), %xmm2
+
+// CHECK: vcomsbf16  254(%ecx), %xmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x51,0x7f]
+          vcomsbf16  254(%ecx), %xmm2
+
+// CHECK: vcomsbf16  -256(%edx), %xmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x52,0x80]
+          vcomsbf16  -256(%edx), %xmm2
+
+// CHECK: vdivnepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5e,0xd4]
+          vdivnepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vdivnepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5e,0xd4]
+          vdivnepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vdivnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5e,0xd4]
+          vdivnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vdivnepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5e,0xd4]
+          vdivnepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vdivnepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5e,0xd4]
+          vdivnepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vdivnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5e,0xd4]
+          vdivnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vdivnepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5e,0xd4]
+          vdivnepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vdivnepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5e,0xd4]
+          vdivnepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vdivnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5e,0xd4]
+          vdivnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vdivnepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vdivnepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vdivnepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vdivnepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vdivnepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5e,0x10]
+          vdivnepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vdivnepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5e,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vdivnepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vdivnepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5e,0x51,0x7f]
+          vdivnepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vdivnepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5e,0x52,0x80]
+          vdivnepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vdivnepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vdivnepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vdivnepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vdivnepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vdivnepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5e,0x10]
+          vdivnepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vdivnepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5e,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vdivnepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vdivnepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5e,0x51,0x7f]
+          vdivnepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vdivnepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5e,0x52,0x80]
+          vdivnepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vdivnepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vdivnepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vdivnepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vdivnepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vdivnepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5e,0x10]
+          vdivnepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vdivnepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5e,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vdivnepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vdivnepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5e,0x51,0x7f]
+          vdivnepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vdivnepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5e,0x52,0x80]
+          vdivnepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x98,0xd4]
+          vfmadd132nepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vfmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x98,0xd4]
+          vfmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vfmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x98,0xd4]
+          vfmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x98,0xd4]
+          vfmadd132nepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vfmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x98,0xd4]
+          vfmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vfmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x98,0xd4]
+          vfmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x98,0xd4]
+          vfmadd132nepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vfmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x98,0xd4]
+          vfmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vfmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x98,0xd4]
+          vfmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x98,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd132nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vfmadd132nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x98,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd132nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vfmadd132nepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x98,0x10]
+          vfmadd132nepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vfmadd132nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x98,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfmadd132nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vfmadd132nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x98,0x51,0x7f]
+          vfmadd132nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x98,0x52,0x80]
+          vfmadd132nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x98,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd132nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vfmadd132nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x98,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd132nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vfmadd132nepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x98,0x10]
+          vfmadd132nepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vfmadd132nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x98,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfmadd132nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vfmadd132nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x98,0x51,0x7f]
+          vfmadd132nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x98,0x52,0x80]
+          vfmadd132nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x98,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd132nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vfmadd132nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x98,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd132nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vfmadd132nepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x98,0x10]
+          vfmadd132nepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vfmadd132nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x98,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfmadd132nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vfmadd132nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x98,0x51,0x7f]
+          vfmadd132nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x98,0x52,0x80]
+          vfmadd132nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xa8,0xd4]
+          vfmadd213nepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vfmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xa8,0xd4]
+          vfmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vfmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xa8,0xd4]
+          vfmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xa8,0xd4]
+          vfmadd213nepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vfmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xa8,0xd4]
+          vfmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vfmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xa8,0xd4]
+          vfmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xa8,0xd4]
+          vfmadd213nepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vfmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xa8,0xd4]
+          vfmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vfmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xa8,0xd4]
+          vfmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd213nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vfmadd213nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd213nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vfmadd213nepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xa8,0x10]
+          vfmadd213nepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vfmadd213nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xa8,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfmadd213nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vfmadd213nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xa8,0x51,0x7f]
+          vfmadd213nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xa8,0x52,0x80]
+          vfmadd213nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd213nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vfmadd213nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd213nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vfmadd213nepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xa8,0x10]
+          vfmadd213nepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vfmadd213nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xa8,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfmadd213nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vfmadd213nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xa8,0x51,0x7f]
+          vfmadd213nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xa8,0x52,0x80]
+          vfmadd213nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd213nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vfmadd213nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd213nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vfmadd213nepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xa8,0x10]
+          vfmadd213nepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vfmadd213nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xa8,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfmadd213nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vfmadd213nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xa8,0x51,0x7f]
+          vfmadd213nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xa8,0x52,0x80]
+          vfmadd213nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xb8,0xd4]
+          vfmadd231nepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vfmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xb8,0xd4]
+          vfmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vfmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xb8,0xd4]
+          vfmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xb8,0xd4]
+          vfmadd231nepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vfmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xb8,0xd4]
+          vfmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vfmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xb8,0xd4]
+          vfmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xb8,0xd4]
+          vfmadd231nepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vfmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xb8,0xd4]
+          vfmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vfmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xb8,0xd4]
+          vfmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd231nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vfmadd231nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd231nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vfmadd231nepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xb8,0x10]
+          vfmadd231nepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vfmadd231nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xb8,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfmadd231nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vfmadd231nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xb8,0x51,0x7f]
+          vfmadd231nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xb8,0x52,0x80]
+          vfmadd231nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd231nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vfmadd231nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd231nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vfmadd231nepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xb8,0x10]
+          vfmadd231nepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vfmadd231nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xb8,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfmadd231nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vfmadd231nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xb8,0x51,0x7f]
+          vfmadd231nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xb8,0x52,0x80]
+          vfmadd231nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd231nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vfmadd231nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd231nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vfmadd231nepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xb8,0x10]
+          vfmadd231nepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vfmadd231nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xb8,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfmadd231nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vfmadd231nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xb8,0x51,0x7f]
+          vfmadd231nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xb8,0x52,0x80]
+          vfmadd231nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9a,0xd4]
+          vfmsub132nepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vfmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9a,0xd4]
+          vfmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vfmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9a,0xd4]
+          vfmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9a,0xd4]
+          vfmsub132nepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vfmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9a,0xd4]
+          vfmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vfmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9a,0xd4]
+          vfmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9a,0xd4]
+          vfmsub132nepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vfmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9a,0xd4]
+          vfmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vfmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9a,0xd4]
+          vfmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub132nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vfmsub132nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub132nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vfmsub132nepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x9a,0x10]
+          vfmsub132nepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vfmsub132nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9a,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfmsub132nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vfmsub132nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9a,0x51,0x7f]
+          vfmsub132nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x9a,0x52,0x80]
+          vfmsub132nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub132nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vfmsub132nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub132nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vfmsub132nepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x9a,0x10]
+          vfmsub132nepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vfmsub132nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9a,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfmsub132nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vfmsub132nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9a,0x51,0x7f]
+          vfmsub132nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x9a,0x52,0x80]
+          vfmsub132nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub132nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vfmsub132nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub132nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vfmsub132nepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x9a,0x10]
+          vfmsub132nepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vfmsub132nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9a,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfmsub132nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vfmsub132nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9a,0x51,0x7f]
+          vfmsub132nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x9a,0x52,0x80]
+          vfmsub132nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xaa,0xd4]
+          vfmsub213nepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vfmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xaa,0xd4]
+          vfmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vfmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xaa,0xd4]
+          vfmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xaa,0xd4]
+          vfmsub213nepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vfmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xaa,0xd4]
+          vfmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vfmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xaa,0xd4]
+          vfmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xaa,0xd4]
+          vfmsub213nepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vfmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xaa,0xd4]
+          vfmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vfmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xaa,0xd4]
+          vfmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub213nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vfmsub213nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub213nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vfmsub213nepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xaa,0x10]
+          vfmsub213nepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vfmsub213nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xaa,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfmsub213nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vfmsub213nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xaa,0x51,0x7f]
+          vfmsub213nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xaa,0x52,0x80]
+          vfmsub213nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub213nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vfmsub213nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub213nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vfmsub213nepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xaa,0x10]
+          vfmsub213nepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vfmsub213nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xaa,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfmsub213nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vfmsub213nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xaa,0x51,0x7f]
+          vfmsub213nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xaa,0x52,0x80]
+          vfmsub213nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub213nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vfmsub213nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub213nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vfmsub213nepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xaa,0x10]
+          vfmsub213nepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vfmsub213nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xaa,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfmsub213nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vfmsub213nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xaa,0x51,0x7f]
+          vfmsub213nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xaa,0x52,0x80]
+          vfmsub213nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xba,0xd4]
+          vfmsub231nepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vfmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xba,0xd4]
+          vfmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vfmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xba,0xd4]
+          vfmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xba,0xd4]
+          vfmsub231nepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vfmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xba,0xd4]
+          vfmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vfmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xba,0xd4]
+          vfmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xba,0xd4]
+          vfmsub231nepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vfmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xba,0xd4]
+          vfmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vfmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xba,0xd4]
+          vfmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xba,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub231nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vfmsub231nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xba,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub231nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vfmsub231nepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xba,0x10]
+          vfmsub231nepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vfmsub231nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xba,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfmsub231nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vfmsub231nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xba,0x51,0x7f]
+          vfmsub231nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xba,0x52,0x80]
+          vfmsub231nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xba,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub231nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vfmsub231nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xba,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub231nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vfmsub231nepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xba,0x10]
+          vfmsub231nepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vfmsub231nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xba,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfmsub231nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vfmsub231nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xba,0x51,0x7f]
+          vfmsub231nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xba,0x52,0x80]
+          vfmsub231nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xba,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub231nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vfmsub231nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xba,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub231nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vfmsub231nepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xba,0x10]
+          vfmsub231nepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vfmsub231nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xba,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfmsub231nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vfmsub231nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xba,0x51,0x7f]
+          vfmsub231nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xba,0x52,0x80]
+          vfmsub231nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9c,0xd4]
+          vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9c,0xd4]
+          vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9c,0xd4]
+          vfnmadd132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9c,0xd4]
+          vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9c,0xd4]
+          vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9c,0xd4]
+          vfnmadd132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9c,0xd4]
+          vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9c,0xd4]
+          vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9c,0xd4]
+          vfnmadd132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd132nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vfnmadd132nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd132nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vfnmadd132nepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x9c,0x10]
+          vfnmadd132nepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vfnmadd132nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9c,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmadd132nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vfnmadd132nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9c,0x51,0x7f]
+          vfnmadd132nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x9c,0x52,0x80]
+          vfnmadd132nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd132nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vfnmadd132nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd132nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vfnmadd132nepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x9c,0x10]
+          vfnmadd132nepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vfnmadd132nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9c,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmadd132nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vfnmadd132nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9c,0x51,0x7f]
+          vfnmadd132nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x9c,0x52,0x80]
+          vfnmadd132nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd132nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vfnmadd132nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd132nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vfnmadd132nepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x9c,0x10]
+          vfnmadd132nepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vfnmadd132nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9c,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmadd132nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vfnmadd132nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9c,0x51,0x7f]
+          vfnmadd132nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x9c,0x52,0x80]
+          vfnmadd132nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xac,0xd4]
+          vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xac,0xd4]
+          vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xac,0xd4]
+          vfnmadd213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xac,0xd4]
+          vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xac,0xd4]
+          vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xac,0xd4]
+          vfnmadd213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xac,0xd4]
+          vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xac,0xd4]
+          vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xac,0xd4]
+          vfnmadd213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xac,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd213nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vfnmadd213nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xac,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd213nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vfnmadd213nepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xac,0x10]
+          vfnmadd213nepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vfnmadd213nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xac,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmadd213nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vfnmadd213nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xac,0x51,0x7f]
+          vfnmadd213nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xac,0x52,0x80]
+          vfnmadd213nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xac,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd213nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vfnmadd213nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xac,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd213nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vfnmadd213nepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xac,0x10]
+          vfnmadd213nepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vfnmadd213nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xac,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmadd213nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vfnmadd213nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xac,0x51,0x7f]
+          vfnmadd213nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xac,0x52,0x80]
+          vfnmadd213nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xac,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd213nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vfnmadd213nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xac,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd213nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vfnmadd213nepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xac,0x10]
+          vfnmadd213nepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vfnmadd213nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xac,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmadd213nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vfnmadd213nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xac,0x51,0x7f]
+          vfnmadd213nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xac,0x52,0x80]
+          vfnmadd213nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbc,0xd4]
+          vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbc,0xd4]
+          vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbc,0xd4]
+          vfnmadd231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbc,0xd4]
+          vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbc,0xd4]
+          vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbc,0xd4]
+          vfnmadd231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbc,0xd4]
+          vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbc,0xd4]
+          vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbc,0xd4]
+          vfnmadd231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd231nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vfnmadd231nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd231nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vfnmadd231nepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xbc,0x10]
+          vfnmadd231nepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vfnmadd231nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbc,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmadd231nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vfnmadd231nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbc,0x51,0x7f]
+          vfnmadd231nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xbc,0x52,0x80]
+          vfnmadd231nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd231nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vfnmadd231nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd231nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vfnmadd231nepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xbc,0x10]
+          vfnmadd231nepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vfnmadd231nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbc,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmadd231nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vfnmadd231nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbc,0x51,0x7f]
+          vfnmadd231nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xbc,0x52,0x80]
+          vfnmadd231nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd231nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vfnmadd231nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd231nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vfnmadd231nepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xbc,0x10]
+          vfnmadd231nepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vfnmadd231nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbc,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmadd231nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vfnmadd231nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbc,0x51,0x7f]
+          vfnmadd231nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xbc,0x52,0x80]
+          vfnmadd231nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9e,0xd4]
+          vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9e,0xd4]
+          vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9e,0xd4]
+          vfnmsub132nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9e,0xd4]
+          vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9e,0xd4]
+          vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9e,0xd4]
+          vfnmsub132nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9e,0xd4]
+          vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9e,0xd4]
+          vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9e,0xd4]
+          vfnmsub132nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub132nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vfnmsub132nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub132nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vfnmsub132nepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x9e,0x10]
+          vfnmsub132nepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vfnmsub132nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9e,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmsub132nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vfnmsub132nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9e,0x51,0x7f]
+          vfnmsub132nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x9e,0x52,0x80]
+          vfnmsub132nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub132nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vfnmsub132nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub132nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vfnmsub132nepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x9e,0x10]
+          vfnmsub132nepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vfnmsub132nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9e,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmsub132nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vfnmsub132nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9e,0x51,0x7f]
+          vfnmsub132nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x9e,0x52,0x80]
+          vfnmsub132nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub132nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vfnmsub132nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub132nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vfnmsub132nepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x9e,0x10]
+          vfnmsub132nepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vfnmsub132nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9e,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmsub132nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vfnmsub132nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9e,0x51,0x7f]
+          vfnmsub132nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x9e,0x52,0x80]
+          vfnmsub132nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xae,0xd4]
+          vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xae,0xd4]
+          vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xae,0xd4]
+          vfnmsub213nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xae,0xd4]
+          vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xae,0xd4]
+          vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xae,0xd4]
+          vfnmsub213nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xae,0xd4]
+          vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xae,0xd4]
+          vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xae,0xd4]
+          vfnmsub213nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xae,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub213nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vfnmsub213nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xae,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub213nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vfnmsub213nepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xae,0x10]
+          vfnmsub213nepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vfnmsub213nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xae,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmsub213nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vfnmsub213nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xae,0x51,0x7f]
+          vfnmsub213nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xae,0x52,0x80]
+          vfnmsub213nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xae,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub213nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vfnmsub213nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xae,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub213nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vfnmsub213nepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xae,0x10]
+          vfnmsub213nepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vfnmsub213nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xae,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmsub213nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vfnmsub213nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xae,0x51,0x7f]
+          vfnmsub213nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xae,0x52,0x80]
+          vfnmsub213nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xae,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub213nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vfnmsub213nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xae,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub213nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vfnmsub213nepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xae,0x10]
+          vfnmsub213nepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vfnmsub213nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xae,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmsub213nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vfnmsub213nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xae,0x51,0x7f]
+          vfnmsub213nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xae,0x52,0x80]
+          vfnmsub213nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbe,0xd4]
+          vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbe,0xd4]
+          vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbe,0xd4]
+          vfnmsub231nepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbe,0xd4]
+          vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbe,0xd4]
+          vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbe,0xd4]
+          vfnmsub231nepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbe,0xd4]
+          vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbe,0xd4]
+          vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbe,0xd4]
+          vfnmsub231nepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub231nepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vfnmsub231nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub231nepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vfnmsub231nepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xbe,0x10]
+          vfnmsub231nepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vfnmsub231nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbe,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmsub231nepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vfnmsub231nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbe,0x51,0x7f]
+          vfnmsub231nepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xbe,0x52,0x80]
+          vfnmsub231nepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub231nepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vfnmsub231nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub231nepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vfnmsub231nepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xbe,0x10]
+          vfnmsub231nepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vfnmsub231nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbe,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmsub231nepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vfnmsub231nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbe,0x51,0x7f]
+          vfnmsub231nepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xbe,0x52,0x80]
+          vfnmsub231nepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub231nepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vfnmsub231nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub231nepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vfnmsub231nepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xbe,0x10]
+          vfnmsub231nepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vfnmsub231nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbe,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmsub231nepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vfnmsub231nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbe,0x51,0x7f]
+          vfnmsub231nepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xbe,0x52,0x80]
+          vfnmsub231nepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vfpclasspbf16 $123, %zmm3, %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x66,0xeb,0x7b]
+          vfpclasspbf16 $123, %zmm3, %k5
+
+// CHECK: vfpclasspbf16 $123, %zmm3, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x66,0xeb,0x7b]
+          vfpclasspbf16 $123, %zmm3, %k5 {%k7}
+
+// CHECK: vfpclasspbf16 $123, %ymm3, %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x66,0xeb,0x7b]
+          vfpclasspbf16 $123, %ymm3, %k5
+
+// CHECK: vfpclasspbf16 $123, %ymm3, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x66,0xeb,0x7b]
+          vfpclasspbf16 $123, %ymm3, %k5 {%k7}
+
+// CHECK: vfpclasspbf16 $123, %xmm3, %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0xeb,0x7b]
+          vfpclasspbf16 $123, %xmm3, %k5
+
+// CHECK: vfpclasspbf16 $123, %xmm3, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0xeb,0x7b]
+          vfpclasspbf16 $123, %xmm3, %k5 {%k7}
+
+// CHECK: vfpclasspbf16x  $123, 268435456(%esp,%esi,8), %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vfpclasspbf16x  $123, 268435456(%esp,%esi,8), %k5
+
+// CHECK: vfpclasspbf16x  $123, 291(%edi,%eax,4), %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0xac,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vfpclasspbf16x  $123, 291(%edi,%eax,4), %k5 {%k7}
+
+// CHECK: vfpclasspbf16  $123, (%eax){1to8}, %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x66,0x28,0x7b]
+          vfpclasspbf16  $123, (%eax){1to8}, %k5
+
+// CHECK: vfpclasspbf16x  $123, -512(,%ebp,2), %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vfpclasspbf16x  $123, -512(,%ebp,2), %k5
+
+// CHECK: vfpclasspbf16x  $123, 2032(%ecx), %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0x69,0x7f,0x7b]
+          vfpclasspbf16x  $123, 2032(%ecx), %k5 {%k7}
+
+// CHECK: vfpclasspbf16  $123, -256(%edx){1to8}, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x1f,0x66,0x6a,0x80,0x7b]
+          vfpclasspbf16  $123, -256(%edx){1to8}, %k5 {%k7}
+
+// CHECK: vfpclasspbf16  $123, (%eax){1to16}, %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x66,0x28,0x7b]
+          vfpclasspbf16  $123, (%eax){1to16}, %k5
+
+// CHECK: vfpclasspbf16y  $123, -1024(,%ebp,2), %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x66,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vfpclasspbf16y  $123, -1024(,%ebp,2), %k5
+
+// CHECK: vfpclasspbf16y  $123, 4064(%ecx), %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x66,0x69,0x7f,0x7b]
+          vfpclasspbf16y  $123, 4064(%ecx), %k5 {%k7}
+
+// CHECK: vfpclasspbf16  $123, -256(%edx){1to16}, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x3f,0x66,0x6a,0x80,0x7b]
+          vfpclasspbf16  $123, -256(%edx){1to16}, %k5 {%k7}
+
+// CHECK: vfpclasspbf16  $123, (%eax){1to32}, %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x66,0x28,0x7b]
+          vfpclasspbf16  $123, (%eax){1to32}, %k5
+
+// CHECK: vfpclasspbf16z  $123, -2048(,%ebp,2), %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x66,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vfpclasspbf16z  $123, -2048(,%ebp,2), %k5
+
+// CHECK: vfpclasspbf16z  $123, 8128(%ecx), %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x66,0x69,0x7f,0x7b]
+          vfpclasspbf16z  $123, 8128(%ecx), %k5 {%k7}
+
+// CHECK: vfpclasspbf16  $123, -256(%edx){1to32}, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x5f,0x66,0x6a,0x80,0x7b]
+          vfpclasspbf16  $123, -256(%edx){1to32}, %k5 {%k7}
+
+// CHECK: vgetexppbf16 %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0xd3]
+          vgetexppbf16 %xmm3, %xmm2
+
+// CHECK: vgetexppbf16 %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x42,0xd3]
+          vgetexppbf16 %xmm3, %xmm2 {%k7}
+
+// CHECK: vgetexppbf16 %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x42,0xd3]
+          vgetexppbf16 %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vgetexppbf16 %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0xd3]
+          vgetexppbf16 %zmm3, %zmm2
+
+// CHECK: vgetexppbf16 %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x42,0xd3]
+          vgetexppbf16 %zmm3, %zmm2 {%k7}
+
+// CHECK: vgetexppbf16 %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x42,0xd3]
+          vgetexppbf16 %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vgetexppbf16 %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0xd3]
+          vgetexppbf16 %ymm3, %ymm2
+
+// CHECK: vgetexppbf16 %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x42,0xd3]
+          vgetexppbf16 %ymm3, %ymm2 {%k7}
+
+// CHECK: vgetexppbf16 %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x42,0xd3]
+          vgetexppbf16 %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vgetexppbf16  268435456(%esp,%esi,8), %xmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vgetexppbf16  268435456(%esp,%esi,8), %xmm2
+
+// CHECK: vgetexppbf16  291(%edi,%eax,4), %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
+          vgetexppbf16  291(%edi,%eax,4), %xmm2 {%k7}
+
+// CHECK: vgetexppbf16  (%eax){1to8}, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x42,0x10]
+          vgetexppbf16  (%eax){1to8}, %xmm2
+
+// CHECK: vgetexppbf16  -512(,%ebp,2), %xmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vgetexppbf16  -512(,%ebp,2), %xmm2
+
+// CHECK: vgetexppbf16  2032(%ecx), %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x42,0x51,0x7f]
+          vgetexppbf16  2032(%ecx), %xmm2 {%k7} {z}
+
+// CHECK: vgetexppbf16  -256(%edx){1to8}, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x42,0x52,0x80]
+          vgetexppbf16  -256(%edx){1to8}, %xmm2 {%k7} {z}
+
+// CHECK: vgetexppbf16  268435456(%esp,%esi,8), %ymm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vgetexppbf16  268435456(%esp,%esi,8), %ymm2
+
+// CHECK: vgetexppbf16  291(%edi,%eax,4), %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
+          vgetexppbf16  291(%edi,%eax,4), %ymm2 {%k7}
+
+// CHECK: vgetexppbf16  (%eax){1to16}, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x42,0x10]
+          vgetexppbf16  (%eax){1to16}, %ymm2
+
+// CHECK: vgetexppbf16  -1024(,%ebp,2), %ymm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vgetexppbf16  -1024(,%ebp,2), %ymm2
+
+// CHECK: vgetexppbf16  4064(%ecx), %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x42,0x51,0x7f]
+          vgetexppbf16  4064(%ecx), %ymm2 {%k7} {z}
+
+// CHECK: vgetexppbf16  -256(%edx){1to16}, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x42,0x52,0x80]
+          vgetexppbf16  -256(%edx){1to16}, %ymm2 {%k7} {z}
+
+// CHECK: vgetexppbf16  268435456(%esp,%esi,8), %zmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vgetexppbf16  268435456(%esp,%esi,8), %zmm2
+
+// CHECK: vgetexppbf16  291(%edi,%eax,4), %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
+          vgetexppbf16  291(%edi,%eax,4), %zmm2 {%k7}
+
+// CHECK: vgetexppbf16  (%eax){1to32}, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x42,0x10]
+          vgetexppbf16  (%eax){1to32}, %zmm2
+
+// CHECK: vgetexppbf16  -2048(,%ebp,2), %zmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vgetexppbf16  -2048(,%ebp,2), %zmm2
+
+// CHECK: vgetexppbf16  8128(%ecx), %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x42,0x51,0x7f]
+          vgetexppbf16  8128(%ecx), %zmm2 {%k7} {z}
+
+// CHECK: vgetexppbf16  -256(%edx){1to32}, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x42,0x52,0x80]
+          vgetexppbf16  -256(%edx){1to32}, %zmm2 {%k7} {z}
+
+// CHECK: vgetmantpbf16 $123, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x26,0xd3,0x7b]
+          vgetmantpbf16 $123, %zmm3, %zmm2
+
+// CHECK: vgetmantpbf16 $123, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x26,0xd3,0x7b]
+          vgetmantpbf16 $123, %zmm3, %zmm2 {%k7}
+
+// CHECK: vgetmantpbf16 $123, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x26,0xd3,0x7b]
+          vgetmantpbf16 $123, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vgetmantpbf16 $123, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x26,0xd3,0x7b]
+          vgetmantpbf16 $123, %ymm3, %ymm2
+
+// CHECK: vgetmantpbf16 $123, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x26,0xd3,0x7b]
+          vgetmantpbf16 $123, %ymm3, %ymm2 {%k7}
+
+// CHECK: vgetmantpbf16 $123, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x26,0xd3,0x7b]
+          vgetmantpbf16 $123, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vgetmantpbf16 $123, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x26,0xd3,0x7b]
+          vgetmantpbf16 $123, %xmm3, %xmm2
+
+// CHECK: vgetmantpbf16 $123, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x26,0xd3,0x7b]
+          vgetmantpbf16 $123, %xmm3, %xmm2 {%k7}
+
+// CHECK: vgetmantpbf16 $123, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x26,0xd3,0x7b]
+          vgetmantpbf16 $123, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vgetmantpbf16  $123, 268435456(%esp,%esi,8), %xmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantpbf16  $123, 268435456(%esp,%esi,8), %xmm2
+
+// CHECK: vgetmantpbf16  $123, 291(%edi,%eax,4), %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vgetmantpbf16  $123, 291(%edi,%eax,4), %xmm2 {%k7}
+
+// CHECK: vgetmantpbf16  $123, (%eax){1to8}, %xmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x26,0x10,0x7b]
+          vgetmantpbf16  $123, (%eax){1to8}, %xmm2
+
+// CHECK: vgetmantpbf16  $123, -512(,%ebp,2), %xmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x26,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vgetmantpbf16  $123, -512(,%ebp,2), %xmm2
+
+// CHECK: vgetmantpbf16  $123, 2032(%ecx), %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x26,0x51,0x7f,0x7b]
+          vgetmantpbf16  $123, 2032(%ecx), %xmm2 {%k7} {z}
+
+// CHECK: vgetmantpbf16  $123, -256(%edx){1to8}, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x9f,0x26,0x52,0x80,0x7b]
+          vgetmantpbf16  $123, -256(%edx){1to8}, %xmm2 {%k7} {z}
+
+// CHECK: vgetmantpbf16  $123, 268435456(%esp,%esi,8), %ymm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantpbf16  $123, 268435456(%esp,%esi,8), %ymm2
+
+// CHECK: vgetmantpbf16  $123, 291(%edi,%eax,4), %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vgetmantpbf16  $123, 291(%edi,%eax,4), %ymm2 {%k7}
+
+// CHECK: vgetmantpbf16  $123, (%eax){1to16}, %ymm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x26,0x10,0x7b]
+          vgetmantpbf16  $123, (%eax){1to16}, %ymm2
+
+// CHECK: vgetmantpbf16  $123, -1024(,%ebp,2), %ymm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x26,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vgetmantpbf16  $123, -1024(,%ebp,2), %ymm2
+
+// CHECK: vgetmantpbf16  $123, 4064(%ecx), %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x26,0x51,0x7f,0x7b]
+          vgetmantpbf16  $123, 4064(%ecx), %ymm2 {%k7} {z}
+
+// CHECK: vgetmantpbf16  $123, -256(%edx){1to16}, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xbf,0x26,0x52,0x80,0x7b]
+          vgetmantpbf16  $123, -256(%edx){1to16}, %ymm2 {%k7} {z}
+
+// CHECK: vgetmantpbf16  $123, 268435456(%esp,%esi,8), %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantpbf16  $123, 268435456(%esp,%esi,8), %zmm2
+
+// CHECK: vgetmantpbf16  $123, 291(%edi,%eax,4), %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vgetmantpbf16  $123, 291(%edi,%eax,4), %zmm2 {%k7}
+
+// CHECK: vgetmantpbf16  $123, (%eax){1to32}, %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x26,0x10,0x7b]
+          vgetmantpbf16  $123, (%eax){1to32}, %zmm2
+
+// CHECK: vgetmantpbf16  $123, -2048(,%ebp,2), %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x26,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vgetmantpbf16  $123, -2048(,%ebp,2), %zmm2
+
+// CHECK: vgetmantpbf16  $123, 8128(%ecx), %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x26,0x51,0x7f,0x7b]
+          vgetmantpbf16  $123, 8128(%ecx), %zmm2 {%k7} {z}
+
+// CHECK: vgetmantpbf16  $123, -256(%edx){1to32}, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xdf,0x26,0x52,0x80,0x7b]
+          vgetmantpbf16  $123, -256(%edx){1to32}, %zmm2 {%k7} {z}
+
+// CHECK: vmaxpbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5f,0xd4]
+          vmaxpbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vmaxpbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5f,0xd4]
+          vmaxpbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vmaxpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5f,0xd4]
+          vmaxpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vmaxpbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5f,0xd4]
+          vmaxpbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vmaxpbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5f,0xd4]
+          vmaxpbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vmaxpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5f,0xd4]
+          vmaxpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vmaxpbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5f,0xd4]
+          vmaxpbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vmaxpbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5f,0xd4]
+          vmaxpbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vmaxpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5f,0xd4]
+          vmaxpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vmaxpbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vmaxpbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vmaxpbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00]
+          vmaxpbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vmaxpbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5f,0x10]
+          vmaxpbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vmaxpbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5f,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vmaxpbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vmaxpbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5f,0x51,0x7f]
+          vmaxpbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vmaxpbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5f,0x52,0x80]
+          vmaxpbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vmaxpbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vmaxpbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vmaxpbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00]
+          vmaxpbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vmaxpbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5f,0x10]
+          vmaxpbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vmaxpbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5f,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vmaxpbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vmaxpbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5f,0x51,0x7f]
+          vmaxpbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vmaxpbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5f,0x52,0x80]
+          vmaxpbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vmaxpbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vmaxpbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vmaxpbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00]
+          vmaxpbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vmaxpbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5f,0x10]
+          vmaxpbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vmaxpbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5f,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vmaxpbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vmaxpbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5f,0x51,0x7f]
+          vmaxpbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vmaxpbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5f,0x52,0x80]
+          vmaxpbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vminpbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5d,0xd4]
+          vminpbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vminpbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5d,0xd4]
+          vminpbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vminpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5d,0xd4]
+          vminpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vminpbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5d,0xd4]
+          vminpbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vminpbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5d,0xd4]
+          vminpbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vminpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5d,0xd4]
+          vminpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vminpbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5d,0xd4]
+          vminpbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vminpbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5d,0xd4]
+          vminpbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vminpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5d,0xd4]
+          vminpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vminpbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vminpbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vminpbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00]
+          vminpbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vminpbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5d,0x10]
+          vminpbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vminpbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5d,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vminpbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vminpbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5d,0x51,0x7f]
+          vminpbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vminpbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5d,0x52,0x80]
+          vminpbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vminpbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vminpbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vminpbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00]
+          vminpbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vminpbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5d,0x10]
+          vminpbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vminpbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5d,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vminpbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vminpbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5d,0x51,0x7f]
+          vminpbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vminpbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5d,0x52,0x80]
+          vminpbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vminpbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vminpbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vminpbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00]
+          vminpbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vminpbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5d,0x10]
+          vminpbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vminpbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5d,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vminpbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vminpbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5d,0x51,0x7f]
+          vminpbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vminpbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5d,0x52,0x80]
+          vminpbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vmulnepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x59,0xd4]
+          vmulnepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vmulnepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x59,0xd4]
+          vmulnepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vmulnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x59,0xd4]
+          vmulnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vmulnepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x59,0xd4]
+          vmulnepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vmulnepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x59,0xd4]
+          vmulnepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vmulnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x59,0xd4]
+          vmulnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vmulnepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x59,0xd4]
+          vmulnepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vmulnepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x59,0xd4]
+          vmulnepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vmulnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x59,0xd4]
+          vmulnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vmulnepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x59,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vmulnepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vmulnepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x59,0x94,0x87,0x23,0x01,0x00,0x00]
+          vmulnepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vmulnepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x59,0x10]
+          vmulnepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vmulnepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x59,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vmulnepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vmulnepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x59,0x51,0x7f]
+          vmulnepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vmulnepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x59,0x52,0x80]
+          vmulnepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vmulnepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x59,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vmulnepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vmulnepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x59,0x94,0x87,0x23,0x01,0x00,0x00]
+          vmulnepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vmulnepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x59,0x10]
+          vmulnepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vmulnepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x59,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vmulnepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vmulnepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x59,0x51,0x7f]
+          vmulnepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vmulnepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x59,0x52,0x80]
+          vmulnepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vmulnepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x59,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vmulnepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vmulnepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x59,0x94,0x87,0x23,0x01,0x00,0x00]
+          vmulnepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vmulnepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x59,0x10]
+          vmulnepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vmulnepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x59,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vmulnepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vmulnepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x59,0x51,0x7f]
+          vmulnepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vmulnepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x59,0x52,0x80]
+          vmulnepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vrcppbf16 %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4c,0xd3]
+          vrcppbf16 %xmm3, %xmm2
+
+// CHECK: vrcppbf16 %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4c,0xd3]
+          vrcppbf16 %xmm3, %xmm2 {%k7}
+
+// CHECK: vrcppbf16 %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4c,0xd3]
+          vrcppbf16 %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vrcppbf16 %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4c,0xd3]
+          vrcppbf16 %zmm3, %zmm2
+
+// CHECK: vrcppbf16 %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4c,0xd3]
+          vrcppbf16 %zmm3, %zmm2 {%k7}
+
+// CHECK: vrcppbf16 %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4c,0xd3]
+          vrcppbf16 %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vrcppbf16 %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4c,0xd3]
+          vrcppbf16 %ymm3, %ymm2
+
+// CHECK: vrcppbf16 %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4c,0xd3]
+          vrcppbf16 %ymm3, %ymm2 {%k7}
+
+// CHECK: vrcppbf16 %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4c,0xd3]
+          vrcppbf16 %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vrcppbf16  268435456(%esp,%esi,8), %xmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vrcppbf16  268435456(%esp,%esi,8), %xmm2
+
+// CHECK: vrcppbf16  291(%edi,%eax,4), %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vrcppbf16  291(%edi,%eax,4), %xmm2 {%k7}
+
+// CHECK: vrcppbf16  (%eax){1to8}, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x18,0x4c,0x10]
+          vrcppbf16  (%eax){1to8}, %xmm2
+
+// CHECK: vrcppbf16  -512(,%ebp,2), %xmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4c,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vrcppbf16  -512(,%ebp,2), %xmm2
+
+// CHECK: vrcppbf16  2032(%ecx), %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4c,0x51,0x7f]
+          vrcppbf16  2032(%ecx), %xmm2 {%k7} {z}
+
+// CHECK: vrcppbf16  -256(%edx){1to8}, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x9f,0x4c,0x52,0x80]
+          vrcppbf16  -256(%edx){1to8}, %xmm2 {%k7} {z}
+
+// CHECK: vrcppbf16  268435456(%esp,%esi,8), %ymm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vrcppbf16  268435456(%esp,%esi,8), %ymm2
+
+// CHECK: vrcppbf16  291(%edi,%eax,4), %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vrcppbf16  291(%edi,%eax,4), %ymm2 {%k7}
+
+// CHECK: vrcppbf16  (%eax){1to16}, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x38,0x4c,0x10]
+          vrcppbf16  (%eax){1to16}, %ymm2
+
+// CHECK: vrcppbf16  -1024(,%ebp,2), %ymm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4c,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vrcppbf16  -1024(,%ebp,2), %ymm2
+
+// CHECK: vrcppbf16  4064(%ecx), %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4c,0x51,0x7f]
+          vrcppbf16  4064(%ecx), %ymm2 {%k7} {z}
+
+// CHECK: vrcppbf16  -256(%edx){1to16}, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xbf,0x4c,0x52,0x80]
+          vrcppbf16  -256(%edx){1to16}, %ymm2 {%k7} {z}
+
+// CHECK: vrcppbf16  268435456(%esp,%esi,8), %zmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vrcppbf16  268435456(%esp,%esi,8), %zmm2
+
+// CHECK: vrcppbf16  291(%edi,%eax,4), %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vrcppbf16  291(%edi,%eax,4), %zmm2 {%k7}
+
+// CHECK: vrcppbf16  (%eax){1to32}, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x58,0x4c,0x10]
+          vrcppbf16  (%eax){1to32}, %zmm2
+
+// CHECK: vrcppbf16  -2048(,%ebp,2), %zmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4c,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vrcppbf16  -2048(,%ebp,2), %zmm2
+
+// CHECK: vrcppbf16  8128(%ecx), %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4c,0x51,0x7f]
+          vrcppbf16  8128(%ecx), %zmm2 {%k7} {z}
+
+// CHECK: vrcppbf16  -256(%edx){1to32}, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xdf,0x4c,0x52,0x80]
+          vrcppbf16  -256(%edx){1to32}, %zmm2 {%k7} {z}
+
+// CHECK: vreducenepbf16 $123, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x56,0xd3,0x7b]
+          vreducenepbf16 $123, %zmm3, %zmm2
+
+// CHECK: vreducenepbf16 $123, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x56,0xd3,0x7b]
+          vreducenepbf16 $123, %zmm3, %zmm2 {%k7}
+
+// CHECK: vreducenepbf16 $123, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x56,0xd3,0x7b]
+          vreducenepbf16 $123, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vreducenepbf16 $123, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x56,0xd3,0x7b]
+          vreducenepbf16 $123, %ymm3, %ymm2
+
+// CHECK: vreducenepbf16 $123, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x56,0xd3,0x7b]
+          vreducenepbf16 $123, %ymm3, %ymm2 {%k7}
+
+// CHECK: vreducenepbf16 $123, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x56,0xd3,0x7b]
+          vreducenepbf16 $123, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vreducenepbf16 $123, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x56,0xd3,0x7b]
+          vreducenepbf16 $123, %xmm3, %xmm2
+
+// CHECK: vreducenepbf16 $123, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x56,0xd3,0x7b]
+          vreducenepbf16 $123, %xmm3, %xmm2 {%k7}
+
+// CHECK: vreducenepbf16 $123, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x56,0xd3,0x7b]
+          vreducenepbf16 $123, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vreducenepbf16  $123, 268435456(%esp,%esi,8), %xmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vreducenepbf16  $123, 268435456(%esp,%esi,8), %xmm2
+
+// CHECK: vreducenepbf16  $123, 291(%edi,%eax,4), %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vreducenepbf16  $123, 291(%edi,%eax,4), %xmm2 {%k7}
+
+// CHECK: vreducenepbf16  $123, (%eax){1to8}, %xmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x56,0x10,0x7b]
+          vreducenepbf16  $123, (%eax){1to8}, %xmm2
+
+// CHECK: vreducenepbf16  $123, -512(,%ebp,2), %xmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x56,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vreducenepbf16  $123, -512(,%ebp,2), %xmm2
+
+// CHECK: vreducenepbf16  $123, 2032(%ecx), %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x56,0x51,0x7f,0x7b]
+          vreducenepbf16  $123, 2032(%ecx), %xmm2 {%k7} {z}
+
+// CHECK: vreducenepbf16  $123, -256(%edx){1to8}, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x9f,0x56,0x52,0x80,0x7b]
+          vreducenepbf16  $123, -256(%edx){1to8}, %xmm2 {%k7} {z}
+
+// CHECK: vreducenepbf16  $123, 268435456(%esp,%esi,8), %ymm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vreducenepbf16  $123, 268435456(%esp,%esi,8), %ymm2
+
+// CHECK: vreducenepbf16  $123, 291(%edi,%eax,4), %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vreducenepbf16  $123, 291(%edi,%eax,4), %ymm2 {%k7}
+
+// CHECK: vreducenepbf16  $123, (%eax){1to16}, %ymm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x56,0x10,0x7b]
+          vreducenepbf16  $123, (%eax){1to16}, %ymm2
+
+// CHECK: vreducenepbf16  $123, -1024(,%ebp,2), %ymm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x56,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vreducenepbf16  $123, -1024(,%ebp,2), %ymm2
+
+// CHECK: vreducenepbf16  $123, 4064(%ecx), %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x56,0x51,0x7f,0x7b]
+          vreducenepbf16  $123, 4064(%ecx), %ymm2 {%k7} {z}
+
+// CHECK: vreducenepbf16  $123, -256(%edx){1to16}, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xbf,0x56,0x52,0x80,0x7b]
+          vreducenepbf16  $123, -256(%edx){1to16}, %ymm2 {%k7} {z}
+
+// CHECK: vreducenepbf16  $123, 268435456(%esp,%esi,8), %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vreducenepbf16  $123, 268435456(%esp,%esi,8), %zmm2
+
+// CHECK: vreducenepbf16  $123, 291(%edi,%eax,4), %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vreducenepbf16  $123, 291(%edi,%eax,4), %zmm2 {%k7}
+
+// CHECK: vreducenepbf16  $123, (%eax){1to32}, %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x56,0x10,0x7b]
+          vreducenepbf16  $123, (%eax){1to32}, %zmm2
+
+// CHECK: vreducenepbf16  $123, -2048(,%ebp,2), %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x56,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vreducenepbf16  $123, -2048(,%ebp,2), %zmm2
+
+// CHECK: vreducenepbf16  $123, 8128(%ecx), %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x56,0x51,0x7f,0x7b]
+          vreducenepbf16  $123, 8128(%ecx), %zmm2 {%k7} {z}
+
+// CHECK: vreducenepbf16  $123, -256(%edx){1to32}, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xdf,0x56,0x52,0x80,0x7b]
+          vreducenepbf16  $123, -256(%edx){1to32}, %zmm2 {%k7} {z}
+
+// CHECK: vrndscalenepbf16 $123, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x08,0xd3,0x7b]
+          vrndscalenepbf16 $123, %zmm3, %zmm2
+
+// CHECK: vrndscalenepbf16 $123, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x08,0xd3,0x7b]
+          vrndscalenepbf16 $123, %zmm3, %zmm2 {%k7}
+
+// CHECK: vrndscalenepbf16 $123, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x08,0xd3,0x7b]
+          vrndscalenepbf16 $123, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vrndscalenepbf16 $123, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x08,0xd3,0x7b]
+          vrndscalenepbf16 $123, %ymm3, %ymm2
+
+// CHECK: vrndscalenepbf16 $123, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x08,0xd3,0x7b]
+          vrndscalenepbf16 $123, %ymm3, %ymm2 {%k7}
+
+// CHECK: vrndscalenepbf16 $123, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x08,0xd3,0x7b]
+          vrndscalenepbf16 $123, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vrndscalenepbf16 $123, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x08,0xd3,0x7b]
+          vrndscalenepbf16 $123, %xmm3, %xmm2
+
+// CHECK: vrndscalenepbf16 $123, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x08,0xd3,0x7b]
+          vrndscalenepbf16 $123, %xmm3, %xmm2 {%k7}
+
+// CHECK: vrndscalenepbf16 $123, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x08,0xd3,0x7b]
+          vrndscalenepbf16 $123, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vrndscalenepbf16  $123, 268435456(%esp,%esi,8), %xmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vrndscalenepbf16  $123, 268435456(%esp,%esi,8), %xmm2
+
+// CHECK: vrndscalenepbf16  $123, 291(%edi,%eax,4), %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vrndscalenepbf16  $123, 291(%edi,%eax,4), %xmm2 {%k7}
+
+// CHECK: vrndscalenepbf16  $123, (%eax){1to8}, %xmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x08,0x10,0x7b]
+          vrndscalenepbf16  $123, (%eax){1to8}, %xmm2
+
+// CHECK: vrndscalenepbf16  $123, -512(,%ebp,2), %xmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x08,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vrndscalenepbf16  $123, -512(,%ebp,2), %xmm2
+
+// CHECK: vrndscalenepbf16  $123, 2032(%ecx), %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x08,0x51,0x7f,0x7b]
+          vrndscalenepbf16  $123, 2032(%ecx), %xmm2 {%k7} {z}
+
+// CHECK: vrndscalenepbf16  $123, -256(%edx){1to8}, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x9f,0x08,0x52,0x80,0x7b]
+          vrndscalenepbf16  $123, -256(%edx){1to8}, %xmm2 {%k7} {z}
+
+// CHECK: vrndscalenepbf16  $123, 268435456(%esp,%esi,8), %ymm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vrndscalenepbf16  $123, 268435456(%esp,%esi,8), %ymm2
+
+// CHECK: vrndscalenepbf16  $123, 291(%edi,%eax,4), %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vrndscalenepbf16  $123, 291(%edi,%eax,4), %ymm2 {%k7}
+
+// CHECK: vrndscalenepbf16  $123, (%eax){1to16}, %ymm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x08,0x10,0x7b]
+          vrndscalenepbf16  $123, (%eax){1to16}, %ymm2
+
+// CHECK: vrndscalenepbf16  $123, -1024(,%ebp,2), %ymm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x08,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vrndscalenepbf16  $123, -1024(,%ebp,2), %ymm2
+
+// CHECK: vrndscalenepbf16  $123, 4064(%ecx), %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x08,0x51,0x7f,0x7b]
+          vrndscalenepbf16  $123, 4064(%ecx), %ymm2 {%k7} {z}
+
+// CHECK: vrndscalenepbf16  $123, -256(%edx){1to16}, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xbf,0x08,0x52,0x80,0x7b]
+          vrndscalenepbf16  $123, -256(%edx){1to16}, %ymm2 {%k7} {z}
+
+// CHECK: vrndscalenepbf16  $123, 268435456(%esp,%esi,8), %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vrndscalenepbf16  $123, 268435456(%esp,%esi,8), %zmm2
+
+// CHECK: vrndscalenepbf16  $123, 291(%edi,%eax,4), %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vrndscalenepbf16  $123, 291(%edi,%eax,4), %zmm2 {%k7}
+
+// CHECK: vrndscalenepbf16  $123, (%eax){1to32}, %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x08,0x10,0x7b]
+          vrndscalenepbf16  $123, (%eax){1to32}, %zmm2
+
+// CHECK: vrndscalenepbf16  $123, -2048(,%ebp,2), %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x08,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vrndscalenepbf16  $123, -2048(,%ebp,2), %zmm2
+
+// CHECK: vrndscalenepbf16  $123, 8128(%ecx), %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x08,0x51,0x7f,0x7b]
+          vrndscalenepbf16  $123, 8128(%ecx), %zmm2 {%k7} {z}
+
+// CHECK: vrndscalenepbf16  $123, -256(%edx){1to32}, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7f,0xdf,0x08,0x52,0x80,0x7b]
+          vrndscalenepbf16  $123, -256(%edx){1to32}, %zmm2 {%k7} {z}
+
+// CHECK: vrsqrtpbf16 %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4e,0xd3]
+          vrsqrtpbf16 %xmm3, %xmm2
+
+// CHECK: vrsqrtpbf16 %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4e,0xd3]
+          vrsqrtpbf16 %xmm3, %xmm2 {%k7}
+
+// CHECK: vrsqrtpbf16 %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4e,0xd3]
+          vrsqrtpbf16 %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vrsqrtpbf16 %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4e,0xd3]
+          vrsqrtpbf16 %zmm3, %zmm2
+
+// CHECK: vrsqrtpbf16 %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4e,0xd3]
+          vrsqrtpbf16 %zmm3, %zmm2 {%k7}
+
+// CHECK: vrsqrtpbf16 %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4e,0xd3]
+          vrsqrtpbf16 %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vrsqrtpbf16 %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4e,0xd3]
+          vrsqrtpbf16 %ymm3, %ymm2
+
+// CHECK: vrsqrtpbf16 %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4e,0xd3]
+          vrsqrtpbf16 %ymm3, %ymm2 {%k7}
+
+// CHECK: vrsqrtpbf16 %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4e,0xd3]
+          vrsqrtpbf16 %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vrsqrtpbf16  268435456(%esp,%esi,8), %xmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vrsqrtpbf16  268435456(%esp,%esi,8), %xmm2
+
+// CHECK: vrsqrtpbf16  291(%edi,%eax,4), %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vrsqrtpbf16  291(%edi,%eax,4), %xmm2 {%k7}
+
+// CHECK: vrsqrtpbf16  (%eax){1to8}, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x18,0x4e,0x10]
+          vrsqrtpbf16  (%eax){1to8}, %xmm2
+
+// CHECK: vrsqrtpbf16  -512(,%ebp,2), %xmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4e,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vrsqrtpbf16  -512(,%ebp,2), %xmm2
+
+// CHECK: vrsqrtpbf16  2032(%ecx), %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4e,0x51,0x7f]
+          vrsqrtpbf16  2032(%ecx), %xmm2 {%k7} {z}
+
+// CHECK: vrsqrtpbf16  -256(%edx){1to8}, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x9f,0x4e,0x52,0x80]
+          vrsqrtpbf16  -256(%edx){1to8}, %xmm2 {%k7} {z}
+
+// CHECK: vrsqrtpbf16  268435456(%esp,%esi,8), %ymm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vrsqrtpbf16  268435456(%esp,%esi,8), %ymm2
+
+// CHECK: vrsqrtpbf16  291(%edi,%eax,4), %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vrsqrtpbf16  291(%edi,%eax,4), %ymm2 {%k7}
+
+// CHECK: vrsqrtpbf16  (%eax){1to16}, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x38,0x4e,0x10]
+          vrsqrtpbf16  (%eax){1to16}, %ymm2
+
+// CHECK: vrsqrtpbf16  -1024(,%ebp,2), %ymm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4e,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vrsqrtpbf16  -1024(,%ebp,2), %ymm2
+
+// CHECK: vrsqrtpbf16  4064(%ecx), %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4e,0x51,0x7f]
+          vrsqrtpbf16  4064(%ecx), %ymm2 {%k7} {z}
+
+// CHECK: vrsqrtpbf16  -256(%edx){1to16}, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xbf,0x4e,0x52,0x80]
+          vrsqrtpbf16  -256(%edx){1to16}, %ymm2 {%k7} {z}
+
+// CHECK: vrsqrtpbf16  268435456(%esp,%esi,8), %zmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vrsqrtpbf16  268435456(%esp,%esi,8), %zmm2
+
+// CHECK: vrsqrtpbf16  291(%edi,%eax,4), %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vrsqrtpbf16  291(%edi,%eax,4), %zmm2 {%k7}
+
+// CHECK: vrsqrtpbf16  (%eax){1to32}, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x58,0x4e,0x10]
+          vrsqrtpbf16  (%eax){1to32}, %zmm2
+
+// CHECK: vrsqrtpbf16  -2048(,%ebp,2), %zmm2
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4e,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vrsqrtpbf16  -2048(,%ebp,2), %zmm2
+
+// CHECK: vrsqrtpbf16  8128(%ecx), %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4e,0x51,0x7f]
+          vrsqrtpbf16  8128(%ecx), %zmm2 {%k7} {z}
+
+// CHECK: vrsqrtpbf16  -256(%edx){1to32}, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xdf,0x4e,0x52,0x80]
+          vrsqrtpbf16  -256(%edx){1to32}, %zmm2 {%k7} {z}
+
+// CHECK: vscalefpbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x2c,0xd4]
+          vscalefpbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vscalefpbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x2c,0xd4]
+          vscalefpbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vscalefpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x2c,0xd4]
+          vscalefpbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vscalefpbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x2c,0xd4]
+          vscalefpbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vscalefpbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x2c,0xd4]
+          vscalefpbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vscalefpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x2c,0xd4]
+          vscalefpbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vscalefpbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x2c,0xd4]
+          vscalefpbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vscalefpbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x2c,0xd4]
+          vscalefpbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vscalefpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x2c,0xd4]
+          vscalefpbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vscalefpbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vscalefpbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vscalefpbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vscalefpbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vscalefpbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x2c,0x10]
+          vscalefpbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vscalefpbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x2c,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vscalefpbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vscalefpbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x2c,0x51,0x7f]
+          vscalefpbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vscalefpbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x2c,0x52,0x80]
+          vscalefpbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vscalefpbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vscalefpbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vscalefpbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vscalefpbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vscalefpbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x2c,0x10]
+          vscalefpbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vscalefpbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x2c,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vscalefpbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vscalefpbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x2c,0x51,0x7f]
+          vscalefpbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vscalefpbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x2c,0x52,0x80]
+          vscalefpbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vscalefpbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vscalefpbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vscalefpbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vscalefpbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vscalefpbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x2c,0x10]
+          vscalefpbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vscalefpbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x2c,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vscalefpbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vscalefpbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x2c,0x51,0x7f]
+          vscalefpbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vscalefpbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x2c,0x52,0x80]
+          vscalefpbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vsqrtnepbf16 %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x51,0xd3]
+          vsqrtnepbf16 %xmm3, %xmm2
+
+// CHECK: vsqrtnepbf16 %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x51,0xd3]
+          vsqrtnepbf16 %xmm3, %xmm2 {%k7}
+
+// CHECK: vsqrtnepbf16 %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x51,0xd3]
+          vsqrtnepbf16 %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vsqrtnepbf16 %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x51,0xd3]
+          vsqrtnepbf16 %zmm3, %zmm2
+
+// CHECK: vsqrtnepbf16 %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x51,0xd3]
+          vsqrtnepbf16 %zmm3, %zmm2 {%k7}
+
+// CHECK: vsqrtnepbf16 %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x51,0xd3]
+          vsqrtnepbf16 %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vsqrtnepbf16 %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x51,0xd3]
+          vsqrtnepbf16 %ymm3, %ymm2
+
+// CHECK: vsqrtnepbf16 %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x51,0xd3]
+          vsqrtnepbf16 %ymm3, %ymm2 {%k7}
+
+// CHECK: vsqrtnepbf16 %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x51,0xd3]
+          vsqrtnepbf16 %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vsqrtnepbf16  268435456(%esp,%esi,8), %xmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vsqrtnepbf16  268435456(%esp,%esi,8), %xmm2
+
+// CHECK: vsqrtnepbf16  291(%edi,%eax,4), %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+          vsqrtnepbf16  291(%edi,%eax,4), %xmm2 {%k7}
+
+// CHECK: vsqrtnepbf16  (%eax){1to8}, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x51,0x10]
+          vsqrtnepbf16  (%eax){1to8}, %xmm2
+
+// CHECK: vsqrtnepbf16  -512(,%ebp,2), %xmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vsqrtnepbf16  -512(,%ebp,2), %xmm2
+
+// CHECK: vsqrtnepbf16  2032(%ecx), %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x51,0x51,0x7f]
+          vsqrtnepbf16  2032(%ecx), %xmm2 {%k7} {z}
+
+// CHECK: vsqrtnepbf16  -256(%edx){1to8}, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x51,0x52,0x80]
+          vsqrtnepbf16  -256(%edx){1to8}, %xmm2 {%k7} {z}
+
+// CHECK: vsqrtnepbf16  268435456(%esp,%esi,8), %ymm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vsqrtnepbf16  268435456(%esp,%esi,8), %ymm2
+
+// CHECK: vsqrtnepbf16  291(%edi,%eax,4), %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+          vsqrtnepbf16  291(%edi,%eax,4), %ymm2 {%k7}
+
+// CHECK: vsqrtnepbf16  (%eax){1to16}, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x51,0x10]
+          vsqrtnepbf16  (%eax){1to16}, %ymm2
+
+// CHECK: vsqrtnepbf16  -1024(,%ebp,2), %ymm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vsqrtnepbf16  -1024(,%ebp,2), %ymm2
+
+// CHECK: vsqrtnepbf16  4064(%ecx), %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x51,0x51,0x7f]
+          vsqrtnepbf16  4064(%ecx), %ymm2 {%k7} {z}
+
+// CHECK: vsqrtnepbf16  -256(%edx){1to16}, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x51,0x52,0x80]
+          vsqrtnepbf16  -256(%edx){1to16}, %ymm2 {%k7} {z}
+
+// CHECK: vsqrtnepbf16  268435456(%esp,%esi,8), %zmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vsqrtnepbf16  268435456(%esp,%esi,8), %zmm2
+
+// CHECK: vsqrtnepbf16  291(%edi,%eax,4), %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+          vsqrtnepbf16  291(%edi,%eax,4), %zmm2 {%k7}
+
+// CHECK: vsqrtnepbf16  (%eax){1to32}, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x51,0x10]
+          vsqrtnepbf16  (%eax){1to32}, %zmm2
+
+// CHECK: vsqrtnepbf16  -2048(,%ebp,2), %zmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vsqrtnepbf16  -2048(,%ebp,2), %zmm2
+
+// CHECK: vsqrtnepbf16  8128(%ecx), %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x51,0x51,0x7f]
+          vsqrtnepbf16  8128(%ecx), %zmm2 {%k7} {z}
+
+// CHECK: vsqrtnepbf16  -256(%edx){1to32}, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x51,0x52,0x80]
+          vsqrtnepbf16  -256(%edx){1to32}, %zmm2 {%k7} {z}
+
+// CHECK: vsubnepbf16 %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5c,0xd4]
+          vsubnepbf16 %ymm4, %ymm3, %ymm2
+
+// CHECK: vsubnepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5c,0xd4]
+          vsubnepbf16 %ymm4, %ymm3, %ymm2 {%k7}
+
+// CHECK: vsubnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5c,0xd4]
+          vsubnepbf16 %ymm4, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vsubnepbf16 %zmm4, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5c,0xd4]
+          vsubnepbf16 %zmm4, %zmm3, %zmm2
+
+// CHECK: vsubnepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5c,0xd4]
+          vsubnepbf16 %zmm4, %zmm3, %zmm2 {%k7}
+
+// CHECK: vsubnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5c,0xd4]
+          vsubnepbf16 %zmm4, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vsubnepbf16 %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5c,0xd4]
+          vsubnepbf16 %xmm4, %xmm3, %xmm2
+
+// CHECK: vsubnepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5c,0xd4]
+          vsubnepbf16 %xmm4, %xmm3, %xmm2 {%k7}
+
+// CHECK: vsubnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5c,0xd4]
+          vsubnepbf16 %xmm4, %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vsubnepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vsubnepbf16  268435456(%esp,%esi,8), %zmm3, %zmm2
+
+// CHECK: vsubnepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vsubnepbf16  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+
+// CHECK: vsubnepbf16  (%eax){1to32}, %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5c,0x10]
+          vsubnepbf16  (%eax){1to32}, %zmm3, %zmm2
+
+// CHECK: vsubnepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5c,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vsubnepbf16  -2048(,%ebp,2), %zmm3, %zmm2
+
+// CHECK: vsubnepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5c,0x51,0x7f]
+          vsubnepbf16  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vsubnepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5c,0x52,0x80]
+          vsubnepbf16  -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z}
+
+// CHECK: vsubnepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vsubnepbf16  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vsubnepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vsubnepbf16  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+
+// CHECK: vsubnepbf16  (%eax){1to16}, %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5c,0x10]
+          vsubnepbf16  (%eax){1to16}, %ymm3, %ymm2
+
+// CHECK: vsubnepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5c,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vsubnepbf16  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vsubnepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5c,0x51,0x7f]
+          vsubnepbf16  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vsubnepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5c,0x52,0x80]
+          vsubnepbf16  -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z}
+
+// CHECK: vsubnepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vsubnepbf16  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vsubnepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vsubnepbf16  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+
+// CHECK: vsubnepbf16  (%eax){1to8}, %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5c,0x10]
+          vsubnepbf16  (%eax){1to8}, %xmm3, %xmm2
+
+// CHECK: vsubnepbf16  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5c,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vsubnepbf16  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vsubnepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5c,0x51,0x7f]
+          vsubnepbf16  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+
+// CHECK: vsubnepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5c,0x52,0x80]
+          vsubnepbf16  -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z}
+
diff --git a/llvm/test/MC/X86/avx10.2-bf16-32-intel.s b/llvm/test/MC/X86/avx10.2-bf16-32-intel.s
new file mode 100644
index 0000000000000..30c2cf45297bc
--- /dev/null
+++ b/llvm/test/MC/X86/avx10.2-bf16-32-intel.s
@@ -0,0 +1,3014 @@
+// RUN: llvm-mc -triple i386 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: vaddnepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x58,0xd4]
+          vaddnepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vaddnepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x58,0xd4]
+          vaddnepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vaddnepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x58,0xd4]
+          vaddnepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vaddnepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x58,0xd4]
+          vaddnepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vaddnepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x58,0xd4]
+          vaddnepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vaddnepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x58,0xd4]
+          vaddnepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vaddnepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x58,0xd4]
+          vaddnepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vaddnepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x58,0xd4]
+          vaddnepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vaddnepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x58,0xd4]
+          vaddnepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vaddnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x58,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vaddnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vaddnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x58,0x94,0x87,0x23,0x01,0x00,0x00]
+          vaddnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vaddnepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x58,0x10]
+          vaddnepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vaddnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x58,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vaddnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vaddnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x58,0x51,0x7f]
+          vaddnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vaddnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x58,0x52,0x80]
+          vaddnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vaddnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x58,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vaddnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vaddnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x58,0x94,0x87,0x23,0x01,0x00,0x00]
+          vaddnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vaddnepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x58,0x10]
+          vaddnepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vaddnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x58,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vaddnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vaddnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x58,0x51,0x7f]
+          vaddnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vaddnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x58,0x52,0x80]
+          vaddnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vaddnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x58,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vaddnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vaddnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x58,0x94,0x87,0x23,0x01,0x00,0x00]
+          vaddnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vaddnepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x58,0x10]
+          vaddnepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vaddnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x58,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vaddnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vaddnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x58,0x51,0x7f]
+          vaddnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vaddnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x58,0x52,0x80]
+          vaddnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vcmppbf16 k5, ymm3, ymm4, 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x28,0xc2,0xec,0x7b]
+          vcmppbf16 k5, ymm3, ymm4, 123
+
+// CHECK: vcmppbf16 k5 {k7}, ymm3, ymm4, 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x2f,0xc2,0xec,0x7b]
+          vcmppbf16 k5 {k7}, ymm3, ymm4, 123
+
+// CHECK: vcmppbf16 k5, xmm3, xmm4, 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x08,0xc2,0xec,0x7b]
+          vcmppbf16 k5, xmm3, xmm4, 123
+
+// CHECK: vcmppbf16 k5 {k7}, xmm3, xmm4, 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x0f,0xc2,0xec,0x7b]
+          vcmppbf16 k5 {k7}, xmm3, xmm4, 123
+
+// CHECK: vcmppbf16 k5, zmm3, zmm4, 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x48,0xc2,0xec,0x7b]
+          vcmppbf16 k5, zmm3, zmm4, 123
+
+// CHECK: vcmppbf16 k5 {k7}, zmm3, zmm4, 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x4f,0xc2,0xec,0x7b]
+          vcmppbf16 k5 {k7}, zmm3, zmm4, 123
+
+// CHECK: vcmppbf16 k5, zmm3, zmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x48,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vcmppbf16 k5, zmm3, zmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vcmppbf16 k5 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291], 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x4f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vcmppbf16 k5 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291], 123
+
+// CHECK: vcmppbf16 k5, zmm3, word ptr [eax]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x58,0xc2,0x28,0x7b]
+          vcmppbf16 k5, zmm3, word ptr [eax]{1to32}, 123
+
+// CHECK: vcmppbf16 k5, zmm3, zmmword ptr [2*ebp - 2048], 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x48,0xc2,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vcmppbf16 k5, zmm3, zmmword ptr [2*ebp - 2048], 123
+
+// CHECK: vcmppbf16 k5 {k7}, zmm3, zmmword ptr [ecx + 8128], 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x4f,0xc2,0x69,0x7f,0x7b]
+          vcmppbf16 k5 {k7}, zmm3, zmmword ptr [ecx + 8128], 123
+
+// CHECK: vcmppbf16 k5 {k7}, zmm3, word ptr [edx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x5f,0xc2,0x6a,0x80,0x7b]
+          vcmppbf16 k5 {k7}, zmm3, word ptr [edx - 256]{1to32}, 123
+
+// CHECK: vcmppbf16 k5, xmm3, xmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x08,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vcmppbf16 k5, xmm3, xmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vcmppbf16 k5 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291], 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x0f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vcmppbf16 k5 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291], 123
+
+// CHECK: vcmppbf16 k5, xmm3, word ptr [eax]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x18,0xc2,0x28,0x7b]
+          vcmppbf16 k5, xmm3, word ptr [eax]{1to8}, 123
+
+// CHECK: vcmppbf16 k5, xmm3, xmmword ptr [2*ebp - 512], 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x08,0xc2,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vcmppbf16 k5, xmm3, xmmword ptr [2*ebp - 512], 123
+
+// CHECK: vcmppbf16 k5 {k7}, xmm3, xmmword ptr [ecx + 2032], 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x0f,0xc2,0x69,0x7f,0x7b]
+          vcmppbf16 k5 {k7}, xmm3, xmmword ptr [ecx + 2032], 123
+
+// CHECK: vcmppbf16 k5 {k7}, xmm3, word ptr [edx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x1f,0xc2,0x6a,0x80,0x7b]
+          vcmppbf16 k5 {k7}, xmm3, word ptr [edx - 256]{1to8}, 123
+
+// CHECK: vcmppbf16 k5, ymm3, ymmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x28,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vcmppbf16 k5, ymm3, ymmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vcmppbf16 k5 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291], 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x2f,0xc2,0xac,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vcmppbf16 k5 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291], 123
+
+// CHECK: vcmppbf16 k5, ymm3, word ptr [eax]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x38,0xc2,0x28,0x7b]
+          vcmppbf16 k5, ymm3, word ptr [eax]{1to16}, 123
+
+// CHECK: vcmppbf16 k5, ymm3, ymmword ptr [2*ebp - 1024], 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x28,0xc2,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vcmppbf16 k5, ymm3, ymmword ptr [2*ebp - 1024], 123
+
+// CHECK: vcmppbf16 k5 {k7}, ymm3, ymmword ptr [ecx + 4064], 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x2f,0xc2,0x69,0x7f,0x7b]
+          vcmppbf16 k5 {k7}, ymm3, ymmword ptr [ecx + 4064], 123
+
+// CHECK: vcmppbf16 k5 {k7}, ymm3, word ptr [edx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x67,0x3f,0xc2,0x6a,0x80,0x7b]
+          vcmppbf16 k5 {k7}, ymm3, word ptr [edx - 256]{1to16}, 123
+
+// CHECK: vcomsbf16 xmm2, xmm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0xd3]
+          vcomsbf16 xmm2, xmm3
+
+// CHECK: vcomsbf16 xmm2, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vcomsbf16 xmm2, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcomsbf16 xmm2, word ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00]
+          vcomsbf16 xmm2, word ptr [edi + 4*eax + 291]
+
+// CHECK: vcomsbf16 xmm2, word ptr [eax]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x10]
+          vcomsbf16 xmm2, word ptr [eax]
+
+// CHECK: vcomsbf16 xmm2, word ptr [2*ebp - 64]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x14,0x6d,0xc0,0xff,0xff,0xff]
+          vcomsbf16 xmm2, word ptr [2*ebp - 64]
+
+// CHECK: vcomsbf16 xmm2, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x51,0x7f]
+          vcomsbf16 xmm2, word ptr [ecx + 254]
+
+// CHECK: vcomsbf16 xmm2, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x2f,0x52,0x80]
+          vcomsbf16 xmm2, word ptr [edx - 256]
+
+// CHECK: vdivnepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5e,0xd4]
+          vdivnepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vdivnepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5e,0xd4]
+          vdivnepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vdivnepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5e,0xd4]
+          vdivnepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vdivnepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5e,0xd4]
+          vdivnepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vdivnepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5e,0xd4]
+          vdivnepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vdivnepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5e,0xd4]
+          vdivnepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vdivnepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5e,0xd4]
+          vdivnepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vdivnepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5e,0xd4]
+          vdivnepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vdivnepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5e,0xd4]
+          vdivnepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vdivnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vdivnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vdivnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vdivnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vdivnepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5e,0x10]
+          vdivnepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vdivnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5e,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vdivnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vdivnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5e,0x51,0x7f]
+          vdivnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vdivnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5e,0x52,0x80]
+          vdivnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vdivnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vdivnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vdivnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vdivnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vdivnepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5e,0x10]
+          vdivnepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vdivnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5e,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vdivnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vdivnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5e,0x51,0x7f]
+          vdivnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vdivnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5e,0x52,0x80]
+          vdivnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vdivnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vdivnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vdivnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vdivnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vdivnepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5e,0x10]
+          vdivnepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vdivnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5e,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vdivnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vdivnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5e,0x51,0x7f]
+          vdivnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vdivnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5e,0x52,0x80]
+          vdivnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vfmadd132nepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x98,0xd4]
+          vfmadd132nepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vfmadd132nepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x98,0xd4]
+          vfmadd132nepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x98,0xd4]
+          vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vfmadd132nepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x98,0xd4]
+          vfmadd132nepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vfmadd132nepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x98,0xd4]
+          vfmadd132nepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x98,0xd4]
+          vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vfmadd132nepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x98,0xd4]
+          vfmadd132nepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vfmadd132nepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x98,0xd4]
+          vfmadd132nepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x98,0xd4]
+          vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vfmadd132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x98,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmadd132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x98,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmadd132nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x98,0x10]
+          vfmadd132nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vfmadd132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x98,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfmadd132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x98,0x51,0x7f]
+          vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x98,0x52,0x80]
+          vfmadd132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmadd132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x98,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmadd132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x98,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmadd132nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x98,0x10]
+          vfmadd132nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vfmadd132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x98,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfmadd132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x98,0x51,0x7f]
+          vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x98,0x52,0x80]
+          vfmadd132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vfmadd132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x98,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmadd132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x98,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmadd132nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x98,0x10]
+          vfmadd132nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vfmadd132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x98,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfmadd132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x98,0x51,0x7f]
+          vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x98,0x52,0x80]
+          vfmadd132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vfmadd213nepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xa8,0xd4]
+          vfmadd213nepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vfmadd213nepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xa8,0xd4]
+          vfmadd213nepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xa8,0xd4]
+          vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vfmadd213nepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xa8,0xd4]
+          vfmadd213nepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vfmadd213nepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xa8,0xd4]
+          vfmadd213nepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xa8,0xd4]
+          vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vfmadd213nepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xa8,0xd4]
+          vfmadd213nepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vfmadd213nepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xa8,0xd4]
+          vfmadd213nepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xa8,0xd4]
+          vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vfmadd213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmadd213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmadd213nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xa8,0x10]
+          vfmadd213nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vfmadd213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xa8,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfmadd213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xa8,0x51,0x7f]
+          vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xa8,0x52,0x80]
+          vfmadd213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmadd213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmadd213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmadd213nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xa8,0x10]
+          vfmadd213nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vfmadd213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xa8,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfmadd213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xa8,0x51,0x7f]
+          vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xa8,0x52,0x80]
+          vfmadd213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vfmadd213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xa8,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmadd213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xa8,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmadd213nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xa8,0x10]
+          vfmadd213nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vfmadd213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xa8,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfmadd213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xa8,0x51,0x7f]
+          vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xa8,0x52,0x80]
+          vfmadd213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vfmadd231nepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xb8,0xd4]
+          vfmadd231nepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vfmadd231nepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xb8,0xd4]
+          vfmadd231nepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xb8,0xd4]
+          vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vfmadd231nepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xb8,0xd4]
+          vfmadd231nepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vfmadd231nepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xb8,0xd4]
+          vfmadd231nepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xb8,0xd4]
+          vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vfmadd231nepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xb8,0xd4]
+          vfmadd231nepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vfmadd231nepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xb8,0xd4]
+          vfmadd231nepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xb8,0xd4]
+          vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vfmadd231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmadd231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmadd231nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xb8,0x10]
+          vfmadd231nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vfmadd231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xb8,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfmadd231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xb8,0x51,0x7f]
+          vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xb8,0x52,0x80]
+          vfmadd231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmadd231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmadd231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmadd231nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xb8,0x10]
+          vfmadd231nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vfmadd231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xb8,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfmadd231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xb8,0x51,0x7f]
+          vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xb8,0x52,0x80]
+          vfmadd231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vfmadd231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xb8,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmadd231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmadd231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xb8,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmadd231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmadd231nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xb8,0x10]
+          vfmadd231nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vfmadd231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xb8,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfmadd231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xb8,0x51,0x7f]
+          vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xb8,0x52,0x80]
+          vfmadd231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vfmsub132nepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9a,0xd4]
+          vfmsub132nepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vfmsub132nepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9a,0xd4]
+          vfmsub132nepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9a,0xd4]
+          vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vfmsub132nepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9a,0xd4]
+          vfmsub132nepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vfmsub132nepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9a,0xd4]
+          vfmsub132nepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9a,0xd4]
+          vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vfmsub132nepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9a,0xd4]
+          vfmsub132nepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vfmsub132nepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9a,0xd4]
+          vfmsub132nepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9a,0xd4]
+          vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vfmsub132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsub132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmsub132nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x9a,0x10]
+          vfmsub132nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vfmsub132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9a,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfmsub132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9a,0x51,0x7f]
+          vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x9a,0x52,0x80]
+          vfmsub132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmsub132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsub132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmsub132nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x9a,0x10]
+          vfmsub132nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vfmsub132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9a,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfmsub132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9a,0x51,0x7f]
+          vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x9a,0x52,0x80]
+          vfmsub132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vfmsub132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9a,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsub132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9a,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmsub132nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x9a,0x10]
+          vfmsub132nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vfmsub132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9a,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfmsub132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9a,0x51,0x7f]
+          vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x9a,0x52,0x80]
+          vfmsub132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vfmsub213nepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xaa,0xd4]
+          vfmsub213nepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vfmsub213nepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xaa,0xd4]
+          vfmsub213nepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xaa,0xd4]
+          vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vfmsub213nepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xaa,0xd4]
+          vfmsub213nepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vfmsub213nepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xaa,0xd4]
+          vfmsub213nepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xaa,0xd4]
+          vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vfmsub213nepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xaa,0xd4]
+          vfmsub213nepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vfmsub213nepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xaa,0xd4]
+          vfmsub213nepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xaa,0xd4]
+          vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vfmsub213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsub213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmsub213nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xaa,0x10]
+          vfmsub213nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vfmsub213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xaa,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfmsub213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xaa,0x51,0x7f]
+          vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xaa,0x52,0x80]
+          vfmsub213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmsub213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsub213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmsub213nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xaa,0x10]
+          vfmsub213nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vfmsub213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xaa,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfmsub213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xaa,0x51,0x7f]
+          vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xaa,0x52,0x80]
+          vfmsub213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vfmsub213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xaa,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsub213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xaa,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmsub213nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xaa,0x10]
+          vfmsub213nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vfmsub213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xaa,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfmsub213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xaa,0x51,0x7f]
+          vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xaa,0x52,0x80]
+          vfmsub213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vfmsub231nepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xba,0xd4]
+          vfmsub231nepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vfmsub231nepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xba,0xd4]
+          vfmsub231nepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xba,0xd4]
+          vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vfmsub231nepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xba,0xd4]
+          vfmsub231nepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vfmsub231nepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xba,0xd4]
+          vfmsub231nepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xba,0xd4]
+          vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vfmsub231nepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xba,0xd4]
+          vfmsub231nepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vfmsub231nepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xba,0xd4]
+          vfmsub231nepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xba,0xd4]
+          vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vfmsub231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xba,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsub231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xba,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmsub231nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xba,0x10]
+          vfmsub231nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vfmsub231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xba,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfmsub231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xba,0x51,0x7f]
+          vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xba,0x52,0x80]
+          vfmsub231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vfmsub231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xba,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsub231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xba,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmsub231nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xba,0x10]
+          vfmsub231nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vfmsub231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xba,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfmsub231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xba,0x51,0x7f]
+          vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xba,0x52,0x80]
+          vfmsub231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vfmsub231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xba,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfmsub231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfmsub231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xba,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfmsub231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfmsub231nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xba,0x10]
+          vfmsub231nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vfmsub231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xba,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfmsub231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xba,0x51,0x7f]
+          vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xba,0x52,0x80]
+          vfmsub231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vfnmadd132nepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9c,0xd4]
+          vfnmadd132nepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vfnmadd132nepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9c,0xd4]
+          vfnmadd132nepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9c,0xd4]
+          vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vfnmadd132nepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9c,0xd4]
+          vfnmadd132nepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vfnmadd132nepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9c,0xd4]
+          vfnmadd132nepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9c,0xd4]
+          vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vfnmadd132nepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9c,0xd4]
+          vfnmadd132nepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vfnmadd132nepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9c,0xd4]
+          vfnmadd132nepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9c,0xd4]
+          vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vfnmadd132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmadd132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmadd132nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x9c,0x10]
+          vfnmadd132nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vfnmadd132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9c,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmadd132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9c,0x51,0x7f]
+          vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x9c,0x52,0x80]
+          vfnmadd132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vfnmadd132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmadd132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmadd132nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x9c,0x10]
+          vfnmadd132nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vfnmadd132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9c,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmadd132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9c,0x51,0x7f]
+          vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x9c,0x52,0x80]
+          vfnmadd132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vfnmadd132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmadd132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmadd132nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x9c,0x10]
+          vfnmadd132nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vfnmadd132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9c,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmadd132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9c,0x51,0x7f]
+          vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x9c,0x52,0x80]
+          vfnmadd132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vfnmadd213nepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xac,0xd4]
+          vfnmadd213nepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vfnmadd213nepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xac,0xd4]
+          vfnmadd213nepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xac,0xd4]
+          vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vfnmadd213nepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xac,0xd4]
+          vfnmadd213nepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vfnmadd213nepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xac,0xd4]
+          vfnmadd213nepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xac,0xd4]
+          vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vfnmadd213nepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xac,0xd4]
+          vfnmadd213nepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vfnmadd213nepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xac,0xd4]
+          vfnmadd213nepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xac,0xd4]
+          vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vfnmadd213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xac,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmadd213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xac,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmadd213nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xac,0x10]
+          vfnmadd213nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vfnmadd213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xac,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmadd213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xac,0x51,0x7f]
+          vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xac,0x52,0x80]
+          vfnmadd213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vfnmadd213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xac,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmadd213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xac,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmadd213nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xac,0x10]
+          vfnmadd213nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vfnmadd213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xac,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmadd213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xac,0x51,0x7f]
+          vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xac,0x52,0x80]
+          vfnmadd213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vfnmadd213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xac,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmadd213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xac,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmadd213nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xac,0x10]
+          vfnmadd213nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vfnmadd213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xac,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmadd213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xac,0x51,0x7f]
+          vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xac,0x52,0x80]
+          vfnmadd213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vfnmadd231nepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbc,0xd4]
+          vfnmadd231nepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vfnmadd231nepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbc,0xd4]
+          vfnmadd231nepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbc,0xd4]
+          vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vfnmadd231nepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbc,0xd4]
+          vfnmadd231nepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vfnmadd231nepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbc,0xd4]
+          vfnmadd231nepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbc,0xd4]
+          vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vfnmadd231nepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbc,0xd4]
+          vfnmadd231nepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vfnmadd231nepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbc,0xd4]
+          vfnmadd231nepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbc,0xd4]
+          vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vfnmadd231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmadd231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmadd231nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xbc,0x10]
+          vfnmadd231nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vfnmadd231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbc,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmadd231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbc,0x51,0x7f]
+          vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xbc,0x52,0x80]
+          vfnmadd231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vfnmadd231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmadd231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmadd231nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xbc,0x10]
+          vfnmadd231nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vfnmadd231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbc,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmadd231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbc,0x51,0x7f]
+          vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xbc,0x52,0x80]
+          vfnmadd231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vfnmadd231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbc,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmadd231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmadd231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbc,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmadd231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmadd231nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xbc,0x10]
+          vfnmadd231nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vfnmadd231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbc,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmadd231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbc,0x51,0x7f]
+          vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xbc,0x52,0x80]
+          vfnmadd231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vfnmsub132nepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9e,0xd4]
+          vfnmsub132nepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vfnmsub132nepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9e,0xd4]
+          vfnmsub132nepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9e,0xd4]
+          vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vfnmsub132nepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9e,0xd4]
+          vfnmsub132nepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vfnmsub132nepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9e,0xd4]
+          vfnmsub132nepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9e,0xd4]
+          vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vfnmsub132nepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9e,0xd4]
+          vfnmsub132nepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vfnmsub132nepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9e,0xd4]
+          vfnmsub132nepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9e,0xd4]
+          vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vfnmsub132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub132nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmsub132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub132nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmsub132nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x9e,0x10]
+          vfnmsub132nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vfnmsub132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x9e,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmsub132nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x9e,0x51,0x7f]
+          vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x9e,0x52,0x80]
+          vfnmsub132nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vfnmsub132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub132nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmsub132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub132nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmsub132nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x9e,0x10]
+          vfnmsub132nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vfnmsub132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x9e,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmsub132nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x9e,0x51,0x7f]
+          vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x9e,0x52,0x80]
+          vfnmsub132nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vfnmsub132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub132nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmsub132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x9e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub132nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmsub132nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x9e,0x10]
+          vfnmsub132nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vfnmsub132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x9e,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmsub132nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x9e,0x51,0x7f]
+          vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x9e,0x52,0x80]
+          vfnmsub132nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vfnmsub213nepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xae,0xd4]
+          vfnmsub213nepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vfnmsub213nepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xae,0xd4]
+          vfnmsub213nepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xae,0xd4]
+          vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vfnmsub213nepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xae,0xd4]
+          vfnmsub213nepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vfnmsub213nepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xae,0xd4]
+          vfnmsub213nepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xae,0xd4]
+          vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vfnmsub213nepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xae,0xd4]
+          vfnmsub213nepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vfnmsub213nepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xae,0xd4]
+          vfnmsub213nepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xae,0xd4]
+          vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vfnmsub213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xae,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub213nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmsub213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xae,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub213nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmsub213nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xae,0x10]
+          vfnmsub213nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vfnmsub213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xae,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmsub213nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xae,0x51,0x7f]
+          vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xae,0x52,0x80]
+          vfnmsub213nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vfnmsub213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xae,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub213nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmsub213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xae,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub213nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmsub213nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xae,0x10]
+          vfnmsub213nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vfnmsub213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xae,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmsub213nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xae,0x51,0x7f]
+          vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xae,0x52,0x80]
+          vfnmsub213nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vfnmsub213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xae,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub213nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmsub213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xae,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub213nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmsub213nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xae,0x10]
+          vfnmsub213nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vfnmsub213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xae,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmsub213nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xae,0x51,0x7f]
+          vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xae,0x52,0x80]
+          vfnmsub213nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vfnmsub231nepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbe,0xd4]
+          vfnmsub231nepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vfnmsub231nepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbe,0xd4]
+          vfnmsub231nepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbe,0xd4]
+          vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vfnmsub231nepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbe,0xd4]
+          vfnmsub231nepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vfnmsub231nepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbe,0xd4]
+          vfnmsub231nepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbe,0xd4]
+          vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vfnmsub231nepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbe,0xd4]
+          vfnmsub231nepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vfnmsub231nepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbe,0xd4]
+          vfnmsub231nepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbe,0xd4]
+          vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vfnmsub231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub231nepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmsub231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub231nepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmsub231nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0xbe,0x10]
+          vfnmsub231nepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vfnmsub231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0xbe,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmsub231nepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0xbe,0x51,0x7f]
+          vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0xbe,0x52,0x80]
+          vfnmsub231nepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vfnmsub231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub231nepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmsub231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub231nepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmsub231nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0xbe,0x10]
+          vfnmsub231nepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vfnmsub231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0xbe,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmsub231nepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0xbe,0x51,0x7f]
+          vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0xbe,0x52,0x80]
+          vfnmsub231nepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vfnmsub231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbe,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vfnmsub231nepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vfnmsub231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0xbe,0x94,0x87,0x23,0x01,0x00,0x00]
+          vfnmsub231nepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vfnmsub231nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0xbe,0x10]
+          vfnmsub231nepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vfnmsub231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0xbe,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmsub231nepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0xbe,0x51,0x7f]
+          vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0xbe,0x52,0x80]
+          vfnmsub231nepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vfpclasspbf16 k5, zmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x66,0xeb,0x7b]
+          vfpclasspbf16 k5, zmm3, 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, zmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x66,0xeb,0x7b]
+          vfpclasspbf16 k5 {k7}, zmm3, 123
+
+// CHECK: vfpclasspbf16 k5, ymm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x66,0xeb,0x7b]
+          vfpclasspbf16 k5, ymm3, 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, ymm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x66,0xeb,0x7b]
+          vfpclasspbf16 k5 {k7}, ymm3, 123
+
+// CHECK: vfpclasspbf16 k5, xmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0xeb,0x7b]
+          vfpclasspbf16 k5, xmm3, 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, xmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0xeb,0x7b]
+          vfpclasspbf16 k5 {k7}, xmm3, 123
+
+// CHECK: vfpclasspbf16 k5, xmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vfpclasspbf16 k5, xmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, xmmword ptr [edi + 4*eax + 291], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0xac,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vfpclasspbf16 k5 {k7}, xmmword ptr [edi + 4*eax + 291], 123
+
+// CHECK: vfpclasspbf16 k5, word ptr [eax]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x66,0x28,0x7b]
+          vfpclasspbf16 k5, word ptr [eax]{1to8}, 123
+
+// CHECK: vfpclasspbf16 k5, xmmword ptr [2*ebp - 512], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vfpclasspbf16 k5, xmmword ptr [2*ebp - 512], 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, xmmword ptr [ecx + 2032], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0x69,0x7f,0x7b]
+          vfpclasspbf16 k5 {k7}, xmmword ptr [ecx + 2032], 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x1f,0x66,0x6a,0x80,0x7b]
+          vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to8}, 123
+
+// CHECK: vfpclasspbf16 k5, word ptr [eax]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x66,0x28,0x7b]
+          vfpclasspbf16 k5, word ptr [eax]{1to16}, 123
+
+// CHECK: vfpclasspbf16 k5, ymmword ptr [2*ebp - 1024], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x66,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vfpclasspbf16 k5, ymmword ptr [2*ebp - 1024], 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, ymmword ptr [ecx + 4064], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x66,0x69,0x7f,0x7b]
+          vfpclasspbf16 k5 {k7}, ymmword ptr [ecx + 4064], 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x3f,0x66,0x6a,0x80,0x7b]
+          vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to16}, 123
+
+// CHECK: vfpclasspbf16 k5, word ptr [eax]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x66,0x28,0x7b]
+          vfpclasspbf16 k5, word ptr [eax]{1to32}, 123
+
+// CHECK: vfpclasspbf16 k5, zmmword ptr [2*ebp - 2048], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x66,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vfpclasspbf16 k5, zmmword ptr [2*ebp - 2048], 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, zmmword ptr [ecx + 8128], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x66,0x69,0x7f,0x7b]
+          vfpclasspbf16 k5 {k7}, zmmword ptr [ecx + 8128], 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x5f,0x66,0x6a,0x80,0x7b]
+          vfpclasspbf16 k5 {k7}, word ptr [edx - 256]{1to32}, 123
+
+// CHECK: vgetexppbf16 xmm2, xmm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0xd3]
+          vgetexppbf16 xmm2, xmm3
+
+// CHECK: vgetexppbf16 xmm2 {k7}, xmm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x42,0xd3]
+          vgetexppbf16 xmm2 {k7}, xmm3
+
+// CHECK: vgetexppbf16 xmm2 {k7} {z}, xmm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x42,0xd3]
+          vgetexppbf16 xmm2 {k7} {z}, xmm3
+
+// CHECK: vgetexppbf16 zmm2, zmm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0xd3]
+          vgetexppbf16 zmm2, zmm3
+
+// CHECK: vgetexppbf16 zmm2 {k7}, zmm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x42,0xd3]
+          vgetexppbf16 zmm2 {k7}, zmm3
+
+// CHECK: vgetexppbf16 zmm2 {k7} {z}, zmm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x42,0xd3]
+          vgetexppbf16 zmm2 {k7} {z}, zmm3
+
+// CHECK: vgetexppbf16 ymm2, ymm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0xd3]
+          vgetexppbf16 ymm2, ymm3
+
+// CHECK: vgetexppbf16 ymm2 {k7}, ymm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x42,0xd3]
+          vgetexppbf16 ymm2 {k7}, ymm3
+
+// CHECK: vgetexppbf16 ymm2 {k7} {z}, ymm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x42,0xd3]
+          vgetexppbf16 ymm2 {k7} {z}, ymm3
+
+// CHECK: vgetexppbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vgetexppbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vgetexppbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
+          vgetexppbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vgetexppbf16 xmm2, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x42,0x10]
+          vgetexppbf16 xmm2, word ptr [eax]{1to8}
+
+// CHECK: vgetexppbf16 xmm2, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x42,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vgetexppbf16 xmm2, xmmword ptr [2*ebp - 512]
+
+// CHECK: vgetexppbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x42,0x51,0x7f]
+          vgetexppbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032]
+
+// CHECK: vgetexppbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x42,0x52,0x80]
+          vgetexppbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}
+
+// CHECK: vgetexppbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vgetexppbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vgetexppbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
+          vgetexppbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vgetexppbf16 ymm2, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x42,0x10]
+          vgetexppbf16 ymm2, word ptr [eax]{1to16}
+
+// CHECK: vgetexppbf16 ymm2, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x42,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vgetexppbf16 ymm2, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vgetexppbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x42,0x51,0x7f]
+          vgetexppbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064]
+
+// CHECK: vgetexppbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x42,0x52,0x80]
+          vgetexppbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}
+
+// CHECK: vgetexppbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vgetexppbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vgetexppbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x42,0x94,0x87,0x23,0x01,0x00,0x00]
+          vgetexppbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vgetexppbf16 zmm2, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x42,0x10]
+          vgetexppbf16 zmm2, word ptr [eax]{1to32}
+
+// CHECK: vgetexppbf16 zmm2, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x42,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vgetexppbf16 zmm2, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vgetexppbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x42,0x51,0x7f]
+          vgetexppbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128]
+
+// CHECK: vgetexppbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x42,0x52,0x80]
+          vgetexppbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vgetmantpbf16 zmm2, zmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x26,0xd3,0x7b]
+          vgetmantpbf16 zmm2, zmm3, 123
+
+// CHECK: vgetmantpbf16 zmm2 {k7}, zmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x26,0xd3,0x7b]
+          vgetmantpbf16 zmm2 {k7}, zmm3, 123
+
+// CHECK: vgetmantpbf16 zmm2 {k7} {z}, zmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x26,0xd3,0x7b]
+          vgetmantpbf16 zmm2 {k7} {z}, zmm3, 123
+
+// CHECK: vgetmantpbf16 ymm2, ymm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x26,0xd3,0x7b]
+          vgetmantpbf16 ymm2, ymm3, 123
+
+// CHECK: vgetmantpbf16 ymm2 {k7}, ymm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x26,0xd3,0x7b]
+          vgetmantpbf16 ymm2 {k7}, ymm3, 123
+
+// CHECK: vgetmantpbf16 ymm2 {k7} {z}, ymm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x26,0xd3,0x7b]
+          vgetmantpbf16 ymm2 {k7} {z}, ymm3, 123
+
+// CHECK: vgetmantpbf16 xmm2, xmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x26,0xd3,0x7b]
+          vgetmantpbf16 xmm2, xmm3, 123
+
+// CHECK: vgetmantpbf16 xmm2 {k7}, xmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x26,0xd3,0x7b]
+          vgetmantpbf16 xmm2 {k7}, xmm3, 123
+
+// CHECK: vgetmantpbf16 xmm2 {k7} {z}, xmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x26,0xd3,0x7b]
+          vgetmantpbf16 xmm2 {k7} {z}, xmm3, 123
+
+// CHECK: vgetmantpbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantpbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vgetmantpbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vgetmantpbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123
+
+// CHECK: vgetmantpbf16 xmm2, word ptr [eax]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x26,0x10,0x7b]
+          vgetmantpbf16 xmm2, word ptr [eax]{1to8}, 123
+
+// CHECK: vgetmantpbf16 xmm2, xmmword ptr [2*ebp - 512], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x26,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vgetmantpbf16 xmm2, xmmword ptr [2*ebp - 512], 123
+
+// CHECK: vgetmantpbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x26,0x51,0x7f,0x7b]
+          vgetmantpbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123
+
+// CHECK: vgetmantpbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x9f,0x26,0x52,0x80,0x7b]
+          vgetmantpbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123
+
+// CHECK: vgetmantpbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantpbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vgetmantpbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vgetmantpbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123
+
+// CHECK: vgetmantpbf16 ymm2, word ptr [eax]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x26,0x10,0x7b]
+          vgetmantpbf16 ymm2, word ptr [eax]{1to16}, 123
+
+// CHECK: vgetmantpbf16 ymm2, ymmword ptr [2*ebp - 1024], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x26,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vgetmantpbf16 ymm2, ymmword ptr [2*ebp - 1024], 123
+
+// CHECK: vgetmantpbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x26,0x51,0x7f,0x7b]
+          vgetmantpbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123
+
+// CHECK: vgetmantpbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xbf,0x26,0x52,0x80,0x7b]
+          vgetmantpbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123
+
+// CHECK: vgetmantpbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x26,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantpbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vgetmantpbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x26,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vgetmantpbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123
+
+// CHECK: vgetmantpbf16 zmm2, word ptr [eax]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x26,0x10,0x7b]
+          vgetmantpbf16 zmm2, word ptr [eax]{1to32}, 123
+
+// CHECK: vgetmantpbf16 zmm2, zmmword ptr [2*ebp - 2048], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x26,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vgetmantpbf16 zmm2, zmmword ptr [2*ebp - 2048], 123
+
+// CHECK: vgetmantpbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x26,0x51,0x7f,0x7b]
+          vgetmantpbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123
+
+// CHECK: vgetmantpbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xdf,0x26,0x52,0x80,0x7b]
+          vgetmantpbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+
+// CHECK: vmaxpbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5f,0xd4]
+          vmaxpbf16 ymm2, ymm3, ymm4
+
+// CHECK: vmaxpbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5f,0xd4]
+          vmaxpbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vmaxpbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5f,0xd4]
+          vmaxpbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vmaxpbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5f,0xd4]
+          vmaxpbf16 zmm2, zmm3, zmm4
+
+// CHECK: vmaxpbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5f,0xd4]
+          vmaxpbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vmaxpbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5f,0xd4]
+          vmaxpbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vmaxpbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5f,0xd4]
+          vmaxpbf16 xmm2, xmm3, xmm4
+
+// CHECK: vmaxpbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5f,0xd4]
+          vmaxpbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vmaxpbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5f,0xd4]
+          vmaxpbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vmaxpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vmaxpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vmaxpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00]
+          vmaxpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vmaxpbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5f,0x10]
+          vmaxpbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vmaxpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5f,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vmaxpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vmaxpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5f,0x51,0x7f]
+          vmaxpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vmaxpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5f,0x52,0x80]
+          vmaxpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vmaxpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vmaxpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vmaxpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00]
+          vmaxpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vmaxpbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5f,0x10]
+          vmaxpbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vmaxpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5f,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vmaxpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vmaxpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5f,0x51,0x7f]
+          vmaxpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vmaxpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5f,0x52,0x80]
+          vmaxpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vmaxpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5f,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vmaxpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vmaxpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5f,0x94,0x87,0x23,0x01,0x00,0x00]
+          vmaxpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vmaxpbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5f,0x10]
+          vmaxpbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vmaxpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5f,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vmaxpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vmaxpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5f,0x51,0x7f]
+          vmaxpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vmaxpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5f,0x52,0x80]
+          vmaxpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vminpbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5d,0xd4]
+          vminpbf16 ymm2, ymm3, ymm4
+
+// CHECK: vminpbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5d,0xd4]
+          vminpbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vminpbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5d,0xd4]
+          vminpbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vminpbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5d,0xd4]
+          vminpbf16 zmm2, zmm3, zmm4
+
+// CHECK: vminpbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5d,0xd4]
+          vminpbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vminpbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5d,0xd4]
+          vminpbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vminpbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5d,0xd4]
+          vminpbf16 xmm2, xmm3, xmm4
+
+// CHECK: vminpbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5d,0xd4]
+          vminpbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vminpbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5d,0xd4]
+          vminpbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vminpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vminpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vminpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00]
+          vminpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vminpbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5d,0x10]
+          vminpbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vminpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5d,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vminpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vminpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5d,0x51,0x7f]
+          vminpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vminpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5d,0x52,0x80]
+          vminpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vminpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vminpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vminpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00]
+          vminpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vminpbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5d,0x10]
+          vminpbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vminpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5d,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vminpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vminpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5d,0x51,0x7f]
+          vminpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vminpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5d,0x52,0x80]
+          vminpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vminpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5d,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vminpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vminpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5d,0x94,0x87,0x23,0x01,0x00,0x00]
+          vminpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vminpbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5d,0x10]
+          vminpbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vminpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5d,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vminpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vminpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5d,0x51,0x7f]
+          vminpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vminpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5d,0x52,0x80]
+          vminpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vmulnepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x59,0xd4]
+          vmulnepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vmulnepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x59,0xd4]
+          vmulnepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vmulnepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x59,0xd4]
+          vmulnepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vmulnepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x59,0xd4]
+          vmulnepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vmulnepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x59,0xd4]
+          vmulnepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vmulnepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x59,0xd4]
+          vmulnepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vmulnepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x59,0xd4]
+          vmulnepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vmulnepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x59,0xd4]
+          vmulnepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vmulnepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x59,0xd4]
+          vmulnepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vmulnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x59,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vmulnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vmulnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x59,0x94,0x87,0x23,0x01,0x00,0x00]
+          vmulnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vmulnepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x59,0x10]
+          vmulnepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vmulnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x59,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vmulnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vmulnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x59,0x51,0x7f]
+          vmulnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vmulnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x59,0x52,0x80]
+          vmulnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vmulnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x59,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vmulnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vmulnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x59,0x94,0x87,0x23,0x01,0x00,0x00]
+          vmulnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vmulnepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x59,0x10]
+          vmulnepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vmulnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x59,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vmulnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vmulnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x59,0x51,0x7f]
+          vmulnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vmulnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x59,0x52,0x80]
+          vmulnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vmulnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x59,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vmulnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vmulnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x59,0x94,0x87,0x23,0x01,0x00,0x00]
+          vmulnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vmulnepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x59,0x10]
+          vmulnepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vmulnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x59,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vmulnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vmulnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x59,0x51,0x7f]
+          vmulnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vmulnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x59,0x52,0x80]
+          vmulnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vrcppbf16 xmm2, xmm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4c,0xd3]
+          vrcppbf16 xmm2, xmm3
+
+// CHECK: vrcppbf16 xmm2 {k7}, xmm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4c,0xd3]
+          vrcppbf16 xmm2 {k7}, xmm3
+
+// CHECK: vrcppbf16 xmm2 {k7} {z}, xmm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4c,0xd3]
+          vrcppbf16 xmm2 {k7} {z}, xmm3
+
+// CHECK: vrcppbf16 zmm2, zmm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4c,0xd3]
+          vrcppbf16 zmm2, zmm3
+
+// CHECK: vrcppbf16 zmm2 {k7}, zmm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4c,0xd3]
+          vrcppbf16 zmm2 {k7}, zmm3
+
+// CHECK: vrcppbf16 zmm2 {k7} {z}, zmm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4c,0xd3]
+          vrcppbf16 zmm2 {k7} {z}, zmm3
+
+// CHECK: vrcppbf16 ymm2, ymm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4c,0xd3]
+          vrcppbf16 ymm2, ymm3
+
+// CHECK: vrcppbf16 ymm2 {k7}, ymm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4c,0xd3]
+          vrcppbf16 ymm2 {k7}, ymm3
+
+// CHECK: vrcppbf16 ymm2 {k7} {z}, ymm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4c,0xd3]
+          vrcppbf16 ymm2 {k7} {z}, ymm3
+
+// CHECK: vrcppbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vrcppbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vrcppbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vrcppbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vrcppbf16 xmm2, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x18,0x4c,0x10]
+          vrcppbf16 xmm2, word ptr [eax]{1to8}
+
+// CHECK: vrcppbf16 xmm2, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4c,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vrcppbf16 xmm2, xmmword ptr [2*ebp - 512]
+
+// CHECK: vrcppbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4c,0x51,0x7f]
+          vrcppbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032]
+
+// CHECK: vrcppbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x9f,0x4c,0x52,0x80]
+          vrcppbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}
+
+// CHECK: vrcppbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vrcppbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vrcppbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vrcppbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vrcppbf16 ymm2, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x38,0x4c,0x10]
+          vrcppbf16 ymm2, word ptr [eax]{1to16}
+
+// CHECK: vrcppbf16 ymm2, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4c,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vrcppbf16 ymm2, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vrcppbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4c,0x51,0x7f]
+          vrcppbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064]
+
+// CHECK: vrcppbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xbf,0x4c,0x52,0x80]
+          vrcppbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}
+
+// CHECK: vrcppbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vrcppbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vrcppbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vrcppbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vrcppbf16 zmm2, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x58,0x4c,0x10]
+          vrcppbf16 zmm2, word ptr [eax]{1to32}
+
+// CHECK: vrcppbf16 zmm2, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4c,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vrcppbf16 zmm2, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vrcppbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4c,0x51,0x7f]
+          vrcppbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128]
+
+// CHECK: vrcppbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xdf,0x4c,0x52,0x80]
+          vrcppbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vreducenepbf16 zmm2, zmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x56,0xd3,0x7b]
+          vreducenepbf16 zmm2, zmm3, 123
+
+// CHECK: vreducenepbf16 zmm2 {k7}, zmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x56,0xd3,0x7b]
+          vreducenepbf16 zmm2 {k7}, zmm3, 123
+
+// CHECK: vreducenepbf16 zmm2 {k7} {z}, zmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x56,0xd3,0x7b]
+          vreducenepbf16 zmm2 {k7} {z}, zmm3, 123
+
+// CHECK: vreducenepbf16 ymm2, ymm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x56,0xd3,0x7b]
+          vreducenepbf16 ymm2, ymm3, 123
+
+// CHECK: vreducenepbf16 ymm2 {k7}, ymm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x56,0xd3,0x7b]
+          vreducenepbf16 ymm2 {k7}, ymm3, 123
+
+// CHECK: vreducenepbf16 ymm2 {k7} {z}, ymm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x56,0xd3,0x7b]
+          vreducenepbf16 ymm2 {k7} {z}, ymm3, 123
+
+// CHECK: vreducenepbf16 xmm2, xmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x56,0xd3,0x7b]
+          vreducenepbf16 xmm2, xmm3, 123
+
+// CHECK: vreducenepbf16 xmm2 {k7}, xmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x56,0xd3,0x7b]
+          vreducenepbf16 xmm2 {k7}, xmm3, 123
+
+// CHECK: vreducenepbf16 xmm2 {k7} {z}, xmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x56,0xd3,0x7b]
+          vreducenepbf16 xmm2 {k7} {z}, xmm3, 123
+
+// CHECK: vreducenepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vreducenepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vreducenepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vreducenepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123
+
+// CHECK: vreducenepbf16 xmm2, word ptr [eax]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x56,0x10,0x7b]
+          vreducenepbf16 xmm2, word ptr [eax]{1to8}, 123
+
+// CHECK: vreducenepbf16 xmm2, xmmword ptr [2*ebp - 512], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x56,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vreducenepbf16 xmm2, xmmword ptr [2*ebp - 512], 123
+
+// CHECK: vreducenepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x56,0x51,0x7f,0x7b]
+          vreducenepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123
+
+// CHECK: vreducenepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x9f,0x56,0x52,0x80,0x7b]
+          vreducenepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123
+
+// CHECK: vreducenepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vreducenepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vreducenepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vreducenepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123
+
+// CHECK: vreducenepbf16 ymm2, word ptr [eax]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x56,0x10,0x7b]
+          vreducenepbf16 ymm2, word ptr [eax]{1to16}, 123
+
+// CHECK: vreducenepbf16 ymm2, ymmword ptr [2*ebp - 1024], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x56,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vreducenepbf16 ymm2, ymmword ptr [2*ebp - 1024], 123
+
+// CHECK: vreducenepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x56,0x51,0x7f,0x7b]
+          vreducenepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123
+
+// CHECK: vreducenepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xbf,0x56,0x52,0x80,0x7b]
+          vreducenepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123
+
+// CHECK: vreducenepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x56,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vreducenepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vreducenepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x56,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vreducenepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123
+
+// CHECK: vreducenepbf16 zmm2, word ptr [eax]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x56,0x10,0x7b]
+          vreducenepbf16 zmm2, word ptr [eax]{1to32}, 123
+
+// CHECK: vreducenepbf16 zmm2, zmmword ptr [2*ebp - 2048], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x56,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vreducenepbf16 zmm2, zmmword ptr [2*ebp - 2048], 123
+
+// CHECK: vreducenepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x56,0x51,0x7f,0x7b]
+          vreducenepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123
+
+// CHECK: vreducenepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xdf,0x56,0x52,0x80,0x7b]
+          vreducenepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+
+// CHECK: vrndscalenepbf16 zmm2, zmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x08,0xd3,0x7b]
+          vrndscalenepbf16 zmm2, zmm3, 123
+
+// CHECK: vrndscalenepbf16 zmm2 {k7}, zmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x08,0xd3,0x7b]
+          vrndscalenepbf16 zmm2 {k7}, zmm3, 123
+
+// CHECK: vrndscalenepbf16 zmm2 {k7} {z}, zmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x08,0xd3,0x7b]
+          vrndscalenepbf16 zmm2 {k7} {z}, zmm3, 123
+
+// CHECK: vrndscalenepbf16 ymm2, ymm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x08,0xd3,0x7b]
+          vrndscalenepbf16 ymm2, ymm3, 123
+
+// CHECK: vrndscalenepbf16 ymm2 {k7}, ymm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x08,0xd3,0x7b]
+          vrndscalenepbf16 ymm2 {k7}, ymm3, 123
+
+// CHECK: vrndscalenepbf16 ymm2 {k7} {z}, ymm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x08,0xd3,0x7b]
+          vrndscalenepbf16 ymm2 {k7} {z}, ymm3, 123
+
+// CHECK: vrndscalenepbf16 xmm2, xmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x08,0xd3,0x7b]
+          vrndscalenepbf16 xmm2, xmm3, 123
+
+// CHECK: vrndscalenepbf16 xmm2 {k7}, xmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x08,0xd3,0x7b]
+          vrndscalenepbf16 xmm2 {k7}, xmm3, 123
+
+// CHECK: vrndscalenepbf16 xmm2 {k7} {z}, xmm3, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x08,0xd3,0x7b]
+          vrndscalenepbf16 xmm2 {k7} {z}, xmm3, 123
+
+// CHECK: vrndscalenepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vrndscalenepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vrndscalenepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vrndscalenepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291], 123
+
+// CHECK: vrndscalenepbf16 xmm2, word ptr [eax]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x08,0x10,0x7b]
+          vrndscalenepbf16 xmm2, word ptr [eax]{1to8}, 123
+
+// CHECK: vrndscalenepbf16 xmm2, xmmword ptr [2*ebp - 512], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x08,0x14,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vrndscalenepbf16 xmm2, xmmword ptr [2*ebp - 512], 123
+
+// CHECK: vrndscalenepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x8f,0x08,0x51,0x7f,0x7b]
+          vrndscalenepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032], 123
+
+// CHECK: vrndscalenepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x9f,0x08,0x52,0x80,0x7b]
+          vrndscalenepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}, 123
+
+// CHECK: vrndscalenepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vrndscalenepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vrndscalenepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vrndscalenepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291], 123
+
+// CHECK: vrndscalenepbf16 ymm2, word ptr [eax]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x08,0x10,0x7b]
+          vrndscalenepbf16 ymm2, word ptr [eax]{1to16}, 123
+
+// CHECK: vrndscalenepbf16 ymm2, ymmword ptr [2*ebp - 1024], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x08,0x14,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vrndscalenepbf16 ymm2, ymmword ptr [2*ebp - 1024], 123
+
+// CHECK: vrndscalenepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xaf,0x08,0x51,0x7f,0x7b]
+          vrndscalenepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064], 123
+
+// CHECK: vrndscalenepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xbf,0x08,0x52,0x80,0x7b]
+          vrndscalenepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}, 123
+
+// CHECK: vrndscalenepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x08,0x94,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vrndscalenepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vrndscalenepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x08,0x94,0x87,0x23,0x01,0x00,0x00,0x7b]
+          vrndscalenepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291], 123
+
+// CHECK: vrndscalenepbf16 zmm2, word ptr [eax]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x08,0x10,0x7b]
+          vrndscalenepbf16 zmm2, word ptr [eax]{1to32}, 123
+
+// CHECK: vrndscalenepbf16 zmm2, zmmword ptr [2*ebp - 2048], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x08,0x14,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vrndscalenepbf16 zmm2, zmmword ptr [2*ebp - 2048], 123
+
+// CHECK: vrndscalenepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xcf,0x08,0x51,0x7f,0x7b]
+          vrndscalenepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128], 123
+
+// CHECK: vrndscalenepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0xdf,0x08,0x52,0x80,0x7b]
+          vrndscalenepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+
+// CHECK: vrsqrtpbf16 xmm2, xmm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4e,0xd3]
+          vrsqrtpbf16 xmm2, xmm3
+
+// CHECK: vrsqrtpbf16 xmm2 {k7}, xmm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4e,0xd3]
+          vrsqrtpbf16 xmm2 {k7}, xmm3
+
+// CHECK: vrsqrtpbf16 xmm2 {k7} {z}, xmm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4e,0xd3]
+          vrsqrtpbf16 xmm2 {k7} {z}, xmm3
+
+// CHECK: vrsqrtpbf16 zmm2, zmm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4e,0xd3]
+          vrsqrtpbf16 zmm2, zmm3
+
+// CHECK: vrsqrtpbf16 zmm2 {k7}, zmm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4e,0xd3]
+          vrsqrtpbf16 zmm2 {k7}, zmm3
+
+// CHECK: vrsqrtpbf16 zmm2 {k7} {z}, zmm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4e,0xd3]
+          vrsqrtpbf16 zmm2 {k7} {z}, zmm3
+
+// CHECK: vrsqrtpbf16 ymm2, ymm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4e,0xd3]
+          vrsqrtpbf16 ymm2, ymm3
+
+// CHECK: vrsqrtpbf16 ymm2 {k7}, ymm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4e,0xd3]
+          vrsqrtpbf16 ymm2 {k7}, ymm3
+
+// CHECK: vrsqrtpbf16 ymm2 {k7} {z}, ymm3
+// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4e,0xd3]
+          vrsqrtpbf16 ymm2 {k7} {z}, ymm3
+
+// CHECK: vrsqrtpbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vrsqrtpbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vrsqrtpbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x0f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vrsqrtpbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vrsqrtpbf16 xmm2, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x18,0x4e,0x10]
+          vrsqrtpbf16 xmm2, word ptr [eax]{1to8}
+
+// CHECK: vrsqrtpbf16 xmm2, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x4e,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vrsqrtpbf16 xmm2, xmmword ptr [2*ebp - 512]
+
+// CHECK: vrsqrtpbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x8f,0x4e,0x51,0x7f]
+          vrsqrtpbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032]
+
+// CHECK: vrsqrtpbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x9f,0x4e,0x52,0x80]
+          vrsqrtpbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}
+
+// CHECK: vrsqrtpbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vrsqrtpbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vrsqrtpbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x2f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vrsqrtpbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vrsqrtpbf16 ymm2, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x38,0x4e,0x10]
+          vrsqrtpbf16 ymm2, word ptr [eax]{1to16}
+
+// CHECK: vrsqrtpbf16 ymm2, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x4e,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vrsqrtpbf16 ymm2, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vrsqrtpbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xaf,0x4e,0x51,0x7f]
+          vrsqrtpbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064]
+
+// CHECK: vrsqrtpbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xbf,0x4e,0x52,0x80]
+          vrsqrtpbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}
+
+// CHECK: vrsqrtpbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4e,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vrsqrtpbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vrsqrtpbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x4e,0x94,0x87,0x23,0x01,0x00,0x00]
+          vrsqrtpbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vrsqrtpbf16 zmm2, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7c,0x58,0x4e,0x10]
+          vrsqrtpbf16 zmm2, word ptr [eax]{1to32}
+
+// CHECK: vrsqrtpbf16 zmm2, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x4e,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vrsqrtpbf16 zmm2, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vrsqrtpbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x4e,0x51,0x7f]
+          vrsqrtpbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128]
+
+// CHECK: vrsqrtpbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7c,0xdf,0x4e,0x52,0x80]
+          vrsqrtpbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vscalefpbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x2c,0xd4]
+          vscalefpbf16 ymm2, ymm3, ymm4
+
+// CHECK: vscalefpbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x2c,0xd4]
+          vscalefpbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vscalefpbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x2c,0xd4]
+          vscalefpbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vscalefpbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x2c,0xd4]
+          vscalefpbf16 zmm2, zmm3, zmm4
+
+// CHECK: vscalefpbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x2c,0xd4]
+          vscalefpbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vscalefpbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x2c,0xd4]
+          vscalefpbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vscalefpbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x2c,0xd4]
+          vscalefpbf16 xmm2, xmm3, xmm4
+
+// CHECK: vscalefpbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x2c,0xd4]
+          vscalefpbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vscalefpbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x2c,0xd4]
+          vscalefpbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vscalefpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vscalefpbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vscalefpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x4f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vscalefpbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vscalefpbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0x58,0x2c,0x10]
+          vscalefpbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vscalefpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf6,0x64,0x48,0x2c,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vscalefpbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vscalefpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x64,0xcf,0x2c,0x51,0x7f]
+          vscalefpbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vscalefpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x64,0xdf,0x2c,0x52,0x80]
+          vscalefpbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vscalefpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vscalefpbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vscalefpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x2f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vscalefpbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vscalefpbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0x38,0x2c,0x10]
+          vscalefpbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vscalefpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf6,0x64,0x28,0x2c,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vscalefpbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vscalefpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x64,0xaf,0x2c,0x51,0x7f]
+          vscalefpbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vscalefpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x64,0xbf,0x2c,0x52,0x80]
+          vscalefpbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vscalefpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vscalefpbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vscalefpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf6,0x64,0x0f,0x2c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vscalefpbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vscalefpbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x18,0x2c,0x10]
+          vscalefpbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vscalefpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf6,0x64,0x08,0x2c,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vscalefpbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vscalefpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf6,0x64,0x8f,0x2c,0x51,0x7f]
+          vscalefpbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vscalefpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf6,0x64,0x9f,0x2c,0x52,0x80]
+          vscalefpbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
+// CHECK: vsqrtnepbf16 xmm2, xmm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x51,0xd3]
+          vsqrtnepbf16 xmm2, xmm3
+
+// CHECK: vsqrtnepbf16 xmm2 {k7}, xmm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x51,0xd3]
+          vsqrtnepbf16 xmm2 {k7}, xmm3
+
+// CHECK: vsqrtnepbf16 xmm2 {k7} {z}, xmm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x51,0xd3]
+          vsqrtnepbf16 xmm2 {k7} {z}, xmm3
+
+// CHECK: vsqrtnepbf16 zmm2, zmm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x51,0xd3]
+          vsqrtnepbf16 zmm2, zmm3
+
+// CHECK: vsqrtnepbf16 zmm2 {k7}, zmm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x51,0xd3]
+          vsqrtnepbf16 zmm2 {k7}, zmm3
+
+// CHECK: vsqrtnepbf16 zmm2 {k7} {z}, zmm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x51,0xd3]
+          vsqrtnepbf16 zmm2 {k7} {z}, zmm3
+
+// CHECK: vsqrtnepbf16 ymm2, ymm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x51,0xd3]
+          vsqrtnepbf16 ymm2, ymm3
+
+// CHECK: vsqrtnepbf16 ymm2 {k7}, ymm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x51,0xd3]
+          vsqrtnepbf16 ymm2 {k7}, ymm3
+
+// CHECK: vsqrtnepbf16 ymm2 {k7} {z}, ymm3
+// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x51,0xd3]
+          vsqrtnepbf16 ymm2 {k7} {z}, ymm3
+
+// CHECK: vsqrtnepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vsqrtnepbf16 xmm2, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vsqrtnepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+          vsqrtnepbf16 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vsqrtnepbf16 xmm2, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x51,0x10]
+          vsqrtnepbf16 xmm2, word ptr [eax]{1to8}
+
+// CHECK: vsqrtnepbf16 xmm2, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vsqrtnepbf16 xmm2, xmmword ptr [2*ebp - 512]
+
+// CHECK: vsqrtnepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x8f,0x51,0x51,0x7f]
+          vsqrtnepbf16 xmm2 {k7} {z}, xmmword ptr [ecx + 2032]
+
+// CHECK: vsqrtnepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x51,0x52,0x80]
+          vsqrtnepbf16 xmm2 {k7} {z}, word ptr [edx - 256]{1to8}
+
+// CHECK: vsqrtnepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vsqrtnepbf16 ymm2, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vsqrtnepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+          vsqrtnepbf16 ymm2 {k7}, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vsqrtnepbf16 ymm2, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x51,0x10]
+          vsqrtnepbf16 ymm2, word ptr [eax]{1to16}
+
+// CHECK: vsqrtnepbf16 ymm2, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vsqrtnepbf16 ymm2, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vsqrtnepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf5,0x7d,0xaf,0x51,0x51,0x7f]
+          vsqrtnepbf16 ymm2 {k7} {z}, ymmword ptr [ecx + 4064]
+
+// CHECK: vsqrtnepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x51,0x52,0x80]
+          vsqrtnepbf16 ymm2 {k7} {z}, word ptr [edx - 256]{1to16}
+
+// CHECK: vsqrtnepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vsqrtnepbf16 zmm2, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vsqrtnepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+          vsqrtnepbf16 zmm2 {k7}, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vsqrtnepbf16 zmm2, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x51,0x10]
+          vsqrtnepbf16 zmm2, word ptr [eax]{1to32}
+
+// CHECK: vsqrtnepbf16 zmm2, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vsqrtnepbf16 zmm2, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vsqrtnepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x7d,0xcf,0x51,0x51,0x7f]
+          vsqrtnepbf16 zmm2 {k7} {z}, zmmword ptr [ecx + 8128]
+
+// CHECK: vsqrtnepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x51,0x52,0x80]
+          vsqrtnepbf16 zmm2 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vsubnepbf16 ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5c,0xd4]
+          vsubnepbf16 ymm2, ymm3, ymm4
+
+// CHECK: vsubnepbf16 ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5c,0xd4]
+          vsubnepbf16 ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vsubnepbf16 ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5c,0xd4]
+          vsubnepbf16 ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vsubnepbf16 zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5c,0xd4]
+          vsubnepbf16 zmm2, zmm3, zmm4
+
+// CHECK: vsubnepbf16 zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5c,0xd4]
+          vsubnepbf16 zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vsubnepbf16 zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5c,0xd4]
+          vsubnepbf16 zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vsubnepbf16 xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5c,0xd4]
+          vsubnepbf16 xmm2, xmm3, xmm4
+
+// CHECK: vsubnepbf16 xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5c,0xd4]
+          vsubnepbf16 xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vsubnepbf16 xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5c,0xd4]
+          vsubnepbf16 xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vsubnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vsubnepbf16 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vsubnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x4f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vsubnepbf16 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vsubnepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x65,0x58,0x5c,0x10]
+          vsubnepbf16 zmm2, zmm3, word ptr [eax]{1to32}
+
+// CHECK: vsubnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf5,0x65,0x48,0x5c,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vsubnepbf16 zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vsubnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x65,0xcf,0x5c,0x51,0x7f]
+          vsubnepbf16 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vsubnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x65,0xdf,0x5c,0x52,0x80]
+          vsubnepbf16 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32}
+
+// CHECK: vsubnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vsubnepbf16 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vsubnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x2f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vsubnepbf16 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vsubnepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x65,0x38,0x5c,0x10]
+          vsubnepbf16 ymm2, ymm3, word ptr [eax]{1to16}
+
+// CHECK: vsubnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf5,0x65,0x28,0x5c,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vsubnepbf16 ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vsubnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf5,0x65,0xaf,0x5c,0x51,0x7f]
+          vsubnepbf16 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vsubnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x65,0xbf,0x5c,0x52,0x80]
+          vsubnepbf16 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16}
+
+// CHECK: vsubnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vsubnepbf16 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vsubnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf5,0x65,0x0f,0x5c,0x94,0x87,0x23,0x01,0x00,0x00]
+          vsubnepbf16 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vsubnepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x65,0x18,0x5c,0x10]
+          vsubnepbf16 xmm2, xmm3, word ptr [eax]{1to8}
+
+// CHECK: vsubnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf5,0x65,0x08,0x5c,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vsubnepbf16 xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vsubnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf5,0x65,0x8f,0x5c,0x51,0x7f]
+          vsubnepbf16 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vsubnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x65,0x9f,0x5c,0x52,0x80]
+          vsubnepbf16 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8}
+
diff --git a/llvm/test/MC/X86/avx10.2-bf16-64-att.s b/llvm/test/MC/X86/avx10.2-bf16-64-att.s
new file mode 100644
index 0000000000000..85d99cfe0a704
--- /dev/null
+++ b/llvm/test/MC/X86/avx10.2-bf16-64-att.s
@@ -0,0 +1,3014 @@
+// RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+
+// CHECK: vaddnepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x85,0x45,0x20,0x58,0xf0]
+          vaddnepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vaddnepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x27,0x58,0xf0]
+          vaddnepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vaddnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x58,0xf0]
+          vaddnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vaddnepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x85,0x45,0x40,0x58,0xf0]
+          vaddnepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vaddnepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x47,0x58,0xf0]
+          vaddnepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vaddnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x58,0xf0]
+          vaddnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vaddnepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x85,0x45,0x00,0x58,0xf0]
+          vaddnepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vaddnepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x07,0x58,0xf0]
+          vaddnepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vaddnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0x87,0x58,0xf0]
+          vaddnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vaddnepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vaddnepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vaddnepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x58,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vaddnepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vaddnepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x58,0x35,0x00,0x00,0x00,0x00]
+          vaddnepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vaddnepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x58,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vaddnepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vaddnepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x58,0x71,0x7f]
+          vaddnepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vaddnepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x58,0x72,0x80]
+          vaddnepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vaddnepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vaddnepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vaddnepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x58,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vaddnepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vaddnepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x58,0x35,0x00,0x00,0x00,0x00]
+          vaddnepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vaddnepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x58,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vaddnepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vaddnepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x58,0x71,0x7f]
+          vaddnepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vaddnepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x58,0x72,0x80]
+          vaddnepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vaddnepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vaddnepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vaddnepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x58,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vaddnepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vaddnepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x58,0x35,0x00,0x00,0x00,0x00]
+          vaddnepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vaddnepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x58,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vaddnepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vaddnepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x58,0x71,0x7f]
+          vaddnepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vaddnepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x58,0x72,0x80]
+          vaddnepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vcmppbf16 $123, %ymm24, %ymm23, %k5
+// CHECK: encoding: [0x62,0x93,0x47,0x20,0xc2,0xe8,0x7b]
+          vcmppbf16 $123, %ymm24, %ymm23, %k5
+
+// CHECK: vcmppbf16 $123, %ymm24, %ymm23, %k5 {%k7}
+// CHECK: encoding: [0x62,0x93,0x47,0x27,0xc2,0xe8,0x7b]
+          vcmppbf16 $123, %ymm24, %ymm23, %k5 {%k7}
+
+// CHECK: vcmppbf16 $123, %xmm24, %xmm23, %k5
+// CHECK: encoding: [0x62,0x93,0x47,0x00,0xc2,0xe8,0x7b]
+          vcmppbf16 $123, %xmm24, %xmm23, %k5
+
+// CHECK: vcmppbf16 $123, %xmm24, %xmm23, %k5 {%k7}
+// CHECK: encoding: [0x62,0x93,0x47,0x07,0xc2,0xe8,0x7b]
+          vcmppbf16 $123, %xmm24, %xmm23, %k5 {%k7}
+
+// CHECK: vcmppbf16 $123, %zmm24, %zmm23, %k5
+// CHECK: encoding: [0x62,0x93,0x47,0x40,0xc2,0xe8,0x7b]
+          vcmppbf16 $123, %zmm24, %zmm23, %k5
+
+// CHECK: vcmppbf16 $123, %zmm24, %zmm23, %k5 {%k7}
+// CHECK: encoding: [0x62,0x93,0x47,0x47,0xc2,0xe8,0x7b]
+          vcmppbf16 $123, %zmm24, %zmm23, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, 268435456(%rbp,%r14,8), %zmm23, %k5
+// CHECK: encoding: [0x62,0xb3,0x47,0x40,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vcmppbf16  $123, 268435456(%rbp,%r14,8), %zmm23, %k5
+
+// CHECK: vcmppbf16  $123, 291(%r8,%rax,4), %zmm23, %k5 {%k7}
+// CHECK: encoding: [0x62,0xd3,0x47,0x47,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vcmppbf16  $123, 291(%r8,%rax,4), %zmm23, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, (%rip){1to32}, %zmm23, %k5
+// CHECK: encoding: [0x62,0xf3,0x47,0x50,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b]
+          vcmppbf16  $123, (%rip){1to32}, %zmm23, %k5
+
+// CHECK: vcmppbf16  $123, -2048(,%rbp,2), %zmm23, %k5
+// CHECK: encoding: [0x62,0xf3,0x47,0x40,0xc2,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vcmppbf16  $123, -2048(,%rbp,2), %zmm23, %k5
+
+// CHECK: vcmppbf16  $123, 8128(%rcx), %zmm23, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x47,0x47,0xc2,0x69,0x7f,0x7b]
+          vcmppbf16  $123, 8128(%rcx), %zmm23, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, -256(%rdx){1to32}, %zmm23, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x47,0x57,0xc2,0x6a,0x80,0x7b]
+          vcmppbf16  $123, -256(%rdx){1to32}, %zmm23, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, 268435456(%rbp,%r14,8), %xmm23, %k5
+// CHECK: encoding: [0x62,0xb3,0x47,0x00,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vcmppbf16  $123, 268435456(%rbp,%r14,8), %xmm23, %k5
+
+// CHECK: vcmppbf16  $123, 291(%r8,%rax,4), %xmm23, %k5 {%k7}
+// CHECK: encoding: [0x62,0xd3,0x47,0x07,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vcmppbf16  $123, 291(%r8,%rax,4), %xmm23, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, (%rip){1to8}, %xmm23, %k5
+// CHECK: encoding: [0x62,0xf3,0x47,0x10,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b]
+          vcmppbf16  $123, (%rip){1to8}, %xmm23, %k5
+
+// CHECK: vcmppbf16  $123, -512(,%rbp,2), %xmm23, %k5
+// CHECK: encoding: [0x62,0xf3,0x47,0x00,0xc2,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vcmppbf16  $123, -512(,%rbp,2), %xmm23, %k5
+
+// CHECK: vcmppbf16  $123, 2032(%rcx), %xmm23, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x47,0x07,0xc2,0x69,0x7f,0x7b]
+          vcmppbf16  $123, 2032(%rcx), %xmm23, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, -256(%rdx){1to8}, %xmm23, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x47,0x17,0xc2,0x6a,0x80,0x7b]
+          vcmppbf16  $123, -256(%rdx){1to8}, %xmm23, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, 268435456(%rbp,%r14,8), %ymm23, %k5
+// CHECK: encoding: [0x62,0xb3,0x47,0x20,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vcmppbf16  $123, 268435456(%rbp,%r14,8), %ymm23, %k5
+
+// CHECK: vcmppbf16  $123, 291(%r8,%rax,4), %ymm23, %k5 {%k7}
+// CHECK: encoding: [0x62,0xd3,0x47,0x27,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vcmppbf16  $123, 291(%r8,%rax,4), %ymm23, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, (%rip){1to16}, %ymm23, %k5
+// CHECK: encoding: [0x62,0xf3,0x47,0x30,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b]
+          vcmppbf16  $123, (%rip){1to16}, %ymm23, %k5
+
+// CHECK: vcmppbf16  $123, -1024(,%rbp,2), %ymm23, %k5
+// CHECK: encoding: [0x62,0xf3,0x47,0x20,0xc2,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vcmppbf16  $123, -1024(,%rbp,2), %ymm23, %k5
+
+// CHECK: vcmppbf16  $123, 4064(%rcx), %ymm23, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x47,0x27,0xc2,0x69,0x7f,0x7b]
+          vcmppbf16  $123, 4064(%rcx), %ymm23, %k5 {%k7}
+
+// CHECK: vcmppbf16  $123, -256(%rdx){1to16}, %ymm23, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x47,0x37,0xc2,0x6a,0x80,0x7b]
+          vcmppbf16  $123, -256(%rdx){1to16}, %ymm23, %k5 {%k7}
+
+// CHECK: vcomsbf16 %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x2f,0xf7]
+          vcomsbf16 %xmm23, %xmm22
+
+// CHECK: vcomsbf16  268435456(%rbp,%r14,8), %xmm22
+// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcomsbf16  268435456(%rbp,%r14,8), %xmm22
+
+// CHECK: vcomsbf16  291(%r8,%rax,4), %xmm22
+// CHECK: encoding: [0x62,0xc5,0x7d,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vcomsbf16  291(%r8,%rax,4), %xmm22
+
+// CHECK: vcomsbf16  (%rip), %xmm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x35,0x00,0x00,0x00,0x00]
+          vcomsbf16  (%rip), %xmm22
+
+// CHECK: vcomsbf16  -64(,%rbp,2), %xmm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x34,0x6d,0xc0,0xff,0xff,0xff]
+          vcomsbf16  -64(,%rbp,2), %xmm22
+
+// CHECK: vcomsbf16  254(%rcx), %xmm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x71,0x7f]
+          vcomsbf16  254(%rcx), %xmm22
+
+// CHECK: vcomsbf16  -256(%rdx), %xmm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x72,0x80]
+          vcomsbf16  -256(%rdx), %xmm22
+
+// CHECK: vdivnepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5e,0xf0]
+          vdivnepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vdivnepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5e,0xf0]
+          vdivnepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vdivnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5e,0xf0]
+          vdivnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vdivnepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5e,0xf0]
+          vdivnepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vdivnepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5e,0xf0]
+          vdivnepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vdivnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5e,0xf0]
+          vdivnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vdivnepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5e,0xf0]
+          vdivnepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vdivnepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5e,0xf0]
+          vdivnepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vdivnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5e,0xf0]
+          vdivnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vdivnepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vdivnepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vdivnepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vdivnepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vdivnepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5e,0x35,0x00,0x00,0x00,0x00]
+          vdivnepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vdivnepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5e,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vdivnepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vdivnepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5e,0x71,0x7f]
+          vdivnepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vdivnepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5e,0x72,0x80]
+          vdivnepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vdivnepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vdivnepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vdivnepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vdivnepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vdivnepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5e,0x35,0x00,0x00,0x00,0x00]
+          vdivnepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vdivnepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5e,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vdivnepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vdivnepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5e,0x71,0x7f]
+          vdivnepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vdivnepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5e,0x72,0x80]
+          vdivnepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vdivnepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vdivnepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vdivnepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vdivnepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vdivnepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5e,0x35,0x00,0x00,0x00,0x00]
+          vdivnepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vdivnepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5e,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vdivnepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vdivnepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5e,0x71,0x7f]
+          vdivnepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vdivnepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5e,0x72,0x80]
+          vdivnepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0x98,0xf0]
+          vfmadd132nepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vfmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0x98,0xf0]
+          vfmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vfmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x98,0xf0]
+          vfmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0x98,0xf0]
+          vfmadd132nepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vfmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0x98,0xf0]
+          vfmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vfmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x98,0xf0]
+          vfmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0x98,0xf0]
+          vfmadd132nepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vfmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0x98,0xf0]
+          vfmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vfmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0x98,0xf0]
+          vfmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd132nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vfmadd132nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x98,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd132nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vfmadd132nepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x98,0x35,0x00,0x00,0x00,0x00]
+          vfmadd132nepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vfmadd132nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x98,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfmadd132nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vfmadd132nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x98,0x71,0x7f]
+          vfmadd132nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x98,0x72,0x80]
+          vfmadd132nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd132nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vfmadd132nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x98,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd132nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vfmadd132nepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x98,0x35,0x00,0x00,0x00,0x00]
+          vfmadd132nepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vfmadd132nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x98,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfmadd132nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vfmadd132nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x98,0x71,0x7f]
+          vfmadd132nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x98,0x72,0x80]
+          vfmadd132nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd132nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vfmadd132nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x98,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd132nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vfmadd132nepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x98,0x35,0x00,0x00,0x00,0x00]
+          vfmadd132nepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vfmadd132nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x98,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfmadd132nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vfmadd132nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x98,0x71,0x7f]
+          vfmadd132nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmadd132nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x98,0x72,0x80]
+          vfmadd132nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xa8,0xf0]
+          vfmadd213nepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vfmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xa8,0xf0]
+          vfmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vfmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xa8,0xf0]
+          vfmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xa8,0xf0]
+          vfmadd213nepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vfmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xa8,0xf0]
+          vfmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vfmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xa8,0xf0]
+          vfmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xa8,0xf0]
+          vfmadd213nepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vfmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xa8,0xf0]
+          vfmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vfmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xa8,0xf0]
+          vfmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd213nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vfmadd213nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd213nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vfmadd213nepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xa8,0x35,0x00,0x00,0x00,0x00]
+          vfmadd213nepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vfmadd213nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xa8,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfmadd213nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vfmadd213nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xa8,0x71,0x7f]
+          vfmadd213nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xa8,0x72,0x80]
+          vfmadd213nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd213nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vfmadd213nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd213nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vfmadd213nepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xa8,0x35,0x00,0x00,0x00,0x00]
+          vfmadd213nepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vfmadd213nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xa8,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfmadd213nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vfmadd213nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xa8,0x71,0x7f]
+          vfmadd213nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xa8,0x72,0x80]
+          vfmadd213nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd213nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vfmadd213nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd213nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vfmadd213nepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xa8,0x35,0x00,0x00,0x00,0x00]
+          vfmadd213nepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vfmadd213nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xa8,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfmadd213nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vfmadd213nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xa8,0x71,0x7f]
+          vfmadd213nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmadd213nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xa8,0x72,0x80]
+          vfmadd213nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xb8,0xf0]
+          vfmadd231nepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vfmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xb8,0xf0]
+          vfmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vfmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xb8,0xf0]
+          vfmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xb8,0xf0]
+          vfmadd231nepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vfmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xb8,0xf0]
+          vfmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vfmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xb8,0xf0]
+          vfmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xb8,0xf0]
+          vfmadd231nepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vfmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xb8,0xf0]
+          vfmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vfmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xb8,0xf0]
+          vfmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd231nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vfmadd231nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd231nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vfmadd231nepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xb8,0x35,0x00,0x00,0x00,0x00]
+          vfmadd231nepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vfmadd231nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xb8,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfmadd231nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vfmadd231nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xb8,0x71,0x7f]
+          vfmadd231nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xb8,0x72,0x80]
+          vfmadd231nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd231nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vfmadd231nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd231nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vfmadd231nepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xb8,0x35,0x00,0x00,0x00,0x00]
+          vfmadd231nepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vfmadd231nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xb8,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfmadd231nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vfmadd231nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xb8,0x71,0x7f]
+          vfmadd231nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xb8,0x72,0x80]
+          vfmadd231nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd231nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vfmadd231nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd231nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vfmadd231nepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xb8,0x35,0x00,0x00,0x00,0x00]
+          vfmadd231nepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vfmadd231nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xb8,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfmadd231nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vfmadd231nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xb8,0x71,0x7f]
+          vfmadd231nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmadd231nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xb8,0x72,0x80]
+          vfmadd231nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0x9a,0xf0]
+          vfmsub132nepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vfmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0x9a,0xf0]
+          vfmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vfmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x9a,0xf0]
+          vfmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0x9a,0xf0]
+          vfmsub132nepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vfmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0x9a,0xf0]
+          vfmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vfmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x9a,0xf0]
+          vfmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0x9a,0xf0]
+          vfmsub132nepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vfmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0x9a,0xf0]
+          vfmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vfmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0x9a,0xf0]
+          vfmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub132nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vfmsub132nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub132nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vfmsub132nepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x9a,0x35,0x00,0x00,0x00,0x00]
+          vfmsub132nepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vfmsub132nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x9a,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfmsub132nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vfmsub132nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x9a,0x71,0x7f]
+          vfmsub132nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x9a,0x72,0x80]
+          vfmsub132nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub132nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vfmsub132nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub132nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vfmsub132nepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x9a,0x35,0x00,0x00,0x00,0x00]
+          vfmsub132nepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vfmsub132nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x9a,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfmsub132nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vfmsub132nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x9a,0x71,0x7f]
+          vfmsub132nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x9a,0x72,0x80]
+          vfmsub132nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub132nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vfmsub132nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub132nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vfmsub132nepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x9a,0x35,0x00,0x00,0x00,0x00]
+          vfmsub132nepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vfmsub132nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x9a,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfmsub132nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vfmsub132nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x9a,0x71,0x7f]
+          vfmsub132nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmsub132nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x9a,0x72,0x80]
+          vfmsub132nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xaa,0xf0]
+          vfmsub213nepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vfmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xaa,0xf0]
+          vfmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vfmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xaa,0xf0]
+          vfmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xaa,0xf0]
+          vfmsub213nepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vfmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xaa,0xf0]
+          vfmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vfmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xaa,0xf0]
+          vfmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xaa,0xf0]
+          vfmsub213nepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vfmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xaa,0xf0]
+          vfmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vfmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xaa,0xf0]
+          vfmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub213nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vfmsub213nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub213nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vfmsub213nepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xaa,0x35,0x00,0x00,0x00,0x00]
+          vfmsub213nepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vfmsub213nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xaa,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfmsub213nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vfmsub213nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xaa,0x71,0x7f]
+          vfmsub213nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xaa,0x72,0x80]
+          vfmsub213nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub213nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vfmsub213nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub213nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vfmsub213nepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xaa,0x35,0x00,0x00,0x00,0x00]
+          vfmsub213nepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vfmsub213nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xaa,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfmsub213nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vfmsub213nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xaa,0x71,0x7f]
+          vfmsub213nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xaa,0x72,0x80]
+          vfmsub213nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub213nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vfmsub213nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub213nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vfmsub213nepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xaa,0x35,0x00,0x00,0x00,0x00]
+          vfmsub213nepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vfmsub213nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xaa,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfmsub213nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vfmsub213nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xaa,0x71,0x7f]
+          vfmsub213nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmsub213nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xaa,0x72,0x80]
+          vfmsub213nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xba,0xf0]
+          vfmsub231nepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vfmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xba,0xf0]
+          vfmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vfmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xba,0xf0]
+          vfmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xba,0xf0]
+          vfmsub231nepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vfmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xba,0xf0]
+          vfmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vfmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xba,0xf0]
+          vfmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xba,0xf0]
+          vfmsub231nepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vfmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xba,0xf0]
+          vfmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vfmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xba,0xf0]
+          vfmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub231nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vfmsub231nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xba,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub231nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vfmsub231nepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xba,0x35,0x00,0x00,0x00,0x00]
+          vfmsub231nepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vfmsub231nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xba,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfmsub231nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vfmsub231nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xba,0x71,0x7f]
+          vfmsub231nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xba,0x72,0x80]
+          vfmsub231nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub231nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vfmsub231nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xba,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub231nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vfmsub231nepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xba,0x35,0x00,0x00,0x00,0x00]
+          vfmsub231nepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vfmsub231nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xba,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfmsub231nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vfmsub231nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xba,0x71,0x7f]
+          vfmsub231nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xba,0x72,0x80]
+          vfmsub231nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub231nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vfmsub231nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xba,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub231nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vfmsub231nepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xba,0x35,0x00,0x00,0x00,0x00]
+          vfmsub231nepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vfmsub231nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xba,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfmsub231nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vfmsub231nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xba,0x71,0x7f]
+          vfmsub231nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfmsub231nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xba,0x72,0x80]
+          vfmsub231nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0x9c,0xf0]
+          vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0x9c,0xf0]
+          vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x9c,0xf0]
+          vfnmadd132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0x9c,0xf0]
+          vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0x9c,0xf0]
+          vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x9c,0xf0]
+          vfnmadd132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0x9c,0xf0]
+          vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0x9c,0xf0]
+          vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0x9c,0xf0]
+          vfnmadd132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd132nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vfnmadd132nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd132nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vfnmadd132nepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x9c,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd132nepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vfnmadd132nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x9c,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmadd132nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vfnmadd132nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x9c,0x71,0x7f]
+          vfnmadd132nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x9c,0x72,0x80]
+          vfnmadd132nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd132nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vfnmadd132nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd132nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vfnmadd132nepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x9c,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd132nepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vfnmadd132nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x9c,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmadd132nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vfnmadd132nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x9c,0x71,0x7f]
+          vfnmadd132nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x9c,0x72,0x80]
+          vfnmadd132nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd132nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vfnmadd132nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd132nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vfnmadd132nepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x9c,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd132nepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vfnmadd132nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x9c,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmadd132nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vfnmadd132nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x9c,0x71,0x7f]
+          vfnmadd132nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmadd132nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x9c,0x72,0x80]
+          vfnmadd132nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xac,0xf0]
+          vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xac,0xf0]
+          vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xac,0xf0]
+          vfnmadd213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xac,0xf0]
+          vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xac,0xf0]
+          vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xac,0xf0]
+          vfnmadd213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xac,0xf0]
+          vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xac,0xf0]
+          vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xac,0xf0]
+          vfnmadd213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd213nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vfnmadd213nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xac,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd213nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vfnmadd213nepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xac,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd213nepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vfnmadd213nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xac,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmadd213nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vfnmadd213nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xac,0x71,0x7f]
+          vfnmadd213nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xac,0x72,0x80]
+          vfnmadd213nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd213nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vfnmadd213nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xac,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd213nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vfnmadd213nepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xac,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd213nepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vfnmadd213nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xac,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmadd213nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vfnmadd213nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xac,0x71,0x7f]
+          vfnmadd213nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xac,0x72,0x80]
+          vfnmadd213nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd213nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vfnmadd213nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xac,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd213nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vfnmadd213nepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xac,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd213nepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vfnmadd213nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xac,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmadd213nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vfnmadd213nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xac,0x71,0x7f]
+          vfnmadd213nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmadd213nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xac,0x72,0x80]
+          vfnmadd213nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xbc,0xf0]
+          vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xbc,0xf0]
+          vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xbc,0xf0]
+          vfnmadd231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xbc,0xf0]
+          vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xbc,0xf0]
+          vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xbc,0xf0]
+          vfnmadd231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xbc,0xf0]
+          vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xbc,0xf0]
+          vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xbc,0xf0]
+          vfnmadd231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd231nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vfnmadd231nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd231nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vfnmadd231nepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xbc,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd231nepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vfnmadd231nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xbc,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmadd231nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vfnmadd231nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xbc,0x71,0x7f]
+          vfnmadd231nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xbc,0x72,0x80]
+          vfnmadd231nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd231nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vfnmadd231nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd231nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vfnmadd231nepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xbc,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd231nepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vfnmadd231nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xbc,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmadd231nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vfnmadd231nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xbc,0x71,0x7f]
+          vfnmadd231nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xbc,0x72,0x80]
+          vfnmadd231nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd231nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vfnmadd231nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd231nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vfnmadd231nepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xbc,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd231nepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vfnmadd231nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xbc,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmadd231nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vfnmadd231nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xbc,0x71,0x7f]
+          vfnmadd231nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmadd231nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xbc,0x72,0x80]
+          vfnmadd231nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0x9e,0xf0]
+          vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0x9e,0xf0]
+          vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x9e,0xf0]
+          vfnmsub132nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0x9e,0xf0]
+          vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0x9e,0xf0]
+          vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x9e,0xf0]
+          vfnmsub132nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0x9e,0xf0]
+          vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0x9e,0xf0]
+          vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0x9e,0xf0]
+          vfnmsub132nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub132nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vfnmsub132nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub132nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vfnmsub132nepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x9e,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub132nepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vfnmsub132nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x9e,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmsub132nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vfnmsub132nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x9e,0x71,0x7f]
+          vfnmsub132nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x9e,0x72,0x80]
+          vfnmsub132nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub132nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vfnmsub132nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub132nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vfnmsub132nepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x9e,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub132nepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vfnmsub132nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x9e,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmsub132nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vfnmsub132nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x9e,0x71,0x7f]
+          vfnmsub132nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x9e,0x72,0x80]
+          vfnmsub132nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub132nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vfnmsub132nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub132nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vfnmsub132nepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x9e,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub132nepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vfnmsub132nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x9e,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmsub132nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vfnmsub132nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x9e,0x71,0x7f]
+          vfnmsub132nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmsub132nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x9e,0x72,0x80]
+          vfnmsub132nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xae,0xf0]
+          vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xae,0xf0]
+          vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xae,0xf0]
+          vfnmsub213nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xae,0xf0]
+          vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xae,0xf0]
+          vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xae,0xf0]
+          vfnmsub213nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xae,0xf0]
+          vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xae,0xf0]
+          vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xae,0xf0]
+          vfnmsub213nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub213nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vfnmsub213nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xae,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub213nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vfnmsub213nepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xae,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub213nepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vfnmsub213nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xae,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmsub213nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vfnmsub213nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xae,0x71,0x7f]
+          vfnmsub213nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xae,0x72,0x80]
+          vfnmsub213nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub213nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vfnmsub213nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xae,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub213nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vfnmsub213nepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xae,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub213nepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vfnmsub213nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xae,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmsub213nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vfnmsub213nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xae,0x71,0x7f]
+          vfnmsub213nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xae,0x72,0x80]
+          vfnmsub213nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub213nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vfnmsub213nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xae,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub213nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vfnmsub213nepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xae,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub213nepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vfnmsub213nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xae,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmsub213nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vfnmsub213nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xae,0x71,0x7f]
+          vfnmsub213nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmsub213nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xae,0x72,0x80]
+          vfnmsub213nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xbe,0xf0]
+          vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xbe,0xf0]
+          vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xbe,0xf0]
+          vfnmsub231nepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xbe,0xf0]
+          vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xbe,0xf0]
+          vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xbe,0xf0]
+          vfnmsub231nepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xbe,0xf0]
+          vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xbe,0xf0]
+          vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xbe,0xf0]
+          vfnmsub231nepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub231nepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vfnmsub231nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub231nepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vfnmsub231nepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xbe,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub231nepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vfnmsub231nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xbe,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmsub231nepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vfnmsub231nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xbe,0x71,0x7f]
+          vfnmsub231nepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xbe,0x72,0x80]
+          vfnmsub231nepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub231nepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vfnmsub231nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub231nepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vfnmsub231nepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xbe,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub231nepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vfnmsub231nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xbe,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmsub231nepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vfnmsub231nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xbe,0x71,0x7f]
+          vfnmsub231nepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xbe,0x72,0x80]
+          vfnmsub231nepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub231nepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vfnmsub231nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub231nepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vfnmsub231nepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xbe,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub231nepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vfnmsub231nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xbe,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmsub231nepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vfnmsub231nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xbe,0x71,0x7f]
+          vfnmsub231nepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfnmsub231nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xbe,0x72,0x80]
+          vfnmsub231nepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vfpclasspbf16 $123, %zmm23, %k5
+// CHECK: encoding: [0x62,0xb3,0x7f,0x48,0x66,0xef,0x7b]
+          vfpclasspbf16 $123, %zmm23, %k5
+
+// CHECK: vfpclasspbf16 $123, %zmm23, %k5 {%k7}
+// CHECK: encoding: [0x62,0xb3,0x7f,0x4f,0x66,0xef,0x7b]
+          vfpclasspbf16 $123, %zmm23, %k5 {%k7}
+
+// CHECK: vfpclasspbf16 $123, %ymm23, %k5
+// CHECK: encoding: [0x62,0xb3,0x7f,0x28,0x66,0xef,0x7b]
+          vfpclasspbf16 $123, %ymm23, %k5
+
+// CHECK: vfpclasspbf16 $123, %ymm23, %k5 {%k7}
+// CHECK: encoding: [0x62,0xb3,0x7f,0x2f,0x66,0xef,0x7b]
+          vfpclasspbf16 $123, %ymm23, %k5 {%k7}
+
+// CHECK: vfpclasspbf16 $123, %xmm23, %k5
+// CHECK: encoding: [0x62,0xb3,0x7f,0x08,0x66,0xef,0x7b]
+          vfpclasspbf16 $123, %xmm23, %k5
+
+// CHECK: vfpclasspbf16 $123, %xmm23, %k5 {%k7}
+// CHECK: encoding: [0x62,0xb3,0x7f,0x0f,0x66,0xef,0x7b]
+          vfpclasspbf16 $123, %xmm23, %k5 {%k7}
+
+// CHECK: vfpclasspbf16x  $123, 268435456(%rbp,%r14,8), %k5
+// CHECK: encoding: [0x62,0xb3,0x7f,0x08,0x66,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vfpclasspbf16x  $123, 268435456(%rbp,%r14,8), %k5
+
+// CHECK: vfpclasspbf16x  $123, 291(%r8,%rax,4), %k5 {%k7}
+// CHECK: encoding: [0x62,0xd3,0x7f,0x0f,0x66,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vfpclasspbf16x  $123, 291(%r8,%rax,4), %k5 {%k7}
+
+// CHECK: vfpclasspbf16  $123, (%rip){1to8}, %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b]
+          vfpclasspbf16  $123, (%rip){1to8}, %k5
+
+// CHECK: vfpclasspbf16x  $123, -512(,%rbp,2), %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vfpclasspbf16x  $123, -512(,%rbp,2), %k5
+
+// CHECK: vfpclasspbf16x  $123, 2032(%rcx), %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0x69,0x7f,0x7b]
+          vfpclasspbf16x  $123, 2032(%rcx), %k5 {%k7}
+
+// CHECK: vfpclasspbf16  $123, -256(%rdx){1to8}, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x1f,0x66,0x6a,0x80,0x7b]
+          vfpclasspbf16  $123, -256(%rdx){1to8}, %k5 {%k7}
+
+// CHECK: vfpclasspbf16  $123, (%rip){1to16}, %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b]
+          vfpclasspbf16  $123, (%rip){1to16}, %k5
+
+// CHECK: vfpclasspbf16y  $123, -1024(,%rbp,2), %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x66,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vfpclasspbf16y  $123, -1024(,%rbp,2), %k5
+
+// CHECK: vfpclasspbf16y  $123, 4064(%rcx), %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x66,0x69,0x7f,0x7b]
+          vfpclasspbf16y  $123, 4064(%rcx), %k5 {%k7}
+
+// CHECK: vfpclasspbf16  $123, -256(%rdx){1to16}, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x3f,0x66,0x6a,0x80,0x7b]
+          vfpclasspbf16  $123, -256(%rdx){1to16}, %k5 {%k7}
+
+// CHECK: vfpclasspbf16  $123, (%rip){1to32}, %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b]
+          vfpclasspbf16  $123, (%rip){1to32}, %k5
+
+// CHECK: vfpclasspbf16z  $123, -2048(,%rbp,2), %k5
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x66,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vfpclasspbf16z  $123, -2048(,%rbp,2), %k5
+
+// CHECK: vfpclasspbf16z  $123, 8128(%rcx), %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x66,0x69,0x7f,0x7b]
+          vfpclasspbf16z  $123, 8128(%rcx), %k5 {%k7}
+
+// CHECK: vfpclasspbf16  $123, -256(%rdx){1to32}, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7f,0x5f,0x66,0x6a,0x80,0x7b]
+          vfpclasspbf16  $123, -256(%rdx){1to32}, %k5 {%k7}
+
+// CHECK: vgetexppbf16 %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x42,0xf7]
+          vgetexppbf16 %xmm23, %xmm22
+
+// CHECK: vgetexppbf16 %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xa5,0x7d,0x0f,0x42,0xf7]
+          vgetexppbf16 %xmm23, %xmm22 {%k7}
+
+// CHECK: vgetexppbf16 %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa5,0x7d,0x8f,0x42,0xf7]
+          vgetexppbf16 %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vgetexppbf16 %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x42,0xf7]
+          vgetexppbf16 %zmm23, %zmm22
+
+// CHECK: vgetexppbf16 %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xa5,0x7d,0x4f,0x42,0xf7]
+          vgetexppbf16 %zmm23, %zmm22 {%k7}
+
+// CHECK: vgetexppbf16 %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa5,0x7d,0xcf,0x42,0xf7]
+          vgetexppbf16 %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vgetexppbf16 %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x42,0xf7]
+          vgetexppbf16 %ymm23, %ymm22
+
+// CHECK: vgetexppbf16 %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xa5,0x7d,0x2f,0x42,0xf7]
+          vgetexppbf16 %ymm23, %ymm22 {%k7}
+
+// CHECK: vgetexppbf16 %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa5,0x7d,0xaf,0x42,0xf7]
+          vgetexppbf16 %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vgetexppbf16  268435456(%rbp,%r14,8), %xmm22
+// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vgetexppbf16  268435456(%rbp,%r14,8), %xmm22
+
+// CHECK: vgetexppbf16  291(%r8,%rax,4), %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x7d,0x0f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vgetexppbf16  291(%r8,%rax,4), %xmm22 {%k7}
+
+// CHECK: vgetexppbf16  (%rip){1to8}, %xmm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x18,0x42,0x35,0x00,0x00,0x00,0x00]
+          vgetexppbf16  (%rip){1to8}, %xmm22
+
+// CHECK: vgetexppbf16  -512(,%rbp,2), %xmm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x42,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vgetexppbf16  -512(,%rbp,2), %xmm22
+
+// CHECK: vgetexppbf16  2032(%rcx), %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x7d,0x8f,0x42,0x71,0x7f]
+          vgetexppbf16  2032(%rcx), %xmm22 {%k7} {z}
+
+// CHECK: vgetexppbf16  -256(%rdx){1to8}, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x7d,0x9f,0x42,0x72,0x80]
+          vgetexppbf16  -256(%rdx){1to8}, %xmm22 {%k7} {z}
+
+// CHECK: vgetexppbf16  268435456(%rbp,%r14,8), %ymm22
+// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vgetexppbf16  268435456(%rbp,%r14,8), %ymm22
+
+// CHECK: vgetexppbf16  291(%r8,%rax,4), %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x7d,0x2f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vgetexppbf16  291(%r8,%rax,4), %ymm22 {%k7}
+
+// CHECK: vgetexppbf16  (%rip){1to16}, %ymm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x38,0x42,0x35,0x00,0x00,0x00,0x00]
+          vgetexppbf16  (%rip){1to16}, %ymm22
+
+// CHECK: vgetexppbf16  -1024(,%rbp,2), %ymm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x28,0x42,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vgetexppbf16  -1024(,%rbp,2), %ymm22
+
+// CHECK: vgetexppbf16  4064(%rcx), %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x7d,0xaf,0x42,0x71,0x7f]
+          vgetexppbf16  4064(%rcx), %ymm22 {%k7} {z}
+
+// CHECK: vgetexppbf16  -256(%rdx){1to16}, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x7d,0xbf,0x42,0x72,0x80]
+          vgetexppbf16  -256(%rdx){1to16}, %ymm22 {%k7} {z}
+
+// CHECK: vgetexppbf16  268435456(%rbp,%r14,8), %zmm22
+// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vgetexppbf16  268435456(%rbp,%r14,8), %zmm22
+
+// CHECK: vgetexppbf16  291(%r8,%rax,4), %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x7d,0x4f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vgetexppbf16  291(%r8,%rax,4), %zmm22 {%k7}
+
+// CHECK: vgetexppbf16  (%rip){1to32}, %zmm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x58,0x42,0x35,0x00,0x00,0x00,0x00]
+          vgetexppbf16  (%rip){1to32}, %zmm22
+
+// CHECK: vgetexppbf16  -2048(,%rbp,2), %zmm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x48,0x42,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vgetexppbf16  -2048(,%rbp,2), %zmm22
+
+// CHECK: vgetexppbf16  8128(%rcx), %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x7d,0xcf,0x42,0x71,0x7f]
+          vgetexppbf16  8128(%rcx), %zmm22 {%k7} {z}
+
+// CHECK: vgetexppbf16  -256(%rdx){1to32}, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x7d,0xdf,0x42,0x72,0x80]
+          vgetexppbf16  -256(%rdx){1to32}, %zmm22 {%k7} {z}
+
+// CHECK: vgetmantpbf16 $123, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x26,0xf7,0x7b]
+          vgetmantpbf16 $123, %zmm23, %zmm22
+
+// CHECK: vgetmantpbf16 $123, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xa3,0x7f,0x4f,0x26,0xf7,0x7b]
+          vgetmantpbf16 $123, %zmm23, %zmm22 {%k7}
+
+// CHECK: vgetmantpbf16 $123, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa3,0x7f,0xcf,0x26,0xf7,0x7b]
+          vgetmantpbf16 $123, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vgetmantpbf16 $123, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x26,0xf7,0x7b]
+          vgetmantpbf16 $123, %ymm23, %ymm22
+
+// CHECK: vgetmantpbf16 $123, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xa3,0x7f,0x2f,0x26,0xf7,0x7b]
+          vgetmantpbf16 $123, %ymm23, %ymm22 {%k7}
+
+// CHECK: vgetmantpbf16 $123, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa3,0x7f,0xaf,0x26,0xf7,0x7b]
+          vgetmantpbf16 $123, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vgetmantpbf16 $123, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x26,0xf7,0x7b]
+          vgetmantpbf16 $123, %xmm23, %xmm22
+
+// CHECK: vgetmantpbf16 $123, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xa3,0x7f,0x0f,0x26,0xf7,0x7b]
+          vgetmantpbf16 $123, %xmm23, %xmm22 {%k7}
+
+// CHECK: vgetmantpbf16 $123, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa3,0x7f,0x8f,0x26,0xf7,0x7b]
+          vgetmantpbf16 $123, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vgetmantpbf16  $123, 268435456(%rbp,%r14,8), %xmm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantpbf16  $123, 268435456(%rbp,%r14,8), %xmm22
+
+// CHECK: vgetmantpbf16  $123, 291(%r8,%rax,4), %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc3,0x7f,0x0f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vgetmantpbf16  $123, 291(%r8,%rax,4), %xmm22 {%k7}
+
+// CHECK: vgetmantpbf16  $123, (%rip){1to8}, %xmm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x18,0x26,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vgetmantpbf16  $123, (%rip){1to8}, %xmm22
+
+// CHECK: vgetmantpbf16  $123, -512(,%rbp,2), %xmm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x08,0x26,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vgetmantpbf16  $123, -512(,%rbp,2), %xmm22
+
+// CHECK: vgetmantpbf16  $123, 2032(%rcx), %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0x8f,0x26,0x71,0x7f,0x7b]
+          vgetmantpbf16  $123, 2032(%rcx), %xmm22 {%k7} {z}
+
+// CHECK: vgetmantpbf16  $123, -256(%rdx){1to8}, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0x9f,0x26,0x72,0x80,0x7b]
+          vgetmantpbf16  $123, -256(%rdx){1to8}, %xmm22 {%k7} {z}
+
+// CHECK: vgetmantpbf16  $123, 268435456(%rbp,%r14,8), %ymm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantpbf16  $123, 268435456(%rbp,%r14,8), %ymm22
+
+// CHECK: vgetmantpbf16  $123, 291(%r8,%rax,4), %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc3,0x7f,0x2f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vgetmantpbf16  $123, 291(%r8,%rax,4), %ymm22 {%k7}
+
+// CHECK: vgetmantpbf16  $123, (%rip){1to16}, %ymm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x38,0x26,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vgetmantpbf16  $123, (%rip){1to16}, %ymm22
+
+// CHECK: vgetmantpbf16  $123, -1024(,%rbp,2), %ymm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x28,0x26,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vgetmantpbf16  $123, -1024(,%rbp,2), %ymm22
+
+// CHECK: vgetmantpbf16  $123, 4064(%rcx), %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0xaf,0x26,0x71,0x7f,0x7b]
+          vgetmantpbf16  $123, 4064(%rcx), %ymm22 {%k7} {z}
+
+// CHECK: vgetmantpbf16  $123, -256(%rdx){1to16}, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0xbf,0x26,0x72,0x80,0x7b]
+          vgetmantpbf16  $123, -256(%rdx){1to16}, %ymm22 {%k7} {z}
+
+// CHECK: vgetmantpbf16  $123, 268435456(%rbp,%r14,8), %zmm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantpbf16  $123, 268435456(%rbp,%r14,8), %zmm22
+
+// CHECK: vgetmantpbf16  $123, 291(%r8,%rax,4), %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc3,0x7f,0x4f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vgetmantpbf16  $123, 291(%r8,%rax,4), %zmm22 {%k7}
+
+// CHECK: vgetmantpbf16  $123, (%rip){1to32}, %zmm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x58,0x26,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vgetmantpbf16  $123, (%rip){1to32}, %zmm22
+
+// CHECK: vgetmantpbf16  $123, -2048(,%rbp,2), %zmm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x26,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vgetmantpbf16  $123, -2048(,%rbp,2), %zmm22
+
+// CHECK: vgetmantpbf16  $123, 8128(%rcx), %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0xcf,0x26,0x71,0x7f,0x7b]
+          vgetmantpbf16  $123, 8128(%rcx), %zmm22 {%k7} {z}
+
+// CHECK: vgetmantpbf16  $123, -256(%rdx){1to32}, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0xdf,0x26,0x72,0x80,0x7b]
+          vgetmantpbf16  $123, -256(%rdx){1to32}, %zmm22 {%k7} {z}
+
+// CHECK: vmaxpbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5f,0xf0]
+          vmaxpbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vmaxpbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5f,0xf0]
+          vmaxpbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vmaxpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5f,0xf0]
+          vmaxpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vmaxpbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5f,0xf0]
+          vmaxpbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vmaxpbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5f,0xf0]
+          vmaxpbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vmaxpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5f,0xf0]
+          vmaxpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vmaxpbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5f,0xf0]
+          vmaxpbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vmaxpbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5f,0xf0]
+          vmaxpbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vmaxpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5f,0xf0]
+          vmaxpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vmaxpbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmaxpbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vmaxpbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vmaxpbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vmaxpbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5f,0x35,0x00,0x00,0x00,0x00]
+          vmaxpbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vmaxpbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5f,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vmaxpbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vmaxpbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5f,0x71,0x7f]
+          vmaxpbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vmaxpbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5f,0x72,0x80]
+          vmaxpbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vmaxpbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmaxpbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vmaxpbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vmaxpbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vmaxpbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5f,0x35,0x00,0x00,0x00,0x00]
+          vmaxpbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vmaxpbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5f,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vmaxpbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vmaxpbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5f,0x71,0x7f]
+          vmaxpbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vmaxpbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5f,0x72,0x80]
+          vmaxpbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vmaxpbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmaxpbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vmaxpbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vmaxpbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vmaxpbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5f,0x35,0x00,0x00,0x00,0x00]
+          vmaxpbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vmaxpbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5f,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vmaxpbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vmaxpbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5f,0x71,0x7f]
+          vmaxpbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vmaxpbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5f,0x72,0x80]
+          vmaxpbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vminpbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5d,0xf0]
+          vminpbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vminpbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5d,0xf0]
+          vminpbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vminpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5d,0xf0]
+          vminpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vminpbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5d,0xf0]
+          vminpbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vminpbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5d,0xf0]
+          vminpbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vminpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5d,0xf0]
+          vminpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vminpbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5d,0xf0]
+          vminpbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vminpbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5d,0xf0]
+          vminpbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vminpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5d,0xf0]
+          vminpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vminpbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vminpbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vminpbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vminpbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vminpbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5d,0x35,0x00,0x00,0x00,0x00]
+          vminpbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vminpbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5d,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vminpbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vminpbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5d,0x71,0x7f]
+          vminpbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vminpbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5d,0x72,0x80]
+          vminpbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vminpbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vminpbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vminpbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vminpbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vminpbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5d,0x35,0x00,0x00,0x00,0x00]
+          vminpbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vminpbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5d,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vminpbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vminpbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5d,0x71,0x7f]
+          vminpbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vminpbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5d,0x72,0x80]
+          vminpbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vminpbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vminpbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vminpbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vminpbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vminpbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5d,0x35,0x00,0x00,0x00,0x00]
+          vminpbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vminpbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5d,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vminpbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vminpbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5d,0x71,0x7f]
+          vminpbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vminpbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5d,0x72,0x80]
+          vminpbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vmulnepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x85,0x45,0x20,0x59,0xf0]
+          vmulnepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vmulnepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x27,0x59,0xf0]
+          vmulnepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vmulnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x59,0xf0]
+          vmulnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vmulnepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x85,0x45,0x40,0x59,0xf0]
+          vmulnepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vmulnepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x47,0x59,0xf0]
+          vmulnepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vmulnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x59,0xf0]
+          vmulnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vmulnepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x85,0x45,0x00,0x59,0xf0]
+          vmulnepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vmulnepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x07,0x59,0xf0]
+          vmulnepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vmulnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0x87,0x59,0xf0]
+          vmulnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vmulnepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmulnepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vmulnepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x59,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vmulnepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vmulnepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x59,0x35,0x00,0x00,0x00,0x00]
+          vmulnepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vmulnepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x59,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vmulnepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vmulnepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x59,0x71,0x7f]
+          vmulnepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vmulnepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x59,0x72,0x80]
+          vmulnepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vmulnepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmulnepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vmulnepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x59,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vmulnepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vmulnepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x59,0x35,0x00,0x00,0x00,0x00]
+          vmulnepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vmulnepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x59,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vmulnepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vmulnepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x59,0x71,0x7f]
+          vmulnepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vmulnepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x59,0x72,0x80]
+          vmulnepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vmulnepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmulnepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vmulnepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x59,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vmulnepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vmulnepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x59,0x35,0x00,0x00,0x00,0x00]
+          vmulnepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vmulnepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x59,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vmulnepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vmulnepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x59,0x71,0x7f]
+          vmulnepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vmulnepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x59,0x72,0x80]
+          vmulnepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vrcppbf16 %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4c,0xf7]
+          vrcppbf16 %xmm23, %xmm22
+
+// CHECK: vrcppbf16 %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xa6,0x7c,0x0f,0x4c,0xf7]
+          vrcppbf16 %xmm23, %xmm22 {%k7}
+
+// CHECK: vrcppbf16 %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa6,0x7c,0x8f,0x4c,0xf7]
+          vrcppbf16 %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vrcppbf16 %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4c,0xf7]
+          vrcppbf16 %zmm23, %zmm22
+
+// CHECK: vrcppbf16 %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xa6,0x7c,0x4f,0x4c,0xf7]
+          vrcppbf16 %zmm23, %zmm22 {%k7}
+
+// CHECK: vrcppbf16 %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa6,0x7c,0xcf,0x4c,0xf7]
+          vrcppbf16 %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vrcppbf16 %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4c,0xf7]
+          vrcppbf16 %ymm23, %ymm22
+
+// CHECK: vrcppbf16 %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xa6,0x7c,0x2f,0x4c,0xf7]
+          vrcppbf16 %ymm23, %ymm22 {%k7}
+
+// CHECK: vrcppbf16 %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa6,0x7c,0xaf,0x4c,0xf7]
+          vrcppbf16 %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vrcppbf16  268435456(%rbp,%r14,8), %xmm22
+// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrcppbf16  268435456(%rbp,%r14,8), %xmm22
+
+// CHECK: vrcppbf16  291(%r8,%rax,4), %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x7c,0x0f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vrcppbf16  291(%r8,%rax,4), %xmm22 {%k7}
+
+// CHECK: vrcppbf16  (%rip){1to8}, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x7c,0x18,0x4c,0x35,0x00,0x00,0x00,0x00]
+          vrcppbf16  (%rip){1to8}, %xmm22
+
+// CHECK: vrcppbf16  -512(,%rbp,2), %xmm22
+// CHECK: encoding: [0x62,0xe6,0x7c,0x08,0x4c,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vrcppbf16  -512(,%rbp,2), %xmm22
+
+// CHECK: vrcppbf16  2032(%rcx), %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x7c,0x8f,0x4c,0x71,0x7f]
+          vrcppbf16  2032(%rcx), %xmm22 {%k7} {z}
+
+// CHECK: vrcppbf16  -256(%rdx){1to8}, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x7c,0x9f,0x4c,0x72,0x80]
+          vrcppbf16  -256(%rdx){1to8}, %xmm22 {%k7} {z}
+
+// CHECK: vrcppbf16  268435456(%rbp,%r14,8), %ymm22
+// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrcppbf16  268435456(%rbp,%r14,8), %ymm22
+
+// CHECK: vrcppbf16  291(%r8,%rax,4), %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x7c,0x2f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vrcppbf16  291(%r8,%rax,4), %ymm22 {%k7}
+
+// CHECK: vrcppbf16  (%rip){1to16}, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x7c,0x38,0x4c,0x35,0x00,0x00,0x00,0x00]
+          vrcppbf16  (%rip){1to16}, %ymm22
+
+// CHECK: vrcppbf16  -1024(,%rbp,2), %ymm22
+// CHECK: encoding: [0x62,0xe6,0x7c,0x28,0x4c,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vrcppbf16  -1024(,%rbp,2), %ymm22
+
+// CHECK: vrcppbf16  4064(%rcx), %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x7c,0xaf,0x4c,0x71,0x7f]
+          vrcppbf16  4064(%rcx), %ymm22 {%k7} {z}
+
+// CHECK: vrcppbf16  -256(%rdx){1to16}, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x7c,0xbf,0x4c,0x72,0x80]
+          vrcppbf16  -256(%rdx){1to16}, %ymm22 {%k7} {z}
+
+// CHECK: vrcppbf16  268435456(%rbp,%r14,8), %zmm22
+// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrcppbf16  268435456(%rbp,%r14,8), %zmm22
+
+// CHECK: vrcppbf16  291(%r8,%rax,4), %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x7c,0x4f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vrcppbf16  291(%r8,%rax,4), %zmm22 {%k7}
+
+// CHECK: vrcppbf16  (%rip){1to32}, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x7c,0x58,0x4c,0x35,0x00,0x00,0x00,0x00]
+          vrcppbf16  (%rip){1to32}, %zmm22
+
+// CHECK: vrcppbf16  -2048(,%rbp,2), %zmm22
+// CHECK: encoding: [0x62,0xe6,0x7c,0x48,0x4c,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vrcppbf16  -2048(,%rbp,2), %zmm22
+
+// CHECK: vrcppbf16  8128(%rcx), %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x7c,0xcf,0x4c,0x71,0x7f]
+          vrcppbf16  8128(%rcx), %zmm22 {%k7} {z}
+
+// CHECK: vrcppbf16  -256(%rdx){1to32}, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x7c,0xdf,0x4c,0x72,0x80]
+          vrcppbf16  -256(%rdx){1to32}, %zmm22 {%k7} {z}
+
+// CHECK: vreducenepbf16 $123, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x56,0xf7,0x7b]
+          vreducenepbf16 $123, %zmm23, %zmm22
+
+// CHECK: vreducenepbf16 $123, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xa3,0x7f,0x4f,0x56,0xf7,0x7b]
+          vreducenepbf16 $123, %zmm23, %zmm22 {%k7}
+
+// CHECK: vreducenepbf16 $123, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa3,0x7f,0xcf,0x56,0xf7,0x7b]
+          vreducenepbf16 $123, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vreducenepbf16 $123, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x56,0xf7,0x7b]
+          vreducenepbf16 $123, %ymm23, %ymm22
+
+// CHECK: vreducenepbf16 $123, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xa3,0x7f,0x2f,0x56,0xf7,0x7b]
+          vreducenepbf16 $123, %ymm23, %ymm22 {%k7}
+
+// CHECK: vreducenepbf16 $123, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa3,0x7f,0xaf,0x56,0xf7,0x7b]
+          vreducenepbf16 $123, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vreducenepbf16 $123, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x56,0xf7,0x7b]
+          vreducenepbf16 $123, %xmm23, %xmm22
+
+// CHECK: vreducenepbf16 $123, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xa3,0x7f,0x0f,0x56,0xf7,0x7b]
+          vreducenepbf16 $123, %xmm23, %xmm22 {%k7}
+
+// CHECK: vreducenepbf16 $123, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa3,0x7f,0x8f,0x56,0xf7,0x7b]
+          vreducenepbf16 $123, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vreducenepbf16  $123, 268435456(%rbp,%r14,8), %xmm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vreducenepbf16  $123, 268435456(%rbp,%r14,8), %xmm22
+
+// CHECK: vreducenepbf16  $123, 291(%r8,%rax,4), %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc3,0x7f,0x0f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vreducenepbf16  $123, 291(%r8,%rax,4), %xmm22 {%k7}
+
+// CHECK: vreducenepbf16  $123, (%rip){1to8}, %xmm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x18,0x56,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vreducenepbf16  $123, (%rip){1to8}, %xmm22
+
+// CHECK: vreducenepbf16  $123, -512(,%rbp,2), %xmm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x08,0x56,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vreducenepbf16  $123, -512(,%rbp,2), %xmm22
+
+// CHECK: vreducenepbf16  $123, 2032(%rcx), %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0x8f,0x56,0x71,0x7f,0x7b]
+          vreducenepbf16  $123, 2032(%rcx), %xmm22 {%k7} {z}
+
+// CHECK: vreducenepbf16  $123, -256(%rdx){1to8}, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0x9f,0x56,0x72,0x80,0x7b]
+          vreducenepbf16  $123, -256(%rdx){1to8}, %xmm22 {%k7} {z}
+
+// CHECK: vreducenepbf16  $123, 268435456(%rbp,%r14,8), %ymm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vreducenepbf16  $123, 268435456(%rbp,%r14,8), %ymm22
+
+// CHECK: vreducenepbf16  $123, 291(%r8,%rax,4), %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc3,0x7f,0x2f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vreducenepbf16  $123, 291(%r8,%rax,4), %ymm22 {%k7}
+
+// CHECK: vreducenepbf16  $123, (%rip){1to16}, %ymm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x38,0x56,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vreducenepbf16  $123, (%rip){1to16}, %ymm22
+
+// CHECK: vreducenepbf16  $123, -1024(,%rbp,2), %ymm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x28,0x56,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vreducenepbf16  $123, -1024(,%rbp,2), %ymm22
+
+// CHECK: vreducenepbf16  $123, 4064(%rcx), %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0xaf,0x56,0x71,0x7f,0x7b]
+          vreducenepbf16  $123, 4064(%rcx), %ymm22 {%k7} {z}
+
+// CHECK: vreducenepbf16  $123, -256(%rdx){1to16}, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0xbf,0x56,0x72,0x80,0x7b]
+          vreducenepbf16  $123, -256(%rdx){1to16}, %ymm22 {%k7} {z}
+
+// CHECK: vreducenepbf16  $123, 268435456(%rbp,%r14,8), %zmm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vreducenepbf16  $123, 268435456(%rbp,%r14,8), %zmm22
+
+// CHECK: vreducenepbf16  $123, 291(%r8,%rax,4), %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc3,0x7f,0x4f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vreducenepbf16  $123, 291(%r8,%rax,4), %zmm22 {%k7}
+
+// CHECK: vreducenepbf16  $123, (%rip){1to32}, %zmm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x58,0x56,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vreducenepbf16  $123, (%rip){1to32}, %zmm22
+
+// CHECK: vreducenepbf16  $123, -2048(,%rbp,2), %zmm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x56,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vreducenepbf16  $123, -2048(,%rbp,2), %zmm22
+
+// CHECK: vreducenepbf16  $123, 8128(%rcx), %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0xcf,0x56,0x71,0x7f,0x7b]
+          vreducenepbf16  $123, 8128(%rcx), %zmm22 {%k7} {z}
+
+// CHECK: vreducenepbf16  $123, -256(%rdx){1to32}, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0xdf,0x56,0x72,0x80,0x7b]
+          vreducenepbf16  $123, -256(%rdx){1to32}, %zmm22 {%k7} {z}
+
+// CHECK: vrndscalenepbf16 $123, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x08,0xf7,0x7b]
+          vrndscalenepbf16 $123, %zmm23, %zmm22
+
+// CHECK: vrndscalenepbf16 $123, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xa3,0x7f,0x4f,0x08,0xf7,0x7b]
+          vrndscalenepbf16 $123, %zmm23, %zmm22 {%k7}
+
+// CHECK: vrndscalenepbf16 $123, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa3,0x7f,0xcf,0x08,0xf7,0x7b]
+          vrndscalenepbf16 $123, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vrndscalenepbf16 $123, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x08,0xf7,0x7b]
+          vrndscalenepbf16 $123, %ymm23, %ymm22
+
+// CHECK: vrndscalenepbf16 $123, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xa3,0x7f,0x2f,0x08,0xf7,0x7b]
+          vrndscalenepbf16 $123, %ymm23, %ymm22 {%k7}
+
+// CHECK: vrndscalenepbf16 $123, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa3,0x7f,0xaf,0x08,0xf7,0x7b]
+          vrndscalenepbf16 $123, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vrndscalenepbf16 $123, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x08,0xf7,0x7b]
+          vrndscalenepbf16 $123, %xmm23, %xmm22
+
+// CHECK: vrndscalenepbf16 $123, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xa3,0x7f,0x0f,0x08,0xf7,0x7b]
+          vrndscalenepbf16 $123, %xmm23, %xmm22 {%k7}
+
+// CHECK: vrndscalenepbf16 $123, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa3,0x7f,0x8f,0x08,0xf7,0x7b]
+          vrndscalenepbf16 $123, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vrndscalenepbf16  $123, 268435456(%rbp,%r14,8), %xmm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vrndscalenepbf16  $123, 268435456(%rbp,%r14,8), %xmm22
+
+// CHECK: vrndscalenepbf16  $123, 291(%r8,%rax,4), %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc3,0x7f,0x0f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vrndscalenepbf16  $123, 291(%r8,%rax,4), %xmm22 {%k7}
+
+// CHECK: vrndscalenepbf16  $123, (%rip){1to8}, %xmm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x18,0x08,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vrndscalenepbf16  $123, (%rip){1to8}, %xmm22
+
+// CHECK: vrndscalenepbf16  $123, -512(,%rbp,2), %xmm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x08,0x08,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vrndscalenepbf16  $123, -512(,%rbp,2), %xmm22
+
+// CHECK: vrndscalenepbf16  $123, 2032(%rcx), %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0x8f,0x08,0x71,0x7f,0x7b]
+          vrndscalenepbf16  $123, 2032(%rcx), %xmm22 {%k7} {z}
+
+// CHECK: vrndscalenepbf16  $123, -256(%rdx){1to8}, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0x9f,0x08,0x72,0x80,0x7b]
+          vrndscalenepbf16  $123, -256(%rdx){1to8}, %xmm22 {%k7} {z}
+
+// CHECK: vrndscalenepbf16  $123, 268435456(%rbp,%r14,8), %ymm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vrndscalenepbf16  $123, 268435456(%rbp,%r14,8), %ymm22
+
+// CHECK: vrndscalenepbf16  $123, 291(%r8,%rax,4), %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc3,0x7f,0x2f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vrndscalenepbf16  $123, 291(%r8,%rax,4), %ymm22 {%k7}
+
+// CHECK: vrndscalenepbf16  $123, (%rip){1to16}, %ymm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x38,0x08,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vrndscalenepbf16  $123, (%rip){1to16}, %ymm22
+
+// CHECK: vrndscalenepbf16  $123, -1024(,%rbp,2), %ymm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x28,0x08,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vrndscalenepbf16  $123, -1024(,%rbp,2), %ymm22
+
+// CHECK: vrndscalenepbf16  $123, 4064(%rcx), %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0xaf,0x08,0x71,0x7f,0x7b]
+          vrndscalenepbf16  $123, 4064(%rcx), %ymm22 {%k7} {z}
+
+// CHECK: vrndscalenepbf16  $123, -256(%rdx){1to16}, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0xbf,0x08,0x72,0x80,0x7b]
+          vrndscalenepbf16  $123, -256(%rdx){1to16}, %ymm22 {%k7} {z}
+
+// CHECK: vrndscalenepbf16  $123, 268435456(%rbp,%r14,8), %zmm22
+// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vrndscalenepbf16  $123, 268435456(%rbp,%r14,8), %zmm22
+
+// CHECK: vrndscalenepbf16  $123, 291(%r8,%rax,4), %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc3,0x7f,0x4f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vrndscalenepbf16  $123, 291(%r8,%rax,4), %zmm22 {%k7}
+
+// CHECK: vrndscalenepbf16  $123, (%rip){1to32}, %zmm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x58,0x08,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vrndscalenepbf16  $123, (%rip){1to32}, %zmm22
+
+// CHECK: vrndscalenepbf16  $123, -2048(,%rbp,2), %zmm22
+// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x08,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vrndscalenepbf16  $123, -2048(,%rbp,2), %zmm22
+
+// CHECK: vrndscalenepbf16  $123, 8128(%rcx), %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0xcf,0x08,0x71,0x7f,0x7b]
+          vrndscalenepbf16  $123, 8128(%rcx), %zmm22 {%k7} {z}
+
+// CHECK: vrndscalenepbf16  $123, -256(%rdx){1to32}, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe3,0x7f,0xdf,0x08,0x72,0x80,0x7b]
+          vrndscalenepbf16  $123, -256(%rdx){1to32}, %zmm22 {%k7} {z}
+
+// CHECK: vrsqrtpbf16 %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4e,0xf7]
+          vrsqrtpbf16 %xmm23, %xmm22
+
+// CHECK: vrsqrtpbf16 %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xa6,0x7c,0x0f,0x4e,0xf7]
+          vrsqrtpbf16 %xmm23, %xmm22 {%k7}
+
+// CHECK: vrsqrtpbf16 %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa6,0x7c,0x8f,0x4e,0xf7]
+          vrsqrtpbf16 %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vrsqrtpbf16 %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4e,0xf7]
+          vrsqrtpbf16 %zmm23, %zmm22
+
+// CHECK: vrsqrtpbf16 %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xa6,0x7c,0x4f,0x4e,0xf7]
+          vrsqrtpbf16 %zmm23, %zmm22 {%k7}
+
+// CHECK: vrsqrtpbf16 %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa6,0x7c,0xcf,0x4e,0xf7]
+          vrsqrtpbf16 %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vrsqrtpbf16 %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4e,0xf7]
+          vrsqrtpbf16 %ymm23, %ymm22
+
+// CHECK: vrsqrtpbf16 %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xa6,0x7c,0x2f,0x4e,0xf7]
+          vrsqrtpbf16 %ymm23, %ymm22 {%k7}
+
+// CHECK: vrsqrtpbf16 %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa6,0x7c,0xaf,0x4e,0xf7]
+          vrsqrtpbf16 %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vrsqrtpbf16  268435456(%rbp,%r14,8), %xmm22
+// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrsqrtpbf16  268435456(%rbp,%r14,8), %xmm22
+
+// CHECK: vrsqrtpbf16  291(%r8,%rax,4), %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x7c,0x0f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vrsqrtpbf16  291(%r8,%rax,4), %xmm22 {%k7}
+
+// CHECK: vrsqrtpbf16  (%rip){1to8}, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x7c,0x18,0x4e,0x35,0x00,0x00,0x00,0x00]
+          vrsqrtpbf16  (%rip){1to8}, %xmm22
+
+// CHECK: vrsqrtpbf16  -512(,%rbp,2), %xmm22
+// CHECK: encoding: [0x62,0xe6,0x7c,0x08,0x4e,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vrsqrtpbf16  -512(,%rbp,2), %xmm22
+
+// CHECK: vrsqrtpbf16  2032(%rcx), %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x7c,0x8f,0x4e,0x71,0x7f]
+          vrsqrtpbf16  2032(%rcx), %xmm22 {%k7} {z}
+
+// CHECK: vrsqrtpbf16  -256(%rdx){1to8}, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x7c,0x9f,0x4e,0x72,0x80]
+          vrsqrtpbf16  -256(%rdx){1to8}, %xmm22 {%k7} {z}
+
+// CHECK: vrsqrtpbf16  268435456(%rbp,%r14,8), %ymm22
+// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrsqrtpbf16  268435456(%rbp,%r14,8), %ymm22
+
+// CHECK: vrsqrtpbf16  291(%r8,%rax,4), %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x7c,0x2f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vrsqrtpbf16  291(%r8,%rax,4), %ymm22 {%k7}
+
+// CHECK: vrsqrtpbf16  (%rip){1to16}, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x7c,0x38,0x4e,0x35,0x00,0x00,0x00,0x00]
+          vrsqrtpbf16  (%rip){1to16}, %ymm22
+
+// CHECK: vrsqrtpbf16  -1024(,%rbp,2), %ymm22
+// CHECK: encoding: [0x62,0xe6,0x7c,0x28,0x4e,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vrsqrtpbf16  -1024(,%rbp,2), %ymm22
+
+// CHECK: vrsqrtpbf16  4064(%rcx), %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x7c,0xaf,0x4e,0x71,0x7f]
+          vrsqrtpbf16  4064(%rcx), %ymm22 {%k7} {z}
+
+// CHECK: vrsqrtpbf16  -256(%rdx){1to16}, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x7c,0xbf,0x4e,0x72,0x80]
+          vrsqrtpbf16  -256(%rdx){1to16}, %ymm22 {%k7} {z}
+
+// CHECK: vrsqrtpbf16  268435456(%rbp,%r14,8), %zmm22
+// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrsqrtpbf16  268435456(%rbp,%r14,8), %zmm22
+
+// CHECK: vrsqrtpbf16  291(%r8,%rax,4), %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x7c,0x4f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vrsqrtpbf16  291(%r8,%rax,4), %zmm22 {%k7}
+
+// CHECK: vrsqrtpbf16  (%rip){1to32}, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x7c,0x58,0x4e,0x35,0x00,0x00,0x00,0x00]
+          vrsqrtpbf16  (%rip){1to32}, %zmm22
+
+// CHECK: vrsqrtpbf16  -2048(,%rbp,2), %zmm22
+// CHECK: encoding: [0x62,0xe6,0x7c,0x48,0x4e,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vrsqrtpbf16  -2048(,%rbp,2), %zmm22
+
+// CHECK: vrsqrtpbf16  8128(%rcx), %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x7c,0xcf,0x4e,0x71,0x7f]
+          vrsqrtpbf16  8128(%rcx), %zmm22 {%k7} {z}
+
+// CHECK: vrsqrtpbf16  -256(%rdx){1to32}, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x7c,0xdf,0x4e,0x72,0x80]
+          vrsqrtpbf16  -256(%rdx){1to32}, %zmm22 {%k7} {z}
+
+// CHECK: vscalefpbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0x2c,0xf0]
+          vscalefpbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vscalefpbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0x2c,0xf0]
+          vscalefpbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vscalefpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x2c,0xf0]
+          vscalefpbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vscalefpbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0x2c,0xf0]
+          vscalefpbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vscalefpbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0x2c,0xf0]
+          vscalefpbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vscalefpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x2c,0xf0]
+          vscalefpbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vscalefpbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0x2c,0xf0]
+          vscalefpbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vscalefpbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0x2c,0xf0]
+          vscalefpbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vscalefpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0x2c,0xf0]
+          vscalefpbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vscalefpbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vscalefpbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vscalefpbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vscalefpbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vscalefpbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x2c,0x35,0x00,0x00,0x00,0x00]
+          vscalefpbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vscalefpbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x2c,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vscalefpbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vscalefpbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x2c,0x71,0x7f]
+          vscalefpbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vscalefpbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x2c,0x72,0x80]
+          vscalefpbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vscalefpbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vscalefpbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vscalefpbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vscalefpbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vscalefpbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x2c,0x35,0x00,0x00,0x00,0x00]
+          vscalefpbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vscalefpbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x2c,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vscalefpbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vscalefpbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x2c,0x71,0x7f]
+          vscalefpbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vscalefpbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x2c,0x72,0x80]
+          vscalefpbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vscalefpbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vscalefpbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vscalefpbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vscalefpbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vscalefpbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x2c,0x35,0x00,0x00,0x00,0x00]
+          vscalefpbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vscalefpbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x2c,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vscalefpbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vscalefpbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x2c,0x71,0x7f]
+          vscalefpbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vscalefpbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x2c,0x72,0x80]
+          vscalefpbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vsqrtnepbf16 %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x51,0xf7]
+          vsqrtnepbf16 %xmm23, %xmm22
+
+// CHECK: vsqrtnepbf16 %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xa5,0x7d,0x0f,0x51,0xf7]
+          vsqrtnepbf16 %xmm23, %xmm22 {%k7}
+
+// CHECK: vsqrtnepbf16 %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa5,0x7d,0x8f,0x51,0xf7]
+          vsqrtnepbf16 %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vsqrtnepbf16 %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x51,0xf7]
+          vsqrtnepbf16 %zmm23, %zmm22
+
+// CHECK: vsqrtnepbf16 %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xa5,0x7d,0x4f,0x51,0xf7]
+          vsqrtnepbf16 %zmm23, %zmm22 {%k7}
+
+// CHECK: vsqrtnepbf16 %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa5,0x7d,0xcf,0x51,0xf7]
+          vsqrtnepbf16 %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vsqrtnepbf16 %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x51,0xf7]
+          vsqrtnepbf16 %ymm23, %ymm22
+
+// CHECK: vsqrtnepbf16 %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xa5,0x7d,0x2f,0x51,0xf7]
+          vsqrtnepbf16 %ymm23, %ymm22 {%k7}
+
+// CHECK: vsqrtnepbf16 %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xa5,0x7d,0xaf,0x51,0xf7]
+          vsqrtnepbf16 %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vsqrtnepbf16  268435456(%rbp,%r14,8), %xmm22
+// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsqrtnepbf16  268435456(%rbp,%r14,8), %xmm22
+
+// CHECK: vsqrtnepbf16  291(%r8,%rax,4), %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x7d,0x0f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vsqrtnepbf16  291(%r8,%rax,4), %xmm22 {%k7}
+
+// CHECK: vsqrtnepbf16  (%rip){1to8}, %xmm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x18,0x51,0x35,0x00,0x00,0x00,0x00]
+          vsqrtnepbf16  (%rip){1to8}, %xmm22
+
+// CHECK: vsqrtnepbf16  -512(,%rbp,2), %xmm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vsqrtnepbf16  -512(,%rbp,2), %xmm22
+
+// CHECK: vsqrtnepbf16  2032(%rcx), %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x7d,0x8f,0x51,0x71,0x7f]
+          vsqrtnepbf16  2032(%rcx), %xmm22 {%k7} {z}
+
+// CHECK: vsqrtnepbf16  -256(%rdx){1to8}, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x7d,0x9f,0x51,0x72,0x80]
+          vsqrtnepbf16  -256(%rdx){1to8}, %xmm22 {%k7} {z}
+
+// CHECK: vsqrtnepbf16  268435456(%rbp,%r14,8), %ymm22
+// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsqrtnepbf16  268435456(%rbp,%r14,8), %ymm22
+
+// CHECK: vsqrtnepbf16  291(%r8,%rax,4), %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x7d,0x2f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vsqrtnepbf16  291(%r8,%rax,4), %ymm22 {%k7}
+
+// CHECK: vsqrtnepbf16  (%rip){1to16}, %ymm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x38,0x51,0x35,0x00,0x00,0x00,0x00]
+          vsqrtnepbf16  (%rip){1to16}, %ymm22
+
+// CHECK: vsqrtnepbf16  -1024(,%rbp,2), %ymm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x28,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vsqrtnepbf16  -1024(,%rbp,2), %ymm22
+
+// CHECK: vsqrtnepbf16  4064(%rcx), %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x7d,0xaf,0x51,0x71,0x7f]
+          vsqrtnepbf16  4064(%rcx), %ymm22 {%k7} {z}
+
+// CHECK: vsqrtnepbf16  -256(%rdx){1to16}, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x7d,0xbf,0x51,0x72,0x80]
+          vsqrtnepbf16  -256(%rdx){1to16}, %ymm22 {%k7} {z}
+
+// CHECK: vsqrtnepbf16  268435456(%rbp,%r14,8), %zmm22
+// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsqrtnepbf16  268435456(%rbp,%r14,8), %zmm22
+
+// CHECK: vsqrtnepbf16  291(%r8,%rax,4), %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x7d,0x4f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vsqrtnepbf16  291(%r8,%rax,4), %zmm22 {%k7}
+
+// CHECK: vsqrtnepbf16  (%rip){1to32}, %zmm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x58,0x51,0x35,0x00,0x00,0x00,0x00]
+          vsqrtnepbf16  (%rip){1to32}, %zmm22
+
+// CHECK: vsqrtnepbf16  -2048(,%rbp,2), %zmm22
+// CHECK: encoding: [0x62,0xe5,0x7d,0x48,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vsqrtnepbf16  -2048(,%rbp,2), %zmm22
+
+// CHECK: vsqrtnepbf16  8128(%rcx), %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x7d,0xcf,0x51,0x71,0x7f]
+          vsqrtnepbf16  8128(%rcx), %zmm22 {%k7} {z}
+
+// CHECK: vsqrtnepbf16  -256(%rdx){1to32}, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x7d,0xdf,0x51,0x72,0x80]
+          vsqrtnepbf16  -256(%rdx){1to32}, %zmm22 {%k7} {z}
+
+// CHECK: vsubnepbf16 %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5c,0xf0]
+          vsubnepbf16 %ymm24, %ymm23, %ymm22
+
+// CHECK: vsubnepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5c,0xf0]
+          vsubnepbf16 %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vsubnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5c,0xf0]
+          vsubnepbf16 %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vsubnepbf16 %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5c,0xf0]
+          vsubnepbf16 %zmm24, %zmm23, %zmm22
+
+// CHECK: vsubnepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5c,0xf0]
+          vsubnepbf16 %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vsubnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5c,0xf0]
+          vsubnepbf16 %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vsubnepbf16 %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5c,0xf0]
+          vsubnepbf16 %xmm24, %xmm23, %xmm22
+
+// CHECK: vsubnepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5c,0xf0]
+          vsubnepbf16 %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vsubnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5c,0xf0]
+          vsubnepbf16 %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vsubnepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsubnepbf16  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vsubnepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vsubnepbf16  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vsubnepbf16  (%rip){1to32}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5c,0x35,0x00,0x00,0x00,0x00]
+          vsubnepbf16  (%rip){1to32}, %zmm23, %zmm22
+
+// CHECK: vsubnepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5c,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vsubnepbf16  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vsubnepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5c,0x71,0x7f]
+          vsubnepbf16  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vsubnepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5c,0x72,0x80]
+          vsubnepbf16  -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vsubnepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsubnepbf16  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vsubnepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vsubnepbf16  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vsubnepbf16  (%rip){1to16}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5c,0x35,0x00,0x00,0x00,0x00]
+          vsubnepbf16  (%rip){1to16}, %ymm23, %ymm22
+
+// CHECK: vsubnepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5c,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vsubnepbf16  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vsubnepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5c,0x71,0x7f]
+          vsubnepbf16  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vsubnepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5c,0x72,0x80]
+          vsubnepbf16  -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vsubnepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsubnepbf16  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vsubnepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vsubnepbf16  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vsubnepbf16  (%rip){1to8}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5c,0x35,0x00,0x00,0x00,0x00]
+          vsubnepbf16  (%rip){1to8}, %xmm23, %xmm22
+
+// CHECK: vsubnepbf16  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5c,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vsubnepbf16  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vsubnepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5c,0x71,0x7f]
+          vsubnepbf16  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vsubnepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5c,0x72,0x80]
+          vsubnepbf16  -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z}
+
diff --git a/llvm/test/MC/X86/avx10.2-bf16-64-intel.s b/llvm/test/MC/X86/avx10.2-bf16-64-intel.s
new file mode 100644
index 0000000000000..5f3dc45ba7745
--- /dev/null
+++ b/llvm/test/MC/X86/avx10.2-bf16-64-intel.s
@@ -0,0 +1,3014 @@
+// RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: vaddnepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0x20,0x58,0xf0]
+          vaddnepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vaddnepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0x27,0x58,0xf0]
+          vaddnepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vaddnepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x58,0xf0]
+          vaddnepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vaddnepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x40,0x58,0xf0]
+          vaddnepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vaddnepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x47,0x58,0xf0]
+          vaddnepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vaddnepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x58,0xf0]
+          vaddnepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vaddnepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x00,0x58,0xf0]
+          vaddnepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vaddnepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x07,0x58,0xf0]
+          vaddnepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vaddnepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x87,0x58,0xf0]
+          vaddnepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vaddnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vaddnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vaddnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x58,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vaddnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vaddnepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x58,0x35,0x00,0x00,0x00,0x00]
+          vaddnepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vaddnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x58,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vaddnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vaddnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x58,0x71,0x7f]
+          vaddnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vaddnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x58,0x72,0x80]
+          vaddnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vaddnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vaddnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vaddnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x58,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vaddnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vaddnepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x58,0x35,0x00,0x00,0x00,0x00]
+          vaddnepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vaddnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x58,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vaddnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vaddnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x58,0x71,0x7f]
+          vaddnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vaddnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x58,0x72,0x80]
+          vaddnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vaddnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vaddnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vaddnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x58,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vaddnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vaddnepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x58,0x35,0x00,0x00,0x00,0x00]
+          vaddnepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vaddnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x58,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vaddnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vaddnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x58,0x71,0x7f]
+          vaddnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vaddnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x58,0x72,0x80]
+          vaddnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vcmppbf16 k5, ymm23, ymm24, 123
+// CHECK: encoding: [0x62,0x93,0x47,0x20,0xc2,0xe8,0x7b]
+          vcmppbf16 k5, ymm23, ymm24, 123
+
+// CHECK: vcmppbf16 k5 {k7}, ymm23, ymm24, 123
+// CHECK: encoding: [0x62,0x93,0x47,0x27,0xc2,0xe8,0x7b]
+          vcmppbf16 k5 {k7}, ymm23, ymm24, 123
+
+// CHECK: vcmppbf16 k5, xmm23, xmm24, 123
+// CHECK: encoding: [0x62,0x93,0x47,0x00,0xc2,0xe8,0x7b]
+          vcmppbf16 k5, xmm23, xmm24, 123
+
+// CHECK: vcmppbf16 k5 {k7}, xmm23, xmm24, 123
+// CHECK: encoding: [0x62,0x93,0x47,0x07,0xc2,0xe8,0x7b]
+          vcmppbf16 k5 {k7}, xmm23, xmm24, 123
+
+// CHECK: vcmppbf16 k5, zmm23, zmm24, 123
+// CHECK: encoding: [0x62,0x93,0x47,0x40,0xc2,0xe8,0x7b]
+          vcmppbf16 k5, zmm23, zmm24, 123
+
+// CHECK: vcmppbf16 k5 {k7}, zmm23, zmm24, 123
+// CHECK: encoding: [0x62,0x93,0x47,0x47,0xc2,0xe8,0x7b]
+          vcmppbf16 k5 {k7}, zmm23, zmm24, 123
+
+// CHECK: vcmppbf16 k5, zmm23, zmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0xb3,0x47,0x40,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vcmppbf16 k5, zmm23, zmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vcmppbf16 k5 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291], 123
+// CHECK: encoding: [0x62,0xd3,0x47,0x47,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vcmppbf16 k5 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291], 123
+
+// CHECK: vcmppbf16 k5, zmm23, word ptr [rip]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x47,0x50,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b]
+          vcmppbf16 k5, zmm23, word ptr [rip]{1to32}, 123
+
+// CHECK: vcmppbf16 k5, zmm23, zmmword ptr [2*rbp - 2048], 123
+// CHECK: encoding: [0x62,0xf3,0x47,0x40,0xc2,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vcmppbf16 k5, zmm23, zmmword ptr [2*rbp - 2048], 123
+
+// CHECK: vcmppbf16 k5 {k7}, zmm23, zmmword ptr [rcx + 8128], 123
+// CHECK: encoding: [0x62,0xf3,0x47,0x47,0xc2,0x69,0x7f,0x7b]
+          vcmppbf16 k5 {k7}, zmm23, zmmword ptr [rcx + 8128], 123
+
+// CHECK: vcmppbf16 k5 {k7}, zmm23, word ptr [rdx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x47,0x57,0xc2,0x6a,0x80,0x7b]
+          vcmppbf16 k5 {k7}, zmm23, word ptr [rdx - 256]{1to32}, 123
+
+// CHECK: vcmppbf16 k5, xmm23, xmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0xb3,0x47,0x00,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vcmppbf16 k5, xmm23, xmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vcmppbf16 k5 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291], 123
+// CHECK: encoding: [0x62,0xd3,0x47,0x07,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vcmppbf16 k5 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291], 123
+
+// CHECK: vcmppbf16 k5, xmm23, word ptr [rip]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x47,0x10,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b]
+          vcmppbf16 k5, xmm23, word ptr [rip]{1to8}, 123
+
+// CHECK: vcmppbf16 k5, xmm23, xmmword ptr [2*rbp - 512], 123
+// CHECK: encoding: [0x62,0xf3,0x47,0x00,0xc2,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vcmppbf16 k5, xmm23, xmmword ptr [2*rbp - 512], 123
+
+// CHECK: vcmppbf16 k5 {k7}, xmm23, xmmword ptr [rcx + 2032], 123
+// CHECK: encoding: [0x62,0xf3,0x47,0x07,0xc2,0x69,0x7f,0x7b]
+          vcmppbf16 k5 {k7}, xmm23, xmmword ptr [rcx + 2032], 123
+
+// CHECK: vcmppbf16 k5 {k7}, xmm23, word ptr [rdx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x47,0x17,0xc2,0x6a,0x80,0x7b]
+          vcmppbf16 k5 {k7}, xmm23, word ptr [rdx - 256]{1to8}, 123
+
+// CHECK: vcmppbf16 k5, ymm23, ymmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0xb3,0x47,0x20,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vcmppbf16 k5, ymm23, ymmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vcmppbf16 k5 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291], 123
+// CHECK: encoding: [0x62,0xd3,0x47,0x27,0xc2,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vcmppbf16 k5 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291], 123
+
+// CHECK: vcmppbf16 k5, ymm23, word ptr [rip]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x47,0x30,0xc2,0x2d,0x00,0x00,0x00,0x00,0x7b]
+          vcmppbf16 k5, ymm23, word ptr [rip]{1to16}, 123
+
+// CHECK: vcmppbf16 k5, ymm23, ymmword ptr [2*rbp - 1024], 123
+// CHECK: encoding: [0x62,0xf3,0x47,0x20,0xc2,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vcmppbf16 k5, ymm23, ymmword ptr [2*rbp - 1024], 123
+
+// CHECK: vcmppbf16 k5 {k7}, ymm23, ymmword ptr [rcx + 4064], 123
+// CHECK: encoding: [0x62,0xf3,0x47,0x27,0xc2,0x69,0x7f,0x7b]
+          vcmppbf16 k5 {k7}, ymm23, ymmword ptr [rcx + 4064], 123
+
+// CHECK: vcmppbf16 k5 {k7}, ymm23, word ptr [rdx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x47,0x37,0xc2,0x6a,0x80,0x7b]
+          vcmppbf16 k5 {k7}, ymm23, word ptr [rdx - 256]{1to16}, 123
+
+// CHECK: vcomsbf16 xmm22, xmm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x2f,0xf7]
+          vcomsbf16 xmm22, xmm23
+
+// CHECK: vcomsbf16 xmm22, word ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcomsbf16 xmm22, word ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcomsbf16 xmm22, word ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x7d,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vcomsbf16 xmm22, word ptr [r8 + 4*rax + 291]
+
+// CHECK: vcomsbf16 xmm22, word ptr [rip]
+// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x35,0x00,0x00,0x00,0x00]
+          vcomsbf16 xmm22, word ptr [rip]
+
+// CHECK: vcomsbf16 xmm22, word ptr [2*rbp - 64]
+// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x34,0x6d,0xc0,0xff,0xff,0xff]
+          vcomsbf16 xmm22, word ptr [2*rbp - 64]
+
+// CHECK: vcomsbf16 xmm22, word ptr [rcx + 254]
+// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x71,0x7f]
+          vcomsbf16 xmm22, word ptr [rcx + 254]
+
+// CHECK: vcomsbf16 xmm22, word ptr [rdx - 256]
+// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x2f,0x72,0x80]
+          vcomsbf16 xmm22, word ptr [rdx - 256]
+
+// CHECK: vdivnepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5e,0xf0]
+          vdivnepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vdivnepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5e,0xf0]
+          vdivnepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vdivnepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5e,0xf0]
+          vdivnepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vdivnepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5e,0xf0]
+          vdivnepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vdivnepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5e,0xf0]
+          vdivnepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vdivnepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5e,0xf0]
+          vdivnepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vdivnepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5e,0xf0]
+          vdivnepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vdivnepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5e,0xf0]
+          vdivnepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vdivnepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5e,0xf0]
+          vdivnepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vdivnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vdivnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vdivnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vdivnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vdivnepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5e,0x35,0x00,0x00,0x00,0x00]
+          vdivnepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vdivnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5e,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vdivnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vdivnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5e,0x71,0x7f]
+          vdivnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vdivnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5e,0x72,0x80]
+          vdivnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vdivnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vdivnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vdivnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vdivnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vdivnepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5e,0x35,0x00,0x00,0x00,0x00]
+          vdivnepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vdivnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5e,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vdivnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vdivnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5e,0x71,0x7f]
+          vdivnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vdivnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5e,0x72,0x80]
+          vdivnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vdivnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vdivnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vdivnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vdivnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vdivnepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5e,0x35,0x00,0x00,0x00,0x00]
+          vdivnepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vdivnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5e,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vdivnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vdivnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5e,0x71,0x7f]
+          vdivnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vdivnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5e,0x72,0x80]
+          vdivnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmadd132nepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0x98,0xf0]
+          vfmadd132nepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vfmadd132nepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0x98,0xf0]
+          vfmadd132nepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x98,0xf0]
+          vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vfmadd132nepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0x98,0xf0]
+          vfmadd132nepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vfmadd132nepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0x98,0xf0]
+          vfmadd132nepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x98,0xf0]
+          vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vfmadd132nepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0x98,0xf0]
+          vfmadd132nepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vfmadd132nepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0x98,0xf0]
+          vfmadd132nepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0x98,0xf0]
+          vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vfmadd132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmadd132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x98,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmadd132nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x98,0x35,0x00,0x00,0x00,0x00]
+          vfmadd132nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vfmadd132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x98,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfmadd132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x98,0x71,0x7f]
+          vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x98,0x72,0x80]
+          vfmadd132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vfmadd132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmadd132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x98,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmadd132nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x98,0x35,0x00,0x00,0x00,0x00]
+          vfmadd132nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vfmadd132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x98,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfmadd132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x98,0x71,0x7f]
+          vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x98,0x72,0x80]
+          vfmadd132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmadd132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmadd132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x98,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmadd132nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x98,0x35,0x00,0x00,0x00,0x00]
+          vfmadd132nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vfmadd132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x98,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfmadd132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x98,0x71,0x7f]
+          vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x98,0x72,0x80]
+          vfmadd132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmadd213nepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xa8,0xf0]
+          vfmadd213nepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vfmadd213nepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xa8,0xf0]
+          vfmadd213nepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xa8,0xf0]
+          vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vfmadd213nepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xa8,0xf0]
+          vfmadd213nepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vfmadd213nepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xa8,0xf0]
+          vfmadd213nepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xa8,0xf0]
+          vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vfmadd213nepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xa8,0xf0]
+          vfmadd213nepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vfmadd213nepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xa8,0xf0]
+          vfmadd213nepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xa8,0xf0]
+          vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vfmadd213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmadd213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmadd213nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xa8,0x35,0x00,0x00,0x00,0x00]
+          vfmadd213nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vfmadd213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xa8,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfmadd213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xa8,0x71,0x7f]
+          vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xa8,0x72,0x80]
+          vfmadd213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vfmadd213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmadd213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmadd213nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xa8,0x35,0x00,0x00,0x00,0x00]
+          vfmadd213nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vfmadd213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xa8,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfmadd213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xa8,0x71,0x7f]
+          vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xa8,0x72,0x80]
+          vfmadd213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmadd213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmadd213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xa8,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmadd213nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xa8,0x35,0x00,0x00,0x00,0x00]
+          vfmadd213nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vfmadd213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xa8,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfmadd213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xa8,0x71,0x7f]
+          vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xa8,0x72,0x80]
+          vfmadd213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmadd231nepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xb8,0xf0]
+          vfmadd231nepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vfmadd231nepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xb8,0xf0]
+          vfmadd231nepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xb8,0xf0]
+          vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vfmadd231nepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xb8,0xf0]
+          vfmadd231nepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vfmadd231nepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xb8,0xf0]
+          vfmadd231nepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xb8,0xf0]
+          vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vfmadd231nepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xb8,0xf0]
+          vfmadd231nepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vfmadd231nepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xb8,0xf0]
+          vfmadd231nepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xb8,0xf0]
+          vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vfmadd231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmadd231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmadd231nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xb8,0x35,0x00,0x00,0x00,0x00]
+          vfmadd231nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vfmadd231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xb8,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfmadd231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xb8,0x71,0x7f]
+          vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xb8,0x72,0x80]
+          vfmadd231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vfmadd231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmadd231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmadd231nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xb8,0x35,0x00,0x00,0x00,0x00]
+          vfmadd231nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vfmadd231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xb8,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfmadd231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xb8,0x71,0x7f]
+          vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xb8,0x72,0x80]
+          vfmadd231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmadd231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmadd231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmadd231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xb8,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmadd231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmadd231nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xb8,0x35,0x00,0x00,0x00,0x00]
+          vfmadd231nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vfmadd231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xb8,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfmadd231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xb8,0x71,0x7f]
+          vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xb8,0x72,0x80]
+          vfmadd231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmsub132nepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0x9a,0xf0]
+          vfmsub132nepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vfmsub132nepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0x9a,0xf0]
+          vfmsub132nepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x9a,0xf0]
+          vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vfmsub132nepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0x9a,0xf0]
+          vfmsub132nepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vfmsub132nepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0x9a,0xf0]
+          vfmsub132nepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x9a,0xf0]
+          vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vfmsub132nepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0x9a,0xf0]
+          vfmsub132nepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vfmsub132nepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0x9a,0xf0]
+          vfmsub132nepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0x9a,0xf0]
+          vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vfmsub132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsub132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmsub132nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x9a,0x35,0x00,0x00,0x00,0x00]
+          vfmsub132nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vfmsub132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x9a,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfmsub132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x9a,0x71,0x7f]
+          vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x9a,0x72,0x80]
+          vfmsub132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vfmsub132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsub132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmsub132nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x9a,0x35,0x00,0x00,0x00,0x00]
+          vfmsub132nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vfmsub132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x9a,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfmsub132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x9a,0x71,0x7f]
+          vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x9a,0x72,0x80]
+          vfmsub132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmsub132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsub132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x9a,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmsub132nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x9a,0x35,0x00,0x00,0x00,0x00]
+          vfmsub132nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vfmsub132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x9a,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfmsub132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x9a,0x71,0x7f]
+          vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x9a,0x72,0x80]
+          vfmsub132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmsub213nepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xaa,0xf0]
+          vfmsub213nepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vfmsub213nepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xaa,0xf0]
+          vfmsub213nepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xaa,0xf0]
+          vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vfmsub213nepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xaa,0xf0]
+          vfmsub213nepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vfmsub213nepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xaa,0xf0]
+          vfmsub213nepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xaa,0xf0]
+          vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vfmsub213nepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xaa,0xf0]
+          vfmsub213nepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vfmsub213nepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xaa,0xf0]
+          vfmsub213nepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xaa,0xf0]
+          vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vfmsub213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsub213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmsub213nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xaa,0x35,0x00,0x00,0x00,0x00]
+          vfmsub213nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vfmsub213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xaa,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfmsub213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xaa,0x71,0x7f]
+          vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xaa,0x72,0x80]
+          vfmsub213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vfmsub213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsub213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmsub213nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xaa,0x35,0x00,0x00,0x00,0x00]
+          vfmsub213nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vfmsub213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xaa,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfmsub213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xaa,0x71,0x7f]
+          vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xaa,0x72,0x80]
+          vfmsub213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmsub213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsub213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xaa,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmsub213nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xaa,0x35,0x00,0x00,0x00,0x00]
+          vfmsub213nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vfmsub213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xaa,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfmsub213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xaa,0x71,0x7f]
+          vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xaa,0x72,0x80]
+          vfmsub213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfmsub231nepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xba,0xf0]
+          vfmsub231nepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vfmsub231nepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xba,0xf0]
+          vfmsub231nepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xba,0xf0]
+          vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vfmsub231nepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xba,0xf0]
+          vfmsub231nepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vfmsub231nepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xba,0xf0]
+          vfmsub231nepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xba,0xf0]
+          vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vfmsub231nepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xba,0xf0]
+          vfmsub231nepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vfmsub231nepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xba,0xf0]
+          vfmsub231nepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xba,0xf0]
+          vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vfmsub231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsub231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xba,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmsub231nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xba,0x35,0x00,0x00,0x00,0x00]
+          vfmsub231nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vfmsub231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xba,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfmsub231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xba,0x71,0x7f]
+          vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xba,0x72,0x80]
+          vfmsub231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vfmsub231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsub231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xba,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmsub231nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xba,0x35,0x00,0x00,0x00,0x00]
+          vfmsub231nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vfmsub231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xba,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfmsub231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xba,0x71,0x7f]
+          vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xba,0x72,0x80]
+          vfmsub231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfmsub231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfmsub231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfmsub231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xba,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfmsub231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfmsub231nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xba,0x35,0x00,0x00,0x00,0x00]
+          vfmsub231nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vfmsub231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xba,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfmsub231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xba,0x71,0x7f]
+          vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xba,0x72,0x80]
+          vfmsub231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfnmadd132nepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0x9c,0xf0]
+          vfnmadd132nepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vfnmadd132nepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0x9c,0xf0]
+          vfnmadd132nepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x9c,0xf0]
+          vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vfnmadd132nepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0x9c,0xf0]
+          vfnmadd132nepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vfnmadd132nepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0x9c,0xf0]
+          vfnmadd132nepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x9c,0xf0]
+          vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vfnmadd132nepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0x9c,0xf0]
+          vfnmadd132nepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vfnmadd132nepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0x9c,0xf0]
+          vfnmadd132nepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0x9c,0xf0]
+          vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vfnmadd132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmadd132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmadd132nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x9c,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd132nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vfnmadd132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x9c,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmadd132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x9c,0x71,0x7f]
+          vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x9c,0x72,0x80]
+          vfnmadd132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vfnmadd132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmadd132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmadd132nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x9c,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd132nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vfnmadd132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x9c,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmadd132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x9c,0x71,0x7f]
+          vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x9c,0x72,0x80]
+          vfnmadd132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfnmadd132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmadd132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x9c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmadd132nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x9c,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd132nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vfnmadd132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x9c,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmadd132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x9c,0x71,0x7f]
+          vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x9c,0x72,0x80]
+          vfnmadd132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfnmadd213nepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xac,0xf0]
+          vfnmadd213nepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vfnmadd213nepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xac,0xf0]
+          vfnmadd213nepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xac,0xf0]
+          vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vfnmadd213nepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xac,0xf0]
+          vfnmadd213nepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vfnmadd213nepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xac,0xf0]
+          vfnmadd213nepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xac,0xf0]
+          vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vfnmadd213nepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xac,0xf0]
+          vfnmadd213nepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vfnmadd213nepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xac,0xf0]
+          vfnmadd213nepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xac,0xf0]
+          vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vfnmadd213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmadd213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xac,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmadd213nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xac,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd213nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vfnmadd213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xac,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmadd213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xac,0x71,0x7f]
+          vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xac,0x72,0x80]
+          vfnmadd213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vfnmadd213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmadd213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xac,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmadd213nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xac,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd213nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vfnmadd213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xac,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmadd213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xac,0x71,0x7f]
+          vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xac,0x72,0x80]
+          vfnmadd213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfnmadd213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmadd213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xac,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmadd213nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xac,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd213nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vfnmadd213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xac,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmadd213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xac,0x71,0x7f]
+          vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xac,0x72,0x80]
+          vfnmadd213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfnmadd231nepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xbc,0xf0]
+          vfnmadd231nepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vfnmadd231nepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xbc,0xf0]
+          vfnmadd231nepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xbc,0xf0]
+          vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vfnmadd231nepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xbc,0xf0]
+          vfnmadd231nepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vfnmadd231nepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xbc,0xf0]
+          vfnmadd231nepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xbc,0xf0]
+          vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vfnmadd231nepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xbc,0xf0]
+          vfnmadd231nepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vfnmadd231nepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xbc,0xf0]
+          vfnmadd231nepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xbc,0xf0]
+          vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vfnmadd231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmadd231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmadd231nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xbc,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd231nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vfnmadd231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xbc,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmadd231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xbc,0x71,0x7f]
+          vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xbc,0x72,0x80]
+          vfnmadd231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vfnmadd231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmadd231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmadd231nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xbc,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd231nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vfnmadd231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xbc,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmadd231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xbc,0x71,0x7f]
+          vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xbc,0x72,0x80]
+          vfnmadd231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfnmadd231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmadd231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmadd231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xbc,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmadd231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmadd231nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xbc,0x35,0x00,0x00,0x00,0x00]
+          vfnmadd231nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vfnmadd231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xbc,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmadd231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xbc,0x71,0x7f]
+          vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xbc,0x72,0x80]
+          vfnmadd231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfnmsub132nepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0x9e,0xf0]
+          vfnmsub132nepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vfnmsub132nepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0x9e,0xf0]
+          vfnmsub132nepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x9e,0xf0]
+          vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vfnmsub132nepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0x9e,0xf0]
+          vfnmsub132nepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vfnmsub132nepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0x9e,0xf0]
+          vfnmsub132nepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x9e,0xf0]
+          vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vfnmsub132nepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0x9e,0xf0]
+          vfnmsub132nepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vfnmsub132nepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0x9e,0xf0]
+          vfnmsub132nepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0x9e,0xf0]
+          vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vfnmsub132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub132nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmsub132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub132nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmsub132nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x9e,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub132nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vfnmsub132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x9e,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmsub132nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x9e,0x71,0x7f]
+          vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x9e,0x72,0x80]
+          vfnmsub132nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vfnmsub132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub132nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmsub132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub132nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmsub132nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x9e,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub132nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vfnmsub132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x9e,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmsub132nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x9e,0x71,0x7f]
+          vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x9e,0x72,0x80]
+          vfnmsub132nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfnmsub132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub132nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmsub132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x9e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub132nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmsub132nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x9e,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub132nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vfnmsub132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x9e,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmsub132nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x9e,0x71,0x7f]
+          vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x9e,0x72,0x80]
+          vfnmsub132nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfnmsub213nepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xae,0xf0]
+          vfnmsub213nepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vfnmsub213nepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xae,0xf0]
+          vfnmsub213nepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xae,0xf0]
+          vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vfnmsub213nepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xae,0xf0]
+          vfnmsub213nepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vfnmsub213nepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xae,0xf0]
+          vfnmsub213nepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xae,0xf0]
+          vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vfnmsub213nepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xae,0xf0]
+          vfnmsub213nepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vfnmsub213nepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xae,0xf0]
+          vfnmsub213nepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xae,0xf0]
+          vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vfnmsub213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub213nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmsub213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xae,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub213nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmsub213nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xae,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub213nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vfnmsub213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xae,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmsub213nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xae,0x71,0x7f]
+          vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xae,0x72,0x80]
+          vfnmsub213nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vfnmsub213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub213nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmsub213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xae,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub213nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmsub213nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xae,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub213nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vfnmsub213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xae,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmsub213nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xae,0x71,0x7f]
+          vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xae,0x72,0x80]
+          vfnmsub213nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfnmsub213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub213nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmsub213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xae,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub213nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmsub213nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xae,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub213nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vfnmsub213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xae,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmsub213nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xae,0x71,0x7f]
+          vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xae,0x72,0x80]
+          vfnmsub213nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfnmsub231nepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0xbe,0xf0]
+          vfnmsub231nepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vfnmsub231nepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0xbe,0xf0]
+          vfnmsub231nepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0xbe,0xf0]
+          vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vfnmsub231nepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0xbe,0xf0]
+          vfnmsub231nepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vfnmsub231nepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0xbe,0xf0]
+          vfnmsub231nepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0xbe,0xf0]
+          vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vfnmsub231nepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0xbe,0xf0]
+          vfnmsub231nepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vfnmsub231nepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0xbe,0xf0]
+          vfnmsub231nepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0xbe,0xf0]
+          vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vfnmsub231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub231nepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmsub231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub231nepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmsub231nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0xbe,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub231nepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vfnmsub231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0xbe,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vfnmsub231nepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0xbe,0x71,0x7f]
+          vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0xbe,0x72,0x80]
+          vfnmsub231nepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vfnmsub231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub231nepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmsub231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub231nepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmsub231nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0xbe,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub231nepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vfnmsub231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0xbe,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vfnmsub231nepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0xbe,0x71,0x7f]
+          vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0xbe,0x72,0x80]
+          vfnmsub231nepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfnmsub231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vfnmsub231nepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vfnmsub231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0xbe,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vfnmsub231nepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vfnmsub231nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0xbe,0x35,0x00,0x00,0x00,0x00]
+          vfnmsub231nepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vfnmsub231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0xbe,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vfnmsub231nepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0xbe,0x71,0x7f]
+          vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0xbe,0x72,0x80]
+          vfnmsub231nepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vfpclasspbf16 k5, zmm23, 123
+// CHECK: encoding: [0x62,0xb3,0x7f,0x48,0x66,0xef,0x7b]
+          vfpclasspbf16 k5, zmm23, 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, zmm23, 123
+// CHECK: encoding: [0x62,0xb3,0x7f,0x4f,0x66,0xef,0x7b]
+          vfpclasspbf16 k5 {k7}, zmm23, 123
+
+// CHECK: vfpclasspbf16 k5, ymm23, 123
+// CHECK: encoding: [0x62,0xb3,0x7f,0x28,0x66,0xef,0x7b]
+          vfpclasspbf16 k5, ymm23, 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, ymm23, 123
+// CHECK: encoding: [0x62,0xb3,0x7f,0x2f,0x66,0xef,0x7b]
+          vfpclasspbf16 k5 {k7}, ymm23, 123
+
+// CHECK: vfpclasspbf16 k5, xmm23, 123
+// CHECK: encoding: [0x62,0xb3,0x7f,0x08,0x66,0xef,0x7b]
+          vfpclasspbf16 k5, xmm23, 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, xmm23, 123
+// CHECK: encoding: [0x62,0xb3,0x7f,0x0f,0x66,0xef,0x7b]
+          vfpclasspbf16 k5 {k7}, xmm23, 123
+
+// CHECK: vfpclasspbf16 k5, xmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0xb3,0x7f,0x08,0x66,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vfpclasspbf16 k5, xmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, xmmword ptr [r8 + 4*rax + 291], 123
+// CHECK: encoding: [0x62,0xd3,0x7f,0x0f,0x66,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vfpclasspbf16 k5 {k7}, xmmword ptr [r8 + 4*rax + 291], 123
+
+// CHECK: vfpclasspbf16 k5, word ptr [rip]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x18,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b]
+          vfpclasspbf16 k5, word ptr [rip]{1to8}, 123
+
+// CHECK: vfpclasspbf16 k5, xmmword ptr [2*rbp - 512], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x08,0x66,0x2c,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vfpclasspbf16 k5, xmmword ptr [2*rbp - 512], 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, xmmword ptr [rcx + 2032], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x0f,0x66,0x69,0x7f,0x7b]
+          vfpclasspbf16 k5 {k7}, xmmword ptr [rcx + 2032], 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x1f,0x66,0x6a,0x80,0x7b]
+          vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to8}, 123
+
+// CHECK: vfpclasspbf16 k5, word ptr [rip]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x38,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b]
+          vfpclasspbf16 k5, word ptr [rip]{1to16}, 123
+
+// CHECK: vfpclasspbf16 k5, ymmword ptr [2*rbp - 1024], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x28,0x66,0x2c,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vfpclasspbf16 k5, ymmword ptr [2*rbp - 1024], 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, ymmword ptr [rcx + 4064], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x2f,0x66,0x69,0x7f,0x7b]
+          vfpclasspbf16 k5 {k7}, ymmword ptr [rcx + 4064], 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x3f,0x66,0x6a,0x80,0x7b]
+          vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to16}, 123
+
+// CHECK: vfpclasspbf16 k5, word ptr [rip]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x58,0x66,0x2d,0x00,0x00,0x00,0x00,0x7b]
+          vfpclasspbf16 k5, word ptr [rip]{1to32}, 123
+
+// CHECK: vfpclasspbf16 k5, zmmword ptr [2*rbp - 2048], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x48,0x66,0x2c,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vfpclasspbf16 k5, zmmword ptr [2*rbp - 2048], 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, zmmword ptr [rcx + 8128], 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x4f,0x66,0x69,0x7f,0x7b]
+          vfpclasspbf16 k5 {k7}, zmmword ptr [rcx + 8128], 123
+
+// CHECK: vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7f,0x5f,0x66,0x6a,0x80,0x7b]
+          vfpclasspbf16 k5 {k7}, word ptr [rdx - 256]{1to32}, 123
+
+// CHECK: vgetexppbf16 xmm22, xmm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x42,0xf7]
+          vgetexppbf16 xmm22, xmm23
+
+// CHECK: vgetexppbf16 xmm22 {k7}, xmm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0x0f,0x42,0xf7]
+          vgetexppbf16 xmm22 {k7}, xmm23
+
+// CHECK: vgetexppbf16 xmm22 {k7} {z}, xmm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0x8f,0x42,0xf7]
+          vgetexppbf16 xmm22 {k7} {z}, xmm23
+
+// CHECK: vgetexppbf16 zmm22, zmm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x42,0xf7]
+          vgetexppbf16 zmm22, zmm23
+
+// CHECK: vgetexppbf16 zmm22 {k7}, zmm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0x4f,0x42,0xf7]
+          vgetexppbf16 zmm22 {k7}, zmm23
+
+// CHECK: vgetexppbf16 zmm22 {k7} {z}, zmm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0xcf,0x42,0xf7]
+          vgetexppbf16 zmm22 {k7} {z}, zmm23
+
+// CHECK: vgetexppbf16 ymm22, ymm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x42,0xf7]
+          vgetexppbf16 ymm22, ymm23
+
+// CHECK: vgetexppbf16 ymm22 {k7}, ymm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0x2f,0x42,0xf7]
+          vgetexppbf16 ymm22 {k7}, ymm23
+
+// CHECK: vgetexppbf16 ymm22 {k7} {z}, ymm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0xaf,0x42,0xf7]
+          vgetexppbf16 ymm22 {k7} {z}, ymm23
+
+// CHECK: vgetexppbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vgetexppbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vgetexppbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x7d,0x0f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vgetexppbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vgetexppbf16 xmm22, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x7d,0x18,0x42,0x35,0x00,0x00,0x00,0x00]
+          vgetexppbf16 xmm22, word ptr [rip]{1to8}
+
+// CHECK: vgetexppbf16 xmm22, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x42,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vgetexppbf16 xmm22, xmmword ptr [2*rbp - 512]
+
+// CHECK: vgetexppbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe5,0x7d,0x8f,0x42,0x71,0x7f]
+          vgetexppbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032]
+
+// CHECK: vgetexppbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x7d,0x9f,0x42,0x72,0x80]
+          vgetexppbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vgetexppbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vgetexppbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vgetexppbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x7d,0x2f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vgetexppbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vgetexppbf16 ymm22, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x7d,0x38,0x42,0x35,0x00,0x00,0x00,0x00]
+          vgetexppbf16 ymm22, word ptr [rip]{1to16}
+
+// CHECK: vgetexppbf16 ymm22, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe5,0x7d,0x28,0x42,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vgetexppbf16 ymm22, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vgetexppbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe5,0x7d,0xaf,0x42,0x71,0x7f]
+          vgetexppbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064]
+
+// CHECK: vgetexppbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x7d,0xbf,0x42,0x72,0x80]
+          vgetexppbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vgetexppbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vgetexppbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vgetexppbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x7d,0x4f,0x42,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vgetexppbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vgetexppbf16 zmm22, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x7d,0x58,0x42,0x35,0x00,0x00,0x00,0x00]
+          vgetexppbf16 zmm22, word ptr [rip]{1to32}
+
+// CHECK: vgetexppbf16 zmm22, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe5,0x7d,0x48,0x42,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vgetexppbf16 zmm22, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vgetexppbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe5,0x7d,0xcf,0x42,0x71,0x7f]
+          vgetexppbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128]
+
+// CHECK: vgetexppbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x7d,0xdf,0x42,0x72,0x80]
+          vgetexppbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}
+
+// CHECK: vgetmantpbf16 zmm22, zmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x26,0xf7,0x7b]
+          vgetmantpbf16 zmm22, zmm23, 123
+
+// CHECK: vgetmantpbf16 zmm22 {k7}, zmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x4f,0x26,0xf7,0x7b]
+          vgetmantpbf16 zmm22 {k7}, zmm23, 123
+
+// CHECK: vgetmantpbf16 zmm22 {k7} {z}, zmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0xcf,0x26,0xf7,0x7b]
+          vgetmantpbf16 zmm22 {k7} {z}, zmm23, 123
+
+// CHECK: vgetmantpbf16 ymm22, ymm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x26,0xf7,0x7b]
+          vgetmantpbf16 ymm22, ymm23, 123
+
+// CHECK: vgetmantpbf16 ymm22 {k7}, ymm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x2f,0x26,0xf7,0x7b]
+          vgetmantpbf16 ymm22 {k7}, ymm23, 123
+
+// CHECK: vgetmantpbf16 ymm22 {k7} {z}, ymm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0xaf,0x26,0xf7,0x7b]
+          vgetmantpbf16 ymm22 {k7} {z}, ymm23, 123
+
+// CHECK: vgetmantpbf16 xmm22, xmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x26,0xf7,0x7b]
+          vgetmantpbf16 xmm22, xmm23, 123
+
+// CHECK: vgetmantpbf16 xmm22 {k7}, xmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x0f,0x26,0xf7,0x7b]
+          vgetmantpbf16 xmm22 {k7}, xmm23, 123
+
+// CHECK: vgetmantpbf16 xmm22 {k7} {z}, xmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x8f,0x26,0xf7,0x7b]
+          vgetmantpbf16 xmm22 {k7} {z}, xmm23, 123
+
+// CHECK: vgetmantpbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantpbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vgetmantpbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123
+// CHECK: encoding: [0x62,0xc3,0x7f,0x0f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vgetmantpbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123
+
+// CHECK: vgetmantpbf16 xmm22, word ptr [rip]{1to8}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x18,0x26,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vgetmantpbf16 xmm22, word ptr [rip]{1to8}, 123
+
+// CHECK: vgetmantpbf16 xmm22, xmmword ptr [2*rbp - 512], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x08,0x26,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vgetmantpbf16 xmm22, xmmword ptr [2*rbp - 512], 123
+
+// CHECK: vgetmantpbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x8f,0x26,0x71,0x7f,0x7b]
+          vgetmantpbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123
+
+// CHECK: vgetmantpbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x9f,0x26,0x72,0x80,0x7b]
+          vgetmantpbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+
+// CHECK: vgetmantpbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantpbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vgetmantpbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123
+// CHECK: encoding: [0x62,0xc3,0x7f,0x2f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vgetmantpbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123
+
+// CHECK: vgetmantpbf16 ymm22, word ptr [rip]{1to16}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x38,0x26,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vgetmantpbf16 ymm22, word ptr [rip]{1to16}, 123
+
+// CHECK: vgetmantpbf16 ymm22, ymmword ptr [2*rbp - 1024], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x28,0x26,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vgetmantpbf16 ymm22, ymmword ptr [2*rbp - 1024], 123
+
+// CHECK: vgetmantpbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0xaf,0x26,0x71,0x7f,0x7b]
+          vgetmantpbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123
+
+// CHECK: vgetmantpbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0xbf,0x26,0x72,0x80,0x7b]
+          vgetmantpbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+
+// CHECK: vgetmantpbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantpbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vgetmantpbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123
+// CHECK: encoding: [0x62,0xc3,0x7f,0x4f,0x26,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vgetmantpbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123
+
+// CHECK: vgetmantpbf16 zmm22, word ptr [rip]{1to32}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x58,0x26,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vgetmantpbf16 zmm22, word ptr [rip]{1to32}, 123
+
+// CHECK: vgetmantpbf16 zmm22, zmmword ptr [2*rbp - 2048], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x26,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vgetmantpbf16 zmm22, zmmword ptr [2*rbp - 2048], 123
+
+// CHECK: vgetmantpbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0xcf,0x26,0x71,0x7f,0x7b]
+          vgetmantpbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123
+
+// CHECK: vgetmantpbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0xdf,0x26,0x72,0x80,0x7b]
+          vgetmantpbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123
+
+// CHECK: vmaxpbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5f,0xf0]
+          vmaxpbf16 ymm22, ymm23, ymm24
+
+// CHECK: vmaxpbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5f,0xf0]
+          vmaxpbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vmaxpbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5f,0xf0]
+          vmaxpbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vmaxpbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5f,0xf0]
+          vmaxpbf16 zmm22, zmm23, zmm24
+
+// CHECK: vmaxpbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5f,0xf0]
+          vmaxpbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vmaxpbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5f,0xf0]
+          vmaxpbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vmaxpbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5f,0xf0]
+          vmaxpbf16 xmm22, xmm23, xmm24
+
+// CHECK: vmaxpbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5f,0xf0]
+          vmaxpbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vmaxpbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5f,0xf0]
+          vmaxpbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vmaxpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmaxpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vmaxpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vmaxpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vmaxpbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5f,0x35,0x00,0x00,0x00,0x00]
+          vmaxpbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vmaxpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5f,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vmaxpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vmaxpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5f,0x71,0x7f]
+          vmaxpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vmaxpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5f,0x72,0x80]
+          vmaxpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vmaxpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmaxpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vmaxpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vmaxpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vmaxpbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5f,0x35,0x00,0x00,0x00,0x00]
+          vmaxpbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vmaxpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5f,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vmaxpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vmaxpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5f,0x71,0x7f]
+          vmaxpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vmaxpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5f,0x72,0x80]
+          vmaxpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vmaxpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmaxpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vmaxpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5f,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vmaxpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vmaxpbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5f,0x35,0x00,0x00,0x00,0x00]
+          vmaxpbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vmaxpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5f,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vmaxpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vmaxpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5f,0x71,0x7f]
+          vmaxpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vmaxpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5f,0x72,0x80]
+          vmaxpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vminpbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5d,0xf0]
+          vminpbf16 ymm22, ymm23, ymm24
+
+// CHECK: vminpbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5d,0xf0]
+          vminpbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vminpbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5d,0xf0]
+          vminpbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vminpbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5d,0xf0]
+          vminpbf16 zmm22, zmm23, zmm24
+
+// CHECK: vminpbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5d,0xf0]
+          vminpbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vminpbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5d,0xf0]
+          vminpbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vminpbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5d,0xf0]
+          vminpbf16 xmm22, xmm23, xmm24
+
+// CHECK: vminpbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5d,0xf0]
+          vminpbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vminpbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5d,0xf0]
+          vminpbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vminpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vminpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vminpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vminpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vminpbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5d,0x35,0x00,0x00,0x00,0x00]
+          vminpbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vminpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5d,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vminpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vminpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5d,0x71,0x7f]
+          vminpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vminpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5d,0x72,0x80]
+          vminpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vminpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vminpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vminpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vminpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vminpbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5d,0x35,0x00,0x00,0x00,0x00]
+          vminpbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vminpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5d,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vminpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vminpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5d,0x71,0x7f]
+          vminpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vminpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5d,0x72,0x80]
+          vminpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vminpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vminpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vminpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5d,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vminpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vminpbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5d,0x35,0x00,0x00,0x00,0x00]
+          vminpbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vminpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5d,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vminpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vminpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5d,0x71,0x7f]
+          vminpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vminpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5d,0x72,0x80]
+          vminpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vmulnepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0x20,0x59,0xf0]
+          vmulnepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vmulnepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0x27,0x59,0xf0]
+          vmulnepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vmulnepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x59,0xf0]
+          vmulnepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vmulnepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x40,0x59,0xf0]
+          vmulnepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vmulnepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x47,0x59,0xf0]
+          vmulnepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vmulnepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x59,0xf0]
+          vmulnepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vmulnepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x00,0x59,0xf0]
+          vmulnepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vmulnepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x07,0x59,0xf0]
+          vmulnepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vmulnepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x87,0x59,0xf0]
+          vmulnepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vmulnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmulnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vmulnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x59,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vmulnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vmulnepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x59,0x35,0x00,0x00,0x00,0x00]
+          vmulnepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vmulnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x59,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vmulnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vmulnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x59,0x71,0x7f]
+          vmulnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vmulnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x59,0x72,0x80]
+          vmulnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vmulnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmulnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vmulnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x59,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vmulnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vmulnepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x59,0x35,0x00,0x00,0x00,0x00]
+          vmulnepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vmulnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x59,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vmulnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vmulnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x59,0x71,0x7f]
+          vmulnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vmulnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x59,0x72,0x80]
+          vmulnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vmulnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmulnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vmulnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x59,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vmulnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vmulnepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x59,0x35,0x00,0x00,0x00,0x00]
+          vmulnepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vmulnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x59,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vmulnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vmulnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x59,0x71,0x7f]
+          vmulnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vmulnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x59,0x72,0x80]
+          vmulnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vrcppbf16 xmm22, xmm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4c,0xf7]
+          vrcppbf16 xmm22, xmm23
+
+// CHECK: vrcppbf16 xmm22 {k7}, xmm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0x0f,0x4c,0xf7]
+          vrcppbf16 xmm22 {k7}, xmm23
+
+// CHECK: vrcppbf16 xmm22 {k7} {z}, xmm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0x8f,0x4c,0xf7]
+          vrcppbf16 xmm22 {k7} {z}, xmm23
+
+// CHECK: vrcppbf16 zmm22, zmm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4c,0xf7]
+          vrcppbf16 zmm22, zmm23
+
+// CHECK: vrcppbf16 zmm22 {k7}, zmm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0x4f,0x4c,0xf7]
+          vrcppbf16 zmm22 {k7}, zmm23
+
+// CHECK: vrcppbf16 zmm22 {k7} {z}, zmm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0xcf,0x4c,0xf7]
+          vrcppbf16 zmm22 {k7} {z}, zmm23
+
+// CHECK: vrcppbf16 ymm22, ymm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4c,0xf7]
+          vrcppbf16 ymm22, ymm23
+
+// CHECK: vrcppbf16 ymm22 {k7}, ymm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0x2f,0x4c,0xf7]
+          vrcppbf16 ymm22 {k7}, ymm23
+
+// CHECK: vrcppbf16 ymm22 {k7} {z}, ymm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0xaf,0x4c,0xf7]
+          vrcppbf16 ymm22 {k7} {z}, ymm23
+
+// CHECK: vrcppbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrcppbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vrcppbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x7c,0x0f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vrcppbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vrcppbf16 xmm22, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x7c,0x18,0x4c,0x35,0x00,0x00,0x00,0x00]
+          vrcppbf16 xmm22, word ptr [rip]{1to8}
+
+// CHECK: vrcppbf16 xmm22, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x08,0x4c,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vrcppbf16 xmm22, xmmword ptr [2*rbp - 512]
+
+// CHECK: vrcppbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x8f,0x4c,0x71,0x7f]
+          vrcppbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032]
+
+// CHECK: vrcppbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x7c,0x9f,0x4c,0x72,0x80]
+          vrcppbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vrcppbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrcppbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vrcppbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x7c,0x2f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vrcppbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vrcppbf16 ymm22, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x7c,0x38,0x4c,0x35,0x00,0x00,0x00,0x00]
+          vrcppbf16 ymm22, word ptr [rip]{1to16}
+
+// CHECK: vrcppbf16 ymm22, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x28,0x4c,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vrcppbf16 ymm22, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vrcppbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe6,0x7c,0xaf,0x4c,0x71,0x7f]
+          vrcppbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064]
+
+// CHECK: vrcppbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x7c,0xbf,0x4c,0x72,0x80]
+          vrcppbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vrcppbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrcppbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vrcppbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x7c,0x4f,0x4c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vrcppbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vrcppbf16 zmm22, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x7c,0x58,0x4c,0x35,0x00,0x00,0x00,0x00]
+          vrcppbf16 zmm22, word ptr [rip]{1to32}
+
+// CHECK: vrcppbf16 zmm22, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x48,0x4c,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vrcppbf16 zmm22, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vrcppbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe6,0x7c,0xcf,0x4c,0x71,0x7f]
+          vrcppbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128]
+
+// CHECK: vrcppbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x7c,0xdf,0x4c,0x72,0x80]
+          vrcppbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}
+
+// CHECK: vreducenepbf16 zmm22, zmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x56,0xf7,0x7b]
+          vreducenepbf16 zmm22, zmm23, 123
+
+// CHECK: vreducenepbf16 zmm22 {k7}, zmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x4f,0x56,0xf7,0x7b]
+          vreducenepbf16 zmm22 {k7}, zmm23, 123
+
+// CHECK: vreducenepbf16 zmm22 {k7} {z}, zmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0xcf,0x56,0xf7,0x7b]
+          vreducenepbf16 zmm22 {k7} {z}, zmm23, 123
+
+// CHECK: vreducenepbf16 ymm22, ymm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x56,0xf7,0x7b]
+          vreducenepbf16 ymm22, ymm23, 123
+
+// CHECK: vreducenepbf16 ymm22 {k7}, ymm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x2f,0x56,0xf7,0x7b]
+          vreducenepbf16 ymm22 {k7}, ymm23, 123
+
+// CHECK: vreducenepbf16 ymm22 {k7} {z}, ymm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0xaf,0x56,0xf7,0x7b]
+          vreducenepbf16 ymm22 {k7} {z}, ymm23, 123
+
+// CHECK: vreducenepbf16 xmm22, xmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x56,0xf7,0x7b]
+          vreducenepbf16 xmm22, xmm23, 123
+
+// CHECK: vreducenepbf16 xmm22 {k7}, xmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x0f,0x56,0xf7,0x7b]
+          vreducenepbf16 xmm22 {k7}, xmm23, 123
+
+// CHECK: vreducenepbf16 xmm22 {k7} {z}, xmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x8f,0x56,0xf7,0x7b]
+          vreducenepbf16 xmm22 {k7} {z}, xmm23, 123
+
+// CHECK: vreducenepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vreducenepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vreducenepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123
+// CHECK: encoding: [0x62,0xc3,0x7f,0x0f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vreducenepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123
+
+// CHECK: vreducenepbf16 xmm22, word ptr [rip]{1to8}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x18,0x56,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vreducenepbf16 xmm22, word ptr [rip]{1to8}, 123
+
+// CHECK: vreducenepbf16 xmm22, xmmword ptr [2*rbp - 512], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x08,0x56,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vreducenepbf16 xmm22, xmmword ptr [2*rbp - 512], 123
+
+// CHECK: vreducenepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x8f,0x56,0x71,0x7f,0x7b]
+          vreducenepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123
+
+// CHECK: vreducenepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x9f,0x56,0x72,0x80,0x7b]
+          vreducenepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+
+// CHECK: vreducenepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vreducenepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vreducenepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123
+// CHECK: encoding: [0x62,0xc3,0x7f,0x2f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vreducenepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123
+
+// CHECK: vreducenepbf16 ymm22, word ptr [rip]{1to16}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x38,0x56,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vreducenepbf16 ymm22, word ptr [rip]{1to16}, 123
+
+// CHECK: vreducenepbf16 ymm22, ymmword ptr [2*rbp - 1024], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x28,0x56,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vreducenepbf16 ymm22, ymmword ptr [2*rbp - 1024], 123
+
+// CHECK: vreducenepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0xaf,0x56,0x71,0x7f,0x7b]
+          vreducenepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123
+
+// CHECK: vreducenepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0xbf,0x56,0x72,0x80,0x7b]
+          vreducenepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+
+// CHECK: vreducenepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vreducenepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vreducenepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123
+// CHECK: encoding: [0x62,0xc3,0x7f,0x4f,0x56,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vreducenepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123
+
+// CHECK: vreducenepbf16 zmm22, word ptr [rip]{1to32}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x58,0x56,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vreducenepbf16 zmm22, word ptr [rip]{1to32}, 123
+
+// CHECK: vreducenepbf16 zmm22, zmmword ptr [2*rbp - 2048], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x56,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vreducenepbf16 zmm22, zmmword ptr [2*rbp - 2048], 123
+
+// CHECK: vreducenepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0xcf,0x56,0x71,0x7f,0x7b]
+          vreducenepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123
+
+// CHECK: vreducenepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0xdf,0x56,0x72,0x80,0x7b]
+          vreducenepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123
+
+// CHECK: vrndscalenepbf16 zmm22, zmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x08,0xf7,0x7b]
+          vrndscalenepbf16 zmm22, zmm23, 123
+
+// CHECK: vrndscalenepbf16 zmm22 {k7}, zmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x4f,0x08,0xf7,0x7b]
+          vrndscalenepbf16 zmm22 {k7}, zmm23, 123
+
+// CHECK: vrndscalenepbf16 zmm22 {k7} {z}, zmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0xcf,0x08,0xf7,0x7b]
+          vrndscalenepbf16 zmm22 {k7} {z}, zmm23, 123
+
+// CHECK: vrndscalenepbf16 ymm22, ymm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x08,0xf7,0x7b]
+          vrndscalenepbf16 ymm22, ymm23, 123
+
+// CHECK: vrndscalenepbf16 ymm22 {k7}, ymm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x2f,0x08,0xf7,0x7b]
+          vrndscalenepbf16 ymm22 {k7}, ymm23, 123
+
+// CHECK: vrndscalenepbf16 ymm22 {k7} {z}, ymm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0xaf,0x08,0xf7,0x7b]
+          vrndscalenepbf16 ymm22 {k7} {z}, ymm23, 123
+
+// CHECK: vrndscalenepbf16 xmm22, xmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x08,0xf7,0x7b]
+          vrndscalenepbf16 xmm22, xmm23, 123
+
+// CHECK: vrndscalenepbf16 xmm22 {k7}, xmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x0f,0x08,0xf7,0x7b]
+          vrndscalenepbf16 xmm22 {k7}, xmm23, 123
+
+// CHECK: vrndscalenepbf16 xmm22 {k7} {z}, xmm23, 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x8f,0x08,0xf7,0x7b]
+          vrndscalenepbf16 xmm22 {k7} {z}, xmm23, 123
+
+// CHECK: vrndscalenepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x08,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vrndscalenepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vrndscalenepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123
+// CHECK: encoding: [0x62,0xc3,0x7f,0x0f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vrndscalenepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291], 123
+
+// CHECK: vrndscalenepbf16 xmm22, word ptr [rip]{1to8}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x18,0x08,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vrndscalenepbf16 xmm22, word ptr [rip]{1to8}, 123
+
+// CHECK: vrndscalenepbf16 xmm22, xmmword ptr [2*rbp - 512], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x08,0x08,0x34,0x6d,0x00,0xfe,0xff,0xff,0x7b]
+          vrndscalenepbf16 xmm22, xmmword ptr [2*rbp - 512], 123
+
+// CHECK: vrndscalenepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x8f,0x08,0x71,0x7f,0x7b]
+          vrndscalenepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032], 123
+
+// CHECK: vrndscalenepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x9f,0x08,0x72,0x80,0x7b]
+          vrndscalenepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+
+// CHECK: vrndscalenepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x28,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vrndscalenepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vrndscalenepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123
+// CHECK: encoding: [0x62,0xc3,0x7f,0x2f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vrndscalenepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291], 123
+
+// CHECK: vrndscalenepbf16 ymm22, word ptr [rip]{1to16}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x38,0x08,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vrndscalenepbf16 ymm22, word ptr [rip]{1to16}, 123
+
+// CHECK: vrndscalenepbf16 ymm22, ymmword ptr [2*rbp - 1024], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x28,0x08,0x34,0x6d,0x00,0xfc,0xff,0xff,0x7b]
+          vrndscalenepbf16 ymm22, ymmword ptr [2*rbp - 1024], 123
+
+// CHECK: vrndscalenepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0xaf,0x08,0x71,0x7f,0x7b]
+          vrndscalenepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064], 123
+
+// CHECK: vrndscalenepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0xbf,0x08,0x72,0x80,0x7b]
+          vrndscalenepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+
+// CHECK: vrndscalenepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0xa3,0x7f,0x48,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vrndscalenepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vrndscalenepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123
+// CHECK: encoding: [0x62,0xc3,0x7f,0x4f,0x08,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+          vrndscalenepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291], 123
+
+// CHECK: vrndscalenepbf16 zmm22, word ptr [rip]{1to32}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x58,0x08,0x35,0x00,0x00,0x00,0x00,0x7b]
+          vrndscalenepbf16 zmm22, word ptr [rip]{1to32}, 123
+
+// CHECK: vrndscalenepbf16 zmm22, zmmword ptr [2*rbp - 2048], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x08,0x34,0x6d,0x00,0xf8,0xff,0xff,0x7b]
+          vrndscalenepbf16 zmm22, zmmword ptr [2*rbp - 2048], 123
+
+// CHECK: vrndscalenepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0xcf,0x08,0x71,0x7f,0x7b]
+          vrndscalenepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128], 123
+
+// CHECK: vrndscalenepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xe3,0x7f,0xdf,0x08,0x72,0x80,0x7b]
+          vrndscalenepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}, 123
+
+// CHECK: vrsqrtpbf16 xmm22, xmm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4e,0xf7]
+          vrsqrtpbf16 xmm22, xmm23
+
+// CHECK: vrsqrtpbf16 xmm22 {k7}, xmm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0x0f,0x4e,0xf7]
+          vrsqrtpbf16 xmm22 {k7}, xmm23
+
+// CHECK: vrsqrtpbf16 xmm22 {k7} {z}, xmm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0x8f,0x4e,0xf7]
+          vrsqrtpbf16 xmm22 {k7} {z}, xmm23
+
+// CHECK: vrsqrtpbf16 zmm22, zmm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4e,0xf7]
+          vrsqrtpbf16 zmm22, zmm23
+
+// CHECK: vrsqrtpbf16 zmm22 {k7}, zmm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0x4f,0x4e,0xf7]
+          vrsqrtpbf16 zmm22 {k7}, zmm23
+
+// CHECK: vrsqrtpbf16 zmm22 {k7} {z}, zmm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0xcf,0x4e,0xf7]
+          vrsqrtpbf16 zmm22 {k7} {z}, zmm23
+
+// CHECK: vrsqrtpbf16 ymm22, ymm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4e,0xf7]
+          vrsqrtpbf16 ymm22, ymm23
+
+// CHECK: vrsqrtpbf16 ymm22 {k7}, ymm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0x2f,0x4e,0xf7]
+          vrsqrtpbf16 ymm22 {k7}, ymm23
+
+// CHECK: vrsqrtpbf16 ymm22 {k7} {z}, ymm23
+// CHECK: encoding: [0x62,0xa6,0x7c,0xaf,0x4e,0xf7]
+          vrsqrtpbf16 ymm22 {k7} {z}, ymm23
+
+// CHECK: vrsqrtpbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x08,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrsqrtpbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vrsqrtpbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x7c,0x0f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vrsqrtpbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vrsqrtpbf16 xmm22, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x7c,0x18,0x4e,0x35,0x00,0x00,0x00,0x00]
+          vrsqrtpbf16 xmm22, word ptr [rip]{1to8}
+
+// CHECK: vrsqrtpbf16 xmm22, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x08,0x4e,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vrsqrtpbf16 xmm22, xmmword ptr [2*rbp - 512]
+
+// CHECK: vrsqrtpbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x8f,0x4e,0x71,0x7f]
+          vrsqrtpbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032]
+
+// CHECK: vrsqrtpbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x7c,0x9f,0x4e,0x72,0x80]
+          vrsqrtpbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vrsqrtpbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x28,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrsqrtpbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vrsqrtpbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x7c,0x2f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vrsqrtpbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vrsqrtpbf16 ymm22, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x7c,0x38,0x4e,0x35,0x00,0x00,0x00,0x00]
+          vrsqrtpbf16 ymm22, word ptr [rip]{1to16}
+
+// CHECK: vrsqrtpbf16 ymm22, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x28,0x4e,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vrsqrtpbf16 ymm22, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vrsqrtpbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe6,0x7c,0xaf,0x4e,0x71,0x7f]
+          vrsqrtpbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064]
+
+// CHECK: vrsqrtpbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x7c,0xbf,0x4e,0x72,0x80]
+          vrsqrtpbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vrsqrtpbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x7c,0x48,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrsqrtpbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vrsqrtpbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x7c,0x4f,0x4e,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vrsqrtpbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vrsqrtpbf16 zmm22, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x7c,0x58,0x4e,0x35,0x00,0x00,0x00,0x00]
+          vrsqrtpbf16 zmm22, word ptr [rip]{1to32}
+
+// CHECK: vrsqrtpbf16 zmm22, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe6,0x7c,0x48,0x4e,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vrsqrtpbf16 zmm22, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vrsqrtpbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe6,0x7c,0xcf,0x4e,0x71,0x7f]
+          vrsqrtpbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128]
+
+// CHECK: vrsqrtpbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x7c,0xdf,0x4e,0x72,0x80]
+          vrsqrtpbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}
+
+// CHECK: vscalefpbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x20,0x2c,0xf0]
+          vscalefpbf16 ymm22, ymm23, ymm24
+
+// CHECK: vscalefpbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0x27,0x2c,0xf0]
+          vscalefpbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vscalefpbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x86,0x44,0xa7,0x2c,0xf0]
+          vscalefpbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vscalefpbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x40,0x2c,0xf0]
+          vscalefpbf16 zmm22, zmm23, zmm24
+
+// CHECK: vscalefpbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x47,0x2c,0xf0]
+          vscalefpbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vscalefpbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x86,0x44,0xc7,0x2c,0xf0]
+          vscalefpbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vscalefpbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x00,0x2c,0xf0]
+          vscalefpbf16 xmm22, xmm23, xmm24
+
+// CHECK: vscalefpbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x07,0x2c,0xf0]
+          vscalefpbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vscalefpbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x86,0x44,0x87,0x2c,0xf0]
+          vscalefpbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vscalefpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x40,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vscalefpbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vscalefpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x47,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vscalefpbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vscalefpbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0x50,0x2c,0x35,0x00,0x00,0x00,0x00]
+          vscalefpbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vscalefpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe6,0x44,0x40,0x2c,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vscalefpbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vscalefpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe6,0x44,0xc7,0x2c,0x71,0x7f]
+          vscalefpbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vscalefpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe6,0x44,0xd7,0x2c,0x72,0x80]
+          vscalefpbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vscalefpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x20,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vscalefpbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vscalefpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x27,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vscalefpbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vscalefpbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0x30,0x2c,0x35,0x00,0x00,0x00,0x00]
+          vscalefpbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vscalefpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe6,0x44,0x20,0x2c,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vscalefpbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vscalefpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe6,0x44,0xa7,0x2c,0x71,0x7f]
+          vscalefpbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vscalefpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe6,0x44,0xb7,0x2c,0x72,0x80]
+          vscalefpbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vscalefpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa6,0x44,0x00,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vscalefpbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vscalefpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc6,0x44,0x07,0x2c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vscalefpbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vscalefpbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x10,0x2c,0x35,0x00,0x00,0x00,0x00]
+          vscalefpbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vscalefpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe6,0x44,0x00,0x2c,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vscalefpbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vscalefpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe6,0x44,0x87,0x2c,0x71,0x7f]
+          vscalefpbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vscalefpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe6,0x44,0x97,0x2c,0x72,0x80]
+          vscalefpbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
+// CHECK: vsqrtnepbf16 xmm22, xmm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x51,0xf7]
+          vsqrtnepbf16 xmm22, xmm23
+
+// CHECK: vsqrtnepbf16 xmm22 {k7}, xmm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0x0f,0x51,0xf7]
+          vsqrtnepbf16 xmm22 {k7}, xmm23
+
+// CHECK: vsqrtnepbf16 xmm22 {k7} {z}, xmm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0x8f,0x51,0xf7]
+          vsqrtnepbf16 xmm22 {k7} {z}, xmm23
+
+// CHECK: vsqrtnepbf16 zmm22, zmm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x51,0xf7]
+          vsqrtnepbf16 zmm22, zmm23
+
+// CHECK: vsqrtnepbf16 zmm22 {k7}, zmm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0x4f,0x51,0xf7]
+          vsqrtnepbf16 zmm22 {k7}, zmm23
+
+// CHECK: vsqrtnepbf16 zmm22 {k7} {z}, zmm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0xcf,0x51,0xf7]
+          vsqrtnepbf16 zmm22 {k7} {z}, zmm23
+
+// CHECK: vsqrtnepbf16 ymm22, ymm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x51,0xf7]
+          vsqrtnepbf16 ymm22, ymm23
+
+// CHECK: vsqrtnepbf16 ymm22 {k7}, ymm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0x2f,0x51,0xf7]
+          vsqrtnepbf16 ymm22 {k7}, ymm23
+
+// CHECK: vsqrtnepbf16 ymm22 {k7} {z}, ymm23
+// CHECK: encoding: [0x62,0xa5,0x7d,0xaf,0x51,0xf7]
+          vsqrtnepbf16 ymm22 {k7} {z}, ymm23
+
+// CHECK: vsqrtnepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsqrtnepbf16 xmm22, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsqrtnepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x7d,0x0f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vsqrtnepbf16 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vsqrtnepbf16 xmm22, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x7d,0x18,0x51,0x35,0x00,0x00,0x00,0x00]
+          vsqrtnepbf16 xmm22, word ptr [rip]{1to8}
+
+// CHECK: vsqrtnepbf16 xmm22, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vsqrtnepbf16 xmm22, xmmword ptr [2*rbp - 512]
+
+// CHECK: vsqrtnepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe5,0x7d,0x8f,0x51,0x71,0x7f]
+          vsqrtnepbf16 xmm22 {k7} {z}, xmmword ptr [rcx + 2032]
+
+// CHECK: vsqrtnepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x7d,0x9f,0x51,0x72,0x80]
+          vsqrtnepbf16 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vsqrtnepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x7d,0x28,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsqrtnepbf16 ymm22, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsqrtnepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x7d,0x2f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vsqrtnepbf16 ymm22 {k7}, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vsqrtnepbf16 ymm22, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x7d,0x38,0x51,0x35,0x00,0x00,0x00,0x00]
+          vsqrtnepbf16 ymm22, word ptr [rip]{1to16}
+
+// CHECK: vsqrtnepbf16 ymm22, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe5,0x7d,0x28,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vsqrtnepbf16 ymm22, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vsqrtnepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe5,0x7d,0xaf,0x51,0x71,0x7f]
+          vsqrtnepbf16 ymm22 {k7} {z}, ymmword ptr [rcx + 4064]
+
+// CHECK: vsqrtnepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x7d,0xbf,0x51,0x72,0x80]
+          vsqrtnepbf16 ymm22 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vsqrtnepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x7d,0x48,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsqrtnepbf16 zmm22, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsqrtnepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x7d,0x4f,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vsqrtnepbf16 zmm22 {k7}, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vsqrtnepbf16 zmm22, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x7d,0x58,0x51,0x35,0x00,0x00,0x00,0x00]
+          vsqrtnepbf16 zmm22, word ptr [rip]{1to32}
+
+// CHECK: vsqrtnepbf16 zmm22, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe5,0x7d,0x48,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vsqrtnepbf16 zmm22, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vsqrtnepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe5,0x7d,0xcf,0x51,0x71,0x7f]
+          vsqrtnepbf16 zmm22 {k7} {z}, zmmword ptr [rcx + 8128]
+
+// CHECK: vsqrtnepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x7d,0xdf,0x51,0x72,0x80]
+          vsqrtnepbf16 zmm22 {k7} {z}, word ptr [rdx - 256]{1to32}
+
+// CHECK: vsubnepbf16 ymm22, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0x20,0x5c,0xf0]
+          vsubnepbf16 ymm22, ymm23, ymm24
+
+// CHECK: vsubnepbf16 ymm22 {k7}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0x27,0x5c,0xf0]
+          vsubnepbf16 ymm22 {k7}, ymm23, ymm24
+
+// CHECK: vsubnepbf16 ymm22 {k7} {z}, ymm23, ymm24
+// CHECK: encoding: [0x62,0x85,0x45,0xa7,0x5c,0xf0]
+          vsubnepbf16 ymm22 {k7} {z}, ymm23, ymm24
+
+// CHECK: vsubnepbf16 zmm22, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x40,0x5c,0xf0]
+          vsubnepbf16 zmm22, zmm23, zmm24
+
+// CHECK: vsubnepbf16 zmm22 {k7}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x47,0x5c,0xf0]
+          vsubnepbf16 zmm22 {k7}, zmm23, zmm24
+
+// CHECK: vsubnepbf16 zmm22 {k7} {z}, zmm23, zmm24
+// CHECK: encoding: [0x62,0x85,0x45,0xc7,0x5c,0xf0]
+          vsubnepbf16 zmm22 {k7} {z}, zmm23, zmm24
+
+// CHECK: vsubnepbf16 xmm22, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x00,0x5c,0xf0]
+          vsubnepbf16 xmm22, xmm23, xmm24
+
+// CHECK: vsubnepbf16 xmm22 {k7}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x07,0x5c,0xf0]
+          vsubnepbf16 xmm22 {k7}, xmm23, xmm24
+
+// CHECK: vsubnepbf16 xmm22 {k7} {z}, xmm23, xmm24
+// CHECK: encoding: [0x62,0x85,0x45,0x87,0x5c,0xf0]
+          vsubnepbf16 xmm22 {k7} {z}, xmm23, xmm24
+
+// CHECK: vsubnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x40,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsubnepbf16 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsubnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x47,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vsubnepbf16 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vsubnepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x45,0x50,0x5c,0x35,0x00,0x00,0x00,0x00]
+          vsubnepbf16 zmm22, zmm23, word ptr [rip]{1to32}
+
+// CHECK: vsubnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+// CHECK: encoding: [0x62,0xe5,0x45,0x40,0x5c,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vsubnepbf16 zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+
+// CHECK: vsubnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+// CHECK: encoding: [0x62,0xe5,0x45,0xc7,0x5c,0x71,0x7f]
+          vsubnepbf16 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+
+// CHECK: vsubnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+// CHECK: encoding: [0x62,0xe5,0x45,0xd7,0x5c,0x72,0x80]
+          vsubnepbf16 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32}
+
+// CHECK: vsubnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x20,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsubnepbf16 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsubnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x27,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vsubnepbf16 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vsubnepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x45,0x30,0x5c,0x35,0x00,0x00,0x00,0x00]
+          vsubnepbf16 ymm22, ymm23, word ptr [rip]{1to16}
+
+// CHECK: vsubnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0x62,0xe5,0x45,0x20,0x5c,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vsubnepbf16 ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vsubnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xe5,0x45,0xa7,0x5c,0x71,0x7f]
+          vsubnepbf16 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+
+// CHECK: vsubnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xe5,0x45,0xb7,0x5c,0x72,0x80]
+          vsubnepbf16 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16}
+
+// CHECK: vsubnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa5,0x45,0x00,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsubnepbf16 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsubnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xc5,0x45,0x07,0x5c,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vsubnepbf16 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vsubnepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x45,0x10,0x5c,0x35,0x00,0x00,0x00,0x00]
+          vsubnepbf16 xmm22, xmm23, word ptr [rip]{1to8}
+
+// CHECK: vsubnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0x62,0xe5,0x45,0x00,0x5c,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vsubnepbf16 xmm22, xmm23, xmmword ptr [2*rbp - 512]
+
+// CHECK: vsubnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xe5,0x45,0x87,0x5c,0x71,0x7f]
+          vsubnepbf16 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+
+// CHECK: vsubnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xe5,0x45,0x97,0x5c,0x72,0x80]
+          vsubnepbf16 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8}
+
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index b88abbb461d08..286fb4904870c 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -1176,6 +1176,8 @@ static const X86FoldTableEntry Table1[] = {
   {X86::VCOMISSZrr_Int, X86::VCOMISSZrm_Int, TB_NO_REVERSE},
   {X86::VCOMISSrr, X86::VCOMISSrm, 0},
   {X86::VCOMISSrr_Int, X86::VCOMISSrm_Int, TB_NO_REVERSE},
+  {X86::VCOMSBF16Zrr, X86::VCOMSBF16Zrm, 0},
+  {X86::VCOMSBF16Zrr_Int, X86::VCOMSBF16Zrm_Int, TB_NO_REVERSE},
   {X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0},
   {X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rm, TB_NO_REVERSE},
   {X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0},
@@ -1461,6 +1463,9 @@ static const X86FoldTableEntry Table1[] = {
   {X86::VEXPANDPSZ128rr, X86::VEXPANDPSZ128rm, TB_NO_REVERSE},
   {X86::VEXPANDPSZ256rr, X86::VEXPANDPSZ256rm, TB_NO_REVERSE},
   {X86::VEXPANDPSZrr, X86::VEXPANDPSZrm, TB_NO_REVERSE},
+  {X86::VFPCLASSPBF16Z128rr, X86::VFPCLASSPBF16Z128rm, 0},
+  {X86::VFPCLASSPBF16Z256rr, X86::VFPCLASSPBF16Z256rm, 0},
+  {X86::VFPCLASSPBF16Zrr, X86::VFPCLASSPBF16Zrm, 0},
   {X86::VFPCLASSPDZ128rr, X86::VFPCLASSPDZ128rm, 0},
   {X86::VFPCLASSPDZ256rr, X86::VFPCLASSPDZ256rm, 0},
   {X86::VFPCLASSPDZrr, X86::VFPCLASSPDZrm, 0},
@@ -1479,6 +1484,9 @@ static const X86FoldTableEntry Table1[] = {
   {X86::VFRCZPSrr, X86::VFRCZPSrm, 0},
   {X86::VFRCZSDrr, X86::VFRCZSDrm, TB_NO_REVERSE},
   {X86::VFRCZSSrr, X86::VFRCZSSrm, TB_NO_REVERSE},
+  {X86::VGETEXPPBF16Z128r, X86::VGETEXPPBF16Z128m, 0},
+  {X86::VGETEXPPBF16Z256r, X86::VGETEXPPBF16Z256m, 0},
+  {X86::VGETEXPPBF16Zr, X86::VGETEXPPBF16Zm, 0},
   {X86::VGETEXPPDZ128r, X86::VGETEXPPDZ128m, 0},
   {X86::VGETEXPPDZ256r, X86::VGETEXPPDZ256m, 0},
   {X86::VGETEXPPDZr, X86::VGETEXPPDZm, 0},
@@ -1488,6 +1496,9 @@ static const X86FoldTableEntry Table1[] = {
   {X86::VGETEXPPSZ128r, X86::VGETEXPPSZ128m, 0},
   {X86::VGETEXPPSZ256r, X86::VGETEXPPSZ256m, 0},
   {X86::VGETEXPPSZr, X86::VGETEXPPSZm, 0},
+  {X86::VGETMANTPBF16Z128rri, X86::VGETMANTPBF16Z128rmi, 0},
+  {X86::VGETMANTPBF16Z256rri, X86::VGETMANTPBF16Z256rmi, 0},
+  {X86::VGETMANTPBF16Zrri, X86::VGETMANTPBF16Zrmi, 0},
   {X86::VGETMANTPDZ128rri, X86::VGETMANTPDZ128rmi, 0},
   {X86::VGETMANTPDZ256rri, X86::VGETMANTPDZ256rmi, 0},
   {X86::VGETMANTPDZrri, X86::VGETMANTPDZrmi, 0},
@@ -1821,11 +1832,17 @@ static const X86FoldTableEntry Table1[] = {
   {X86::VRCP14PSZr, X86::VRCP14PSZm, 0},
   {X86::VRCP28PDZr, X86::VRCP28PDZm, 0},
   {X86::VRCP28PSZr, X86::VRCP28PSZm, 0},
+  {X86::VRCPPBF16Z128r, X86::VRCPPBF16Z128m, 0},
+  {X86::VRCPPBF16Z256r, X86::VRCPPBF16Z256m, 0},
+  {X86::VRCPPBF16Zr, X86::VRCPPBF16Zm, 0},
   {X86::VRCPPHZ128r, X86::VRCPPHZ128m, 0},
   {X86::VRCPPHZ256r, X86::VRCPPHZ256m, 0},
   {X86::VRCPPHZr, X86::VRCPPHZm, 0},
   {X86::VRCPPSYr, X86::VRCPPSYm, 0},
   {X86::VRCPPSr, X86::VRCPPSm, 0},
+  {X86::VREDUCENEPBF16Z128rri, X86::VREDUCENEPBF16Z128rmi, 0},
+  {X86::VREDUCENEPBF16Z256rri, X86::VREDUCENEPBF16Z256rmi, 0},
+  {X86::VREDUCENEPBF16Zrri, X86::VREDUCENEPBF16Zrmi, 0},
   {X86::VREDUCEPDZ128rri, X86::VREDUCEPDZ128rmi, 0},
   {X86::VREDUCEPDZ256rri, X86::VREDUCEPDZ256rmi, 0},
   {X86::VREDUCEPDZrri, X86::VREDUCEPDZrmi, 0},
@@ -1835,6 +1852,9 @@ static const X86FoldTableEntry Table1[] = {
   {X86::VREDUCEPSZ128rri, X86::VREDUCEPSZ128rmi, 0},
   {X86::VREDUCEPSZ256rri, X86::VREDUCEPSZ256rmi, 0},
   {X86::VREDUCEPSZrri, X86::VREDUCEPSZrmi, 0},
+  {X86::VRNDSCALENEPBF16Z128rri, X86::VRNDSCALENEPBF16Z128rmi, 0},
+  {X86::VRNDSCALENEPBF16Z256rri, X86::VRNDSCALENEPBF16Z256rmi, 0},
+  {X86::VRNDSCALENEPBF16Zrri, X86::VRNDSCALENEPBF16Zrmi, 0},
   {X86::VRNDSCALEPDZ128rri, X86::VRNDSCALEPDZ128rmi, 0},
   {X86::VRNDSCALEPDZ256rri, X86::VRNDSCALEPDZ256rmi, 0},
   {X86::VRNDSCALEPDZrri, X86::VRNDSCALEPDZrmi, 0},
@@ -1856,11 +1876,17 @@ static const X86FoldTableEntry Table1[] = {
   {X86::VRSQRT14PSZr, X86::VRSQRT14PSZm, 0},
   {X86::VRSQRT28PDZr, X86::VRSQRT28PDZm, 0},
   {X86::VRSQRT28PSZr, X86::VRSQRT28PSZm, 0},
+  {X86::VRSQRTPBF16Z128r, X86::VRSQRTPBF16Z128m, 0},
+  {X86::VRSQRTPBF16Z256r, X86::VRSQRTPBF16Z256m, 0},
+  {X86::VRSQRTPBF16Zr, X86::VRSQRTPBF16Zm, 0},
   {X86::VRSQRTPHZ128r, X86::VRSQRTPHZ128m, 0},
   {X86::VRSQRTPHZ256r, X86::VRSQRTPHZ256m, 0},
   {X86::VRSQRTPHZr, X86::VRSQRTPHZm, 0},
   {X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0},
   {X86::VRSQRTPSr, X86::VRSQRTPSm, 0},
+  {X86::VSQRTNEPBF16Z128r, X86::VSQRTNEPBF16Z128m, 0},
+  {X86::VSQRTNEPBF16Z256r, X86::VSQRTNEPBF16Z256m, 0},
+  {X86::VSQRTNEPBF16Zr, X86::VSQRTNEPBF16Zm, 0},
   {X86::VSQRTPDYr, X86::VSQRTPDYm, 0},
   {X86::VSQRTPDZ128r, X86::VSQRTPDZ128m, 0},
   {X86::VSQRTPDZ256r, X86::VSQRTPDZ256m, 0},
@@ -2335,6 +2361,9 @@ static const X86FoldTableEntry Table2[] = {
   {X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16},
   {X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16},
   {X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16},
+  {X86::VADDNEPBF16Z128rr, X86::VADDNEPBF16Z128rm, 0},
+  {X86::VADDNEPBF16Z256rr, X86::VADDNEPBF16Z256rm, 0},
+  {X86::VADDNEPBF16Zrr, X86::VADDNEPBF16Zrm, 0},
   {X86::VADDPDYrr, X86::VADDPDYrm, 0},
   {X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0},
   {X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0},
@@ -2432,6 +2461,9 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VBROADCASTSSZ128rrkz, X86::VBROADCASTSSZ128rmkz, TB_NO_REVERSE},
   {X86::VBROADCASTSSZ256rrkz, X86::VBROADCASTSSZ256rmkz, TB_NO_REVERSE},
   {X86::VBROADCASTSSZrrkz, X86::VBROADCASTSSZrmkz, TB_NO_REVERSE},
+  {X86::VCMPPBF16Z128rri, X86::VCMPPBF16Z128rmi, 0},
+  {X86::VCMPPBF16Z256rri, X86::VCMPPBF16Z256rmi, 0},
+  {X86::VCMPPBF16Zrri, X86::VCMPPBF16Zrmi, 0},
   {X86::VCMPPDYrri, X86::VCMPPDYrmi, 0},
   {X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0},
   {X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0},
@@ -2737,6 +2769,9 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VDBPSADBWZ128rri, X86::VDBPSADBWZ128rmi, 0},
   {X86::VDBPSADBWZ256rri, X86::VDBPSADBWZ256rmi, 0},
   {X86::VDBPSADBWZrri, X86::VDBPSADBWZrmi, 0},
+  {X86::VDIVNEPBF16Z128rr, X86::VDIVNEPBF16Z128rm, 0},
+  {X86::VDIVNEPBF16Z256rr, X86::VDIVNEPBF16Z256rm, 0},
+  {X86::VDIVNEPBF16Zrr, X86::VDIVNEPBF16Zrm, 0},
   {X86::VDIVPDYrr, X86::VDIVPDYrm, 0},
   {X86::VDIVPDZ128rr, X86::VDIVPDZ128rm, 0},
   {X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0},
@@ -2819,6 +2854,9 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE},
   {X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0},
   {X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE},
+  {X86::VFPCLASSPBF16Z128rrk, X86::VFPCLASSPBF16Z128rmk, 0},
+  {X86::VFPCLASSPBF16Z256rrk, X86::VFPCLASSPBF16Z256rmk, 0},
+  {X86::VFPCLASSPBF16Zrrk, X86::VFPCLASSPBF16Zrmk, 0},
   {X86::VFPCLASSPDZ128rrk, X86::VFPCLASSPDZ128rmk, 0},
   {X86::VFPCLASSPDZ256rrk, X86::VFPCLASSPDZ256rmk, 0},
   {X86::VFPCLASSPDZrrk, X86::VFPCLASSPDZrmk, 0},
@@ -2831,6 +2869,9 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VFPCLASSSDZrrk, X86::VFPCLASSSDZrmk, TB_NO_REVERSE},
   {X86::VFPCLASSSHZrrk, X86::VFPCLASSSHZrmk, TB_NO_REVERSE},
   {X86::VFPCLASSSSZrrk, X86::VFPCLASSSSZrmk, TB_NO_REVERSE},
+  {X86::VGETEXPPBF16Z128rkz, X86::VGETEXPPBF16Z128mkz, 0},
+  {X86::VGETEXPPBF16Z256rkz, X86::VGETEXPPBF16Z256mkz, 0},
+  {X86::VGETEXPPBF16Zrkz, X86::VGETEXPPBF16Zmkz, 0},
   {X86::VGETEXPPDZ128rkz, X86::VGETEXPPDZ128mkz, 0},
   {X86::VGETEXPPDZ256rkz, X86::VGETEXPPDZ256mkz, 0},
   {X86::VGETEXPPDZrkz, X86::VGETEXPPDZmkz, 0},
@@ -2843,6 +2884,9 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VGETEXPSDZr, X86::VGETEXPSDZm, TB_NO_REVERSE},
   {X86::VGETEXPSHZr, X86::VGETEXPSHZm, TB_NO_REVERSE},
   {X86::VGETEXPSSZr, X86::VGETEXPSSZm, TB_NO_REVERSE},
+  {X86::VGETMANTPBF16Z128rrikz, X86::VGETMANTPBF16Z128rmikz, 0},
+  {X86::VGETMANTPBF16Z256rrikz, X86::VGETMANTPBF16Z256rmikz, 0},
+  {X86::VGETMANTPBF16Zrrikz, X86::VGETMANTPBF16Zrmikz, 0},
   {X86::VGETMANTPDZ128rrikz, X86::VGETMANTPDZ128rmikz, 0},
   {X86::VGETMANTPDZ256rrikz, X86::VGETMANTPDZ256rmikz, 0},
   {X86::VGETMANTPDZrrikz, X86::VGETMANTPDZrmikz, 0},
@@ -2910,6 +2954,9 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VMAXCSHZrr, X86::VMAXCSHZrm, 0},
   {X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0},
   {X86::VMAXCSSrr, X86::VMAXCSSrm, 0},
+  {X86::VMAXPBF16Z128rr, X86::VMAXPBF16Z128rm, 0},
+  {X86::VMAXPBF16Z256rr, X86::VMAXPBF16Z256rm, 0},
+  {X86::VMAXPBF16Zrr, X86::VMAXPBF16Zrm, 0},
   {X86::VMAXPDYrr, X86::VMAXPDYrm, 0},
   {X86::VMAXPDZ128rr, X86::VMAXPDZ128rm, 0},
   {X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0},
@@ -2966,6 +3013,9 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VMINMAXSDrri, X86::VMINMAXSDrmi, TB_NO_REVERSE},
   {X86::VMINMAXSHrri, X86::VMINMAXSHrmi, TB_NO_REVERSE},
   {X86::VMINMAXSSrri, X86::VMINMAXSSrmi, TB_NO_REVERSE},
+  {X86::VMINPBF16Z128rr, X86::VMINPBF16Z128rm, 0},
+  {X86::VMINPBF16Z256rr, X86::VMINPBF16Z256rm, 0},
+  {X86::VMINPBF16Zrr, X86::VMINPBF16Zrm, 0},
   {X86::VMINPDYrr, X86::VMINPDYrm, 0},
   {X86::VMINPDZ128rr, X86::VMINPDZ128rm, 0},
   {X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0},
@@ -3037,6 +3087,9 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VMPSADBWZ256rri, X86::VMPSADBWZ256rmi, 0},
   {X86::VMPSADBWZrri, X86::VMPSADBWZrmi, 0},
   {X86::VMPSADBWrri, X86::VMPSADBWrmi, 0},
+  {X86::VMULNEPBF16Z128rr, X86::VMULNEPBF16Z128rm, 0},
+  {X86::VMULNEPBF16Z256rr, X86::VMULNEPBF16Z256rm, 0},
+  {X86::VMULNEPBF16Zrr, X86::VMULNEPBF16Zrm, 0},
   {X86::VMULPDYrr, X86::VMULPDYrm, 0},
   {X86::VMULPDZ128rr, X86::VMULPDZ128rm, 0},
   {X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0},
@@ -3887,12 +3940,18 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VRCP28PSZrkz, X86::VRCP28PSZmkz, 0},
   {X86::VRCP28SDZr, X86::VRCP28SDZm, TB_NO_REVERSE},
   {X86::VRCP28SSZr, X86::VRCP28SSZm, TB_NO_REVERSE},
+  {X86::VRCPPBF16Z128rkz, X86::VRCPPBF16Z128mkz, 0},
+  {X86::VRCPPBF16Z256rkz, X86::VRCPPBF16Z256mkz, 0},
+  {X86::VRCPPBF16Zrkz, X86::VRCPPBF16Zmkz, 0},
   {X86::VRCPPHZ128rkz, X86::VRCPPHZ128mkz, 0},
   {X86::VRCPPHZ256rkz, X86::VRCPPHZ256mkz, 0},
   {X86::VRCPPHZrkz, X86::VRCPPHZmkz, 0},
   {X86::VRCPSHZrr, X86::VRCPSHZrm, TB_NO_REVERSE},
   {X86::VRCPSSr, X86::VRCPSSm, 0},
   {X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE},
+  {X86::VREDUCENEPBF16Z128rrikz, X86::VREDUCENEPBF16Z128rmikz, 0},
+  {X86::VREDUCENEPBF16Z256rrikz, X86::VREDUCENEPBF16Z256rmikz, 0},
+  {X86::VREDUCENEPBF16Zrrikz, X86::VREDUCENEPBF16Zrmikz, 0},
   {X86::VREDUCEPDZ128rrikz, X86::VREDUCEPDZ128rmikz, 0},
   {X86::VREDUCEPDZ256rrikz, X86::VREDUCEPDZ256rmikz, 0},
   {X86::VREDUCEPDZrrikz, X86::VREDUCEPDZrmikz, 0},
@@ -3905,6 +3964,9 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VREDUCESDZrri, X86::VREDUCESDZrmi, TB_NO_REVERSE},
   {X86::VREDUCESHZrri, X86::VREDUCESHZrmi, TB_NO_REVERSE},
   {X86::VREDUCESSZrri, X86::VREDUCESSZrmi, TB_NO_REVERSE},
+  {X86::VRNDSCALENEPBF16Z128rrikz, X86::VRNDSCALENEPBF16Z128rmikz, 0},
+  {X86::VRNDSCALENEPBF16Z256rrikz, X86::VRNDSCALENEPBF16Z256rmikz, 0},
+  {X86::VRNDSCALENEPBF16Zrrikz, X86::VRNDSCALENEPBF16Zrmikz, 0},
   {X86::VRNDSCALEPDZ128rrikz, X86::VRNDSCALEPDZ128rmikz, 0},
   {X86::VRNDSCALEPDZ256rrikz, X86::VRNDSCALEPDZ256rmikz, 0},
   {X86::VRNDSCALEPDZrrikz, X86::VRNDSCALEPDZrmikz, 0},
@@ -3936,12 +3998,18 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VRSQRT28PSZrkz, X86::VRSQRT28PSZmkz, 0},
   {X86::VRSQRT28SDZr, X86::VRSQRT28SDZm, TB_NO_REVERSE},
   {X86::VRSQRT28SSZr, X86::VRSQRT28SSZm, TB_NO_REVERSE},
+  {X86::VRSQRTPBF16Z128rkz, X86::VRSQRTPBF16Z128mkz, 0},
+  {X86::VRSQRTPBF16Z256rkz, X86::VRSQRTPBF16Z256mkz, 0},
+  {X86::VRSQRTPBF16Zrkz, X86::VRSQRTPBF16Zmkz, 0},
   {X86::VRSQRTPHZ128rkz, X86::VRSQRTPHZ128mkz, 0},
   {X86::VRSQRTPHZ256rkz, X86::VRSQRTPHZ256mkz, 0},
   {X86::VRSQRTPHZrkz, X86::VRSQRTPHZmkz, 0},
   {X86::VRSQRTSHZrr, X86::VRSQRTSHZrm, TB_NO_REVERSE},
   {X86::VRSQRTSSr, X86::VRSQRTSSm, 0},
   {X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE},
+  {X86::VSCALEFPBF16Z128rr, X86::VSCALEFPBF16Z128rm, 0},
+  {X86::VSCALEFPBF16Z256rr, X86::VSCALEFPBF16Z256rm, 0},
+  {X86::VSCALEFPBF16Zrr, X86::VSCALEFPBF16Zrm, 0},
   {X86::VSCALEFPDZ128rr, X86::VSCALEFPDZ128rm, 0},
   {X86::VSCALEFPDZ256rr, X86::VSCALEFPDZ256rm, 0},
   {X86::VSCALEFPDZrr, X86::VSCALEFPDZrm, 0},
@@ -3976,6 +4044,9 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VSM4KEY4rr, X86::VSM4KEY4rm, 0},
   {X86::VSM4RNDS4Yrr, X86::VSM4RNDS4Yrm, 0},
   {X86::VSM4RNDS4rr, X86::VSM4RNDS4rm, 0},
+  {X86::VSQRTNEPBF16Z128rkz, X86::VSQRTNEPBF16Z128mkz, 0},
+  {X86::VSQRTNEPBF16Z256rkz, X86::VSQRTNEPBF16Z256mkz, 0},
+  {X86::VSQRTNEPBF16Zrkz, X86::VSQRTNEPBF16Zmkz, 0},
   {X86::VSQRTPDZ128rkz, X86::VSQRTPDZ128mkz, 0},
   {X86::VSQRTPDZ256rkz, X86::VSQRTPDZ256mkz, 0},
   {X86::VSQRTPDZrkz, X86::VSQRTPDZmkz, 0},
@@ -3995,6 +4066,9 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VSQRTSSZr_Int, X86::VSQRTSSZm_Int, TB_NO_REVERSE},
   {X86::VSQRTSSr, X86::VSQRTSSm, 0},
   {X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, TB_NO_REVERSE},
+  {X86::VSUBNEPBF16Z128rr, X86::VSUBNEPBF16Z128rm, 0},
+  {X86::VSUBNEPBF16Z256rr, X86::VSUBNEPBF16Z256rm, 0},
+  {X86::VSUBNEPBF16Zrr, X86::VSUBNEPBF16Zrm, 0},
   {X86::VSUBPDYrr, X86::VSUBPDYrm, 0},
   {X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0},
   {X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0},
@@ -4069,6 +4143,9 @@ static const X86FoldTableEntry Table2[] = {
 };
 
 static const X86FoldTableEntry Table3[] = {
+  {X86::VADDNEPBF16Z128rrkz, X86::VADDNEPBF16Z128rmkz, 0},
+  {X86::VADDNEPBF16Z256rrkz, X86::VADDNEPBF16Z256rmkz, 0},
+  {X86::VADDNEPBF16Zrrkz, X86::VADDNEPBF16Zrmkz, 0},
   {X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0},
   {X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0},
   {X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0},
@@ -4115,6 +4192,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VBROADCASTSSZ128rrk, X86::VBROADCASTSSZ128rmk, TB_NO_REVERSE},
   {X86::VBROADCASTSSZ256rrk, X86::VBROADCASTSSZ256rmk, TB_NO_REVERSE},
   {X86::VBROADCASTSSZrrk, X86::VBROADCASTSSZrmk, TB_NO_REVERSE},
+  {X86::VCMPPBF16Z128rrik, X86::VCMPPBF16Z128rmik, 0},
+  {X86::VCMPPBF16Z256rrik, X86::VCMPPBF16Z256rmik, 0},
+  {X86::VCMPPBF16Zrrik, X86::VCMPPBF16Zrmik, 0},
   {X86::VCMPPDZ128rrik, X86::VCMPPDZ128rmik, 0},
   {X86::VCMPPDZ256rrik, X86::VCMPPDZ256rmik, 0},
   {X86::VCMPPDZrrik, X86::VCMPPDZrmik, 0},
@@ -4367,6 +4447,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VDBPSADBWZ128rrikz, X86::VDBPSADBWZ128rmikz, 0},
   {X86::VDBPSADBWZ256rrikz, X86::VDBPSADBWZ256rmikz, 0},
   {X86::VDBPSADBWZrrikz, X86::VDBPSADBWZrmikz, 0},
+  {X86::VDIVNEPBF16Z128rrkz, X86::VDIVNEPBF16Z128rmkz, 0},
+  {X86::VDIVNEPBF16Z256rrkz, X86::VDIVNEPBF16Z256rmkz, 0},
+  {X86::VDIVNEPBF16Zrrkz, X86::VDIVNEPBF16Zrmkz, 0},
   {X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0},
   {X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0},
   {X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0},
@@ -4409,6 +4492,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VFIXUPIMMPSZrri, X86::VFIXUPIMMPSZrmi, 0},
   {X86::VFIXUPIMMSDZrri, X86::VFIXUPIMMSDZrmi, TB_NO_REVERSE},
   {X86::VFIXUPIMMSSZrri, X86::VFIXUPIMMSSZrmi, TB_NO_REVERSE},
+  {X86::VFMADD132NEPBF16Z128r, X86::VFMADD132NEPBF16Z128m, 0},
+  {X86::VFMADD132NEPBF16Z256r, X86::VFMADD132NEPBF16Z256m, 0},
+  {X86::VFMADD132NEPBF16Zr, X86::VFMADD132NEPBF16Zm, 0},
   {X86::VFMADD132PDYr, X86::VFMADD132PDYm, 0},
   {X86::VFMADD132PDZ128r, X86::VFMADD132PDZ128m, 0},
   {X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256m, 0},
@@ -4432,6 +4518,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VFMADD132SSZr_Int, X86::VFMADD132SSZm_Int, TB_NO_REVERSE},
   {X86::VFMADD132SSr, X86::VFMADD132SSm, 0},
   {X86::VFMADD132SSr_Int, X86::VFMADD132SSm_Int, TB_NO_REVERSE},
+  {X86::VFMADD213NEPBF16Z128r, X86::VFMADD213NEPBF16Z128m, 0},
+  {X86::VFMADD213NEPBF16Z256r, X86::VFMADD213NEPBF16Z256m, 0},
+  {X86::VFMADD213NEPBF16Zr, X86::VFMADD213NEPBF16Zm, 0},
   {X86::VFMADD213PDYr, X86::VFMADD213PDYm, 0},
   {X86::VFMADD213PDZ128r, X86::VFMADD213PDZ128m, 0},
   {X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256m, 0},
@@ -4455,6 +4544,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VFMADD213SSZr_Int, X86::VFMADD213SSZm_Int, TB_NO_REVERSE},
   {X86::VFMADD213SSr, X86::VFMADD213SSm, 0},
   {X86::VFMADD213SSr_Int, X86::VFMADD213SSm_Int, TB_NO_REVERSE},
+  {X86::VFMADD231NEPBF16Z128r, X86::VFMADD231NEPBF16Z128m, 0},
+  {X86::VFMADD231NEPBF16Z256r, X86::VFMADD231NEPBF16Z256m, 0},
+  {X86::VFMADD231NEPBF16Zr, X86::VFMADD231NEPBF16Zm, 0},
   {X86::VFMADD231PDYr, X86::VFMADD231PDYm, 0},
   {X86::VFMADD231PDZ128r, X86::VFMADD231PDZ128m, 0},
   {X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256m, 0},
@@ -4533,6 +4625,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, 0},
   {X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Yrm, 0},
   {X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, 0},
+  {X86::VFMSUB132NEPBF16Z128r, X86::VFMSUB132NEPBF16Z128m, 0},
+  {X86::VFMSUB132NEPBF16Z256r, X86::VFMSUB132NEPBF16Z256m, 0},
+  {X86::VFMSUB132NEPBF16Zr, X86::VFMSUB132NEPBF16Zm, 0},
   {X86::VFMSUB132PDYr, X86::VFMSUB132PDYm, 0},
   {X86::VFMSUB132PDZ128r, X86::VFMSUB132PDZ128m, 0},
   {X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256m, 0},
@@ -4556,6 +4651,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VFMSUB132SSZr_Int, X86::VFMSUB132SSZm_Int, TB_NO_REVERSE},
   {X86::VFMSUB132SSr, X86::VFMSUB132SSm, 0},
   {X86::VFMSUB132SSr_Int, X86::VFMSUB132SSm_Int, TB_NO_REVERSE},
+  {X86::VFMSUB213NEPBF16Z128r, X86::VFMSUB213NEPBF16Z128m, 0},
+  {X86::VFMSUB213NEPBF16Z256r, X86::VFMSUB213NEPBF16Z256m, 0},
+  {X86::VFMSUB213NEPBF16Zr, X86::VFMSUB213NEPBF16Zm, 0},
   {X86::VFMSUB213PDYr, X86::VFMSUB213PDYm, 0},
   {X86::VFMSUB213PDZ128r, X86::VFMSUB213PDZ128m, 0},
   {X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256m, 0},
@@ -4579,6 +4677,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VFMSUB213SSZr_Int, X86::VFMSUB213SSZm_Int, TB_NO_REVERSE},
   {X86::VFMSUB213SSr, X86::VFMSUB213SSm, 0},
   {X86::VFMSUB213SSr_Int, X86::VFMSUB213SSm_Int, TB_NO_REVERSE},
+  {X86::VFMSUB231NEPBF16Z128r, X86::VFMSUB231NEPBF16Z128m, 0},
+  {X86::VFMSUB231NEPBF16Z256r, X86::VFMSUB231NEPBF16Z256m, 0},
+  {X86::VFMSUB231NEPBF16Zr, X86::VFMSUB231NEPBF16Zm, 0},
   {X86::VFMSUB231PDYr, X86::VFMSUB231PDYm, 0},
   {X86::VFMSUB231PDZ128r, X86::VFMSUB231PDZ128m, 0},
   {X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256m, 0},
@@ -4657,6 +4758,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VFMULCPHZ256rrkz, X86::VFMULCPHZ256rmkz, 0},
   {X86::VFMULCPHZrrkz, X86::VFMULCPHZrmkz, 0},
   {X86::VFMULCSHZrrkz, X86::VFMULCSHZrmkz, TB_NO_REVERSE},
+  {X86::VFNMADD132NEPBF16Z128r, X86::VFNMADD132NEPBF16Z128m, 0},
+  {X86::VFNMADD132NEPBF16Z256r, X86::VFNMADD132NEPBF16Z256m, 0},
+  {X86::VFNMADD132NEPBF16Zr, X86::VFNMADD132NEPBF16Zm, 0},
   {X86::VFNMADD132PDYr, X86::VFNMADD132PDYm, 0},
   {X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128m, 0},
   {X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256m, 0},
@@ -4680,6 +4784,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VFNMADD132SSZr_Int, X86::VFNMADD132SSZm_Int, TB_NO_REVERSE},
   {X86::VFNMADD132SSr, X86::VFNMADD132SSm, 0},
   {X86::VFNMADD132SSr_Int, X86::VFNMADD132SSm_Int, TB_NO_REVERSE},
+  {X86::VFNMADD213NEPBF16Z128r, X86::VFNMADD213NEPBF16Z128m, 0},
+  {X86::VFNMADD213NEPBF16Z256r, X86::VFNMADD213NEPBF16Z256m, 0},
+  {X86::VFNMADD213NEPBF16Zr, X86::VFNMADD213NEPBF16Zm, 0},
   {X86::VFNMADD213PDYr, X86::VFNMADD213PDYm, 0},
   {X86::VFNMADD213PDZ128r, X86::VFNMADD213PDZ128m, 0},
   {X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256m, 0},
@@ -4703,6 +4810,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VFNMADD213SSZr_Int, X86::VFNMADD213SSZm_Int, TB_NO_REVERSE},
   {X86::VFNMADD213SSr, X86::VFNMADD213SSm, 0},
   {X86::VFNMADD213SSr_Int, X86::VFNMADD213SSm_Int, TB_NO_REVERSE},
+  {X86::VFNMADD231NEPBF16Z128r, X86::VFNMADD231NEPBF16Z128m, 0},
+  {X86::VFNMADD231NEPBF16Z256r, X86::VFNMADD231NEPBF16Z256m, 0},
+  {X86::VFNMADD231NEPBF16Zr, X86::VFNMADD231NEPBF16Zm, 0},
   {X86::VFNMADD231PDYr, X86::VFNMADD231PDYm, 0},
   {X86::VFNMADD231PDZ128r, X86::VFNMADD231PDZ128m, 0},
   {X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256m, 0},
@@ -4734,6 +4844,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4rm_Int, TB_NO_REVERSE},
   {X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, 0},
   {X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4rm_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB132NEPBF16Z128r, X86::VFNMSUB132NEPBF16Z128m, 0},
+  {X86::VFNMSUB132NEPBF16Z256r, X86::VFNMSUB132NEPBF16Z256m, 0},
+  {X86::VFNMSUB132NEPBF16Zr, X86::VFNMSUB132NEPBF16Zm, 0},
   {X86::VFNMSUB132PDYr, X86::VFNMSUB132PDYm, 0},
   {X86::VFNMSUB132PDZ128r, X86::VFNMSUB132PDZ128m, 0},
   {X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256m, 0},
@@ -4757,6 +4870,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VFNMSUB132SSZr_Int, X86::VFNMSUB132SSZm_Int, TB_NO_REVERSE},
   {X86::VFNMSUB132SSr, X86::VFNMSUB132SSm, 0},
   {X86::VFNMSUB132SSr_Int, X86::VFNMSUB132SSm_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB213NEPBF16Z128r, X86::VFNMSUB213NEPBF16Z128m, 0},
+  {X86::VFNMSUB213NEPBF16Z256r, X86::VFNMSUB213NEPBF16Z256m, 0},
+  {X86::VFNMSUB213NEPBF16Zr, X86::VFNMSUB213NEPBF16Zm, 0},
   {X86::VFNMSUB213PDYr, X86::VFNMSUB213PDYm, 0},
   {X86::VFNMSUB213PDZ128r, X86::VFNMSUB213PDZ128m, 0},
   {X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256m, 0},
@@ -4780,6 +4896,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VFNMSUB213SSZr_Int, X86::VFNMSUB213SSZm_Int, TB_NO_REVERSE},
   {X86::VFNMSUB213SSr, X86::VFNMSUB213SSm, 0},
   {X86::VFNMSUB213SSr_Int, X86::VFNMSUB213SSm_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB231NEPBF16Z128r, X86::VFNMSUB231NEPBF16Z128m, 0},
+  {X86::VFNMSUB231NEPBF16Z256r, X86::VFNMSUB231NEPBF16Z256m, 0},
+  {X86::VFNMSUB231NEPBF16Zr, X86::VFNMSUB231NEPBF16Zm, 0},
   {X86::VFNMSUB231PDYr, X86::VFNMSUB231PDYm, 0},
   {X86::VFNMSUB231PDZ128r, X86::VFNMSUB231PDZ128m, 0},
   {X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256m, 0},
@@ -4811,6 +4930,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4rm_Int, TB_NO_REVERSE},
   {X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, 0},
   {X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4rm_Int, TB_NO_REVERSE},
+  {X86::VGETEXPPBF16Z128rk, X86::VGETEXPPBF16Z128mk, 0},
+  {X86::VGETEXPPBF16Z256rk, X86::VGETEXPPBF16Z256mk, 0},
+  {X86::VGETEXPPBF16Zrk, X86::VGETEXPPBF16Zmk, 0},
   {X86::VGETEXPPDZ128rk, X86::VGETEXPPDZ128mk, 0},
   {X86::VGETEXPPDZ256rk, X86::VGETEXPPDZ256mk, 0},
   {X86::VGETEXPPDZrk, X86::VGETEXPPDZmk, 0},
@@ -4823,6 +4945,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VGETEXPSDZrkz, X86::VGETEXPSDZmkz, TB_NO_REVERSE},
   {X86::VGETEXPSHZrkz, X86::VGETEXPSHZmkz, TB_NO_REVERSE},
   {X86::VGETEXPSSZrkz, X86::VGETEXPSSZmkz, TB_NO_REVERSE},
+  {X86::VGETMANTPBF16Z128rrik, X86::VGETMANTPBF16Z128rmik, 0},
+  {X86::VGETMANTPBF16Z256rrik, X86::VGETMANTPBF16Z256rmik, 0},
+  {X86::VGETMANTPBF16Zrrik, X86::VGETMANTPBF16Zrmik, 0},
   {X86::VGETMANTPDZ128rrik, X86::VGETMANTPDZ128rmik, 0},
   {X86::VGETMANTPDZ256rrik, X86::VGETMANTPDZ256rmik, 0},
   {X86::VGETMANTPDZrrik, X86::VGETMANTPDZrmik, 0},
@@ -4865,6 +4990,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0},
   {X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0},
   {X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0},
+  {X86::VMAXPBF16Z128rrkz, X86::VMAXPBF16Z128rmkz, 0},
+  {X86::VMAXPBF16Z256rrkz, X86::VMAXPBF16Z256rmkz, 0},
+  {X86::VMAXPBF16Zrrkz, X86::VMAXPBF16Zrmkz, 0},
   {X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0},
   {X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0},
   {X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0},
@@ -4901,6 +5029,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VMINMAXSDrrikz, X86::VMINMAXSDrmikz, TB_NO_REVERSE},
   {X86::VMINMAXSHrrikz, X86::VMINMAXSHrmikz, TB_NO_REVERSE},
   {X86::VMINMAXSSrrikz, X86::VMINMAXSSrmikz, TB_NO_REVERSE},
+  {X86::VMINPBF16Z128rrkz, X86::VMINPBF16Z128rmkz, 0},
+  {X86::VMINPBF16Z256rrkz, X86::VMINPBF16Z256rmkz, 0},
+  {X86::VMINPBF16Zrrkz, X86::VMINPBF16Zrmkz, 0},
   {X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0},
   {X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0},
   {X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0},
@@ -4955,6 +5086,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VMPSADBWZ128rrikz, X86::VMPSADBWZ128rmikz, 0},
   {X86::VMPSADBWZ256rrikz, X86::VMPSADBWZ256rmikz, 0},
   {X86::VMPSADBWZrrikz, X86::VMPSADBWZrmikz, 0},
+  {X86::VMULNEPBF16Z128rrkz, X86::VMULNEPBF16Z128rmkz, 0},
+  {X86::VMULNEPBF16Z256rrkz, X86::VMULNEPBF16Z256rmkz, 0},
+  {X86::VMULNEPBF16Zrrkz, X86::VMULNEPBF16Zrmkz, 0},
   {X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0},
   {X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0},
   {X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0},
@@ -5696,10 +5830,16 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VRCP28PSZrk, X86::VRCP28PSZmk, 0},
   {X86::VRCP28SDZrkz, X86::VRCP28SDZmkz, TB_NO_REVERSE},
   {X86::VRCP28SSZrkz, X86::VRCP28SSZmkz, TB_NO_REVERSE},
+  {X86::VRCPPBF16Z128rk, X86::VRCPPBF16Z128mk, 0},
+  {X86::VRCPPBF16Z256rk, X86::VRCPPBF16Z256mk, 0},
+  {X86::VRCPPBF16Zrk, X86::VRCPPBF16Zmk, 0},
   {X86::VRCPPHZ128rk, X86::VRCPPHZ128mk, 0},
   {X86::VRCPPHZ256rk, X86::VRCPPHZ256mk, 0},
   {X86::VRCPPHZrk, X86::VRCPPHZmk, 0},
   {X86::VRCPSHZrrkz, X86::VRCPSHZrmkz, TB_NO_REVERSE},
+  {X86::VREDUCENEPBF16Z128rrik, X86::VREDUCENEPBF16Z128rmik, 0},
+  {X86::VREDUCENEPBF16Z256rrik, X86::VREDUCENEPBF16Z256rmik, 0},
+  {X86::VREDUCENEPBF16Zrrik, X86::VREDUCENEPBF16Zrmik, 0},
   {X86::VREDUCEPDZ128rrik, X86::VREDUCEPDZ128rmik, 0},
   {X86::VREDUCEPDZ256rrik, X86::VREDUCEPDZ256rmik, 0},
   {X86::VREDUCEPDZrrik, X86::VREDUCEPDZrmik, 0},
@@ -5712,6 +5852,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VREDUCESDZrrikz, X86::VREDUCESDZrmikz, TB_NO_REVERSE},
   {X86::VREDUCESHZrrikz, X86::VREDUCESHZrmikz, TB_NO_REVERSE},
   {X86::VREDUCESSZrrikz, X86::VREDUCESSZrmikz, TB_NO_REVERSE},
+  {X86::VRNDSCALENEPBF16Z128rrik, X86::VRNDSCALENEPBF16Z128rmik, 0},
+  {X86::VRNDSCALENEPBF16Z256rrik, X86::VRNDSCALENEPBF16Z256rmik, 0},
+  {X86::VRNDSCALENEPBF16Zrrik, X86::VRNDSCALENEPBF16Zrmik, 0},
   {X86::VRNDSCALEPDZ128rrik, X86::VRNDSCALEPDZ128rmik, 0},
   {X86::VRNDSCALEPDZ256rrik, X86::VRNDSCALEPDZ256rmik, 0},
   {X86::VRNDSCALEPDZrrik, X86::VRNDSCALEPDZrmik, 0},
@@ -5736,10 +5879,16 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VRSQRT28PSZrk, X86::VRSQRT28PSZmk, 0},
   {X86::VRSQRT28SDZrkz, X86::VRSQRT28SDZmkz, TB_NO_REVERSE},
   {X86::VRSQRT28SSZrkz, X86::VRSQRT28SSZmkz, TB_NO_REVERSE},
+  {X86::VRSQRTPBF16Z128rk, X86::VRSQRTPBF16Z128mk, 0},
+  {X86::VRSQRTPBF16Z256rk, X86::VRSQRTPBF16Z256mk, 0},
+  {X86::VRSQRTPBF16Zrk, X86::VRSQRTPBF16Zmk, 0},
   {X86::VRSQRTPHZ128rk, X86::VRSQRTPHZ128mk, 0},
   {X86::VRSQRTPHZ256rk, X86::VRSQRTPHZ256mk, 0},
   {X86::VRSQRTPHZrk, X86::VRSQRTPHZmk, 0},
   {X86::VRSQRTSHZrrkz, X86::VRSQRTSHZrmkz, TB_NO_REVERSE},
+  {X86::VSCALEFPBF16Z128rrkz, X86::VSCALEFPBF16Z128rmkz, 0},
+  {X86::VSCALEFPBF16Z256rrkz, X86::VSCALEFPBF16Z256rmkz, 0},
+  {X86::VSCALEFPBF16Zrrkz, X86::VSCALEFPBF16Zrmkz, 0},
   {X86::VSCALEFPDZ128rrkz, X86::VSCALEFPDZ128rmkz, 0},
   {X86::VSCALEFPDZ256rrkz, X86::VSCALEFPDZ256rmkz, 0},
   {X86::VSCALEFPDZrrkz, X86::VSCALEFPDZrmkz, 0},
@@ -5769,6 +5918,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VSM3MSG1rr, X86::VSM3MSG1rm, 0},
   {X86::VSM3MSG2rr, X86::VSM3MSG2rm, 0},
   {X86::VSM3RNDS2rr, X86::VSM3RNDS2rm, 0},
+  {X86::VSQRTNEPBF16Z128rk, X86::VSQRTNEPBF16Z128mk, 0},
+  {X86::VSQRTNEPBF16Z256rk, X86::VSQRTNEPBF16Z256mk, 0},
+  {X86::VSQRTNEPBF16Zrk, X86::VSQRTNEPBF16Zmk, 0},
   {X86::VSQRTPDZ128rk, X86::VSQRTPDZ128mk, 0},
   {X86::VSQRTPDZ256rk, X86::VSQRTPDZ256mk, 0},
   {X86::VSQRTPDZrk, X86::VSQRTPDZmk, 0},
@@ -5781,6 +5933,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VSQRTSDZr_Intkz, X86::VSQRTSDZm_Intkz, TB_NO_REVERSE},
   {X86::VSQRTSHZr_Intkz, X86::VSQRTSHZm_Intkz, TB_NO_REVERSE},
   {X86::VSQRTSSZr_Intkz, X86::VSQRTSSZm_Intkz, TB_NO_REVERSE},
+  {X86::VSUBNEPBF16Z128rrkz, X86::VSUBNEPBF16Z128rmkz, 0},
+  {X86::VSUBNEPBF16Z256rrkz, X86::VSUBNEPBF16Z256rmkz, 0},
+  {X86::VSUBNEPBF16Zrrkz, X86::VSUBNEPBF16Zrmkz, 0},
   {X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0},
   {X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0},
   {X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0},
@@ -5814,6 +5969,9 @@ static const X86FoldTableEntry Table3[] = {
 };
 
 static const X86FoldTableEntry Table4[] = {
+  {X86::VADDNEPBF16Z128rrk, X86::VADDNEPBF16Z128rmk, 0},
+  {X86::VADDNEPBF16Z256rrk, X86::VADDNEPBF16Z256rmk, 0},
+  {X86::VADDNEPBF16Zrrk, X86::VADDNEPBF16Zrmk, 0},
   {X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0},
   {X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0},
   {X86::VADDPDZrrk, X86::VADDPDZrmk, 0},
@@ -5883,6 +6041,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VDBPSADBWZ128rrik, X86::VDBPSADBWZ128rmik, 0},
   {X86::VDBPSADBWZ256rrik, X86::VDBPSADBWZ256rmik, 0},
   {X86::VDBPSADBWZrrik, X86::VDBPSADBWZrmik, 0},
+  {X86::VDIVNEPBF16Z128rrk, X86::VDIVNEPBF16Z128rmk, 0},
+  {X86::VDIVNEPBF16Z256rrk, X86::VDIVNEPBF16Z256rmk, 0},
+  {X86::VDIVNEPBF16Zrrk, X86::VDIVNEPBF16Zrmk, 0},
   {X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0},
   {X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0},
   {X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0},
@@ -5935,6 +6096,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFIXUPIMMSDZrrikz, X86::VFIXUPIMMSDZrmikz, TB_NO_REVERSE},
   {X86::VFIXUPIMMSSZrrik, X86::VFIXUPIMMSSZrmik, TB_NO_REVERSE},
   {X86::VFIXUPIMMSSZrrikz, X86::VFIXUPIMMSSZrmikz, TB_NO_REVERSE},
+  {X86::VFMADD132NEPBF16Z128rk, X86::VFMADD132NEPBF16Z128mk, 0},
+  {X86::VFMADD132NEPBF16Z128rkz, X86::VFMADD132NEPBF16Z128mkz, 0},
+  {X86::VFMADD132NEPBF16Z256rk, X86::VFMADD132NEPBF16Z256mk, 0},
+  {X86::VFMADD132NEPBF16Z256rkz, X86::VFMADD132NEPBF16Z256mkz, 0},
+  {X86::VFMADD132NEPBF16Zrk, X86::VFMADD132NEPBF16Zmk, 0},
+  {X86::VFMADD132NEPBF16Zrkz, X86::VFMADD132NEPBF16Zmkz, 0},
   {X86::VFMADD132PDZ128rk, X86::VFMADD132PDZ128mk, 0},
   {X86::VFMADD132PDZ128rkz, X86::VFMADD132PDZ128mkz, 0},
   {X86::VFMADD132PDZ256rk, X86::VFMADD132PDZ256mk, 0},
@@ -5959,6 +6126,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMADD132SHZr_Intkz, X86::VFMADD132SHZm_Intkz, TB_NO_REVERSE},
   {X86::VFMADD132SSZr_Intk, X86::VFMADD132SSZm_Intk, TB_NO_REVERSE},
   {X86::VFMADD132SSZr_Intkz, X86::VFMADD132SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMADD213NEPBF16Z128rk, X86::VFMADD213NEPBF16Z128mk, 0},
+  {X86::VFMADD213NEPBF16Z128rkz, X86::VFMADD213NEPBF16Z128mkz, 0},
+  {X86::VFMADD213NEPBF16Z256rk, X86::VFMADD213NEPBF16Z256mk, 0},
+  {X86::VFMADD213NEPBF16Z256rkz, X86::VFMADD213NEPBF16Z256mkz, 0},
+  {X86::VFMADD213NEPBF16Zrk, X86::VFMADD213NEPBF16Zmk, 0},
+  {X86::VFMADD213NEPBF16Zrkz, X86::VFMADD213NEPBF16Zmkz, 0},
   {X86::VFMADD213PDZ128rk, X86::VFMADD213PDZ128mk, 0},
   {X86::VFMADD213PDZ128rkz, X86::VFMADD213PDZ128mkz, 0},
   {X86::VFMADD213PDZ256rk, X86::VFMADD213PDZ256mk, 0},
@@ -5983,6 +6156,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMADD213SHZr_Intkz, X86::VFMADD213SHZm_Intkz, TB_NO_REVERSE},
   {X86::VFMADD213SSZr_Intk, X86::VFMADD213SSZm_Intk, TB_NO_REVERSE},
   {X86::VFMADD213SSZr_Intkz, X86::VFMADD213SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMADD231NEPBF16Z128rk, X86::VFMADD231NEPBF16Z128mk, 0},
+  {X86::VFMADD231NEPBF16Z128rkz, X86::VFMADD231NEPBF16Z128mkz, 0},
+  {X86::VFMADD231NEPBF16Z256rk, X86::VFMADD231NEPBF16Z256mk, 0},
+  {X86::VFMADD231NEPBF16Z256rkz, X86::VFMADD231NEPBF16Z256mkz, 0},
+  {X86::VFMADD231NEPBF16Zrk, X86::VFMADD231NEPBF16Zmk, 0},
+  {X86::VFMADD231NEPBF16Zrkz, X86::VFMADD231NEPBF16Zmkz, 0},
   {X86::VFMADD231PDZ128rk, X86::VFMADD231PDZ128mk, 0},
   {X86::VFMADD231PDZ128rkz, X86::VFMADD231PDZ128mkz, 0},
   {X86::VFMADD231PDZ256rk, X86::VFMADD231PDZ256mk, 0},
@@ -6069,6 +6248,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMADDSUB231PSZ256rkz, X86::VFMADDSUB231PSZ256mkz, 0},
   {X86::VFMADDSUB231PSZrk, X86::VFMADDSUB231PSZmk, 0},
   {X86::VFMADDSUB231PSZrkz, X86::VFMADDSUB231PSZmkz, 0},
+  {X86::VFMSUB132NEPBF16Z128rk, X86::VFMSUB132NEPBF16Z128mk, 0},
+  {X86::VFMSUB132NEPBF16Z128rkz, X86::VFMSUB132NEPBF16Z128mkz, 0},
+  {X86::VFMSUB132NEPBF16Z256rk, X86::VFMSUB132NEPBF16Z256mk, 0},
+  {X86::VFMSUB132NEPBF16Z256rkz, X86::VFMSUB132NEPBF16Z256mkz, 0},
+  {X86::VFMSUB132NEPBF16Zrk, X86::VFMSUB132NEPBF16Zmk, 0},
+  {X86::VFMSUB132NEPBF16Zrkz, X86::VFMSUB132NEPBF16Zmkz, 0},
   {X86::VFMSUB132PDZ128rk, X86::VFMSUB132PDZ128mk, 0},
   {X86::VFMSUB132PDZ128rkz, X86::VFMSUB132PDZ128mkz, 0},
   {X86::VFMSUB132PDZ256rk, X86::VFMSUB132PDZ256mk, 0},
@@ -6093,6 +6278,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMSUB132SHZr_Intkz, X86::VFMSUB132SHZm_Intkz, TB_NO_REVERSE},
   {X86::VFMSUB132SSZr_Intk, X86::VFMSUB132SSZm_Intk, TB_NO_REVERSE},
   {X86::VFMSUB132SSZr_Intkz, X86::VFMSUB132SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMSUB213NEPBF16Z128rk, X86::VFMSUB213NEPBF16Z128mk, 0},
+  {X86::VFMSUB213NEPBF16Z128rkz, X86::VFMSUB213NEPBF16Z128mkz, 0},
+  {X86::VFMSUB213NEPBF16Z256rk, X86::VFMSUB213NEPBF16Z256mk, 0},
+  {X86::VFMSUB213NEPBF16Z256rkz, X86::VFMSUB213NEPBF16Z256mkz, 0},
+  {X86::VFMSUB213NEPBF16Zrk, X86::VFMSUB213NEPBF16Zmk, 0},
+  {X86::VFMSUB213NEPBF16Zrkz, X86::VFMSUB213NEPBF16Zmkz, 0},
   {X86::VFMSUB213PDZ128rk, X86::VFMSUB213PDZ128mk, 0},
   {X86::VFMSUB213PDZ128rkz, X86::VFMSUB213PDZ128mkz, 0},
   {X86::VFMSUB213PDZ256rk, X86::VFMSUB213PDZ256mk, 0},
@@ -6117,6 +6308,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMSUB213SHZr_Intkz, X86::VFMSUB213SHZm_Intkz, TB_NO_REVERSE},
   {X86::VFMSUB213SSZr_Intk, X86::VFMSUB213SSZm_Intk, TB_NO_REVERSE},
   {X86::VFMSUB213SSZr_Intkz, X86::VFMSUB213SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMSUB231NEPBF16Z128rk, X86::VFMSUB231NEPBF16Z128mk, 0},
+  {X86::VFMSUB231NEPBF16Z128rkz, X86::VFMSUB231NEPBF16Z128mkz, 0},
+  {X86::VFMSUB231NEPBF16Z256rk, X86::VFMSUB231NEPBF16Z256mk, 0},
+  {X86::VFMSUB231NEPBF16Z256rkz, X86::VFMSUB231NEPBF16Z256mkz, 0},
+  {X86::VFMSUB231NEPBF16Zrk, X86::VFMSUB231NEPBF16Zmk, 0},
+  {X86::VFMSUB231NEPBF16Zrkz, X86::VFMSUB231NEPBF16Zmkz, 0},
   {X86::VFMSUB231PDZ128rk, X86::VFMSUB231PDZ128mk, 0},
   {X86::VFMSUB231PDZ128rkz, X86::VFMSUB231PDZ128mkz, 0},
   {X86::VFMSUB231PDZ256rk, X86::VFMSUB231PDZ256mk, 0},
@@ -6199,6 +6396,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMULCPHZ256rrk, X86::VFMULCPHZ256rmk, 0},
   {X86::VFMULCPHZrrk, X86::VFMULCPHZrmk, 0},
   {X86::VFMULCSHZrrk, X86::VFMULCSHZrmk, TB_NO_REVERSE},
+  {X86::VFNMADD132NEPBF16Z128rk, X86::VFNMADD132NEPBF16Z128mk, 0},
+  {X86::VFNMADD132NEPBF16Z128rkz, X86::VFNMADD132NEPBF16Z128mkz, 0},
+  {X86::VFNMADD132NEPBF16Z256rk, X86::VFNMADD132NEPBF16Z256mk, 0},
+  {X86::VFNMADD132NEPBF16Z256rkz, X86::VFNMADD132NEPBF16Z256mkz, 0},
+  {X86::VFNMADD132NEPBF16Zrk, X86::VFNMADD132NEPBF16Zmk, 0},
+  {X86::VFNMADD132NEPBF16Zrkz, X86::VFNMADD132NEPBF16Zmkz, 0},
   {X86::VFNMADD132PDZ128rk, X86::VFNMADD132PDZ128mk, 0},
   {X86::VFNMADD132PDZ128rkz, X86::VFNMADD132PDZ128mkz, 0},
   {X86::VFNMADD132PDZ256rk, X86::VFNMADD132PDZ256mk, 0},
@@ -6223,6 +6426,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMADD132SHZr_Intkz, X86::VFNMADD132SHZm_Intkz, TB_NO_REVERSE},
   {X86::VFNMADD132SSZr_Intk, X86::VFNMADD132SSZm_Intk, TB_NO_REVERSE},
   {X86::VFNMADD132SSZr_Intkz, X86::VFNMADD132SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMADD213NEPBF16Z128rk, X86::VFNMADD213NEPBF16Z128mk, 0},
+  {X86::VFNMADD213NEPBF16Z128rkz, X86::VFNMADD213NEPBF16Z128mkz, 0},
+  {X86::VFNMADD213NEPBF16Z256rk, X86::VFNMADD213NEPBF16Z256mk, 0},
+  {X86::VFNMADD213NEPBF16Z256rkz, X86::VFNMADD213NEPBF16Z256mkz, 0},
+  {X86::VFNMADD213NEPBF16Zrk, X86::VFNMADD213NEPBF16Zmk, 0},
+  {X86::VFNMADD213NEPBF16Zrkz, X86::VFNMADD213NEPBF16Zmkz, 0},
   {X86::VFNMADD213PDZ128rk, X86::VFNMADD213PDZ128mk, 0},
   {X86::VFNMADD213PDZ128rkz, X86::VFNMADD213PDZ128mkz, 0},
   {X86::VFNMADD213PDZ256rk, X86::VFNMADD213PDZ256mk, 0},
@@ -6247,6 +6456,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMADD213SHZr_Intkz, X86::VFNMADD213SHZm_Intkz, TB_NO_REVERSE},
   {X86::VFNMADD213SSZr_Intk, X86::VFNMADD213SSZm_Intk, TB_NO_REVERSE},
   {X86::VFNMADD213SSZr_Intkz, X86::VFNMADD213SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMADD231NEPBF16Z128rk, X86::VFNMADD231NEPBF16Z128mk, 0},
+  {X86::VFNMADD231NEPBF16Z128rkz, X86::VFNMADD231NEPBF16Z128mkz, 0},
+  {X86::VFNMADD231NEPBF16Z256rk, X86::VFNMADD231NEPBF16Z256mk, 0},
+  {X86::VFNMADD231NEPBF16Z256rkz, X86::VFNMADD231NEPBF16Z256mkz, 0},
+  {X86::VFNMADD231NEPBF16Zrk, X86::VFNMADD231NEPBF16Zmk, 0},
+  {X86::VFNMADD231NEPBF16Zrkz, X86::VFNMADD231NEPBF16Zmkz, 0},
   {X86::VFNMADD231PDZ128rk, X86::VFNMADD231PDZ128mk, 0},
   {X86::VFNMADD231PDZ128rkz, X86::VFNMADD231PDZ128mkz, 0},
   {X86::VFNMADD231PDZ256rk, X86::VFNMADD231PDZ256mk, 0},
@@ -6271,6 +6486,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMADD231SHZr_Intkz, X86::VFNMADD231SHZm_Intkz, TB_NO_REVERSE},
   {X86::VFNMADD231SSZr_Intk, X86::VFNMADD231SSZm_Intk, TB_NO_REVERSE},
   {X86::VFNMADD231SSZr_Intkz, X86::VFNMADD231SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMSUB132NEPBF16Z128rk, X86::VFNMSUB132NEPBF16Z128mk, 0},
+  {X86::VFNMSUB132NEPBF16Z128rkz, X86::VFNMSUB132NEPBF16Z128mkz, 0},
+  {X86::VFNMSUB132NEPBF16Z256rk, X86::VFNMSUB132NEPBF16Z256mk, 0},
+  {X86::VFNMSUB132NEPBF16Z256rkz, X86::VFNMSUB132NEPBF16Z256mkz, 0},
+  {X86::VFNMSUB132NEPBF16Zrk, X86::VFNMSUB132NEPBF16Zmk, 0},
+  {X86::VFNMSUB132NEPBF16Zrkz, X86::VFNMSUB132NEPBF16Zmkz, 0},
   {X86::VFNMSUB132PDZ128rk, X86::VFNMSUB132PDZ128mk, 0},
   {X86::VFNMSUB132PDZ128rkz, X86::VFNMSUB132PDZ128mkz, 0},
   {X86::VFNMSUB132PDZ256rk, X86::VFNMSUB132PDZ256mk, 0},
@@ -6295,6 +6516,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMSUB132SHZr_Intkz, X86::VFNMSUB132SHZm_Intkz, TB_NO_REVERSE},
   {X86::VFNMSUB132SSZr_Intk, X86::VFNMSUB132SSZm_Intk, TB_NO_REVERSE},
   {X86::VFNMSUB132SSZr_Intkz, X86::VFNMSUB132SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMSUB213NEPBF16Z128rk, X86::VFNMSUB213NEPBF16Z128mk, 0},
+  {X86::VFNMSUB213NEPBF16Z128rkz, X86::VFNMSUB213NEPBF16Z128mkz, 0},
+  {X86::VFNMSUB213NEPBF16Z256rk, X86::VFNMSUB213NEPBF16Z256mk, 0},
+  {X86::VFNMSUB213NEPBF16Z256rkz, X86::VFNMSUB213NEPBF16Z256mkz, 0},
+  {X86::VFNMSUB213NEPBF16Zrk, X86::VFNMSUB213NEPBF16Zmk, 0},
+  {X86::VFNMSUB213NEPBF16Zrkz, X86::VFNMSUB213NEPBF16Zmkz, 0},
   {X86::VFNMSUB213PDZ128rk, X86::VFNMSUB213PDZ128mk, 0},
   {X86::VFNMSUB213PDZ128rkz, X86::VFNMSUB213PDZ128mkz, 0},
   {X86::VFNMSUB213PDZ256rk, X86::VFNMSUB213PDZ256mk, 0},
@@ -6319,6 +6546,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMSUB213SHZr_Intkz, X86::VFNMSUB213SHZm_Intkz, TB_NO_REVERSE},
   {X86::VFNMSUB213SSZr_Intk, X86::VFNMSUB213SSZm_Intk, TB_NO_REVERSE},
   {X86::VFNMSUB213SSZr_Intkz, X86::VFNMSUB213SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMSUB231NEPBF16Z128rk, X86::VFNMSUB231NEPBF16Z128mk, 0},
+  {X86::VFNMSUB231NEPBF16Z128rkz, X86::VFNMSUB231NEPBF16Z128mkz, 0},
+  {X86::VFNMSUB231NEPBF16Z256rk, X86::VFNMSUB231NEPBF16Z256mk, 0},
+  {X86::VFNMSUB231NEPBF16Z256rkz, X86::VFNMSUB231NEPBF16Z256mkz, 0},
+  {X86::VFNMSUB231NEPBF16Zrk, X86::VFNMSUB231NEPBF16Zmk, 0},
+  {X86::VFNMSUB231NEPBF16Zrkz, X86::VFNMSUB231NEPBF16Zmkz, 0},
   {X86::VFNMSUB231PDZ128rk, X86::VFNMSUB231PDZ128mk, 0},
   {X86::VFNMSUB231PDZ128rkz, X86::VFNMSUB231PDZ128mkz, 0},
   {X86::VFNMSUB231PDZ256rk, X86::VFNMSUB231PDZ256mk, 0},
@@ -6379,6 +6612,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0},
   {X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0},
   {X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0},
+  {X86::VMAXPBF16Z128rrk, X86::VMAXPBF16Z128rmk, 0},
+  {X86::VMAXPBF16Z256rrk, X86::VMAXPBF16Z256rmk, 0},
+  {X86::VMAXPBF16Zrrk, X86::VMAXPBF16Zrmk, 0},
   {X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0},
   {X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0},
   {X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0},
@@ -6415,6 +6651,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VMINMAXSDrrik, X86::VMINMAXSDrmik, TB_NO_REVERSE},
   {X86::VMINMAXSHrrik, X86::VMINMAXSHrmik, TB_NO_REVERSE},
   {X86::VMINMAXSSrrik, X86::VMINMAXSSrmik, TB_NO_REVERSE},
+  {X86::VMINPBF16Z128rrk, X86::VMINPBF16Z128rmk, 0},
+  {X86::VMINPBF16Z256rrk, X86::VMINPBF16Z256rmk, 0},
+  {X86::VMINPBF16Zrrk, X86::VMINPBF16Zrmk, 0},
   {X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0},
   {X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0},
   {X86::VMINPDZrrk, X86::VMINPDZrmk, 0},
@@ -6430,6 +6669,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VMPSADBWZ128rrik, X86::VMPSADBWZ128rmik, 0},
   {X86::VMPSADBWZ256rrik, X86::VMPSADBWZ256rmik, 0},
   {X86::VMPSADBWZrrik, X86::VMPSADBWZrmik, 0},
+  {X86::VMULNEPBF16Z128rrk, X86::VMULNEPBF16Z128rmk, 0},
+  {X86::VMULNEPBF16Z256rrk, X86::VMULNEPBF16Z256rmk, 0},
+  {X86::VMULNEPBF16Zrrk, X86::VMULNEPBF16Zrmk, 0},
   {X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0},
   {X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0},
   {X86::VMULPDZrrk, X86::VMULPDZrmk, 0},
@@ -7005,6 +7247,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VRSQRT28SDZrk, X86::VRSQRT28SDZmk, TB_NO_REVERSE},
   {X86::VRSQRT28SSZrk, X86::VRSQRT28SSZmk, TB_NO_REVERSE},
   {X86::VRSQRTSHZrrk, X86::VRSQRTSHZrmk, TB_NO_REVERSE},
+  {X86::VSCALEFPBF16Z128rrk, X86::VSCALEFPBF16Z128rmk, 0},
+  {X86::VSCALEFPBF16Z256rrk, X86::VSCALEFPBF16Z256rmk, 0},
+  {X86::VSCALEFPBF16Zrrk, X86::VSCALEFPBF16Zrmk, 0},
   {X86::VSCALEFPDZ128rrk, X86::VSCALEFPDZ128rmk, 0},
   {X86::VSCALEFPDZ256rrk, X86::VSCALEFPDZ256rmk, 0},
   {X86::VSCALEFPDZrrk, X86::VSCALEFPDZrmk, 0},
@@ -7034,6 +7279,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VSQRTSDZr_Intk, X86::VSQRTSDZm_Intk, TB_NO_REVERSE},
   {X86::VSQRTSHZr_Intk, X86::VSQRTSHZm_Intk, TB_NO_REVERSE},
   {X86::VSQRTSSZr_Intk, X86::VSQRTSSZm_Intk, TB_NO_REVERSE},
+  {X86::VSUBNEPBF16Z128rrk, X86::VSUBNEPBF16Z128rmk, 0},
+  {X86::VSUBNEPBF16Z256rrk, X86::VSUBNEPBF16Z256rmk, 0},
+  {X86::VSUBNEPBF16Zrrk, X86::VSUBNEPBF16Zrmk, 0},
   {X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0},
   {X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0},
   {X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0},
@@ -7264,6 +7512,9 @@ static const X86FoldTableEntry BroadcastTable1[] = {
   {X86::VCVTW2PHZrr, X86::VCVTW2PHZrmb, TB_BCAST_W},
   {X86::VEXP2PDZr, X86::VEXP2PDZmb, TB_BCAST_SD},
   {X86::VEXP2PSZr, X86::VEXP2PSZmb, TB_BCAST_SS},
+  {X86::VFPCLASSPBF16Z128rr, X86::VFPCLASSPBF16Z128rmb, TB_BCAST_SH},
+  {X86::VFPCLASSPBF16Z256rr, X86::VFPCLASSPBF16Z256rmb, TB_BCAST_SH},
+  {X86::VFPCLASSPBF16Zrr, X86::VFPCLASSPBF16Zrmb, TB_BCAST_SH},
   {X86::VFPCLASSPDZ128rr, X86::VFPCLASSPDZ128rmb, TB_BCAST_SD},
   {X86::VFPCLASSPDZ256rr, X86::VFPCLASSPDZ256rmb, TB_BCAST_SD},
   {X86::VFPCLASSPDZrr, X86::VFPCLASSPDZrmb, TB_BCAST_SD},
@@ -7273,6 +7524,9 @@ static const X86FoldTableEntry BroadcastTable1[] = {
   {X86::VFPCLASSPSZ128rr, X86::VFPCLASSPSZ128rmb, TB_BCAST_SS},
   {X86::VFPCLASSPSZ256rr, X86::VFPCLASSPSZ256rmb, TB_BCAST_SS},
   {X86::VFPCLASSPSZrr, X86::VFPCLASSPSZrmb, TB_BCAST_SS},
+  {X86::VGETEXPPBF16Z128r, X86::VGETEXPPBF16Z128mb, TB_BCAST_SH},
+  {X86::VGETEXPPBF16Z256r, X86::VGETEXPPBF16Z256mb, TB_BCAST_SH},
+  {X86::VGETEXPPBF16Zr, X86::VGETEXPPBF16Zmb, TB_BCAST_SH},
   {X86::VGETEXPPDZ128r, X86::VGETEXPPDZ128mb, TB_BCAST_SD},
   {X86::VGETEXPPDZ256r, X86::VGETEXPPDZ256mb, TB_BCAST_SD},
   {X86::VGETEXPPDZr, X86::VGETEXPPDZmb, TB_BCAST_SD},
@@ -7282,6 +7536,9 @@ static const X86FoldTableEntry BroadcastTable1[] = {
   {X86::VGETEXPPSZ128r, X86::VGETEXPPSZ128mb, TB_BCAST_SS},
   {X86::VGETEXPPSZ256r, X86::VGETEXPPSZ256mb, TB_BCAST_SS},
   {X86::VGETEXPPSZr, X86::VGETEXPPSZmb, TB_BCAST_SS},
+  {X86::VGETMANTPBF16Z128rri, X86::VGETMANTPBF16Z128rmbi, TB_BCAST_SH},
+  {X86::VGETMANTPBF16Z256rri, X86::VGETMANTPBF16Z256rmbi, TB_BCAST_SH},
+  {X86::VGETMANTPBF16Zrri, X86::VGETMANTPBF16Zrmbi, TB_BCAST_SH},
   {X86::VGETMANTPDZ128rri, X86::VGETMANTPDZ128rmbi, TB_BCAST_SD},
   {X86::VGETMANTPDZ256rri, X86::VGETMANTPDZ256rmbi, TB_BCAST_SD},
   {X86::VGETMANTPDZrri, X86::VGETMANTPDZrmbi, TB_BCAST_SD},
@@ -7366,9 +7623,15 @@ static const X86FoldTableEntry BroadcastTable1[] = {
   {X86::VRCP14PSZr, X86::VRCP14PSZmb, TB_BCAST_SS},
   {X86::VRCP28PDZr, X86::VRCP28PDZmb, TB_BCAST_SD},
   {X86::VRCP28PSZr, X86::VRCP28PSZmb, TB_BCAST_SS},
+  {X86::VRCPPBF16Z128r, X86::VRCPPBF16Z128mb, TB_BCAST_SH},
+  {X86::VRCPPBF16Z256r, X86::VRCPPBF16Z256mb, TB_BCAST_SH},
+  {X86::VRCPPBF16Zr, X86::VRCPPBF16Zmb, TB_BCAST_SH},
   {X86::VRCPPHZ128r, X86::VRCPPHZ128mb, TB_BCAST_SH},
   {X86::VRCPPHZ256r, X86::VRCPPHZ256mb, TB_BCAST_SH},
   {X86::VRCPPHZr, X86::VRCPPHZmb, TB_BCAST_SH},
+  {X86::VREDUCENEPBF16Z128rri, X86::VREDUCENEPBF16Z128rmbi, TB_BCAST_SH},
+  {X86::VREDUCENEPBF16Z256rri, X86::VREDUCENEPBF16Z256rmbi, TB_BCAST_SH},
+  {X86::VREDUCENEPBF16Zrri, X86::VREDUCENEPBF16Zrmbi, TB_BCAST_SH},
   {X86::VREDUCEPDZ128rri, X86::VREDUCEPDZ128rmbi, TB_BCAST_SD},
   {X86::VREDUCEPDZ256rri, X86::VREDUCEPDZ256rmbi, TB_BCAST_SD},
   {X86::VREDUCEPDZrri, X86::VREDUCEPDZrmbi, TB_BCAST_SD},
@@ -7378,6 +7641,9 @@ static const X86FoldTableEntry BroadcastTable1[] = {
   {X86::VREDUCEPSZ128rri, X86::VREDUCEPSZ128rmbi, TB_BCAST_SS},
   {X86::VREDUCEPSZ256rri, X86::VREDUCEPSZ256rmbi, TB_BCAST_SS},
   {X86::VREDUCEPSZrri, X86::VREDUCEPSZrmbi, TB_BCAST_SS},
+  {X86::VRNDSCALENEPBF16Z128rri, X86::VRNDSCALENEPBF16Z128rmbi, TB_BCAST_SH},
+  {X86::VRNDSCALENEPBF16Z256rri, X86::VRNDSCALENEPBF16Z256rmbi, TB_BCAST_SH},
+  {X86::VRNDSCALENEPBF16Zrri, X86::VRNDSCALENEPBF16Zrmbi, TB_BCAST_SH},
   {X86::VRNDSCALEPDZ128rri, X86::VRNDSCALEPDZ128rmbi, TB_BCAST_SD},
   {X86::VRNDSCALEPDZ256rri, X86::VRNDSCALEPDZ256rmbi, TB_BCAST_SD},
   {X86::VRNDSCALEPDZrri, X86::VRNDSCALEPDZrmbi, TB_BCAST_SD},
@@ -7395,9 +7661,15 @@ static const X86FoldTableEntry BroadcastTable1[] = {
   {X86::VRSQRT14PSZr, X86::VRSQRT14PSZmb, TB_BCAST_SS},
   {X86::VRSQRT28PDZr, X86::VRSQRT28PDZmb, TB_BCAST_SD},
   {X86::VRSQRT28PSZr, X86::VRSQRT28PSZmb, TB_BCAST_SS},
+  {X86::VRSQRTPBF16Z128r, X86::VRSQRTPBF16Z128mb, TB_BCAST_SH},
+  {X86::VRSQRTPBF16Z256r, X86::VRSQRTPBF16Z256mb, TB_BCAST_SH},
+  {X86::VRSQRTPBF16Zr, X86::VRSQRTPBF16Zmb, TB_BCAST_SH},
   {X86::VRSQRTPHZ128r, X86::VRSQRTPHZ128mb, TB_BCAST_SH},
   {X86::VRSQRTPHZ256r, X86::VRSQRTPHZ256mb, TB_BCAST_SH},
   {X86::VRSQRTPHZr, X86::VRSQRTPHZmb, TB_BCAST_SH},
+  {X86::VSQRTNEPBF16Z128r, X86::VSQRTNEPBF16Z128mb, TB_BCAST_SH},
+  {X86::VSQRTNEPBF16Z256r, X86::VSQRTNEPBF16Z256mb, TB_BCAST_SH},
+  {X86::VSQRTNEPBF16Zr, X86::VSQRTNEPBF16Zmb, TB_BCAST_SH},
   {X86::VSQRTPDZ128r, X86::VSQRTPDZ128mb, TB_BCAST_SD},
   {X86::VSQRTPDZ256r, X86::VSQRTPDZ256mb, TB_BCAST_SD},
   {X86::VSQRTPDZr, X86::VSQRTPDZmb, TB_BCAST_SD},
@@ -7410,6 +7682,9 @@ static const X86FoldTableEntry BroadcastTable1[] = {
 };
 
 static const X86FoldTableEntry BroadcastTable2[] = {
+  {X86::VADDNEPBF16Z128rr, X86::VADDNEPBF16Z128rmb, TB_BCAST_SH},
+  {X86::VADDNEPBF16Z256rr, X86::VADDNEPBF16Z256rmb, TB_BCAST_SH},
+  {X86::VADDNEPBF16Zrr, X86::VADDNEPBF16Zrmb, TB_BCAST_SH},
   {X86::VADDPDZ128rr, X86::VADDPDZ128rmb, TB_BCAST_SD},
   {X86::VADDPDZ256rr, X86::VADDPDZ256rmb, TB_BCAST_SD},
   {X86::VADDPDZrr, X86::VADDPDZrmb, TB_BCAST_SD},
@@ -7443,6 +7718,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VBLENDMPSZ128rr, X86::VBLENDMPSZ128rmb, TB_BCAST_SS},
   {X86::VBLENDMPSZ256rr, X86::VBLENDMPSZ256rmb, TB_BCAST_SS},
   {X86::VBLENDMPSZrr, X86::VBLENDMPSZrmb, TB_BCAST_SS},
+  {X86::VCMPPBF16Z128rri, X86::VCMPPBF16Z128rmbi, TB_BCAST_SH},
+  {X86::VCMPPBF16Z256rri, X86::VCMPPBF16Z256rmbi, TB_BCAST_SH},
+  {X86::VCMPPBF16Zrri, X86::VCMPPBF16Zrmbi, TB_BCAST_SH},
   {X86::VCMPPDZ128rri, X86::VCMPPDZ128rmbi, TB_BCAST_SD},
   {X86::VCMPPDZ256rri, X86::VCMPPDZ256rmbi, TB_BCAST_SD},
   {X86::VCMPPDZrri, X86::VCMPPDZrmbi, TB_BCAST_SD},
@@ -7677,6 +7955,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VCVTW2PHZ128rrkz, X86::VCVTW2PHZ128rmbkz, TB_BCAST_W},
   {X86::VCVTW2PHZ256rrkz, X86::VCVTW2PHZ256rmbkz, TB_BCAST_W},
   {X86::VCVTW2PHZrrkz, X86::VCVTW2PHZrmbkz, TB_BCAST_W},
+  {X86::VDIVNEPBF16Z128rr, X86::VDIVNEPBF16Z128rmb, TB_BCAST_SH},
+  {X86::VDIVNEPBF16Z256rr, X86::VDIVNEPBF16Z256rmb, TB_BCAST_SH},
+  {X86::VDIVNEPBF16Zrr, X86::VDIVNEPBF16Zrmb, TB_BCAST_SH},
   {X86::VDIVPDZ128rr, X86::VDIVPDZ128rmb, TB_BCAST_SD},
   {X86::VDIVPDZ256rr, X86::VDIVPDZ256rmb, TB_BCAST_SD},
   {X86::VDIVPDZrr, X86::VDIVPDZrmb, TB_BCAST_SD},
@@ -7694,6 +7975,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VFMULCPHZ128rr, X86::VFMULCPHZ128rmb, TB_BCAST_SS},
   {X86::VFMULCPHZ256rr, X86::VFMULCPHZ256rmb, TB_BCAST_SS},
   {X86::VFMULCPHZrr, X86::VFMULCPHZrmb, TB_BCAST_SS},
+  {X86::VFPCLASSPBF16Z128rrk, X86::VFPCLASSPBF16Z128rmbk, TB_BCAST_SH},
+  {X86::VFPCLASSPBF16Z256rrk, X86::VFPCLASSPBF16Z256rmbk, TB_BCAST_SH},
+  {X86::VFPCLASSPBF16Zrrk, X86::VFPCLASSPBF16Zrmbk, TB_BCAST_SH},
   {X86::VFPCLASSPDZ128rrk, X86::VFPCLASSPDZ128rmbk, TB_BCAST_SD},
   {X86::VFPCLASSPDZ256rrk, X86::VFPCLASSPDZ256rmbk, TB_BCAST_SD},
   {X86::VFPCLASSPDZrrk, X86::VFPCLASSPDZrmbk, TB_BCAST_SD},
@@ -7703,6 +7987,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VFPCLASSPSZ128rrk, X86::VFPCLASSPSZ128rmbk, TB_BCAST_SS},
   {X86::VFPCLASSPSZ256rrk, X86::VFPCLASSPSZ256rmbk, TB_BCAST_SS},
   {X86::VFPCLASSPSZrrk, X86::VFPCLASSPSZrmbk, TB_BCAST_SS},
+  {X86::VGETEXPPBF16Z128rkz, X86::VGETEXPPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VGETEXPPBF16Z256rkz, X86::VGETEXPPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VGETEXPPBF16Zrkz, X86::VGETEXPPBF16Zmbkz, TB_BCAST_SH},
   {X86::VGETEXPPDZ128rkz, X86::VGETEXPPDZ128mbkz, TB_BCAST_SD},
   {X86::VGETEXPPDZ256rkz, X86::VGETEXPPDZ256mbkz, TB_BCAST_SD},
   {X86::VGETEXPPDZrkz, X86::VGETEXPPDZmbkz, TB_BCAST_SD},
@@ -7712,6 +7999,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VGETEXPPSZ128rkz, X86::VGETEXPPSZ128mbkz, TB_BCAST_SS},
   {X86::VGETEXPPSZ256rkz, X86::VGETEXPPSZ256mbkz, TB_BCAST_SS},
   {X86::VGETEXPPSZrkz, X86::VGETEXPPSZmbkz, TB_BCAST_SS},
+  {X86::VGETMANTPBF16Z128rrikz, X86::VGETMANTPBF16Z128rmbikz, TB_BCAST_SH},
+  {X86::VGETMANTPBF16Z256rrikz, X86::VGETMANTPBF16Z256rmbikz, TB_BCAST_SH},
+  {X86::VGETMANTPBF16Zrrikz, X86::VGETMANTPBF16Zrmbikz, TB_BCAST_SH},
   {X86::VGETMANTPDZ128rrikz, X86::VGETMANTPDZ128rmbikz, TB_BCAST_SD},
   {X86::VGETMANTPDZ256rrikz, X86::VGETMANTPDZ256rmbikz, TB_BCAST_SD},
   {X86::VGETMANTPDZrrikz, X86::VGETMANTPDZrmbikz, TB_BCAST_SD},
@@ -7736,6 +8026,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rmb, TB_BCAST_SS},
   {X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rmb, TB_BCAST_SS},
   {X86::VMAXCPSZrr, X86::VMAXCPSZrmb, TB_BCAST_SS},
+  {X86::VMAXPBF16Z128rr, X86::VMAXPBF16Z128rmb, TB_BCAST_SH},
+  {X86::VMAXPBF16Z256rr, X86::VMAXPBF16Z256rmb, TB_BCAST_SH},
+  {X86::VMAXPBF16Zrr, X86::VMAXPBF16Zrmb, TB_BCAST_SH},
   {X86::VMAXPDZ128rr, X86::VMAXPDZ128rmb, TB_BCAST_SD},
   {X86::VMAXPDZ256rr, X86::VMAXPDZ256rmb, TB_BCAST_SD},
   {X86::VMAXPDZrr, X86::VMAXPDZrmb, TB_BCAST_SD},
@@ -7766,6 +8059,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VMINMAXPSZ128rri, X86::VMINMAXPSZ128rmbi, TB_BCAST_SS},
   {X86::VMINMAXPSZ256rri, X86::VMINMAXPSZ256rmbi, TB_BCAST_SS},
   {X86::VMINMAXPSZrri, X86::VMINMAXPSZrmbi, TB_BCAST_SS},
+  {X86::VMINPBF16Z128rr, X86::VMINPBF16Z128rmb, TB_BCAST_SH},
+  {X86::VMINPBF16Z256rr, X86::VMINPBF16Z256rmb, TB_BCAST_SH},
+  {X86::VMINPBF16Zrr, X86::VMINPBF16Zrmb, TB_BCAST_SH},
   {X86::VMINPDZ128rr, X86::VMINPDZ128rmb, TB_BCAST_SD},
   {X86::VMINPDZ256rr, X86::VMINPDZ256rmb, TB_BCAST_SD},
   {X86::VMINPDZrr, X86::VMINPDZrmb, TB_BCAST_SD},
@@ -7775,6 +8071,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VMINPSZ128rr, X86::VMINPSZ128rmb, TB_BCAST_SS},
   {X86::VMINPSZ256rr, X86::VMINPSZ256rmb, TB_BCAST_SS},
   {X86::VMINPSZrr, X86::VMINPSZrmb, TB_BCAST_SS},
+  {X86::VMULNEPBF16Z128rr, X86::VMULNEPBF16Z128rmb, TB_BCAST_SH},
+  {X86::VMULNEPBF16Z256rr, X86::VMULNEPBF16Z256rmb, TB_BCAST_SH},
+  {X86::VMULNEPBF16Zrr, X86::VMULNEPBF16Zrmb, TB_BCAST_SH},
   {X86::VMULPDZ128rr, X86::VMULPDZ128rmb, TB_BCAST_SD},
   {X86::VMULPDZ256rr, X86::VMULPDZ256rmb, TB_BCAST_SD},
   {X86::VMULPDZrr, X86::VMULPDZrmb, TB_BCAST_SD},
@@ -8068,9 +8367,15 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VRCP14PSZrkz, X86::VRCP14PSZmbkz, TB_BCAST_SS},
   {X86::VRCP28PDZrkz, X86::VRCP28PDZmbkz, TB_BCAST_SD},
   {X86::VRCP28PSZrkz, X86::VRCP28PSZmbkz, TB_BCAST_SS},
+  {X86::VRCPPBF16Z128rkz, X86::VRCPPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VRCPPBF16Z256rkz, X86::VRCPPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VRCPPBF16Zrkz, X86::VRCPPBF16Zmbkz, TB_BCAST_SH},
   {X86::VRCPPHZ128rkz, X86::VRCPPHZ128mbkz, TB_BCAST_SH},
   {X86::VRCPPHZ256rkz, X86::VRCPPHZ256mbkz, TB_BCAST_SH},
   {X86::VRCPPHZrkz, X86::VRCPPHZmbkz, TB_BCAST_SH},
+  {X86::VREDUCENEPBF16Z128rrikz, X86::VREDUCENEPBF16Z128rmbikz, TB_BCAST_SH},
+  {X86::VREDUCENEPBF16Z256rrikz, X86::VREDUCENEPBF16Z256rmbikz, TB_BCAST_SH},
+  {X86::VREDUCENEPBF16Zrrikz, X86::VREDUCENEPBF16Zrmbikz, TB_BCAST_SH},
   {X86::VREDUCEPDZ128rrikz, X86::VREDUCEPDZ128rmbikz, TB_BCAST_SD},
   {X86::VREDUCEPDZ256rrikz, X86::VREDUCEPDZ256rmbikz, TB_BCAST_SD},
   {X86::VREDUCEPDZrrikz, X86::VREDUCEPDZrmbikz, TB_BCAST_SD},
@@ -8080,6 +8385,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VREDUCEPSZ128rrikz, X86::VREDUCEPSZ128rmbikz, TB_BCAST_SS},
   {X86::VREDUCEPSZ256rrikz, X86::VREDUCEPSZ256rmbikz, TB_BCAST_SS},
   {X86::VREDUCEPSZrrikz, X86::VREDUCEPSZrmbikz, TB_BCAST_SS},
+  {X86::VRNDSCALENEPBF16Z128rrikz, X86::VRNDSCALENEPBF16Z128rmbikz, TB_BCAST_SH},
+  {X86::VRNDSCALENEPBF16Z256rrikz, X86::VRNDSCALENEPBF16Z256rmbikz, TB_BCAST_SH},
+  {X86::VRNDSCALENEPBF16Zrrikz, X86::VRNDSCALENEPBF16Zrmbikz, TB_BCAST_SH},
   {X86::VRNDSCALEPDZ128rrikz, X86::VRNDSCALEPDZ128rmbikz, TB_BCAST_SD},
   {X86::VRNDSCALEPDZ256rrikz, X86::VRNDSCALEPDZ256rmbikz, TB_BCAST_SD},
   {X86::VRNDSCALEPDZrrikz, X86::VRNDSCALEPDZrmbikz, TB_BCAST_SD},
@@ -8097,9 +8405,15 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VRSQRT14PSZrkz, X86::VRSQRT14PSZmbkz, TB_BCAST_SS},
   {X86::VRSQRT28PDZrkz, X86::VRSQRT28PDZmbkz, TB_BCAST_SD},
   {X86::VRSQRT28PSZrkz, X86::VRSQRT28PSZmbkz, TB_BCAST_SS},
+  {X86::VRSQRTPBF16Z128rkz, X86::VRSQRTPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VRSQRTPBF16Z256rkz, X86::VRSQRTPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VRSQRTPBF16Zrkz, X86::VRSQRTPBF16Zmbkz, TB_BCAST_SH},
   {X86::VRSQRTPHZ128rkz, X86::VRSQRTPHZ128mbkz, TB_BCAST_SH},
   {X86::VRSQRTPHZ256rkz, X86::VRSQRTPHZ256mbkz, TB_BCAST_SH},
   {X86::VRSQRTPHZrkz, X86::VRSQRTPHZmbkz, TB_BCAST_SH},
+  {X86::VSCALEFPBF16Z128rr, X86::VSCALEFPBF16Z128rmb, TB_BCAST_SH},
+  {X86::VSCALEFPBF16Z256rr, X86::VSCALEFPBF16Z256rmb, TB_BCAST_SH},
+  {X86::VSCALEFPBF16Zrr, X86::VSCALEFPBF16Zrmb, TB_BCAST_SH},
   {X86::VSCALEFPDZ128rr, X86::VSCALEFPDZ128rmb, TB_BCAST_SD},
   {X86::VSCALEFPDZ256rr, X86::VSCALEFPDZ256rmb, TB_BCAST_SD},
   {X86::VSCALEFPDZrr, X86::VSCALEFPDZrmb, TB_BCAST_SD},
@@ -8123,6 +8437,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VSHUFPSZ128rri, X86::VSHUFPSZ128rmbi, TB_BCAST_SS},
   {X86::VSHUFPSZ256rri, X86::VSHUFPSZ256rmbi, TB_BCAST_SS},
   {X86::VSHUFPSZrri, X86::VSHUFPSZrmbi, TB_BCAST_SS},
+  {X86::VSQRTNEPBF16Z128rkz, X86::VSQRTNEPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VSQRTNEPBF16Z256rkz, X86::VSQRTNEPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VSQRTNEPBF16Zrkz, X86::VSQRTNEPBF16Zmbkz, TB_BCAST_SH},
   {X86::VSQRTPDZ128rkz, X86::VSQRTPDZ128mbkz, TB_BCAST_SD},
   {X86::VSQRTPDZ256rkz, X86::VSQRTPDZ256mbkz, TB_BCAST_SD},
   {X86::VSQRTPDZrkz, X86::VSQRTPDZmbkz, TB_BCAST_SD},
@@ -8132,6 +8449,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VSQRTPSZ128rkz, X86::VSQRTPSZ128mbkz, TB_BCAST_SS},
   {X86::VSQRTPSZ256rkz, X86::VSQRTPSZ256mbkz, TB_BCAST_SS},
   {X86::VSQRTPSZrkz, X86::VSQRTPSZmbkz, TB_BCAST_SS},
+  {X86::VSUBNEPBF16Z128rr, X86::VSUBNEPBF16Z128rmb, TB_BCAST_SH},
+  {X86::VSUBNEPBF16Z256rr, X86::VSUBNEPBF16Z256rmb, TB_BCAST_SH},
+  {X86::VSUBNEPBF16Zrr, X86::VSUBNEPBF16Zrmb, TB_BCAST_SH},
   {X86::VSUBPDZ128rr, X86::VSUBPDZ128rmb, TB_BCAST_SD},
   {X86::VSUBPDZ256rr, X86::VSUBPDZ256rmb, TB_BCAST_SD},
   {X86::VSUBPDZrr, X86::VSUBPDZrmb, TB_BCAST_SD},
@@ -8162,6 +8482,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
 };
 
 static const X86FoldTableEntry BroadcastTable3[] = {
+  {X86::VADDNEPBF16Z128rrkz, X86::VADDNEPBF16Z128rmbkz, TB_BCAST_SH},
+  {X86::VADDNEPBF16Z256rrkz, X86::VADDNEPBF16Z256rmbkz, TB_BCAST_SH},
+  {X86::VADDNEPBF16Zrrkz, X86::VADDNEPBF16Zrmbkz, TB_BCAST_SH},
   {X86::VADDPDZ128rrkz, X86::VADDPDZ128rmbkz, TB_BCAST_SD},
   {X86::VADDPDZ256rrkz, X86::VADDPDZ256rmbkz, TB_BCAST_SD},
   {X86::VADDPDZrrkz, X86::VADDPDZrmbkz, TB_BCAST_SD},
@@ -8195,6 +8518,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VBLENDMPSZ128rrk, X86::VBLENDMPSZ128rmbk, TB_BCAST_SS},
   {X86::VBLENDMPSZ256rrk, X86::VBLENDMPSZ256rmbk, TB_BCAST_SS},
   {X86::VBLENDMPSZrrk, X86::VBLENDMPSZrmbk, TB_BCAST_SS},
+  {X86::VCMPPBF16Z128rrik, X86::VCMPPBF16Z128rmbik, TB_BCAST_SH},
+  {X86::VCMPPBF16Z256rrik, X86::VCMPPBF16Z256rmbik, TB_BCAST_SH},
+  {X86::VCMPPBF16Zrrik, X86::VCMPPBF16Zrmbik, TB_BCAST_SH},
   {X86::VCMPPDZ128rrik, X86::VCMPPDZ128rmbik, TB_BCAST_SD},
   {X86::VCMPPDZ256rrik, X86::VCMPPDZ256rmbik, TB_BCAST_SD},
   {X86::VCMPPDZrrik, X86::VCMPPDZrmbik, TB_BCAST_SD},
@@ -8429,6 +8755,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VCVTW2PHZ128rrk, X86::VCVTW2PHZ128rmbk, TB_BCAST_W},
   {X86::VCVTW2PHZ256rrk, X86::VCVTW2PHZ256rmbk, TB_BCAST_W},
   {X86::VCVTW2PHZrrk, X86::VCVTW2PHZrmbk, TB_BCAST_W},
+  {X86::VDIVNEPBF16Z128rrkz, X86::VDIVNEPBF16Z128rmbkz, TB_BCAST_SH},
+  {X86::VDIVNEPBF16Z256rrkz, X86::VDIVNEPBF16Z256rmbkz, TB_BCAST_SH},
+  {X86::VDIVNEPBF16Zrrkz, X86::VDIVNEPBF16Zrmbkz, TB_BCAST_SH},
   {X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmbkz, TB_BCAST_SD},
   {X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmbkz, TB_BCAST_SD},
   {X86::VDIVPDZrrkz, X86::VDIVPDZrmbkz, TB_BCAST_SD},
@@ -8458,6 +8787,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VFIXUPIMMPSZ128rri, X86::VFIXUPIMMPSZ128rmbi, TB_BCAST_SS},
   {X86::VFIXUPIMMPSZ256rri, X86::VFIXUPIMMPSZ256rmbi, TB_BCAST_SS},
   {X86::VFIXUPIMMPSZrri, X86::VFIXUPIMMPSZrmbi, TB_BCAST_SS},
+  {X86::VFMADD132NEPBF16Z128r, X86::VFMADD132NEPBF16Z128mb, TB_BCAST_SH},
+  {X86::VFMADD132NEPBF16Z256r, X86::VFMADD132NEPBF16Z256mb, TB_BCAST_SH},
+  {X86::VFMADD132NEPBF16Zr, X86::VFMADD132NEPBF16Zmb, TB_BCAST_SH},
   {X86::VFMADD132PDZ128r, X86::VFMADD132PDZ128mb, TB_BCAST_SD},
   {X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256mb, TB_BCAST_SD},
   {X86::VFMADD132PDZr, X86::VFMADD132PDZmb, TB_BCAST_SD},
@@ -8467,6 +8799,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VFMADD132PSZ128r, X86::VFMADD132PSZ128mb, TB_BCAST_SS},
   {X86::VFMADD132PSZ256r, X86::VFMADD132PSZ256mb, TB_BCAST_SS},
   {X86::VFMADD132PSZr, X86::VFMADD132PSZmb, TB_BCAST_SS},
+  {X86::VFMADD213NEPBF16Z128r, X86::VFMADD213NEPBF16Z128mb, TB_BCAST_SH},
+  {X86::VFMADD213NEPBF16Z256r, X86::VFMADD213NEPBF16Z256mb, TB_BCAST_SH},
+  {X86::VFMADD213NEPBF16Zr, X86::VFMADD213NEPBF16Zmb, TB_BCAST_SH},
   {X86::VFMADD213PDZ128r, X86::VFMADD213PDZ128mb, TB_BCAST_SD},
   {X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256mb, TB_BCAST_SD},
   {X86::VFMADD213PDZr, X86::VFMADD213PDZmb, TB_BCAST_SD},
@@ -8476,6 +8811,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VFMADD213PSZ128r, X86::VFMADD213PSZ128mb, TB_BCAST_SS},
   {X86::VFMADD213PSZ256r, X86::VFMADD213PSZ256mb, TB_BCAST_SS},
   {X86::VFMADD213PSZr, X86::VFMADD213PSZmb, TB_BCAST_SS},
+  {X86::VFMADD231NEPBF16Z128r, X86::VFMADD231NEPBF16Z128mb, TB_BCAST_SH},
+  {X86::VFMADD231NEPBF16Z256r, X86::VFMADD231NEPBF16Z256mb, TB_BCAST_SH},
+  {X86::VFMADD231NEPBF16Zr, X86::VFMADD231NEPBF16Zmb, TB_BCAST_SH},
   {X86::VFMADD231PDZ128r, X86::VFMADD231PDZ128mb, TB_BCAST_SD},
   {X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256mb, TB_BCAST_SD},
   {X86::VFMADD231PDZr, X86::VFMADD231PDZmb, TB_BCAST_SD},
@@ -8515,6 +8853,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VFMADDSUB231PSZ128r, X86::VFMADDSUB231PSZ128mb, TB_BCAST_SS},
   {X86::VFMADDSUB231PSZ256r, X86::VFMADDSUB231PSZ256mb, TB_BCAST_SS},
   {X86::VFMADDSUB231PSZr, X86::VFMADDSUB231PSZmb, TB_BCAST_SS},
+  {X86::VFMSUB132NEPBF16Z128r, X86::VFMSUB132NEPBF16Z128mb, TB_BCAST_SH},
+  {X86::VFMSUB132NEPBF16Z256r, X86::VFMSUB132NEPBF16Z256mb, TB_BCAST_SH},
+  {X86::VFMSUB132NEPBF16Zr, X86::VFMSUB132NEPBF16Zmb, TB_BCAST_SH},
   {X86::VFMSUB132PDZ128r, X86::VFMSUB132PDZ128mb, TB_BCAST_SD},
   {X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256mb, TB_BCAST_SD},
   {X86::VFMSUB132PDZr, X86::VFMSUB132PDZmb, TB_BCAST_SD},
@@ -8524,6 +8865,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VFMSUB132PSZ128r, X86::VFMSUB132PSZ128mb, TB_BCAST_SS},
   {X86::VFMSUB132PSZ256r, X86::VFMSUB132PSZ256mb, TB_BCAST_SS},
   {X86::VFMSUB132PSZr, X86::VFMSUB132PSZmb, TB_BCAST_SS},
+  {X86::VFMSUB213NEPBF16Z128r, X86::VFMSUB213NEPBF16Z128mb, TB_BCAST_SH},
+  {X86::VFMSUB213NEPBF16Z256r, X86::VFMSUB213NEPBF16Z256mb, TB_BCAST_SH},
+  {X86::VFMSUB213NEPBF16Zr, X86::VFMSUB213NEPBF16Zmb, TB_BCAST_SH},
   {X86::VFMSUB213PDZ128r, X86::VFMSUB213PDZ128mb, TB_BCAST_SD},
   {X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256mb, TB_BCAST_SD},
   {X86::VFMSUB213PDZr, X86::VFMSUB213PDZmb, TB_BCAST_SD},
@@ -8533,6 +8877,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VFMSUB213PSZ128r, X86::VFMSUB213PSZ128mb, TB_BCAST_SS},
   {X86::VFMSUB213PSZ256r, X86::VFMSUB213PSZ256mb, TB_BCAST_SS},
   {X86::VFMSUB213PSZr, X86::VFMSUB213PSZmb, TB_BCAST_SS},
+  {X86::VFMSUB231NEPBF16Z128r, X86::VFMSUB231NEPBF16Z128mb, TB_BCAST_SH},
+  {X86::VFMSUB231NEPBF16Z256r, X86::VFMSUB231NEPBF16Z256mb, TB_BCAST_SH},
+  {X86::VFMSUB231NEPBF16Zr, X86::VFMSUB231NEPBF16Zmb, TB_BCAST_SH},
   {X86::VFMSUB231PDZ128r, X86::VFMSUB231PDZ128mb, TB_BCAST_SD},
   {X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256mb, TB_BCAST_SD},
   {X86::VFMSUB231PDZr, X86::VFMSUB231PDZmb, TB_BCAST_SD},
@@ -8572,6 +8919,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VFMULCPHZ128rrkz, X86::VFMULCPHZ128rmbkz, TB_BCAST_SS},
   {X86::VFMULCPHZ256rrkz, X86::VFMULCPHZ256rmbkz, TB_BCAST_SS},
   {X86::VFMULCPHZrrkz, X86::VFMULCPHZrmbkz, TB_BCAST_SS},
+  {X86::VFNMADD132NEPBF16Z128r, X86::VFNMADD132NEPBF16Z128mb, TB_BCAST_SH},
+  {X86::VFNMADD132NEPBF16Z256r, X86::VFNMADD132NEPBF16Z256mb, TB_BCAST_SH},
+  {X86::VFNMADD132NEPBF16Zr, X86::VFNMADD132NEPBF16Zmb, TB_BCAST_SH},
   {X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128mb, TB_BCAST_SD},
   {X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256mb, TB_BCAST_SD},
   {X86::VFNMADD132PDZr, X86::VFNMADD132PDZmb, TB_BCAST_SD},
@@ -8581,6 +8931,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VFNMADD132PSZ128r, X86::VFNMADD132PSZ128mb, TB_BCAST_SS},
   {X86::VFNMADD132PSZ256r, X86::VFNMADD132PSZ256mb, TB_BCAST_SS},
   {X86::VFNMADD132PSZr, X86::VFNMADD132PSZmb, TB_BCAST_SS},
+  {X86::VFNMADD213NEPBF16Z128r, X86::VFNMADD213NEPBF16Z128mb, TB_BCAST_SH},
+  {X86::VFNMADD213NEPBF16Z256r, X86::VFNMADD213NEPBF16Z256mb, TB_BCAST_SH},
+  {X86::VFNMADD213NEPBF16Zr, X86::VFNMADD213NEPBF16Zmb, TB_BCAST_SH},
   {X86::VFNMADD213PDZ128r, X86::VFNMADD213PDZ128mb, TB_BCAST_SD},
   {X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256mb, TB_BCAST_SD},
   {X86::VFNMADD213PDZr, X86::VFNMADD213PDZmb, TB_BCAST_SD},
@@ -8590,6 +8943,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VFNMADD213PSZ128r, X86::VFNMADD213PSZ128mb, TB_BCAST_SS},
   {X86::VFNMADD213PSZ256r, X86::VFNMADD213PSZ256mb, TB_BCAST_SS},
   {X86::VFNMADD213PSZr, X86::VFNMADD213PSZmb, TB_BCAST_SS},
+  {X86::VFNMADD231NEPBF16Z128r, X86::VFNMADD231NEPBF16Z128mb, TB_BCAST_SH},
+  {X86::VFNMADD231NEPBF16Z256r, X86::VFNMADD231NEPBF16Z256mb, TB_BCAST_SH},
+  {X86::VFNMADD231NEPBF16Zr, X86::VFNMADD231NEPBF16Zmb, TB_BCAST_SH},
   {X86::VFNMADD231PDZ128r, X86::VFNMADD231PDZ128mb, TB_BCAST_SD},
   {X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256mb, TB_BCAST_SD},
   {X86::VFNMADD231PDZr, X86::VFNMADD231PDZmb, TB_BCAST_SD},
@@ -8599,6 +8955,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VFNMADD231PSZ128r, X86::VFNMADD231PSZ128mb, TB_BCAST_SS},
   {X86::VFNMADD231PSZ256r, X86::VFNMADD231PSZ256mb, TB_BCAST_SS},
   {X86::VFNMADD231PSZr, X86::VFNMADD231PSZmb, TB_BCAST_SS},
+  {X86::VFNMSUB132NEPBF16Z128r, X86::VFNMSUB132NEPBF16Z128mb, TB_BCAST_SH},
+  {X86::VFNMSUB132NEPBF16Z256r, X86::VFNMSUB132NEPBF16Z256mb, TB_BCAST_SH},
+  {X86::VFNMSUB132NEPBF16Zr, X86::VFNMSUB132NEPBF16Zmb, TB_BCAST_SH},
   {X86::VFNMSUB132PDZ128r, X86::VFNMSUB132PDZ128mb, TB_BCAST_SD},
   {X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256mb, TB_BCAST_SD},
   {X86::VFNMSUB132PDZr, X86::VFNMSUB132PDZmb, TB_BCAST_SD},
@@ -8608,6 +8967,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VFNMSUB132PSZ128r, X86::VFNMSUB132PSZ128mb, TB_BCAST_SS},
   {X86::VFNMSUB132PSZ256r, X86::VFNMSUB132PSZ256mb, TB_BCAST_SS},
   {X86::VFNMSUB132PSZr, X86::VFNMSUB132PSZmb, TB_BCAST_SS},
+  {X86::VFNMSUB213NEPBF16Z128r, X86::VFNMSUB213NEPBF16Z128mb, TB_BCAST_SH},
+  {X86::VFNMSUB213NEPBF16Z256r, X86::VFNMSUB213NEPBF16Z256mb, TB_BCAST_SH},
+  {X86::VFNMSUB213NEPBF16Zr, X86::VFNMSUB213NEPBF16Zmb, TB_BCAST_SH},
   {X86::VFNMSUB213PDZ128r, X86::VFNMSUB213PDZ128mb, TB_BCAST_SD},
   {X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256mb, TB_BCAST_SD},
   {X86::VFNMSUB213PDZr, X86::VFNMSUB213PDZmb, TB_BCAST_SD},
@@ -8617,6 +8979,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VFNMSUB213PSZ128r, X86::VFNMSUB213PSZ128mb, TB_BCAST_SS},
   {X86::VFNMSUB213PSZ256r, X86::VFNMSUB213PSZ256mb, TB_BCAST_SS},
   {X86::VFNMSUB213PSZr, X86::VFNMSUB213PSZmb, TB_BCAST_SS},
+  {X86::VFNMSUB231NEPBF16Z128r, X86::VFNMSUB231NEPBF16Z128mb, TB_BCAST_SH},
+  {X86::VFNMSUB231NEPBF16Z256r, X86::VFNMSUB231NEPBF16Z256mb, TB_BCAST_SH},
+  {X86::VFNMSUB231NEPBF16Zr, X86::VFNMSUB231NEPBF16Zmb, TB_BCAST_SH},
   {X86::VFNMSUB231PDZ128r, X86::VFNMSUB231PDZ128mb, TB_BCAST_SD},
   {X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256mb, TB_BCAST_SD},
   {X86::VFNMSUB231PDZr, X86::VFNMSUB231PDZmb, TB_BCAST_SD},
@@ -8626,6 +8991,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128mb, TB_BCAST_SS},
   {X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256mb, TB_BCAST_SS},
   {X86::VFNMSUB231PSZr, X86::VFNMSUB231PSZmb, TB_BCAST_SS},
+  {X86::VGETEXPPBF16Z128rk, X86::VGETEXPPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VGETEXPPBF16Z256rk, X86::VGETEXPPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VGETEXPPBF16Zrk, X86::VGETEXPPBF16Zmbk, TB_BCAST_SH},
   {X86::VGETEXPPDZ128rk, X86::VGETEXPPDZ128mbk, TB_BCAST_SD},
   {X86::VGETEXPPDZ256rk, X86::VGETEXPPDZ256mbk, TB_BCAST_SD},
   {X86::VGETEXPPDZrk, X86::VGETEXPPDZmbk, TB_BCAST_SD},
@@ -8635,6 +9003,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VGETEXPPSZ128rk, X86::VGETEXPPSZ128mbk, TB_BCAST_SS},
   {X86::VGETEXPPSZ256rk, X86::VGETEXPPSZ256mbk, TB_BCAST_SS},
   {X86::VGETEXPPSZrk, X86::VGETEXPPSZmbk, TB_BCAST_SS},
+  {X86::VGETMANTPBF16Z128rrik, X86::VGETMANTPBF16Z128rmbik, TB_BCAST_SH},
+  {X86::VGETMANTPBF16Z256rrik, X86::VGETMANTPBF16Z256rmbik, TB_BCAST_SH},
+  {X86::VGETMANTPBF16Zrrik, X86::VGETMANTPBF16Zrmbik, TB_BCAST_SH},
   {X86::VGETMANTPDZ128rrik, X86::VGETMANTPDZ128rmbik, TB_BCAST_SD},
   {X86::VGETMANTPDZ256rrik, X86::VGETMANTPDZ256rmbik, TB_BCAST_SD},
   {X86::VGETMANTPDZrrik, X86::VGETMANTPDZrmbik, TB_BCAST_SD},
@@ -8659,6 +9030,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmbkz, TB_BCAST_SS},
   {X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmbkz, TB_BCAST_SS},
   {X86::VMAXCPSZrrkz, X86::VMAXCPSZrmbkz, TB_BCAST_SS},
+  {X86::VMAXPBF16Z128rrkz, X86::VMAXPBF16Z128rmbkz, TB_BCAST_SH},
+  {X86::VMAXPBF16Z256rrkz, X86::VMAXPBF16Z256rmbkz, TB_BCAST_SH},
+  {X86::VMAXPBF16Zrrkz, X86::VMAXPBF16Zrmbkz, TB_BCAST_SH},
   {X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmbkz, TB_BCAST_SD},
   {X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmbkz, TB_BCAST_SD},
   {X86::VMAXPDZrrkz, X86::VMAXPDZrmbkz, TB_BCAST_SD},
@@ -8689,6 +9063,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VMINMAXPSZ128rrikz, X86::VMINMAXPSZ128rmbikz, TB_BCAST_SS},
   {X86::VMINMAXPSZ256rrikz, X86::VMINMAXPSZ256rmbikz, TB_BCAST_SS},
   {X86::VMINMAXPSZrrikz, X86::VMINMAXPSZrmbikz, TB_BCAST_SS},
+  {X86::VMINPBF16Z128rrkz, X86::VMINPBF16Z128rmbkz, TB_BCAST_SH},
+  {X86::VMINPBF16Z256rrkz, X86::VMINPBF16Z256rmbkz, TB_BCAST_SH},
+  {X86::VMINPBF16Zrrkz, X86::VMINPBF16Zrmbkz, TB_BCAST_SH},
   {X86::VMINPDZ128rrkz, X86::VMINPDZ128rmbkz, TB_BCAST_SD},
   {X86::VMINPDZ256rrkz, X86::VMINPDZ256rmbkz, TB_BCAST_SD},
   {X86::VMINPDZrrkz, X86::VMINPDZrmbkz, TB_BCAST_SD},
@@ -8698,6 +9075,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VMINPSZ128rrkz, X86::VMINPSZ128rmbkz, TB_BCAST_SS},
   {X86::VMINPSZ256rrkz, X86::VMINPSZ256rmbkz, TB_BCAST_SS},
   {X86::VMINPSZrrkz, X86::VMINPSZrmbkz, TB_BCAST_SS},
+  {X86::VMULNEPBF16Z128rrkz, X86::VMULNEPBF16Z128rmbkz, TB_BCAST_SH},
+  {X86::VMULNEPBF16Z256rrkz, X86::VMULNEPBF16Z256rmbkz, TB_BCAST_SH},
+  {X86::VMULNEPBF16Zrrkz, X86::VMULNEPBF16Zrmbkz, TB_BCAST_SH},
   {X86::VMULPDZ128rrkz, X86::VMULPDZ128rmbkz, TB_BCAST_SD},
   {X86::VMULPDZ256rrkz, X86::VMULPDZ256rmbkz, TB_BCAST_SD},
   {X86::VMULPDZrrkz, X86::VMULPDZrmbkz, TB_BCAST_SD},
@@ -9081,9 +9461,15 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VRCP14PSZrk, X86::VRCP14PSZmbk, TB_BCAST_SS},
   {X86::VRCP28PDZrk, X86::VRCP28PDZmbk, TB_BCAST_SD},
   {X86::VRCP28PSZrk, X86::VRCP28PSZmbk, TB_BCAST_SS},
+  {X86::VRCPPBF16Z128rk, X86::VRCPPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VRCPPBF16Z256rk, X86::VRCPPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VRCPPBF16Zrk, X86::VRCPPBF16Zmbk, TB_BCAST_SH},
   {X86::VRCPPHZ128rk, X86::VRCPPHZ128mbk, TB_BCAST_SH},
   {X86::VRCPPHZ256rk, X86::VRCPPHZ256mbk, TB_BCAST_SH},
   {X86::VRCPPHZrk, X86::VRCPPHZmbk, TB_BCAST_SH},
+  {X86::VREDUCENEPBF16Z128rrik, X86::VREDUCENEPBF16Z128rmbik, TB_BCAST_SH},
+  {X86::VREDUCENEPBF16Z256rrik, X86::VREDUCENEPBF16Z256rmbik, TB_BCAST_SH},
+  {X86::VREDUCENEPBF16Zrrik, X86::VREDUCENEPBF16Zrmbik, TB_BCAST_SH},
   {X86::VREDUCEPDZ128rrik, X86::VREDUCEPDZ128rmbik, TB_BCAST_SD},
   {X86::VREDUCEPDZ256rrik, X86::VREDUCEPDZ256rmbik, TB_BCAST_SD},
   {X86::VREDUCEPDZrrik, X86::VREDUCEPDZrmbik, TB_BCAST_SD},
@@ -9093,6 +9479,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VREDUCEPSZ128rrik, X86::VREDUCEPSZ128rmbik, TB_BCAST_SS},
   {X86::VREDUCEPSZ256rrik, X86::VREDUCEPSZ256rmbik, TB_BCAST_SS},
   {X86::VREDUCEPSZrrik, X86::VREDUCEPSZrmbik, TB_BCAST_SS},
+  {X86::VRNDSCALENEPBF16Z128rrik, X86::VRNDSCALENEPBF16Z128rmbik, TB_BCAST_SH},
+  {X86::VRNDSCALENEPBF16Z256rrik, X86::VRNDSCALENEPBF16Z256rmbik, TB_BCAST_SH},
+  {X86::VRNDSCALENEPBF16Zrrik, X86::VRNDSCALENEPBF16Zrmbik, TB_BCAST_SH},
   {X86::VRNDSCALEPDZ128rrik, X86::VRNDSCALEPDZ128rmbik, TB_BCAST_SD},
   {X86::VRNDSCALEPDZ256rrik, X86::VRNDSCALEPDZ256rmbik, TB_BCAST_SD},
   {X86::VRNDSCALEPDZrrik, X86::VRNDSCALEPDZrmbik, TB_BCAST_SD},
@@ -9110,9 +9499,15 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VRSQRT14PSZrk, X86::VRSQRT14PSZmbk, TB_BCAST_SS},
   {X86::VRSQRT28PDZrk, X86::VRSQRT28PDZmbk, TB_BCAST_SD},
   {X86::VRSQRT28PSZrk, X86::VRSQRT28PSZmbk, TB_BCAST_SS},
+  {X86::VRSQRTPBF16Z128rk, X86::VRSQRTPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VRSQRTPBF16Z256rk, X86::VRSQRTPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VRSQRTPBF16Zrk, X86::VRSQRTPBF16Zmbk, TB_BCAST_SH},
   {X86::VRSQRTPHZ128rk, X86::VRSQRTPHZ128mbk, TB_BCAST_SH},
   {X86::VRSQRTPHZ256rk, X86::VRSQRTPHZ256mbk, TB_BCAST_SH},
   {X86::VRSQRTPHZrk, X86::VRSQRTPHZmbk, TB_BCAST_SH},
+  {X86::VSCALEFPBF16Z128rrkz, X86::VSCALEFPBF16Z128rmbkz, TB_BCAST_SH},
+  {X86::VSCALEFPBF16Z256rrkz, X86::VSCALEFPBF16Z256rmbkz, TB_BCAST_SH},
+  {X86::VSCALEFPBF16Zrrkz, X86::VSCALEFPBF16Zrmbkz, TB_BCAST_SH},
   {X86::VSCALEFPDZ128rrkz, X86::VSCALEFPDZ128rmbkz, TB_BCAST_SD},
   {X86::VSCALEFPDZ256rrkz, X86::VSCALEFPDZ256rmbkz, TB_BCAST_SD},
   {X86::VSCALEFPDZrrkz, X86::VSCALEFPDZrmbkz, TB_BCAST_SD},
@@ -9136,6 +9531,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VSHUFPSZ128rrikz, X86::VSHUFPSZ128rmbikz, TB_BCAST_SS},
   {X86::VSHUFPSZ256rrikz, X86::VSHUFPSZ256rmbikz, TB_BCAST_SS},
   {X86::VSHUFPSZrrikz, X86::VSHUFPSZrmbikz, TB_BCAST_SS},
+  {X86::VSQRTNEPBF16Z128rk, X86::VSQRTNEPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VSQRTNEPBF16Z256rk, X86::VSQRTNEPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VSQRTNEPBF16Zrk, X86::VSQRTNEPBF16Zmbk, TB_BCAST_SH},
   {X86::VSQRTPDZ128rk, X86::VSQRTPDZ128mbk, TB_BCAST_SD},
   {X86::VSQRTPDZ256rk, X86::VSQRTPDZ256mbk, TB_BCAST_SD},
   {X86::VSQRTPDZrk, X86::VSQRTPDZmbk, TB_BCAST_SD},
@@ -9145,6 +9543,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VSQRTPSZ128rk, X86::VSQRTPSZ128mbk, TB_BCAST_SS},
   {X86::VSQRTPSZ256rk, X86::VSQRTPSZ256mbk, TB_BCAST_SS},
   {X86::VSQRTPSZrk, X86::VSQRTPSZmbk, TB_BCAST_SS},
+  {X86::VSUBNEPBF16Z128rrkz, X86::VSUBNEPBF16Z128rmbkz, TB_BCAST_SH},
+  {X86::VSUBNEPBF16Z256rrkz, X86::VSUBNEPBF16Z256rmbkz, TB_BCAST_SH},
+  {X86::VSUBNEPBF16Zrrkz, X86::VSUBNEPBF16Zrmbkz, TB_BCAST_SH},
   {X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmbkz, TB_BCAST_SD},
   {X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmbkz, TB_BCAST_SD},
   {X86::VSUBPDZrrkz, X86::VSUBPDZrmbkz, TB_BCAST_SD},
@@ -9175,6 +9576,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
 };
 
 static const X86FoldTableEntry BroadcastTable4[] = {
+  {X86::VADDNEPBF16Z128rrk, X86::VADDNEPBF16Z128rmbk, TB_BCAST_SH},
+  {X86::VADDNEPBF16Z256rrk, X86::VADDNEPBF16Z256rmbk, TB_BCAST_SH},
+  {X86::VADDNEPBF16Zrrk, X86::VADDNEPBF16Zrmbk, TB_BCAST_SH},
   {X86::VADDPDZ128rrk, X86::VADDPDZ128rmbk, TB_BCAST_SD},
   {X86::VADDPDZ256rrk, X86::VADDPDZ256rmbk, TB_BCAST_SD},
   {X86::VADDPDZrrk, X86::VADDPDZrmbk, TB_BCAST_SD},
@@ -9232,6 +9636,9 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VCVTNE2PS2BF16Z128rrk, X86::VCVTNE2PS2BF16Z128rmbk, TB_BCAST_SS},
   {X86::VCVTNE2PS2BF16Z256rrk, X86::VCVTNE2PS2BF16Z256rmbk, TB_BCAST_SS},
   {X86::VCVTNE2PS2BF16Zrrk, X86::VCVTNE2PS2BF16Zrmbk, TB_BCAST_SS},
+  {X86::VDIVNEPBF16Z128rrk, X86::VDIVNEPBF16Z128rmbk, TB_BCAST_SH},
+  {X86::VDIVNEPBF16Z256rrk, X86::VDIVNEPBF16Z256rmbk, TB_BCAST_SH},
+  {X86::VDIVNEPBF16Zrrk, X86::VDIVNEPBF16Zrmbk, TB_BCAST_SH},
   {X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmbk, TB_BCAST_SD},
   {X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmbk, TB_BCAST_SD},
   {X86::VDIVPDZrrk, X86::VDIVPDZrmbk, TB_BCAST_SD},
@@ -9274,6 +9681,12 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VFIXUPIMMPSZ256rrikz, X86::VFIXUPIMMPSZ256rmbikz, TB_BCAST_SS},
   {X86::VFIXUPIMMPSZrrik, X86::VFIXUPIMMPSZrmbik, TB_BCAST_SS},
   {X86::VFIXUPIMMPSZrrikz, X86::VFIXUPIMMPSZrmbikz, TB_BCAST_SS},
+  {X86::VFMADD132NEPBF16Z128rk, X86::VFMADD132NEPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VFMADD132NEPBF16Z128rkz, X86::VFMADD132NEPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VFMADD132NEPBF16Z256rk, X86::VFMADD132NEPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VFMADD132NEPBF16Z256rkz, X86::VFMADD132NEPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VFMADD132NEPBF16Zrk, X86::VFMADD132NEPBF16Zmbk, TB_BCAST_SH},
+  {X86::VFMADD132NEPBF16Zrkz, X86::VFMADD132NEPBF16Zmbkz, TB_BCAST_SH},
   {X86::VFMADD132PDZ128rk, X86::VFMADD132PDZ128mbk, TB_BCAST_SD},
   {X86::VFMADD132PDZ128rkz, X86::VFMADD132PDZ128mbkz, TB_BCAST_SD},
   {X86::VFMADD132PDZ256rk, X86::VFMADD132PDZ256mbk, TB_BCAST_SD},
@@ -9292,6 +9705,12 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VFMADD132PSZ256rkz, X86::VFMADD132PSZ256mbkz, TB_BCAST_SS},
   {X86::VFMADD132PSZrk, X86::VFMADD132PSZmbk, TB_BCAST_SS},
   {X86::VFMADD132PSZrkz, X86::VFMADD132PSZmbkz, TB_BCAST_SS},
+  {X86::VFMADD213NEPBF16Z128rk, X86::VFMADD213NEPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VFMADD213NEPBF16Z128rkz, X86::VFMADD213NEPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VFMADD213NEPBF16Z256rk, X86::VFMADD213NEPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VFMADD213NEPBF16Z256rkz, X86::VFMADD213NEPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VFMADD213NEPBF16Zrk, X86::VFMADD213NEPBF16Zmbk, TB_BCAST_SH},
+  {X86::VFMADD213NEPBF16Zrkz, X86::VFMADD213NEPBF16Zmbkz, TB_BCAST_SH},
   {X86::VFMADD213PDZ128rk, X86::VFMADD213PDZ128mbk, TB_BCAST_SD},
   {X86::VFMADD213PDZ128rkz, X86::VFMADD213PDZ128mbkz, TB_BCAST_SD},
   {X86::VFMADD213PDZ256rk, X86::VFMADD213PDZ256mbk, TB_BCAST_SD},
@@ -9310,6 +9729,12 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VFMADD213PSZ256rkz, X86::VFMADD213PSZ256mbkz, TB_BCAST_SS},
   {X86::VFMADD213PSZrk, X86::VFMADD213PSZmbk, TB_BCAST_SS},
   {X86::VFMADD213PSZrkz, X86::VFMADD213PSZmbkz, TB_BCAST_SS},
+  {X86::VFMADD231NEPBF16Z128rk, X86::VFMADD231NEPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VFMADD231NEPBF16Z128rkz, X86::VFMADD231NEPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VFMADD231NEPBF16Z256rk, X86::VFMADD231NEPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VFMADD231NEPBF16Z256rkz, X86::VFMADD231NEPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VFMADD231NEPBF16Zrk, X86::VFMADD231NEPBF16Zmbk, TB_BCAST_SH},
+  {X86::VFMADD231NEPBF16Zrkz, X86::VFMADD231NEPBF16Zmbkz, TB_BCAST_SH},
   {X86::VFMADD231PDZ128rk, X86::VFMADD231PDZ128mbk, TB_BCAST_SD},
   {X86::VFMADD231PDZ128rkz, X86::VFMADD231PDZ128mbkz, TB_BCAST_SD},
   {X86::VFMADD231PDZ256rk, X86::VFMADD231PDZ256mbk, TB_BCAST_SD},
@@ -9388,6 +9813,12 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VFMADDSUB231PSZ256rkz, X86::VFMADDSUB231PSZ256mbkz, TB_BCAST_SS},
   {X86::VFMADDSUB231PSZrk, X86::VFMADDSUB231PSZmbk, TB_BCAST_SS},
   {X86::VFMADDSUB231PSZrkz, X86::VFMADDSUB231PSZmbkz, TB_BCAST_SS},
+  {X86::VFMSUB132NEPBF16Z128rk, X86::VFMSUB132NEPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VFMSUB132NEPBF16Z128rkz, X86::VFMSUB132NEPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VFMSUB132NEPBF16Z256rk, X86::VFMSUB132NEPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VFMSUB132NEPBF16Z256rkz, X86::VFMSUB132NEPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VFMSUB132NEPBF16Zrk, X86::VFMSUB132NEPBF16Zmbk, TB_BCAST_SH},
+  {X86::VFMSUB132NEPBF16Zrkz, X86::VFMSUB132NEPBF16Zmbkz, TB_BCAST_SH},
   {X86::VFMSUB132PDZ128rk, X86::VFMSUB132PDZ128mbk, TB_BCAST_SD},
   {X86::VFMSUB132PDZ128rkz, X86::VFMSUB132PDZ128mbkz, TB_BCAST_SD},
   {X86::VFMSUB132PDZ256rk, X86::VFMSUB132PDZ256mbk, TB_BCAST_SD},
@@ -9406,6 +9837,12 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VFMSUB132PSZ256rkz, X86::VFMSUB132PSZ256mbkz, TB_BCAST_SS},
   {X86::VFMSUB132PSZrk, X86::VFMSUB132PSZmbk, TB_BCAST_SS},
   {X86::VFMSUB132PSZrkz, X86::VFMSUB132PSZmbkz, TB_BCAST_SS},
+  {X86::VFMSUB213NEPBF16Z128rk, X86::VFMSUB213NEPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VFMSUB213NEPBF16Z128rkz, X86::VFMSUB213NEPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VFMSUB213NEPBF16Z256rk, X86::VFMSUB213NEPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VFMSUB213NEPBF16Z256rkz, X86::VFMSUB213NEPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VFMSUB213NEPBF16Zrk, X86::VFMSUB213NEPBF16Zmbk, TB_BCAST_SH},
+  {X86::VFMSUB213NEPBF16Zrkz, X86::VFMSUB213NEPBF16Zmbkz, TB_BCAST_SH},
   {X86::VFMSUB213PDZ128rk, X86::VFMSUB213PDZ128mbk, TB_BCAST_SD},
   {X86::VFMSUB213PDZ128rkz, X86::VFMSUB213PDZ128mbkz, TB_BCAST_SD},
   {X86::VFMSUB213PDZ256rk, X86::VFMSUB213PDZ256mbk, TB_BCAST_SD},
@@ -9424,6 +9861,12 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VFMSUB213PSZ256rkz, X86::VFMSUB213PSZ256mbkz, TB_BCAST_SS},
   {X86::VFMSUB213PSZrk, X86::VFMSUB213PSZmbk, TB_BCAST_SS},
   {X86::VFMSUB213PSZrkz, X86::VFMSUB213PSZmbkz, TB_BCAST_SS},
+  {X86::VFMSUB231NEPBF16Z128rk, X86::VFMSUB231NEPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VFMSUB231NEPBF16Z128rkz, X86::VFMSUB231NEPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VFMSUB231NEPBF16Z256rk, X86::VFMSUB231NEPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VFMSUB231NEPBF16Z256rkz, X86::VFMSUB231NEPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VFMSUB231NEPBF16Zrk, X86::VFMSUB231NEPBF16Zmbk, TB_BCAST_SH},
+  {X86::VFMSUB231NEPBF16Zrkz, X86::VFMSUB231NEPBF16Zmbkz, TB_BCAST_SH},
   {X86::VFMSUB231PDZ128rk, X86::VFMSUB231PDZ128mbk, TB_BCAST_SD},
   {X86::VFMSUB231PDZ128rkz, X86::VFMSUB231PDZ128mbkz, TB_BCAST_SD},
   {X86::VFMSUB231PDZ256rk, X86::VFMSUB231PDZ256mbk, TB_BCAST_SD},
@@ -9499,6 +9942,12 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VFMULCPHZ128rrk, X86::VFMULCPHZ128rmbk, TB_BCAST_SS},
   {X86::VFMULCPHZ256rrk, X86::VFMULCPHZ256rmbk, TB_BCAST_SS},
   {X86::VFMULCPHZrrk, X86::VFMULCPHZrmbk, TB_BCAST_SS},
+  {X86::VFNMADD132NEPBF16Z128rk, X86::VFNMADD132NEPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VFNMADD132NEPBF16Z128rkz, X86::VFNMADD132NEPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VFNMADD132NEPBF16Z256rk, X86::VFNMADD132NEPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VFNMADD132NEPBF16Z256rkz, X86::VFNMADD132NEPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VFNMADD132NEPBF16Zrk, X86::VFNMADD132NEPBF16Zmbk, TB_BCAST_SH},
+  {X86::VFNMADD132NEPBF16Zrkz, X86::VFNMADD132NEPBF16Zmbkz, TB_BCAST_SH},
   {X86::VFNMADD132PDZ128rk, X86::VFNMADD132PDZ128mbk, TB_BCAST_SD},
   {X86::VFNMADD132PDZ128rkz, X86::VFNMADD132PDZ128mbkz, TB_BCAST_SD},
   {X86::VFNMADD132PDZ256rk, X86::VFNMADD132PDZ256mbk, TB_BCAST_SD},
@@ -9517,6 +9966,12 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VFNMADD132PSZ256rkz, X86::VFNMADD132PSZ256mbkz, TB_BCAST_SS},
   {X86::VFNMADD132PSZrk, X86::VFNMADD132PSZmbk, TB_BCAST_SS},
   {X86::VFNMADD132PSZrkz, X86::VFNMADD132PSZmbkz, TB_BCAST_SS},
+  {X86::VFNMADD213NEPBF16Z128rk, X86::VFNMADD213NEPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VFNMADD213NEPBF16Z128rkz, X86::VFNMADD213NEPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VFNMADD213NEPBF16Z256rk, X86::VFNMADD213NEPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VFNMADD213NEPBF16Z256rkz, X86::VFNMADD213NEPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VFNMADD213NEPBF16Zrk, X86::VFNMADD213NEPBF16Zmbk, TB_BCAST_SH},
+  {X86::VFNMADD213NEPBF16Zrkz, X86::VFNMADD213NEPBF16Zmbkz, TB_BCAST_SH},
   {X86::VFNMADD213PDZ128rk, X86::VFNMADD213PDZ128mbk, TB_BCAST_SD},
   {X86::VFNMADD213PDZ128rkz, X86::VFNMADD213PDZ128mbkz, TB_BCAST_SD},
   {X86::VFNMADD213PDZ256rk, X86::VFNMADD213PDZ256mbk, TB_BCAST_SD},
@@ -9535,6 +9990,12 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VFNMADD213PSZ256rkz, X86::VFNMADD213PSZ256mbkz, TB_BCAST_SS},
   {X86::VFNMADD213PSZrk, X86::VFNMADD213PSZmbk, TB_BCAST_SS},
   {X86::VFNMADD213PSZrkz, X86::VFNMADD213PSZmbkz, TB_BCAST_SS},
+  {X86::VFNMADD231NEPBF16Z128rk, X86::VFNMADD231NEPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VFNMADD231NEPBF16Z128rkz, X86::VFNMADD231NEPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VFNMADD231NEPBF16Z256rk, X86::VFNMADD231NEPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VFNMADD231NEPBF16Z256rkz, X86::VFNMADD231NEPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VFNMADD231NEPBF16Zrk, X86::VFNMADD231NEPBF16Zmbk, TB_BCAST_SH},
+  {X86::VFNMADD231NEPBF16Zrkz, X86::VFNMADD231NEPBF16Zmbkz, TB_BCAST_SH},
   {X86::VFNMADD231PDZ128rk, X86::VFNMADD231PDZ128mbk, TB_BCAST_SD},
   {X86::VFNMADD231PDZ128rkz, X86::VFNMADD231PDZ128mbkz, TB_BCAST_SD},
   {X86::VFNMADD231PDZ256rk, X86::VFNMADD231PDZ256mbk, TB_BCAST_SD},
@@ -9553,6 +10014,12 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VFNMADD231PSZ256rkz, X86::VFNMADD231PSZ256mbkz, TB_BCAST_SS},
   {X86::VFNMADD231PSZrk, X86::VFNMADD231PSZmbk, TB_BCAST_SS},
   {X86::VFNMADD231PSZrkz, X86::VFNMADD231PSZmbkz, TB_BCAST_SS},
+  {X86::VFNMSUB132NEPBF16Z128rk, X86::VFNMSUB132NEPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VFNMSUB132NEPBF16Z128rkz, X86::VFNMSUB132NEPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VFNMSUB132NEPBF16Z256rk, X86::VFNMSUB132NEPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VFNMSUB132NEPBF16Z256rkz, X86::VFNMSUB132NEPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VFNMSUB132NEPBF16Zrk, X86::VFNMSUB132NEPBF16Zmbk, TB_BCAST_SH},
+  {X86::VFNMSUB132NEPBF16Zrkz, X86::VFNMSUB132NEPBF16Zmbkz, TB_BCAST_SH},
   {X86::VFNMSUB132PDZ128rk, X86::VFNMSUB132PDZ128mbk, TB_BCAST_SD},
   {X86::VFNMSUB132PDZ128rkz, X86::VFNMSUB132PDZ128mbkz, TB_BCAST_SD},
   {X86::VFNMSUB132PDZ256rk, X86::VFNMSUB132PDZ256mbk, TB_BCAST_SD},
@@ -9571,6 +10038,12 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VFNMSUB132PSZ256rkz, X86::VFNMSUB132PSZ256mbkz, TB_BCAST_SS},
   {X86::VFNMSUB132PSZrk, X86::VFNMSUB132PSZmbk, TB_BCAST_SS},
   {X86::VFNMSUB132PSZrkz, X86::VFNMSUB132PSZmbkz, TB_BCAST_SS},
+  {X86::VFNMSUB213NEPBF16Z128rk, X86::VFNMSUB213NEPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VFNMSUB213NEPBF16Z128rkz, X86::VFNMSUB213NEPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VFNMSUB213NEPBF16Z256rk, X86::VFNMSUB213NEPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VFNMSUB213NEPBF16Z256rkz, X86::VFNMSUB213NEPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VFNMSUB213NEPBF16Zrk, X86::VFNMSUB213NEPBF16Zmbk, TB_BCAST_SH},
+  {X86::VFNMSUB213NEPBF16Zrkz, X86::VFNMSUB213NEPBF16Zmbkz, TB_BCAST_SH},
   {X86::VFNMSUB213PDZ128rk, X86::VFNMSUB213PDZ128mbk, TB_BCAST_SD},
   {X86::VFNMSUB213PDZ128rkz, X86::VFNMSUB213PDZ128mbkz, TB_BCAST_SD},
   {X86::VFNMSUB213PDZ256rk, X86::VFNMSUB213PDZ256mbk, TB_BCAST_SD},
@@ -9589,6 +10062,12 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VFNMSUB213PSZ256rkz, X86::VFNMSUB213PSZ256mbkz, TB_BCAST_SS},
   {X86::VFNMSUB213PSZrk, X86::VFNMSUB213PSZmbk, TB_BCAST_SS},
   {X86::VFNMSUB213PSZrkz, X86::VFNMSUB213PSZmbkz, TB_BCAST_SS},
+  {X86::VFNMSUB231NEPBF16Z128rk, X86::VFNMSUB231NEPBF16Z128mbk, TB_BCAST_SH},
+  {X86::VFNMSUB231NEPBF16Z128rkz, X86::VFNMSUB231NEPBF16Z128mbkz, TB_BCAST_SH},
+  {X86::VFNMSUB231NEPBF16Z256rk, X86::VFNMSUB231NEPBF16Z256mbk, TB_BCAST_SH},
+  {X86::VFNMSUB231NEPBF16Z256rkz, X86::VFNMSUB231NEPBF16Z256mbkz, TB_BCAST_SH},
+  {X86::VFNMSUB231NEPBF16Zrk, X86::VFNMSUB231NEPBF16Zmbk, TB_BCAST_SH},
+  {X86::VFNMSUB231NEPBF16Zrkz, X86::VFNMSUB231NEPBF16Zmbkz, TB_BCAST_SH},
   {X86::VFNMSUB231PDZ128rk, X86::VFNMSUB231PDZ128mbk, TB_BCAST_SD},
   {X86::VFNMSUB231PDZ128rkz, X86::VFNMSUB231PDZ128mbkz, TB_BCAST_SD},
   {X86::VFNMSUB231PDZ256rk, X86::VFNMSUB231PDZ256mbk, TB_BCAST_SD},
@@ -9622,6 +10101,9 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmbk, TB_BCAST_SS},
   {X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmbk, TB_BCAST_SS},
   {X86::VMAXCPSZrrk, X86::VMAXCPSZrmbk, TB_BCAST_SS},
+  {X86::VMAXPBF16Z128rrk, X86::VMAXPBF16Z128rmbk, TB_BCAST_SH},
+  {X86::VMAXPBF16Z256rrk, X86::VMAXPBF16Z256rmbk, TB_BCAST_SH},
+  {X86::VMAXPBF16Zrrk, X86::VMAXPBF16Zrmbk, TB_BCAST_SH},
   {X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmbk, TB_BCAST_SD},
   {X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmbk, TB_BCAST_SD},
   {X86::VMAXPDZrrk, X86::VMAXPDZrmbk, TB_BCAST_SD},
@@ -9652,6 +10134,9 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VMINMAXPSZ128rrik, X86::VMINMAXPSZ128rmbik, TB_BCAST_SS},
   {X86::VMINMAXPSZ256rrik, X86::VMINMAXPSZ256rmbik, TB_BCAST_SS},
   {X86::VMINMAXPSZrrik, X86::VMINMAXPSZrmbik, TB_BCAST_SS},
+  {X86::VMINPBF16Z128rrk, X86::VMINPBF16Z128rmbk, TB_BCAST_SH},
+  {X86::VMINPBF16Z256rrk, X86::VMINPBF16Z256rmbk, TB_BCAST_SH},
+  {X86::VMINPBF16Zrrk, X86::VMINPBF16Zrmbk, TB_BCAST_SH},
   {X86::VMINPDZ128rrk, X86::VMINPDZ128rmbk, TB_BCAST_SD},
   {X86::VMINPDZ256rrk, X86::VMINPDZ256rmbk, TB_BCAST_SD},
   {X86::VMINPDZrrk, X86::VMINPDZrmbk, TB_BCAST_SD},
@@ -9661,6 +10146,9 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VMINPSZ128rrk, X86::VMINPSZ128rmbk, TB_BCAST_SS},
   {X86::VMINPSZ256rrk, X86::VMINPSZ256rmbk, TB_BCAST_SS},
   {X86::VMINPSZrrk, X86::VMINPSZrmbk, TB_BCAST_SS},
+  {X86::VMULNEPBF16Z128rrk, X86::VMULNEPBF16Z128rmbk, TB_BCAST_SH},
+  {X86::VMULNEPBF16Z256rrk, X86::VMULNEPBF16Z256rmbk, TB_BCAST_SH},
+  {X86::VMULNEPBF16Zrrk, X86::VMULNEPBF16Zrmbk, TB_BCAST_SH},
   {X86::VMULPDZ128rrk, X86::VMULPDZ128rmbk, TB_BCAST_SD},
   {X86::VMULPDZ256rrk, X86::VMULPDZ256rmbk, TB_BCAST_SD},
   {X86::VMULPDZrrk, X86::VMULPDZrmbk, TB_BCAST_SD},
@@ -10023,6 +10511,9 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VRANGEPSZ128rrik, X86::VRANGEPSZ128rmbik, TB_BCAST_SS},
   {X86::VRANGEPSZ256rrik, X86::VRANGEPSZ256rmbik, TB_BCAST_SS},
   {X86::VRANGEPSZrrik, X86::VRANGEPSZrmbik, TB_BCAST_SS},
+  {X86::VSCALEFPBF16Z128rrk, X86::VSCALEFPBF16Z128rmbk, TB_BCAST_SH},
+  {X86::VSCALEFPBF16Z256rrk, X86::VSCALEFPBF16Z256rmbk, TB_BCAST_SH},
+  {X86::VSCALEFPBF16Zrrk, X86::VSCALEFPBF16Zrmbk, TB_BCAST_SH},
   {X86::VSCALEFPDZ128rrk, X86::VSCALEFPDZ128rmbk, TB_BCAST_SD},
   {X86::VSCALEFPDZ256rrk, X86::VSCALEFPDZ256rmbk, TB_BCAST_SD},
   {X86::VSCALEFPDZrrk, X86::VSCALEFPDZrmbk, TB_BCAST_SD},
@@ -10046,6 +10537,9 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VSHUFPSZ128rrik, X86::VSHUFPSZ128rmbik, TB_BCAST_SS},
   {X86::VSHUFPSZ256rrik, X86::VSHUFPSZ256rmbik, TB_BCAST_SS},
   {X86::VSHUFPSZrrik, X86::VSHUFPSZrmbik, TB_BCAST_SS},
+  {X86::VSUBNEPBF16Z128rrk, X86::VSUBNEPBF16Z128rmbk, TB_BCAST_SH},
+  {X86::VSUBNEPBF16Z256rrk, X86::VSUBNEPBF16Z256rmbk, TB_BCAST_SH},
+  {X86::VSUBNEPBF16Zrrk, X86::VSUBNEPBF16Zrmbk, TB_BCAST_SH},
   {X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmbk, TB_BCAST_SD},
   {X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmbk, TB_BCAST_SD},
   {X86::VSUBPDZrrk, X86::VSUBPDZrmbk, TB_BCAST_SD},

From 814aa432abf8e9f644903061029e6e27f6a418a8 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Tue, 3 Sep 2024 17:15:24 -0700
Subject: [PATCH 005/425] [SandboxIR] Implement ConstantAggregate (#107136)

This patch implements sandboxir:: ConstantAggregate, ConstantStruct,
ConstantArray and ConstantVector, mirroring LLVM IR.
---
 llvm/include/llvm/SandboxIR/SandboxIR.h       | 94 +++++++++++++++++++
 .../llvm/SandboxIR/SandboxIRValues.def        |  3 +
 llvm/include/llvm/SandboxIR/Type.h            | 45 +++++++--
 llvm/lib/SandboxIR/SandboxIR.cpp              | 48 +++++++++-
 llvm/lib/SandboxIR/Type.cpp                   | 15 +++
 llvm/unittests/SandboxIR/SandboxIRTest.cpp    | 75 +++++++++++++++
 llvm/unittests/SandboxIR/TypesTest.cpp        | 38 ++++++++
 7 files changed, 310 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 0ac049af4db2b..5c2d58c1b99dc 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -304,6 +304,8 @@ class Value {
   friend class PHINode;               // For getting `Val`.
   friend class UnreachableInst;       // For getting `Val`.
   friend class CatchSwitchAddHandler; // For `Val`.
+  friend class ConstantArray;         // For `Val`.
+  friend class ConstantStruct;        // For `Val`.
 
   /// All values point to the context.
   Context &Ctx;
@@ -840,6 +842,97 @@ class ConstantFP final : public Constant {
 #endif
 };
 
+/// Base class for aggregate constants (with operands).
+class ConstantAggregate : public Constant {
+protected:
+  ConstantAggregate(ClassID ID, llvm::Constant *C, Context &Ctx)
+      : Constant(ID, C, Ctx) {}
+
+public:
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    auto ID = From->getSubclassID();
+    return ID == ClassID::ConstantVector || ID == ClassID::ConstantStruct ||
+           ID == ClassID::ConstantArray;
+  }
+};
+
+class ConstantArray final : public ConstantAggregate {
+  ConstantArray(llvm::ConstantArray *C, Context &Ctx)
+      : ConstantAggregate(ClassID::ConstantArray, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  static Constant *get(ArrayType *T, ArrayRef<Constant *> V);
+  ArrayType *getType() const;
+
+  // TODO: Missing functions: getType(), getTypeForElements(), getAnon(), get().
+
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::ConstantArray;
+  }
+};
+
+class ConstantStruct final : public ConstantAggregate {
+  ConstantStruct(llvm::ConstantStruct *C, Context &Ctx)
+      : ConstantAggregate(ClassID::ConstantStruct, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  static Constant *get(StructType *T, ArrayRef<Constant *> V);
+
+  template <typename... Csts>
+  static std::enable_if_t<are_base_of<Constant, Csts...>::value, Constant *>
+  get(StructType *T, Csts *...Vs) {
+    return get(T, ArrayRef<Constant *>({Vs...}));
+  }
+  /// Return an anonymous struct that has the specified elements.
+  /// If the struct is possibly empty, then you must specify a context.
+  static Constant *getAnon(ArrayRef<Constant *> V, bool Packed = false) {
+    return get(getTypeForElements(V, Packed), V);
+  }
+  static Constant *getAnon(Context &Ctx, ArrayRef<Constant *> V,
+                           bool Packed = false) {
+    return get(getTypeForElements(Ctx, V, Packed), V);
+  }
+  /// This version of the method allows an empty list.
+  static StructType *getTypeForElements(Context &Ctx, ArrayRef<Constant *> V,
+                                        bool Packed = false);
+  /// Return an anonymous struct type to use for a constant with the specified
+  /// set of elements. The list must not be empty.
+  static StructType *getTypeForElements(ArrayRef<Constant *> V,
+                                        bool Packed = false) {
+    assert(!V.empty() &&
+           "ConstantStruct::getTypeForElements cannot be called on empty list");
+    return getTypeForElements(V[0]->getContext(), V, Packed);
+  }
+
+  /// Specialization - reduce amount of casting.
+  inline StructType *getType() const {
+    return cast<StructType>(Value::getType());
+  }
+
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::ConstantStruct;
+  }
+};
+
+class ConstantVector final : public ConstantAggregate {
+  ConstantVector(llvm::ConstantVector *C, Context &Ctx)
+      : ConstantAggregate(ClassID::ConstantVector, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  // TODO: Missing functions: getSplat(), getType(), getSplatValue(), get().
+
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::ConstantVector;
+  }
+};
+
 /// Iterator for `Instruction`s in a `BasicBlock.
 /// \Returns an sandboxir::Instruction & when derereferenced.
 class BBIterator {
@@ -3353,6 +3446,7 @@ class Context {
   friend class Type;        // For LLVMCtx.
   friend class PointerType; // For LLVMCtx.
   friend class IntegerType; // For LLVMCtx.
+  friend class StructType;  // For LLVMCtx.
   Tracker IRTracker;
 
   /// Maps LLVM Value to the corresponding sandboxir::Value. Owns all
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index 2fc24ed71c4cf..d2031bbdcfb54 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -27,6 +27,9 @@ DEF_VALUE(Block, BasicBlock)
 DEF_CONST(Constant, Constant)
 DEF_CONST(ConstantInt, ConstantInt)
 DEF_CONST(ConstantFP, ConstantFP)
+DEF_CONST(ConstantArray, ConstantArray)
+DEF_CONST(ConstantStruct, ConstantStruct)
+DEF_CONST(ConstantVector, ConstantVector)
 
 #ifndef DEF_INSTR
 #define DEF_INSTR(ID, OPCODE, CLASS)
diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h
index 39c545a6e6c6d..2f9b94b8d7175 100644
--- a/llvm/include/llvm/SandboxIR/Type.h
+++ b/llvm/include/llvm/SandboxIR/Type.h
@@ -27,6 +27,8 @@ class PointerType;
 class VectorType;
 class IntegerType;
 class FunctionType;
+class ArrayType;
+class StructType;
 #define DEF_INSTR(ID, OPCODE, CLASS) class CLASS;
 #define DEF_CONST(ID, CLASS) class CLASS;
 #include "llvm/SandboxIR/SandboxIRValues.def"
@@ -36,13 +38,19 @@ class FunctionType;
 class Type {
 protected:
   llvm::Type *LLVMTy;
-  friend class VectorType;   // For LLVMTy.
-  friend class PointerType;  // For LLVMTy.
-  friend class FunctionType; // For LLVMTy.
-  friend class IntegerType;  // For LLVMTy.
-  friend class Function;     // For LLVMTy.
-  friend class CallBase;     // For LLVMTy.
-  friend class ConstantInt;  // For LLVMTy.
+  friend class ArrayType;      // For LLVMTy.
+  friend class StructType;     // For LLVMTy.
+  friend class VectorType;     // For LLVMTy.
+  friend class PointerType;    // For LLVMTy.
+  friend class FunctionType;   // For LLVMTy.
+  friend class IntegerType;    // For LLVMTy.
+  friend class Function;       // For LLVMTy.
+  friend class CallBase;       // For LLVMTy.
+  friend class ConstantInt;    // For LLVMTy.
+  friend class ConstantArray;  // For LLVMTy.
+  friend class ConstantStruct; // For LLVMTy.
+  friend class ConstantVector; // For LLVMTy.
+
   // Friend all instruction classes because `create()` functions use LLVMTy.
 #define DEF_INSTR(ID, OPCODE, CLASS) friend class CLASS;
 #define DEF_CONST(ID, CLASS) friend class CLASS;
@@ -281,8 +289,31 @@ class PointerType : public Type {
   }
 };
 
+class ArrayType : public Type {
+public:
+  // TODO: add missing functions
+  static bool classof(const Type *From) {
+    return isa<llvm::ArrayType>(From->LLVMTy);
+  }
+};
+
+class StructType : public Type {
+public:
+  /// This static method is the primary way to create a literal StructType.
+  static StructType *get(Context &Ctx, ArrayRef<Type *> Elements,
+                         bool IsPacked = false);
+
+  bool isPacked() const { return cast<llvm::StructType>(LLVMTy)->isPacked(); }
+
+  // TODO: add missing functions
+  static bool classof(const Type *From) {
+    return isa<llvm::StructType>(From->LLVMTy);
+  }
+};
+
 class VectorType : public Type {
 public:
+  static VectorType *get(Type *ElementType, ElementCount EC);
   // TODO: add missing functions
   static bool classof(const Type *From) {
     return isa<llvm::VectorType>(From->LLVMTy);
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index 5af6fbdde42cb..e8d081e6b17e7 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -2364,6 +2364,44 @@ bool ConstantFP::isValueValidForType(Type *Ty, const APFloat &V) {
   return llvm::ConstantFP::isValueValidForType(Ty->LLVMTy, V);
 }
 
+Constant *ConstantArray::get(ArrayType *T, ArrayRef<Constant *> V) {
+  auto &Ctx = T->getContext();
+  SmallVector<llvm::Constant *> LLVMValues;
+  LLVMValues.reserve(V.size());
+  for (auto *Elm : V)
+    LLVMValues.push_back(cast<llvm::Constant>(Elm->Val));
+  auto *LLVMC =
+      llvm::ConstantArray::get(cast<llvm::ArrayType>(T->LLVMTy), LLVMValues);
+  return cast<ConstantArray>(Ctx.getOrCreateConstant(LLVMC));
+}
+
+ArrayType *ConstantArray::getType() const {
+  return cast<ArrayType>(
+      Ctx.getType(cast<llvm::ConstantArray>(Val)->getType()));
+}
+
+Constant *ConstantStruct::get(StructType *T, ArrayRef<Constant *> V) {
+  auto &Ctx = T->getContext();
+  SmallVector<llvm::Constant *> LLVMValues;
+  LLVMValues.reserve(V.size());
+  for (auto *Elm : V)
+    LLVMValues.push_back(cast<llvm::Constant>(Elm->Val));
+  auto *LLVMC =
+      llvm::ConstantStruct::get(cast<llvm::StructType>(T->LLVMTy), LLVMValues);
+  return cast<ConstantStruct>(Ctx.getOrCreateConstant(LLVMC));
+}
+
+StructType *ConstantStruct::getTypeForElements(Context &Ctx,
+                                               ArrayRef<Constant *> V,
+                                               bool Packed) {
+  unsigned VecSize = V.size();
+  SmallVector<Type *, 16> EltTypes;
+  EltTypes.reserve(VecSize);
+  for (Constant *Elm : V)
+    EltTypes.push_back(Elm->getType());
+  return StructType::get(Ctx, EltTypes, Packed);
+}
+
 FunctionType *Function::getFunctionType() const {
   return cast<FunctionType>(
       Ctx.getType(cast<llvm::Function>(Val)->getFunctionType()));
@@ -2459,7 +2497,15 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
       It->second = std::unique_ptr<ConstantFP>(new ConstantFP(CF, *this));
       return It->second.get();
     }
-    if (auto *F = dyn_cast<llvm::Function>(LLVMV))
+    if (auto *CA = dyn_cast<llvm::ConstantArray>(C))
+      It->second = std::unique_ptr<ConstantArray>(new ConstantArray(CA, *this));
+    else if (auto *CS = dyn_cast<llvm::ConstantStruct>(C))
+      It->second =
+          std::unique_ptr<ConstantStruct>(new ConstantStruct(CS, *this));
+    else if (auto *CV = dyn_cast<llvm::ConstantVector>(C))
+      It->second =
+          std::unique_ptr<ConstantVector>(new ConstantVector(CV, *this));
+    else if (auto *F = dyn_cast<llvm::Function>(LLVMV))
       It->second = std::unique_ptr<Function>(new Function(F, *this));
     else
       It->second = std::unique_ptr<Constant>(new Constant(C, *this));
diff --git a/llvm/lib/SandboxIR/Type.cpp b/llvm/lib/SandboxIR/Type.cpp
index eee69c5ec7c89..535b0f75fd874 100644
--- a/llvm/lib/SandboxIR/Type.cpp
+++ b/llvm/lib/SandboxIR/Type.cpp
@@ -47,6 +47,21 @@ PointerType *PointerType::get(Context &Ctx, unsigned AddressSpace) {
       Ctx.getType(llvm::PointerType::get(Ctx.LLVMCtx, AddressSpace)));
 }
 
+StructType *StructType::get(Context &Ctx, ArrayRef<Type *> Elements,
+                            bool IsPacked) {
+  SmallVector<llvm::Type *> LLVMElements;
+  LLVMElements.reserve(Elements.size());
+  for (Type *Elm : Elements)
+    LLVMElements.push_back(Elm->LLVMTy);
+  return cast<StructType>(
+      Ctx.getType(llvm::StructType::get(Ctx.LLVMCtx, LLVMElements, IsPacked)));
+}
+
+VectorType *VectorType::get(Type *ElementType, ElementCount EC) {
+  return cast<VectorType>(ElementType->getContext().getType(
+      llvm::VectorType::get(ElementType->LLVMTy, EC)));
+}
+
 IntegerType *IntegerType::get(Context &Ctx, unsigned NumBits) {
   return cast<IntegerType>(
       Ctx.getType(llvm::IntegerType::get(Ctx.LLVMCtx, NumBits)));
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index 2ec8eefd8c323..ca2a183e53268 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -445,6 +445,81 @@ define void @foo(float %v0, double %v1) {
   EXPECT_TRUE(NegZero->isExactlyValue(-0.0));
 }
 
+// Tests ConstantArray, ConstantStruct and ConstantVector.
+TEST_F(SandboxIRTest, ConstantAggregate) {
+  // Note: we are using i42 to avoid the creation of ConstantDataVector or
+  // ConstantDataArray.
+  parseIR(C, R"IR(
+define void @foo() {
+  %array = extractvalue [2 x i42] [i42 0, i42 1], 0
+  %struct = extractvalue {i42, i42} {i42 0, i42 1}, 0
+  %vector = extractelement <2 x i42> <i42 0, i42 1>, i32 0
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto &BB = *F.begin();
+  auto It = BB.begin();
+  auto *I0 = &*It++;
+  auto *I1 = &*It++;
+  auto *I2 = &*It++;
+  // Check classof() and creation.
+  auto *Array = cast<sandboxir::ConstantArray>(I0->getOperand(0));
+  EXPECT_TRUE(isa<sandboxir::ConstantAggregate>(Array));
+  auto *Struct = cast<sandboxir::ConstantStruct>(I1->getOperand(0));
+  EXPECT_TRUE(isa<sandboxir::ConstantAggregate>(Struct));
+  auto *Vector = cast<sandboxir::ConstantVector>(I2->getOperand(0));
+  EXPECT_TRUE(isa<sandboxir::ConstantAggregate>(Vector));
+
+  auto *ZeroI42 = cast<sandboxir::ConstantInt>(Array->getOperand(0));
+  auto *OneI42 = cast<sandboxir::ConstantInt>(Array->getOperand(1));
+  // Check ConstantArray::get(), getType().
+  auto *NewCA =
+      sandboxir::ConstantArray::get(Array->getType(), {ZeroI42, OneI42});
+  EXPECT_EQ(NewCA, Array);
+
+  // Check ConstantStruct::get(), getType().
+  auto *NewCS =
+      sandboxir::ConstantStruct::get(Struct->getType(), {ZeroI42, OneI42});
+  EXPECT_EQ(NewCS, Struct);
+  // Check ConstantStruct::get(...).
+  auto *NewCS2 =
+      sandboxir::ConstantStruct::get(Struct->getType(), ZeroI42, OneI42);
+  EXPECT_EQ(NewCS2, Struct);
+  // Check ConstantStruct::getAnon(ArayRef).
+  auto *AnonCS = sandboxir::ConstantStruct::getAnon({ZeroI42, OneI42});
+  EXPECT_FALSE(cast<sandboxir::StructType>(AnonCS->getType())->isPacked());
+  auto *AnonCSPacked =
+      sandboxir::ConstantStruct::getAnon({ZeroI42, OneI42}, /*Packed=*/true);
+  EXPECT_TRUE(cast<sandboxir::StructType>(AnonCSPacked->getType())->isPacked());
+  // Check ConstantStruct::getAnon(Ctx, ArrayRef).
+  auto *AnonCS2 = sandboxir::ConstantStruct::getAnon(Ctx, {ZeroI42, OneI42});
+  EXPECT_EQ(AnonCS2, AnonCS);
+  auto *AnonCS2Packed = sandboxir::ConstantStruct::getAnon(
+      Ctx, {ZeroI42, OneI42}, /*Packed=*/true);
+  EXPECT_EQ(AnonCS2Packed, AnonCSPacked);
+  // Check ConstantStruct::getTypeForElements(Ctx, ArrayRef).
+  auto *StructTy =
+      sandboxir::ConstantStruct::getTypeForElements(Ctx, {ZeroI42, OneI42});
+  EXPECT_EQ(StructTy, Struct->getType());
+  EXPECT_FALSE(StructTy->isPacked());
+  // Check ConstantStruct::getTypeForElements(Ctx, ArrayRef, Packed).
+  auto *StructTyPacked = sandboxir::ConstantStruct::getTypeForElements(
+      Ctx, {ZeroI42, OneI42}, /*Packed=*/true);
+  EXPECT_TRUE(StructTyPacked->isPacked());
+  // Check ConstantStruct::getTypeForElements(ArrayRef).
+  auto *StructTy2 =
+      sandboxir::ConstantStruct::getTypeForElements(Ctx, {ZeroI42, OneI42});
+  EXPECT_EQ(StructTy2, Struct->getType());
+  // Check ConstantStruct::getTypeForElements(ArrayRef, Packed).
+  auto *StructTy2Packed = sandboxir::ConstantStruct::getTypeForElements(
+      Ctx, {ZeroI42, OneI42}, /*Packed=*/true);
+  EXPECT_EQ(StructTy2Packed, StructTyPacked);
+}
+
 TEST_F(SandboxIRTest, Use) {
   parseIR(C, R"IR(
 define i32 @foo(i32 %v0, i32 %v1) {
diff --git a/llvm/unittests/SandboxIR/TypesTest.cpp b/llvm/unittests/SandboxIR/TypesTest.cpp
index dcbf65a20b2fd..d4c2de441268c 100644
--- a/llvm/unittests/SandboxIR/TypesTest.cpp
+++ b/llvm/unittests/SandboxIR/TypesTest.cpp
@@ -224,6 +224,44 @@ define void @foo(ptr %ptr) {
   EXPECT_EQ(NewPtrTy2, PtrTy);
 }
 
+TEST_F(SandboxTypeTest, ArrayType) {
+  parseIR(C, R"IR(
+define void @foo([2 x i8] %v0) {
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  // Check classof(), creation.
+  [[maybe_unused]] auto *ArrayTy =
+      cast<sandboxir::ArrayType>(F->getArg(0)->getType());
+}
+
+TEST_F(SandboxTypeTest, StructType) {
+  parseIR(C, R"IR(
+define void @foo({i32, i8} %v0) {
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  auto *Int32Ty = sandboxir::Type::getInt32Ty(Ctx);
+  auto *Int8Ty = sandboxir::Type::getInt8Ty(Ctx);
+  // Check classof(), creation.
+  [[maybe_unused]] auto *StructTy =
+      cast<sandboxir::StructType>(F->getArg(0)->getType());
+  // Check get().
+  auto *NewStructTy = sandboxir::StructType::get(Ctx, {Int32Ty, Int8Ty});
+  EXPECT_EQ(NewStructTy, StructTy);
+  // Check get(Packed).
+  auto *NewStructTyPacked =
+      sandboxir::StructType::get(Ctx, {Int32Ty, Int8Ty}, /*Packed=*/true);
+  EXPECT_NE(NewStructTyPacked, StructTy);
+  EXPECT_TRUE(NewStructTyPacked->isPacked());
+}
+
 TEST_F(SandboxTypeTest, VectorType) {
   parseIR(C, R"IR(
 define void @foo(<2 x i8> %v0) {

From 48bc8b0f7f49f5b23884a0d9d21056ec0bfffe24 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 4 Sep 2024 00:26:54 +0000
Subject: [PATCH 006/425] [gn build] Port 83ad644afaac

---
 llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 73cee07a8b9d7..f3553210191a4 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -134,10 +134,12 @@ copy("Headers") {
     "arm_cmse.h",
     "arm_neon_sve_bridge.h",
     "armintr.h",
+    "avx10_2_512bf16intrin.h",
     "avx10_2_512convertintrin.h",
     "avx10_2_512minmaxintrin.h",
     "avx10_2_512niintrin.h",
     "avx10_2_512satcvtintrin.h",
+    "avx10_2bf16intrin.h",
     "avx10_2convertintrin.h",
     "avx10_2minmaxintrin.h",
     "avx10_2niintrin.h",

From ff0f2011e475141454028bce9cf7c6ff37a49620 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 3 Sep 2024 17:50:04 -0700
Subject: [PATCH 007/425] [RISCV] Bitcast fixed length bf16/f16 build_vector to
 i16 with Zvfbfmin/Zvfhmin+Zfbfmin/Zfhmin. (#106637)

Previously, if Zfbfmin/Zfhmin were enabled, we only handled
build_vectors that could be turned into splat_vectors. We promoted them
to f32 splats by extending in the scalar domain and narrowing in the
vector domain.

This patch fixes a crash where we failed to account for whether the f32
vector type fit in LMUL<=8.

Because the new lowering occurs after type legalization, we have to be
careful to use XLenVT for the scalar integer type and use custom cast
nodes.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   60 +-
 .../rvv/fixed-vectors-fp-buildvec-bf16.ll     |  175 ++
 .../RISCV/rvv/fixed-vectors-fp-buildvec.ll    |  151 +-
 .../RISCV/rvv/fixed-vectors-fp-setcc.ll       |  160 +-
 .../RISCV/rvv/fixed-vectors-fp-splat-bf16.ll  |   45 +-
 .../RISCV/rvv/fixed-vectors-fp-splat.ll       |   89 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll     | 1722 +++++++++--------
 .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll    |  196 +-
 .../RISCV/rvv/fixed-vectors-vfadd-vp.ll       |   80 +-
 .../RISCV/rvv/fixed-vectors-vfdiv-vp.ll       |   80 +-
 .../RISCV/rvv/fixed-vectors-vfma-vp.ll        |  104 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vfmax.ll  |   80 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vfmin.ll  |   80 +-
 .../RISCV/rvv/fixed-vectors-vfmul-vp.ll       |   80 +-
 .../RISCV/rvv/fixed-vectors-vfsub-vp.ll       |   80 +-
 .../RISCV/rvv/fixed-vectors-vpmerge-bf16.ll   |   40 +-
 .../RISCV/rvv/fixed-vectors-vpmerge.ll        |   40 +-
 17 files changed, 1667 insertions(+), 1595 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 303e0a1eee9cf..11be5c25354f1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1322,6 +1322,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
         if (VT.getVectorElementType() == MVT::f16 &&
             !Subtarget.hasVInstructionsF16()) {
+          setOperationAction(ISD::BITCAST, VT, Custom);
           setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
           setOperationAction(
               {ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT,
@@ -1331,8 +1332,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                              VT, Custom);
           setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
           if (Subtarget.hasStdExtZfhmin()) {
-            // FIXME: We should prefer BUILD_VECTOR over SPLAT_VECTOR.
-            setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+            setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
           } else {
             // We need to custom legalize f16 build vectors if Zfhmin isn't
             // available.
@@ -1350,10 +1350,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         }
 
         if (VT.getVectorElementType() == MVT::bf16) {
+          setOperationAction(ISD::BITCAST, VT, Custom);
           setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
           if (Subtarget.hasStdExtZfbfmin()) {
-            // FIXME: We should prefer BUILD_VECTOR over SPLAT_VECTOR.
-            setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+            setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
           } else {
             // We need to custom legalize bf16 build vectors if Zfbfmin isn't
             // available.
@@ -4120,26 +4120,45 @@ static SDValue lowerBuildVectorViaPacking(SDValue Op, SelectionDAG &DAG,
                      DAG.getBuildVector(WideVecVT, DL, NewOperands));
 }
 
-// Convert to an vXf16 build_vector to vXi16 with bitcasts.
-static SDValue lowerBUILD_VECTORvXf16(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
-  MVT IVT = VT.changeVectorElementType(MVT::i16);
-  SmallVector<SDValue, 16> NewOps(Op.getNumOperands());
-  for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
-    NewOps[I] = DAG.getBitcast(MVT::i16, Op.getOperand(I));
-  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), IVT, NewOps);
-  return DAG.getBitcast(VT, Res);
-}
-
 static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                                  const RISCVSubtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   assert(VT.isFixedLengthVector() && "Unexpected vector!");
 
-  // If we don't have scalar f16/bf16, we need to bitcast to an i16 vector.
-  if ((VT.getVectorElementType() == MVT::f16 && !Subtarget.hasStdExtZfhmin()) ||
-      (VT.getVectorElementType() == MVT::bf16 && !Subtarget.hasStdExtZfbfmin()))
-    return lowerBUILD_VECTORvXf16(Op, DAG);
+  MVT EltVT = VT.getVectorElementType();
+  MVT XLenVT = Subtarget.getXLenVT();
+
+  SDLoc DL(Op);
+
+  // Proper support for f16 requires Zvfh. bf16 always requires special
+  // handling. We need to cast the scalar to integer and create an integer
+  // build_vector.
+  if ((EltVT == MVT::f16 && !Subtarget.hasStdExtZvfh()) || EltVT == MVT::bf16) {
+    MVT IVT = VT.changeVectorElementType(MVT::i16);
+    SmallVector<SDValue, 16> NewOps(Op.getNumOperands());
+    for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
+      SDValue Elem = Op.getOperand(I);
+      if ((EltVT == MVT::bf16 && Subtarget.hasStdExtZfbfmin()) ||
+          (EltVT == MVT::f16 && Subtarget.hasStdExtZfhmin())) {
+        // Called by LegalizeDAG, we need to use XLenVT operations since we
+        // can't create illegal types.
+        if (auto *C = dyn_cast<ConstantFPSDNode>(Elem)) {
+          // Manually constant fold so the integer build_vector can be lowered
+          // better. Waiting for DAGCombine will be too late.
+          APInt V =
+              C->getValueAPF().bitcastToAPInt().sext(XLenVT.getSizeInBits());
+          NewOps[I] = DAG.getConstant(V, DL, XLenVT);
+        } else {
+          NewOps[I] = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, XLenVT, Elem);
+        }
+      } else {
+        // Called by scalar type legalizer, we can use i16.
+        NewOps[I] = DAG.getBitcast(MVT::i16, Op.getOperand(I));
+      }
+    }
+    SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, IVT, NewOps);
+    return DAG.getBitcast(VT, Res);
+  }
 
   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
       ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()))
@@ -4147,11 +4166,8 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 
   MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
 
-  SDLoc DL(Op);
   auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
 
-  MVT XLenVT = Subtarget.getXLenVT();
-
   if (VT.getVectorElementType() == MVT::i1) {
     // A BUILD_VECTOR can be lowered as a SETCC. For each fixed-length mask
     // vector type, we have a legal equivalently-sized i8 type, so we can use
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll
new file mode 100644
index 0000000000000..170e71af09b49
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFBFMIN,RV32-NO-ZFBFMIN
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFBFMIN,RV64-NO-ZFBFMIN
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfbfmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFBFMIN,RV32-ZFBFMIN
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfbfmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFBFMIN,RV64-ZFBFMIN
+
+define <4 x bfloat> @splat_idx_v4bf16(<4 x bfloat> %v, i64 %idx) {
+; RV32-NO-ZFBFMIN-LABEL: splat_idx_v4bf16:
+; RV32-NO-ZFBFMIN:       # %bb.0:
+; RV32-NO-ZFBFMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NO-ZFBFMIN-NEXT:    vrgather.vx v9, v8, a0
+; RV32-NO-ZFBFMIN-NEXT:    vmv1r.v v8, v9
+; RV32-NO-ZFBFMIN-NEXT:    ret
+;
+; RV64-NO-ZFBFMIN-LABEL: splat_idx_v4bf16:
+; RV64-NO-ZFBFMIN:       # %bb.0:
+; RV64-NO-ZFBFMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-NO-ZFBFMIN-NEXT:    vrgather.vx v9, v8, a0
+; RV64-NO-ZFBFMIN-NEXT:    vmv1r.v v8, v9
+; RV64-NO-ZFBFMIN-NEXT:    ret
+;
+; RV32-ZFBFMIN-LABEL: splat_idx_v4bf16:
+; RV32-ZFBFMIN:       # %bb.0:
+; RV32-ZFBFMIN-NEXT:    addi sp, sp, -48
+; RV32-ZFBFMIN-NEXT:    .cfi_def_cfa_offset 48
+; RV32-ZFBFMIN-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
+; RV32-ZFBFMIN-NEXT:    .cfi_offset ra, -4
+; RV32-ZFBFMIN-NEXT:    csrr a1, vlenb
+; RV32-ZFBFMIN-NEXT:    slli a1, a1, 1
+; RV32-ZFBFMIN-NEXT:    sub sp, sp, a1
+; RV32-ZFBFMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; RV32-ZFBFMIN-NEXT:    addi a1, sp, 32
+; RV32-ZFBFMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-ZFBFMIN-NEXT:    andi a0, a0, 3
+; RV32-ZFBFMIN-NEXT:    li a1, 2
+; RV32-ZFBFMIN-NEXT:    call __mulsi3
+; RV32-ZFBFMIN-NEXT:    addi a1, sp, 16
+; RV32-ZFBFMIN-NEXT:    add a0, a1, a0
+; RV32-ZFBFMIN-NEXT:    addi a2, sp, 32
+; RV32-ZFBFMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-ZFBFMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-ZFBFMIN-NEXT:    vse16.v v8, (a1)
+; RV32-ZFBFMIN-NEXT:    flh fa5, 0(a0)
+; RV32-ZFBFMIN-NEXT:    fmv.x.h a0, fa5
+; RV32-ZFBFMIN-NEXT:    vmv.v.x v8, a0
+; RV32-ZFBFMIN-NEXT:    csrr a0, vlenb
+; RV32-ZFBFMIN-NEXT:    slli a0, a0, 1
+; RV32-ZFBFMIN-NEXT:    add sp, sp, a0
+; RV32-ZFBFMIN-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
+; RV32-ZFBFMIN-NEXT:    addi sp, sp, 48
+; RV32-ZFBFMIN-NEXT:    ret
+;
+; RV64-ZFBFMIN-LABEL: splat_idx_v4bf16:
+; RV64-ZFBFMIN:       # %bb.0:
+; RV64-ZFBFMIN-NEXT:    addi sp, sp, -48
+; RV64-ZFBFMIN-NEXT:    .cfi_def_cfa_offset 48
+; RV64-ZFBFMIN-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64-ZFBFMIN-NEXT:    .cfi_offset ra, -8
+; RV64-ZFBFMIN-NEXT:    csrr a1, vlenb
+; RV64-ZFBFMIN-NEXT:    slli a1, a1, 1
+; RV64-ZFBFMIN-NEXT:    sub sp, sp, a1
+; RV64-ZFBFMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; RV64-ZFBFMIN-NEXT:    addi a1, sp, 32
+; RV64-ZFBFMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-ZFBFMIN-NEXT:    andi a0, a0, 3
+; RV64-ZFBFMIN-NEXT:    li a1, 2
+; RV64-ZFBFMIN-NEXT:    call __muldi3
+; RV64-ZFBFMIN-NEXT:    addi a1, sp, 16
+; RV64-ZFBFMIN-NEXT:    add a0, a1, a0
+; RV64-ZFBFMIN-NEXT:    addi a2, sp, 32
+; RV64-ZFBFMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-ZFBFMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-ZFBFMIN-NEXT:    vse16.v v8, (a1)
+; RV64-ZFBFMIN-NEXT:    flh fa5, 0(a0)
+; RV64-ZFBFMIN-NEXT:    fmv.x.h a0, fa5
+; RV64-ZFBFMIN-NEXT:    vmv.v.x v8, a0
+; RV64-ZFBFMIN-NEXT:    csrr a0, vlenb
+; RV64-ZFBFMIN-NEXT:    slli a0, a0, 1
+; RV64-ZFBFMIN-NEXT:    add sp, sp, a0
+; RV64-ZFBFMIN-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64-ZFBFMIN-NEXT:    addi sp, sp, 48
+; RV64-ZFBFMIN-NEXT:    ret
+  %x = extractelement <4 x bfloat> %v, i64 %idx
+  %ins = insertelement <4 x bfloat> poison, bfloat %x, i32 0
+  %splat = shufflevector <4 x bfloat> %ins, <4 x bfloat> poison, <4 x i32> zeroinitializer
+  ret <4 x bfloat> %splat
+}
+
+define <2 x bfloat> @buildvec_v2bf16(bfloat %a, bfloat %b) {
+; RV32-NO-ZFBFMIN-LABEL: buildvec_v2bf16:
+; RV32-NO-ZFBFMIN:       # %bb.0:
+; RV32-NO-ZFBFMIN-NEXT:    fmv.x.w a0, fa1
+; RV32-NO-ZFBFMIN-NEXT:    fmv.x.w a1, fa0
+; RV32-NO-ZFBFMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-NO-ZFBFMIN-NEXT:    vmv.v.x v8, a1
+; RV32-NO-ZFBFMIN-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NO-ZFBFMIN-NEXT:    ret
+;
+; RV64-NO-ZFBFMIN-LABEL: buildvec_v2bf16:
+; RV64-NO-ZFBFMIN:       # %bb.0:
+; RV64-NO-ZFBFMIN-NEXT:    fmv.x.w a0, fa1
+; RV64-NO-ZFBFMIN-NEXT:    fmv.x.w a1, fa0
+; RV64-NO-ZFBFMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-NO-ZFBFMIN-NEXT:    vmv.v.x v8, a1
+; RV64-NO-ZFBFMIN-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NO-ZFBFMIN-NEXT:    ret
+;
+; RV32-ZFBFMIN-LABEL: buildvec_v2bf16:
+; RV32-ZFBFMIN:       # %bb.0:
+; RV32-ZFBFMIN-NEXT:    fmv.x.h a0, fa1
+; RV32-ZFBFMIN-NEXT:    fmv.x.h a1, fa0
+; RV32-ZFBFMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-ZFBFMIN-NEXT:    vmv.v.x v8, a1
+; RV32-ZFBFMIN-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-ZFBFMIN-NEXT:    ret
+;
+; RV64-ZFBFMIN-LABEL: buildvec_v2bf16:
+; RV64-ZFBFMIN:       # %bb.0:
+; RV64-ZFBFMIN-NEXT:    fmv.x.h a0, fa1
+; RV64-ZFBFMIN-NEXT:    fmv.x.h a1, fa0
+; RV64-ZFBFMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-ZFBFMIN-NEXT:    vmv.v.x v8, a1
+; RV64-ZFBFMIN-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-ZFBFMIN-NEXT:    ret
+  %v1 = insertelement <2 x bfloat> poison, bfloat %a, i64 0
+  %v2 = insertelement <2 x bfloat> %v1, bfloat %b, i64 1
+  ret <2 x bfloat> %v2
+}
+
+define <2 x bfloat> @vid_v2bf16() {
+; CHECK-LABEL: vid_v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 260096
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v8, a0
+; CHECK-NEXT:    ret
+  ret <2 x bfloat> <bfloat 0.0, bfloat 1.0>
+}
+
+define <2 x bfloat> @vid_addend1_v2bf16() {
+; CHECK-LABEL: vid_addend1_v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 262148
+; CHECK-NEXT:    addi a0, a0, -128
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v8, a0
+; CHECK-NEXT:    ret
+  ret <2 x bfloat> <bfloat 1.0, bfloat 2.0>
+}
+
+define <2 x bfloat> @vid_denominator2_v2bf16() {
+; CHECK-LABEL: vid_denominator2_v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 260100
+; CHECK-NEXT:    addi a0, a0, -256
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v8, a0
+; CHECK-NEXT:    ret
+  ret <2 x bfloat> <bfloat 0.5, bfloat 1.0>
+}
+
+define <2 x bfloat> @vid_step2_v2bf16() {
+; CHECK-LABEL: vid_step2_v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vsll.vi v8, v8, 14
+; CHECK-NEXT:    ret
+  ret <2 x bfloat> <bfloat 0.0, bfloat 2.0>
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV32ZVFBFMIN: {{.*}}
+; RV64: {{.*}}
+; RV64ZVFBFMIN: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 26ed4595ca758..e3aabb5de29c2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -4,8 +4,10 @@
 ; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+zba,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFH
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFH,RV64V
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+rva22u64 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFH,RVA22U64
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFHMIN
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFHMIN
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFHMIN,RV32-NO-ZFHMIN
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFHMIN,RV64-NO-ZFHMIN
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfhmin,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFHMIN,RV32-ZFHMIN
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfhmin,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFHMIN,RV64-ZFHMIN
 
 ; Tests that a floating-point build_vector doesn't try and generate a VID
 ; instruction
@@ -169,12 +171,95 @@ define <4 x half> @splat_c3_v4f16(<4 x half> %v) {
 }
 
 define <4 x half> @splat_idx_v4f16(<4 x half> %v, i64 %idx) {
-; CHECK-LABEL: splat_idx_v4f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
+; RV32ZVFH-LABEL: splat_idx_v4f16:
+; RV32ZVFH:       # %bb.0:
+; RV32ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV32ZVFH-NEXT:    vrgather.vx v9, v8, a0
+; RV32ZVFH-NEXT:    vmv1r.v v8, v9
+; RV32ZVFH-NEXT:    ret
+;
+; RV64ZVFH-LABEL: splat_idx_v4f16:
+; RV64ZVFH:       # %bb.0:
+; RV64ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV64ZVFH-NEXT:    vrgather.vx v9, v8, a0
+; RV64ZVFH-NEXT:    vmv1r.v v8, v9
+; RV64ZVFH-NEXT:    ret
+;
+; RV32-NO-ZFHMIN-LABEL: splat_idx_v4f16:
+; RV32-NO-ZFHMIN:       # %bb.0:
+; RV32-NO-ZFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NO-ZFHMIN-NEXT:    vrgather.vx v9, v8, a0
+; RV32-NO-ZFHMIN-NEXT:    vmv1r.v v8, v9
+; RV32-NO-ZFHMIN-NEXT:    ret
+;
+; RV64-NO-ZFHMIN-LABEL: splat_idx_v4f16:
+; RV64-NO-ZFHMIN:       # %bb.0:
+; RV64-NO-ZFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-NO-ZFHMIN-NEXT:    vrgather.vx v9, v8, a0
+; RV64-NO-ZFHMIN-NEXT:    vmv1r.v v8, v9
+; RV64-NO-ZFHMIN-NEXT:    ret
+;
+; RV32-ZFHMIN-LABEL: splat_idx_v4f16:
+; RV32-ZFHMIN:       # %bb.0:
+; RV32-ZFHMIN-NEXT:    addi sp, sp, -48
+; RV32-ZFHMIN-NEXT:    .cfi_def_cfa_offset 48
+; RV32-ZFHMIN-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
+; RV32-ZFHMIN-NEXT:    .cfi_offset ra, -4
+; RV32-ZFHMIN-NEXT:    csrr a1, vlenb
+; RV32-ZFHMIN-NEXT:    slli a1, a1, 1
+; RV32-ZFHMIN-NEXT:    sub sp, sp, a1
+; RV32-ZFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; RV32-ZFHMIN-NEXT:    addi a1, sp, 32
+; RV32-ZFHMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-ZFHMIN-NEXT:    andi a0, a0, 3
+; RV32-ZFHMIN-NEXT:    li a1, 2
+; RV32-ZFHMIN-NEXT:    call __mulsi3
+; RV32-ZFHMIN-NEXT:    addi a1, sp, 16
+; RV32-ZFHMIN-NEXT:    add a0, a1, a0
+; RV32-ZFHMIN-NEXT:    addi a2, sp, 32
+; RV32-ZFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-ZFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-ZFHMIN-NEXT:    vse16.v v8, (a1)
+; RV32-ZFHMIN-NEXT:    flh fa5, 0(a0)
+; RV32-ZFHMIN-NEXT:    fmv.x.h a0, fa5
+; RV32-ZFHMIN-NEXT:    vmv.v.x v8, a0
+; RV32-ZFHMIN-NEXT:    csrr a0, vlenb
+; RV32-ZFHMIN-NEXT:    slli a0, a0, 1
+; RV32-ZFHMIN-NEXT:    add sp, sp, a0
+; RV32-ZFHMIN-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
+; RV32-ZFHMIN-NEXT:    addi sp, sp, 48
+; RV32-ZFHMIN-NEXT:    ret
+;
+; RV64-ZFHMIN-LABEL: splat_idx_v4f16:
+; RV64-ZFHMIN:       # %bb.0:
+; RV64-ZFHMIN-NEXT:    addi sp, sp, -48
+; RV64-ZFHMIN-NEXT:    .cfi_def_cfa_offset 48
+; RV64-ZFHMIN-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64-ZFHMIN-NEXT:    .cfi_offset ra, -8
+; RV64-ZFHMIN-NEXT:    csrr a1, vlenb
+; RV64-ZFHMIN-NEXT:    slli a1, a1, 1
+; RV64-ZFHMIN-NEXT:    sub sp, sp, a1
+; RV64-ZFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; RV64-ZFHMIN-NEXT:    addi a1, sp, 32
+; RV64-ZFHMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-ZFHMIN-NEXT:    andi a0, a0, 3
+; RV64-ZFHMIN-NEXT:    li a1, 2
+; RV64-ZFHMIN-NEXT:    call __muldi3
+; RV64-ZFHMIN-NEXT:    addi a1, sp, 16
+; RV64-ZFHMIN-NEXT:    add a0, a1, a0
+; RV64-ZFHMIN-NEXT:    addi a2, sp, 32
+; RV64-ZFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-ZFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-ZFHMIN-NEXT:    vse16.v v8, (a1)
+; RV64-ZFHMIN-NEXT:    flh fa5, 0(a0)
+; RV64-ZFHMIN-NEXT:    fmv.x.h a0, fa5
+; RV64-ZFHMIN-NEXT:    vmv.v.x v8, a0
+; RV64-ZFHMIN-NEXT:    csrr a0, vlenb
+; RV64-ZFHMIN-NEXT:    slli a0, a0, 1
+; RV64-ZFHMIN-NEXT:    add sp, sp, a0
+; RV64-ZFHMIN-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64-ZFHMIN-NEXT:    addi sp, sp, 48
+; RV64-ZFHMIN-NEXT:    ret
   %x = extractelement <4 x half> %v, i64 %idx
   %ins = insertelement <4 x half> poison, half %x, i32 0
   %splat = shufflevector <4 x half> %ins, <4 x half> poison, <4 x i32> zeroinitializer
@@ -295,23 +380,41 @@ define <2 x half> @buildvec_v2f16(half %a, half %b) {
 ; RV64ZVFH-NEXT:    vfslide1down.vf v8, v8, fa1
 ; RV64ZVFH-NEXT:    ret
 ;
-; RV32ZVFHMIN-LABEL: buildvec_v2f16:
-; RV32ZVFHMIN:       # %bb.0:
-; RV32ZVFHMIN-NEXT:    fmv.x.w a0, fa1
-; RV32ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; RV32ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; RV32ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
-; RV32ZVFHMIN-NEXT:    ret
+; RV32-NO-ZFHMIN-LABEL: buildvec_v2f16:
+; RV32-NO-ZFHMIN:       # %bb.0:
+; RV32-NO-ZFHMIN-NEXT:    fmv.x.w a0, fa1
+; RV32-NO-ZFHMIN-NEXT:    fmv.x.w a1, fa0
+; RV32-NO-ZFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-NO-ZFHMIN-NEXT:    vmv.v.x v8, a1
+; RV32-NO-ZFHMIN-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NO-ZFHMIN-NEXT:    ret
 ;
-; RV64ZVFHMIN-LABEL: buildvec_v2f16:
-; RV64ZVFHMIN:       # %bb.0:
-; RV64ZVFHMIN-NEXT:    fmv.x.w a0, fa1
-; RV64ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; RV64ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; RV64ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVFHMIN-NEXT:    ret
+; RV64-NO-ZFHMIN-LABEL: buildvec_v2f16:
+; RV64-NO-ZFHMIN:       # %bb.0:
+; RV64-NO-ZFHMIN-NEXT:    fmv.x.w a0, fa1
+; RV64-NO-ZFHMIN-NEXT:    fmv.x.w a1, fa0
+; RV64-NO-ZFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-NO-ZFHMIN-NEXT:    vmv.v.x v8, a1
+; RV64-NO-ZFHMIN-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NO-ZFHMIN-NEXT:    ret
+;
+; RV32-ZFHMIN-LABEL: buildvec_v2f16:
+; RV32-ZFHMIN:       # %bb.0:
+; RV32-ZFHMIN-NEXT:    fmv.x.h a0, fa1
+; RV32-ZFHMIN-NEXT:    fmv.x.h a1, fa0
+; RV32-ZFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-ZFHMIN-NEXT:    vmv.v.x v8, a1
+; RV32-ZFHMIN-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-ZFHMIN-NEXT:    ret
+;
+; RV64-ZFHMIN-LABEL: buildvec_v2f16:
+; RV64-ZFHMIN:       # %bb.0:
+; RV64-ZFHMIN-NEXT:    fmv.x.h a0, fa1
+; RV64-ZFHMIN-NEXT:    fmv.x.h a1, fa0
+; RV64-ZFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-ZFHMIN-NEXT:    vmv.v.x v8, a1
+; RV64-ZFHMIN-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-ZFHMIN-NEXT:    ret
   %v1 = insertelement <2 x half> poison, half %a, i64 0
   %v2 = insertelement <2 x half> %v1, half %b, i64 1
   ret <2 x half> %v2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
index a566fab1596f6..31e2d75e514b4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
@@ -648,12 +648,8 @@ define void @fcmp_oeq_vf_v8f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -681,12 +677,8 @@ define void @fcmp_oeq_vf_v8f16_nonans(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -814,12 +806,8 @@ define void @fcmp_olt_vf_v16f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -847,12 +835,8 @@ define void @fcmp_olt_vf_v16f16_nonans(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -965,12 +949,8 @@ define void @fcmp_ule_vf_v32f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-NEXT:    li a2, 32
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1001,12 +981,8 @@ define void @fcmp_ule_vf_v32f16_nonans(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-NEXT:    li a2, 32
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1217,19 +1193,15 @@ define void @fcmp_ord_vf_v4f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v9, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10
 ; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
@@ -1275,19 +1247,15 @@ define void @fcmp_uno_vf_v4f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v9, v9
+; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v9, v9
+; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10
 ; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
@@ -1321,12 +1289,8 @@ define void @fcmp_oeq_fv_v8f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -1354,12 +1318,8 @@ define void @fcmp_oeq_fv_v8f16_nonans(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -1487,12 +1447,8 @@ define void @fcmp_olt_fv_v16f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -1520,12 +1476,8 @@ define void @fcmp_olt_fv_v16f16_nonans(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -1638,12 +1590,8 @@ define void @fcmp_ule_fv_v32f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-NEXT:    li a2, 32
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1674,12 +1622,8 @@ define void @fcmp_ule_fv_v32f16_nonans(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-NEXT:    li a2, 32
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1890,19 +1834,15 @@ define void @fcmp_ord_fv_v4f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v9, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10
 ; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
@@ -1948,19 +1888,15 @@ define void @fcmp_uno_fv_v4f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v9, v9
+; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v9, v9
+; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat-bf16.ll
index b1250f4804549..c94cdadc8ca59 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat-bf16.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat-bf16.ll
@@ -7,11 +7,9 @@
 define <8 x bfloat> @splat_v8bf16(ptr %x, bfloat %y) {
 ; ZFBFMIN-ZVFBFMIN-LABEL: splat_v8bf16:
 ; ZFBFMIN-ZVFBFMIN:       # %bb.0:
-; ZFBFMIN-ZVFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; ZFBFMIN-ZVFBFMIN-NEXT:    vfmv.v.f v10, fa5
-; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZFBFMIN-ZVFBFMIN-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZFBFMIN-ZVFBFMIN-NEXT:    fmv.x.h a0, fa0
+; ZFBFMIN-ZVFBFMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZFBFMIN-ZVFBFMIN-NEXT:    vmv.v.x v8, a0
 ; ZFBFMIN-ZVFBFMIN-NEXT:    ret
 ;
 ; ZVFBFMIN-LABEL: splat_v8bf16:
@@ -28,11 +26,9 @@ define <8 x bfloat> @splat_v8bf16(ptr %x, bfloat %y) {
 define <16 x bfloat> @splat_16bf16(ptr %x, bfloat %y) {
 ; ZFBFMIN-ZVFBFMIN-LABEL: splat_16bf16:
 ; ZFBFMIN-ZVFBFMIN:       # %bb.0:
-; ZFBFMIN-ZVFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; ZFBFMIN-ZVFBFMIN-NEXT:    vfmv.v.f v12, fa5
-; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZFBFMIN-ZVFBFMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZFBFMIN-ZVFBFMIN-NEXT:    fmv.x.h a0, fa0
+; ZFBFMIN-ZVFBFMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZFBFMIN-ZVFBFMIN-NEXT:    vmv.v.x v8, a0
 ; ZFBFMIN-ZVFBFMIN-NEXT:    ret
 ;
 ; ZVFBFMIN-LABEL: splat_16bf16:
@@ -46,10 +42,31 @@ define <16 x bfloat> @splat_16bf16(ptr %x, bfloat %y) {
   ret <16 x bfloat> %b
 }
 
+define <64 x bfloat> @splat_64bf16(ptr %x, bfloat %y) {
+; ZFBFMIN-ZVFBFMIN-LABEL: splat_64bf16:
+; ZFBFMIN-ZVFBFMIN:       # %bb.0:
+; ZFBFMIN-ZVFBFMIN-NEXT:    fmv.x.h a0, fa0
+; ZFBFMIN-ZVFBFMIN-NEXT:    li a1, 64
+; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZFBFMIN-ZVFBFMIN-NEXT:    vmv.v.x v8, a0
+; ZFBFMIN-ZVFBFMIN-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: splat_64bf16:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    fmv.x.w a0, fa0
+; ZVFBFMIN-NEXT:    li a1, 64
+; ZVFBFMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFBFMIN-NEXT:    vmv.v.x v8, a0
+; ZVFBFMIN-NEXT:    ret
+  %a = insertelement <64 x bfloat> poison, bfloat %y, i32 0
+  %b = shufflevector <64 x bfloat> %a, <64 x bfloat> poison, <64 x i32> zeroinitializer
+  ret <64 x bfloat> %b
+}
+
 define <8 x bfloat> @splat_zero_v8bf16(ptr %x) {
 ; ZFBFMIN-ZVFBFMIN-LABEL: splat_zero_v8bf16:
 ; ZFBFMIN-ZVFBFMIN:       # %bb.0:
-; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZFBFMIN-ZVFBFMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZFBFMIN-ZVFBFMIN-NEXT:    vmv.v.i v8, 0
 ; ZFBFMIN-ZVFBFMIN-NEXT:    ret
 ;
@@ -64,7 +81,7 @@ define <8 x bfloat> @splat_zero_v8bf16(ptr %x) {
 define <16 x bfloat> @splat_zero_16bf16(ptr %x) {
 ; ZFBFMIN-ZVFBFMIN-LABEL: splat_zero_16bf16:
 ; ZFBFMIN-ZVFBFMIN:       # %bb.0:
-; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZFBFMIN-ZVFBFMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZFBFMIN-ZVFBFMIN-NEXT:    vmv.v.i v8, 0
 ; ZFBFMIN-ZVFBFMIN-NEXT:    ret
 ;
@@ -80,7 +97,7 @@ define <8 x bfloat> @splat_negzero_v8bf16(ptr %x) {
 ; ZFBFMIN-ZVFBFMIN-LABEL: splat_negzero_v8bf16:
 ; ZFBFMIN-ZVFBFMIN:       # %bb.0:
 ; ZFBFMIN-ZVFBFMIN-NEXT:    lui a0, 1048568
-; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZFBFMIN-ZVFBFMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZFBFMIN-ZVFBFMIN-NEXT:    vmv.v.x v8, a0
 ; ZFBFMIN-ZVFBFMIN-NEXT:    ret
 ;
@@ -97,7 +114,7 @@ define <16 x bfloat> @splat_negzero_16bf16(ptr %x) {
 ; ZFBFMIN-ZVFBFMIN-LABEL: splat_negzero_16bf16:
 ; ZFBFMIN-ZVFBFMIN:       # %bb.0:
 ; ZFBFMIN-ZVFBFMIN-NEXT:    lui a0, 1048568
-; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZFBFMIN-ZVFBFMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZFBFMIN-ZVFBFMIN-NEXT:    vmv.v.x v8, a0
 ; ZFBFMIN-ZVFBFMIN-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll
index 10103813d526c..250b3e90cbbb6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV32
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,RV64-ZVFH
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,RV64-ZVFHMIN
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,RV64-ZVFHMIN,RV64-ZVFHMIN-NOZFHMIN
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfhmin,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,RV64-ZVFHMIN,RV64_ZVFHMIN-ZFHMIN
 
 define void @splat_v8f16(ptr %x, half %y) {
 ; CHECK-RV32-LABEL: splat_v8f16:
@@ -18,13 +19,21 @@ define void @splat_v8f16(ptr %x, half %y) {
 ; RV64-ZVFH-NEXT:    vse16.v v8, (a0)
 ; RV64-ZVFH-NEXT:    ret
 ;
-; RV64-ZVFHMIN-LABEL: splat_v8f16:
-; RV64-ZVFHMIN:       # %bb.0:
-; RV64-ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; RV64-ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64-ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; RV64-ZVFHMIN-NEXT:    vse16.v v8, (a0)
-; RV64-ZVFHMIN-NEXT:    ret
+; RV64-ZVFHMIN-NOZFHMIN-LABEL: splat_v8f16:
+; RV64-ZVFHMIN-NOZFHMIN:       # %bb.0:
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    fmv.x.w a1, fa0
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    vmv.v.x v8, a1
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    vse16.v v8, (a0)
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    ret
+;
+; RV64_ZVFHMIN-ZFHMIN-LABEL: splat_v8f16:
+; RV64_ZVFHMIN-ZFHMIN:       # %bb.0:
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    fmv.x.h a1, fa0
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    vmv.v.x v8, a1
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    vse16.v v8, (a0)
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    ret
   %a = insertelement <8 x half> poison, half %y, i32 0
   %b = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> zeroinitializer
   store <8 x half> %b, ptr %x
@@ -72,13 +81,21 @@ define void @splat_16f16(ptr %x, half %y) {
 ; RV64-ZVFH-NEXT:    vse16.v v8, (a0)
 ; RV64-ZVFH-NEXT:    ret
 ;
-; RV64-ZVFHMIN-LABEL: splat_16f16:
-; RV64-ZVFHMIN:       # %bb.0:
-; RV64-ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; RV64-ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; RV64-ZVFHMIN-NEXT:    vse16.v v8, (a0)
-; RV64-ZVFHMIN-NEXT:    ret
+; RV64-ZVFHMIN-NOZFHMIN-LABEL: splat_16f16:
+; RV64-ZVFHMIN-NOZFHMIN:       # %bb.0:
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    fmv.x.w a1, fa0
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    vmv.v.x v8, a1
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    vse16.v v8, (a0)
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    ret
+;
+; RV64_ZVFHMIN-ZFHMIN-LABEL: splat_16f16:
+; RV64_ZVFHMIN-ZFHMIN:       # %bb.0:
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    fmv.x.h a1, fa0
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    vmv.v.x v8, a1
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    vse16.v v8, (a0)
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    ret
   %a = insertelement <16 x half> poison, half %y, i32 0
   %b = shufflevector <16 x half> %a, <16 x half> poison, <16 x i32> zeroinitializer
   store <16 x half> %b, ptr %x
@@ -111,6 +128,46 @@ define void @splat_v4f64(ptr %x, double %y) {
   ret void
 }
 
+define void @splat_64f16(ptr %x, half %y) {
+; CHECK-RV32-LABEL: splat_64f16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    li a1, 64
+; CHECK-RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-RV32-NEXT:    vfmv.v.f v8, fa0
+; CHECK-RV32-NEXT:    vse16.v v8, (a0)
+; CHECK-RV32-NEXT:    ret
+;
+; RV64-ZVFH-LABEL: splat_64f16:
+; RV64-ZVFH:       # %bb.0:
+; RV64-ZVFH-NEXT:    li a1, 64
+; RV64-ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; RV64-ZVFH-NEXT:    vfmv.v.f v8, fa0
+; RV64-ZVFH-NEXT:    vse16.v v8, (a0)
+; RV64-ZVFH-NEXT:    ret
+;
+; RV64-ZVFHMIN-NOZFHMIN-LABEL: splat_64f16:
+; RV64-ZVFHMIN-NOZFHMIN:       # %bb.0:
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    fmv.x.w a1, fa0
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    li a2, 64
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    vmv.v.x v8, a1
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    vse16.v v8, (a0)
+; RV64-ZVFHMIN-NOZFHMIN-NEXT:    ret
+;
+; RV64_ZVFHMIN-ZFHMIN-LABEL: splat_64f16:
+; RV64_ZVFHMIN-ZFHMIN:       # %bb.0:
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    fmv.x.h a1, fa0
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    li a2, 64
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    vmv.v.x v8, a1
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    vse16.v v8, (a0)
+; RV64_ZVFHMIN-ZFHMIN-NEXT:    ret
+  %a = insertelement <64 x half> poison, half %y, i32 0
+  %b = shufflevector <64 x half> %a, <64 x half> poison, <64 x i32> zeroinitializer
+  store <64 x half> %b, ptr %x
+  ret void
+}
+
 define void @splat_zero_v8f16(ptr %x) {
 ; CHECK-LABEL: splat_zero_v8f16:
 ; CHECK:       # %bb.0:
@@ -268,3 +325,5 @@ define void @splat_negzero_v4f64(ptr %x) {
   store <4 x double> splat (double -0.0), ptr %x
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV64-ZVFHMIN: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index cdbca0b874e60..1843157573257 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -868,16 +868,12 @@ define void @copysign_vf_v8f16(ptr %x, half %y) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v8, v9, v8
+; ZVFHMIN-NEXT:    vfsgnj.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-NEXT:    vse16.v v9, (a0)
@@ -903,16 +899,16 @@ define void @copysign_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV32:       # %bb.0:
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    li a2, 192
+; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfsgnj.vv v8, v9, v8
+; ZVFHMIN-RV32-NEXT:    vfsgnj.vv v8, v9, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -927,16 +923,16 @@ define void @copysign_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV64:       # %bb.0:
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    li a2, 192
+; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfsgnj.vv v8, v9, v8
+; ZVFHMIN-RV64-NEXT:    vfsgnj.vv v8, v9, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -1225,10 +1221,6 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) {
 ; ZVFHMIN-RV64-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vle64.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    mv a2, sp
-; ZVFHMIN-RV64-NEXT:    vse64.v v8, (a2)
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a2)
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 3, e32, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vle32.v v9, (a1)
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
@@ -2193,16 +2185,12 @@ define void @fadd_vf_v8f16(ptr %x, half %y) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v9, v8
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-NEXT:    vse16.v v9, (a0)
@@ -2228,16 +2216,16 @@ define void @fadd_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV32:       # %bb.0:
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    li a2, 192
+; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfadd.vv v8, v9, v8
+; ZVFHMIN-RV32-NEXT:    vfadd.vv v8, v9, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -2252,16 +2240,16 @@ define void @fadd_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV64:       # %bb.0:
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    li a2, 192
+; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfadd.vv v8, v9, v8
+; ZVFHMIN-RV64-NEXT:    vfadd.vv v8, v9, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -2331,16 +2319,12 @@ define void @fadd_fv_v8f16(ptr %x, half %y) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v8, v9
+; ZVFHMIN-NEXT:    vfadd.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-NEXT:    vse16.v v9, (a0)
@@ -2366,16 +2350,16 @@ define void @fadd_fv_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV32:       # %bb.0:
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    li a2, 192
+; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfadd.vv v8, v8, v9
+; ZVFHMIN-RV32-NEXT:    vfadd.vv v8, v10, v9
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -2390,16 +2374,16 @@ define void @fadd_fv_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV64:       # %bb.0:
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    li a2, 192
+; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfadd.vv v8, v8, v9
+; ZVFHMIN-RV64-NEXT:    vfadd.vv v8, v10, v9
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -2469,16 +2453,12 @@ define void @fsub_vf_v8f16(ptr %x, half %y) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v8, v9, v8
+; ZVFHMIN-NEXT:    vfsub.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-NEXT:    vse16.v v9, (a0)
@@ -2504,16 +2484,16 @@ define void @fsub_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV32:       # %bb.0:
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    li a2, 192
+; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfsub.vv v8, v9, v8
+; ZVFHMIN-RV32-NEXT:    vfsub.vv v8, v9, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -2528,16 +2508,16 @@ define void @fsub_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV64:       # %bb.0:
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    li a2, 192
+; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfsub.vv v8, v9, v8
+; ZVFHMIN-RV64-NEXT:    vfsub.vv v8, v9, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -2607,16 +2587,12 @@ define void @fsub_fv_v8f16(ptr %x, half %y) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v8, v8, v9
+; ZVFHMIN-NEXT:    vfsub.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-NEXT:    vse16.v v9, (a0)
@@ -2642,16 +2618,16 @@ define void @fsub_fv_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV32:       # %bb.0:
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    li a2, 192
+; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfsub.vv v8, v8, v9
+; ZVFHMIN-RV32-NEXT:    vfsub.vv v8, v10, v9
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -2666,16 +2642,16 @@ define void @fsub_fv_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV64:       # %bb.0:
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    li a2, 192
+; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfsub.vv v8, v8, v9
+; ZVFHMIN-RV64-NEXT:    vfsub.vv v8, v10, v9
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -2745,16 +2721,12 @@ define void @fmul_vf_v8f16(ptr %x, half %y) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v8, v9, v8
+; ZVFHMIN-NEXT:    vfmul.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-NEXT:    vse16.v v9, (a0)
@@ -2780,16 +2752,16 @@ define void @fmul_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV32:       # %bb.0:
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    li a2, 192
+; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmul.vv v8, v9, v8
+; ZVFHMIN-RV32-NEXT:    vfmul.vv v8, v9, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -2804,16 +2776,16 @@ define void @fmul_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV64:       # %bb.0:
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    li a2, 192
+; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmul.vv v8, v9, v8
+; ZVFHMIN-RV64-NEXT:    vfmul.vv v8, v9, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -2883,16 +2855,12 @@ define void @fmul_fv_v8f16(ptr %x, half %y) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v8, v8, v9
+; ZVFHMIN-NEXT:    vfmul.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-NEXT:    vse16.v v9, (a0)
@@ -2918,16 +2886,16 @@ define void @fmul_fv_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV32:       # %bb.0:
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    li a2, 192
+; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmul.vv v8, v8, v9
+; ZVFHMIN-RV32-NEXT:    vfmul.vv v8, v10, v9
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -2942,16 +2910,16 @@ define void @fmul_fv_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV64:       # %bb.0:
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    li a2, 192
+; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmul.vv v8, v8, v9
+; ZVFHMIN-RV64-NEXT:    vfmul.vv v8, v10, v9
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -3021,16 +2989,12 @@ define void @fdiv_vf_v8f16(ptr %x, half %y) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v8, v9, v8
+; ZVFHMIN-NEXT:    vfdiv.vv v8, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-NEXT:    vse16.v v9, (a0)
@@ -3056,16 +3020,16 @@ define void @fdiv_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV32:       # %bb.0:
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    li a2, 192
+; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfdiv.vv v8, v9, v8
+; ZVFHMIN-RV32-NEXT:    vfdiv.vv v8, v9, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -3080,16 +3044,16 @@ define void @fdiv_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV64:       # %bb.0:
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    li a2, 192
+; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfdiv.vv v8, v9, v8
+; ZVFHMIN-RV64-NEXT:    vfdiv.vv v8, v9, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -3159,16 +3123,12 @@ define void @fdiv_fv_v8f16(ptr %x, half %y) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v8, v8, v9
+; ZVFHMIN-NEXT:    vfdiv.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-NEXT:    vse16.v v9, (a0)
@@ -3194,16 +3154,16 @@ define void @fdiv_fv_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV32:       # %bb.0:
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    li a2, 192
+; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfdiv.vv v8, v8, v9
+; ZVFHMIN-RV32-NEXT:    vfdiv.vv v8, v10, v9
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -3218,16 +3178,16 @@ define void @fdiv_fv_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-RV64:       # %bb.0:
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    li a2, 192
+; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfdiv.vv v8, v8, v9
+; ZVFHMIN-RV64-NEXT:    vfdiv.vv v8, v10, v9
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -3297,22 +3257,18 @@ define void @fma_vf_v8f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-LABEL: fma_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v9, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
@@ -3338,25 +3294,25 @@ define void @fma_vf_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v11, v10
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    li a2, 192
+; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v10, a2
+; ZVFHMIN-RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v11
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmadd.vv v8, v9, v10
+; ZVFHMIN-RV32-NEXT:    vfmadd.vv v9, v11, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 2
 ; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
+; ZVFHMIN-RV32-NEXT:    vse32.v v9, (a1)
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-RV32-NEXT:    ret
 ;
 ; ZVFHMIN-RV64-LABEL: fma_vf_v6f16:
@@ -3364,22 +3320,22 @@ define void @fma_vf_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v11, v10
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    li a2, 192
+; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v10, a2
+; ZVFHMIN-RV64-NEXT:    vmerge.vxm v10, v10, a1, v0
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v11
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmadd.vv v8, v9, v10
+; ZVFHMIN-RV64-NEXT:    vfmadd.vv v9, v11, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-RV64-NEXT:    vse64.v v8, (a0)
+; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v8, 2
 ; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
 ; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
 ; ZVFHMIN-RV64-NEXT:    ret
@@ -3450,22 +3406,18 @@ define void @fma_fv_v8f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-LABEL: fma_fv_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v9, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
@@ -3491,25 +3443,25 @@ define void @fma_fv_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v11, v10
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    li a2, 192
+; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v10, a2
+; ZVFHMIN-RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v11
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmadd.vv v8, v9, v10
+; ZVFHMIN-RV32-NEXT:    vfmadd.vv v9, v11, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 2
 ; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
+; ZVFHMIN-RV32-NEXT:    vse32.v v9, (a1)
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-RV32-NEXT:    ret
 ;
 ; ZVFHMIN-RV64-LABEL: fma_fv_v6f16:
@@ -3517,22 +3469,22 @@ define void @fma_fv_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v11, v10
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    li a2, 192
+; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v10, a2
+; ZVFHMIN-RV64-NEXT:    vmerge.vxm v10, v10, a1, v0
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v11
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmadd.vv v8, v9, v10
+; ZVFHMIN-RV64-NEXT:    vfmadd.vv v9, v11, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-RV64-NEXT:    vse64.v v8, (a0)
+; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v8, 2
 ; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
 ; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
 ; ZVFHMIN-RV64-NEXT:    ret
@@ -3603,26 +3555,22 @@ define void @fmsub_vf_v8f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-LABEL: fmsub_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v10
+; ZVFHMIN-NEXT:    vfneg.v v8, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmacc.vv v11, v9, v8
+; ZVFHMIN-NEXT:    vfmacc.vv v10, v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
@@ -3650,30 +3598,30 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v11, v10
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    li a2, 192
+; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v10, a2
+; ZVFHMIN-RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfneg.v v9, v10
+; ZVFHMIN-RV32-NEXT:    vfneg.v v9, v11
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v11, v9
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v11
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmacc.vv v11, v9, v8
+; ZVFHMIN-RV32-NEXT:    vfmacc.vv v8, v9, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v8, v11
+; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 2
+; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
 ; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v9, (a1)
+; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-RV32-NEXT:    ret
 ;
 ; ZVFHMIN-RV64-LABEL: fmsub_vf_v6f16:
@@ -3681,27 +3629,27 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-RV64-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v11, v10
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    li a2, 192
+; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v10, a2
+; ZVFHMIN-RV64-NEXT:    vmerge.vxm v10, v10, a1, v0
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfneg.v v9, v10
+; ZVFHMIN-RV64-NEXT:    vfneg.v v9, v11
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v11, v9
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v11
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmacc.vv v11, v9, v8
+; ZVFHMIN-RV64-NEXT:    vfmacc.vv v8, v9, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v8, v11
+; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
+; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
 ; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
 ; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
 ; ZVFHMIN-RV64-NEXT:    ret
@@ -3829,13 +3777,13 @@ define void @trunc_v8f16(ptr %x) {
 ;
 ; ZVFHMIN-ZFH-RV32-LABEL: trunc_v8f16:
 ; ZVFHMIN-ZFH-RV32:       # %bb.0:
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 2(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    lui a1, %hi(.LCPI115_0)
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, %lo(.LCPI115_0)(a1)
 ; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa4
@@ -3846,92 +3794,101 @@ define void @trunc_v8f16(ptr %x) {
 ; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa3, a1, rtz
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB115_2:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa4, 30(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 0(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa1
 ; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa3, fa5
 ; ZVFHMIN-ZFH-RV32-NEXT:    beqz a1, .LBB115_4
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.3:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa4, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa1, rtz
 ; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa1, fa3, fa1
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB115_4:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa4, 28(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 4(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa2
 ; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa3, fa5
 ; ZVFHMIN-ZFH-RV32-NEXT:    beqz a1, .LBB115_6
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.5:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa4, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa2, rtz
 ; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa2, fa3, fa2
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB115_6:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa4, 26(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 8(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa3, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 6(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa0, fa3
+; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa0, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa1
 ; ZVFHMIN-ZFH-RV32-NEXT:    beqz a1, .LBB115_8
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.7:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa3, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa1, a1, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa3, fa1, fa3
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB115_8:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa4, 24(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 6(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    beqz a1, .LBB115_10
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 10(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa4, fa1
+; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a3, fa4, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    beqz a3, .LBB115_10
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.9:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a2, fa1, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa4, a2, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa1, fa4, fa1
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB115_10:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa4, 22(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 4(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    beqz a1, .LBB115_12
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 8(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa2
+; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa2, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a3, fa2, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa1
+; ZVFHMIN-ZFH-RV32-NEXT:    beqz a3, .LBB115_12
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.11:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a3, fa4, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa2, a3, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa2, fa4
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB115_12:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa4, 20(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 2(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    beqz a1, .LBB115_14
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 12(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa3
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a3
+; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa4, fa2
+; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a3, fa4, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    beqz a3, .LBB115_14
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.13:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa2, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa4, a1, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa2, fa4, fa2
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB115_14:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa4, 18(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa2
 ; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    beqz a1, .LBB115_16
+; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a2, fa3, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    beqz a2, .LBB115_16
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.15:
 ; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa4, rtz
 ; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa5, a1, rtz
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa5, fa4
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB115_16:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa4, 16(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFH-RV32-NEXT:    ret
 ;
 ; ZVFHMIN-ZFH-RV64-LABEL: trunc_v8f16:
 ; ZVFHMIN-ZFH-RV64:       # %bb.0:
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 2(sp)
 ; ZVFHMIN-ZFH-RV64-NEXT:    lui a1, %hi(.LCPI115_0)
 ; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, %lo(.LCPI115_0)(a1)
 ; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
@@ -3942,92 +3899,101 @@ define void @trunc_v8f16(ptr %x) {
 ; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa3, a1, rtz
 ; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB115_2:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 30(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 12(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 0(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa1
 ; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa3, fa5
 ; ZVFHMIN-ZFH-RV64-NEXT:    beqz a1, .LBB115_4
 ; ZVFHMIN-ZFH-RV64-NEXT:  # %bb.3:
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa4, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa1, rtz
 ; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa1, fa3, fa1
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB115_4:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 28(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 4(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa2
 ; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa3, fa5
 ; ZVFHMIN-ZFH-RV64-NEXT:    beqz a1, .LBB115_6
 ; ZVFHMIN-ZFH-RV64-NEXT:  # %bb.5:
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa4, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa2, rtz
 ; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa2, fa3, fa2
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB115_6:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 26(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 8(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa3, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 6(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa0, fa3
+; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa0, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa1
 ; ZVFHMIN-ZFH-RV64-NEXT:    beqz a1, .LBB115_8
 ; ZVFHMIN-ZFH-RV64-NEXT:  # %bb.7:
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa3, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa1, a1, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa3, fa1, fa3
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB115_8:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 24(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 6(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    beqz a1, .LBB115_10
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 10(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa4, fa1
+; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a3, fa4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    beqz a3, .LBB115_10
 ; ZVFHMIN-ZFH-RV64-NEXT:  # %bb.9:
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a2, fa1, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa4, a2, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa1, fa4, fa1
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB115_10:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 22(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 4(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    beqz a1, .LBB115_12
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 8(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa2
+; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa2, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a3, fa2, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa1
+; ZVFHMIN-ZFH-RV64-NEXT:    beqz a3, .LBB115_12
 ; ZVFHMIN-ZFH-RV64-NEXT:  # %bb.11:
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a3, fa4, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa2, a3, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa2, fa4
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB115_12:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 20(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 2(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    beqz a1, .LBB115_14
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 12(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa3
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v9, a3
+; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa4, fa2
+; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a3, fa4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    beqz a3, .LBB115_14
 ; ZVFHMIN-ZFH-RV64-NEXT:  # %bb.13:
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa2, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa4, a1, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa2, fa4, fa2
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB115_14:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 18(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa2
 ; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    beqz a1, .LBB115_16
+; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a2, fa3, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    beqz a2, .LBB115_16
 ; ZVFHMIN-ZFH-RV64-NEXT:  # %bb.15:
 ; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa4, rtz
 ; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa5, a1, rtz
 ; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa5, fa4
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB115_16:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 16(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFH-RV64-NEXT:    ret
 ;
 ; ZVFHMIN-ZFHIN-RV32-LABEL: trunc_v8f16:
 ; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a1, 307200
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.w.x fa5, a1
@@ -4039,107 +4005,116 @@ define void @trunc_v8f16(ptr %x) {
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a1, rtz
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa3, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB115_2:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa4, 30(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 12(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 0(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa2, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa3, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a1, .LBB115_4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.3:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa4, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa2, rtz
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa2, fa3, fa2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB115_4:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa4, 28(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa3, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa1, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa1, fa2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a1, .LBB115_6
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.5:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa3, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa2, a1, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa3, fa2, fa3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB115_6:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa4, 26(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa0, 6(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa2, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa0, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa0, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a1, .LBB115_8
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.7:
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa1, a1, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa1, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB115_8:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa4, 24(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 6(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a1, .LBB115_10
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa1, 10(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa2, fa1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa1, fa2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a2, fa1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa3, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a2, .LBB115_10
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.9:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a2, fa2, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa1, a2, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa2, fa1, fa2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB115_10:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa4, 22(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 4(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a1, .LBB115_12
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa1, 8(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa2, fa2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa1, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a3, fa1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a3, .LBB115_12
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.11:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a3, fa4, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa2, a3, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa2, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB115_12:
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 12(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa4, 20(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 2(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a1, .LBB115_14
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a3, fa3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a3, .LBB115_14
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.13:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a1, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a2, fa4, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a2, rtz
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa3, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB115_14:
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 14(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa4, 18(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a1, .LBB115_16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a2, fa3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a2, .LBB115_16
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.15:
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa4, rtz
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa5, a1, rtz
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa5, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB115_16:
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa5, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa5, 16(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
 ;
 ; ZVFHMIN-ZFHIN-RV64-LABEL: trunc_v8f16:
 ; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa5
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a1, 307200
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.w.x fa5, a1
@@ -4151,96 +4126,105 @@ define void @trunc_v8f16(ptr %x) {
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a1, rtz
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa3, fa4
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB115_2:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa4, 30(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 12(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 0(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa2, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa2
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa3, fa5
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a1, .LBB115_4
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  # %bb.3:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa4, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa2, rtz
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa2, fa3, fa2
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB115_4:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa4, 28(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa3, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa1, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa1, fa2
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a1, .LBB115_6
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  # %bb.5:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa3, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa2, a1, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa3, fa2, fa3
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB115_6:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa4, 26(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 8(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa0, 6(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa2, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa0
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa0, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa0, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa1
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a1, .LBB115_8
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  # %bb.7:
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa1, a1, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa1, fa4
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB115_8:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa4, 24(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 6(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a1, .LBB115_10
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa1, 10(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa2, fa1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa1, fa2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a2, fa1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa3, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a2, .LBB115_10
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  # %bb.9:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a2, fa2, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa1, a2, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa2, fa1, fa2
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB115_10:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa4, 22(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 4(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a1, .LBB115_12
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa1, 8(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa2, fa2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa1, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a3, fa1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a3, .LBB115_12
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  # %bb.11:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a3, fa4, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa2, a3, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa2, fa4
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB115_12:
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 12(sp)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa4, 20(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 2(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v9, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa3
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a1, .LBB115_14
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a3, fa3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a3, .LBB115_14
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  # %bb.13:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a1, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a2, fa4, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a2, rtz
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa3, fa4
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB115_14:
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 14(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa4, 18(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa3
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a1, .LBB115_16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a2, fa3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a2, .LBB115_16
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  # %bb.15:
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa4, rtz
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa5, a1, rtz
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa5, fa4
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB115_16:
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa5, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 16(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = call <8 x half> @llvm.trunc.v8f16(<8 x half> %a)
@@ -4268,13 +4252,13 @@ define void @trunc_v6f16(ptr %x) {
 ;
 ; ZVFHMIN-ZFH-RV32-LABEL: trunc_v6f16:
 ; ZVFHMIN-ZFH-RV32:       # %bb.0:
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -48
-; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 48
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 2(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    lui a1, %hi(.LCPI116_0)
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, %lo(.LCPI116_0)(a1)
 ; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa4
@@ -4285,103 +4269,111 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa3, a1, rtz
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB116_2:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa4, 46(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 0(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa1
 ; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa3, fa5
 ; ZVFHMIN-ZFH-RV32-NEXT:    beqz a1, .LBB116_4
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.3:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa4, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa1, rtz
 ; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa1, fa3, fa1
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB116_4:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa4, 44(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 4(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa2
 ; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa3, fa5
 ; ZVFHMIN-ZFH-RV32-NEXT:    beqz a1, .LBB116_6
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.5:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa4, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa2, rtz
 ; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa2, fa3, fa2
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB116_6:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa4, 42(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 8(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    beqz a1, .LBB116_8
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 6(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa0, fa3
+; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a2, fa0, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa1
+; ZVFHMIN-ZFH-RV32-NEXT:    beqz a2, .LBB116_8
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.7:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a2, fa3, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa1, a2, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa3, fa1, fa3
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB116_8:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa4, 40(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 6(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    beqz a1, .LBB116_10
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 10(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa4, fa1
+; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a3, fa4, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    beqz a3, .LBB116_10
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.9:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a3, fa1, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa4, a3, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa1, fa4, fa1
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB116_10:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa4, 38(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa2, fa3
-; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    beqz a1, .LBB116_12
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 8(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa2
+; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa2, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a4, fa2, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa1
+; ZVFHMIN-ZFH-RV32-NEXT:    beqz a4, .LBB116_12
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.11:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa3, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa2, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa3, fa2, fa3
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a4, fa4, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa2, a4, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa2, fa4
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB116_12:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa3, 36(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 2(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa1, fa2
-; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa1, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    beqz a1, .LBB116_14
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 12(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a4, fa3
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a6, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a6
+; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa4, fa2
+; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a6, fa4, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    beqz a6, .LBB116_14
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.13:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa2, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa1, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa2, fa1, fa2
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a5, fa2, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa4, a5, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa2, fa4, fa2
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB116_14:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa2, 34(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 0(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa0, fa1
-; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa0, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    beqz a1, .LBB116_16
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa2
+; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a6, fa3, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    beqz a6, .LBB116_16
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.15:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa5, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa1, fa5, fa1
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a5, fa4, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa5, a5, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa5, fa4
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB116_16:
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa1, 32(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 32
-; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa4, 30(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa3, 28(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa2, 26(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa1, 24(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 24
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v8, v9, 2
 ; ZVFHMIN-ZFH-RV32-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v8, v8, 2
 ; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV32-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 48
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFH-RV32-NEXT:    ret
 ;
 ; ZVFHMIN-ZFH-RV64-LABEL: trunc_v6f16:
 ; ZVFHMIN-ZFH-RV64:       # %bb.0:
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 2(sp)
 ; ZVFHMIN-ZFH-RV64-NEXT:    lui a1, %hi(.LCPI116_0)
 ; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, %lo(.LCPI116_0)(a1)
 ; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
@@ -4392,96 +4384,105 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa3, a1, rtz
 ; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB116_2:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 30(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 12(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 0(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa1
 ; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa3, fa5
 ; ZVFHMIN-ZFH-RV64-NEXT:    beqz a1, .LBB116_4
 ; ZVFHMIN-ZFH-RV64-NEXT:  # %bb.3:
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa4, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa1, rtz
 ; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa1, fa3, fa1
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB116_4:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 28(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 4(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa2
 ; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa3, fa5
 ; ZVFHMIN-ZFH-RV64-NEXT:    beqz a1, .LBB116_6
 ; ZVFHMIN-ZFH-RV64-NEXT:  # %bb.5:
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa4, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa2, rtz
 ; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa2, fa3, fa2
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB116_6:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 26(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 8(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa3, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 6(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa0, fa3
+; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa0, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa1
 ; ZVFHMIN-ZFH-RV64-NEXT:    beqz a1, .LBB116_8
 ; ZVFHMIN-ZFH-RV64-NEXT:  # %bb.7:
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa3, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa1, a1, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa3, fa1, fa3
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB116_8:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 24(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 6(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    beqz a1, .LBB116_10
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 10(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa4, fa1
+; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a3, fa4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    beqz a3, .LBB116_10
 ; ZVFHMIN-ZFH-RV64-NEXT:  # %bb.9:
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a2, fa1, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa4, a2, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa1, fa4, fa1
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB116_10:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 22(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 4(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    beqz a1, .LBB116_12
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 8(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa2
+; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa2, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a3, fa2, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa1
+; ZVFHMIN-ZFH-RV64-NEXT:    beqz a3, .LBB116_12
 ; ZVFHMIN-ZFH-RV64-NEXT:  # %bb.11:
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a3, fa4, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa2, a3, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa2, fa4
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB116_12:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 20(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 2(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    beqz a1, .LBB116_14
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 12(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa3
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v9, a3
+; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa4, fa2
+; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a3, fa4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    beqz a3, .LBB116_14
 ; ZVFHMIN-ZFH-RV64-NEXT:  # %bb.13:
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa2, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa4, a1, rtz
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa2, fa4, fa2
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB116_14:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 18(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa2
 ; ZVFHMIN-ZFH-RV64-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    beqz a1, .LBB116_16
+; ZVFHMIN-ZFH-RV64-NEXT:    flt.h a2, fa3, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    beqz a2, .LBB116_16
 ; ZVFHMIN-ZFH-RV64-NEXT:  # %bb.15:
 ; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.w.h a1, fa4, rtz
 ; ZVFHMIN-ZFH-RV64-NEXT:    fcvt.h.w fa5, a1, rtz
 ; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa5, fa4
 ; ZVFHMIN-ZFH-RV64-NEXT:  .LBB116_16:
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa4, 16(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
 ; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vse64.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    vse64.v v9, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v8, v9, 2
 ; ZVFHMIN-ZFH-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v8, v8, 2
 ; ZVFHMIN-ZFH-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFH-RV64-NEXT:    ret
 ;
 ; ZVFHMIN-ZFHIN-RV32-LABEL: trunc_v6f16:
 ; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -48
-; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 48
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a1, 307200
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.w.x fa5, a1
@@ -4493,118 +4494,126 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a1, rtz
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa3, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB116_2:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa4, 46(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 12(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 0(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa2, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa3, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a1, .LBB116_4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.3:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa4, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa2, rtz
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa2, fa3, fa2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB116_4:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa4, 44(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa3, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa1, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa1, fa2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a1, .LBB116_6
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.5:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa3, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa2, a1, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa3, fa2, fa3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB116_6:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa4, 42(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a1, .LBB116_8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa0, 6(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa2, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa0, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a2, fa0, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a2, .LBB116_8
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.7:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a2, fa4, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa1, a2, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa1, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB116_8:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa4, 40(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 6(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a1, .LBB116_10
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa1, 10(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa2, fa1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa1, fa2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a3, fa1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa3, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a3, .LBB116_10
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.9:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a3, fa2, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa1, a3, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa2, fa1, fa2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB116_10:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa4, 38(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa3, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa2, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a1, .LBB116_12
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa1, 8(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa2, fa2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa1, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a4, fa1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a4, .LBB116_12
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.11:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa3, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa2, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa3, fa2, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a4, fa4, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa2, a4, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa2, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB116_12:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa3, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa3, 36(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa2, 2(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa2, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa1, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a1, .LBB116_14
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 12(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a6, fa3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a6, .LBB116_14
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.13:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa2, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa1, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa2, fa1, fa2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a5, fa4, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a5, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa3, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB116_14:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa2, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa2, 34(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa1, 0(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa1, fa1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa0, fa1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa0, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a1, .LBB116_16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 14(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a6, fa3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a6, .LBB116_16
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.15:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa5, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa1, fa5, fa1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a5, fa4, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa5, a5, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa5, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB116_16:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa5, fa1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa5, 32(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa4, 30(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa3, 28(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa2, 26(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa5, 24(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 24
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa5, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v8, v8, 2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 48
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
 ;
 ; ZVFHMIN-ZFHIN-RV64-LABEL: trunc_v6f16:
 ; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa5
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a1, 307200
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.w.x fa5, a1
@@ -4616,100 +4625,109 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a1, rtz
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa3, fa4
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB116_2:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa4, 30(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 12(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 0(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa2, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa2
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa3, fa5
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a1, .LBB116_4
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  # %bb.3:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa4, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa2, rtz
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa2, fa3, fa2
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB116_4:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa4, 28(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa3, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa1, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa1, fa2
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a1, .LBB116_6
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  # %bb.5:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa3, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa2, a1, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa3, fa2, fa3
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB116_6:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa4, 26(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 8(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa0, 6(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa2, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa0
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa0, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa0, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa1
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a1, .LBB116_8
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  # %bb.7:
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa1, a1, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa1, fa4
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB116_8:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa4, 24(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 6(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a1, .LBB116_10
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa1, 10(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa2, fa1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa1, fa2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a2, fa1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa3, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a2, .LBB116_10
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  # %bb.9:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a2, fa2, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa1, a2, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa2, fa1, fa2
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB116_10:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa4, 22(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 4(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a1, .LBB116_12
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa1, 8(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa2, fa2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa1, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a3, fa1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a3, .LBB116_12
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  # %bb.11:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a3, fa4, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa2, a3, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa2, fa4
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB116_12:
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 12(sp)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa4, 20(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 2(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v9, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa3
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a1, .LBB116_14
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a3, fa3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a3, .LBB116_14
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  # %bb.13:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a1, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a2, fa4, rtz
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa3, a2, rtz
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa3, fa4
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB116_14:
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 14(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa4, 18(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa3
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a1, .LBB116_16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flt.s a2, fa3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    beqz a2, .LBB116_16
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  # %bb.15:
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.w.s a1, fa4, rtz
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.w fa5, a1, rtz
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fsgnj.s fa4, fa5, fa4
 ; ZVFHMIN-ZFHIN-RV64-NEXT:  .LBB116_16:
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.h.s fa5, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 16(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse64.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse64.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v8, v8, 2
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = call <6 x half> @llvm.trunc.v6f16(<6 x half> %a)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index 33e9cde4c30ab..8e2a225622eec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -64,12 +64,9 @@ define <8 x i1> @fcmp_oeq_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ;
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -91,12 +88,9 @@ define <8 x i1> @fcmp_oeq_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_swap_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -138,12 +132,9 @@ define <8 x i1> @fcmp_ogt_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ;
 ; ZVFHMIN-LABEL: fcmp_ogt_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -165,12 +156,9 @@ define <8 x i1> @fcmp_ogt_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: fcmp_ogt_vf_swap_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -212,12 +200,9 @@ define <8 x i1> @fcmp_oge_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ;
 ; ZVFHMIN-LABEL: fcmp_oge_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -239,12 +224,9 @@ define <8 x i1> @fcmp_oge_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: fcmp_oge_vf_swap_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -286,12 +268,9 @@ define <8 x i1> @fcmp_olt_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ;
 ; ZVFHMIN-LABEL: fcmp_olt_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -313,12 +292,9 @@ define <8 x i1> @fcmp_olt_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: fcmp_olt_vf_swap_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -360,12 +336,9 @@ define <8 x i1> @fcmp_ole_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ;
 ; ZVFHMIN-LABEL: fcmp_ole_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -387,12 +360,9 @@ define <8 x i1> @fcmp_ole_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: fcmp_ole_vf_swap_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -439,12 +409,9 @@ define <8 x i1> @fcmp_one_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ;
 ; ZVFHMIN-LABEL: fcmp_one_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -469,12 +436,9 @@ define <8 x i1> @fcmp_one_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: fcmp_one_vf_swap_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -526,12 +490,9 @@ define <8 x i1> @fcmp_ord_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ;
 ; ZVFHMIN-LABEL: fcmp_ord_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
@@ -560,12 +521,9 @@ define <8 x i1> @fcmp_ord_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: fcmp_ord_vf_swap_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
@@ -615,12 +573,9 @@ define <8 x i1> @fcmp_ueq_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ;
 ; ZVFHMIN-LABEL: fcmp_ueq_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -645,12 +600,9 @@ define <8 x i1> @fcmp_ueq_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: fcmp_ueq_vf_swap_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -695,12 +647,9 @@ define <8 x i1> @fcmp_ugt_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ;
 ; ZVFHMIN-LABEL: fcmp_ugt_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -723,12 +672,9 @@ define <8 x i1> @fcmp_ugt_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: fcmp_ugt_vf_swap_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -772,12 +718,9 @@ define <8 x i1> @fcmp_uge_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ;
 ; ZVFHMIN-LABEL: fcmp_uge_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -800,12 +743,9 @@ define <8 x i1> @fcmp_uge_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: fcmp_uge_vf_swap_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -849,12 +789,9 @@ define <8 x i1> @fcmp_ult_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ;
 ; ZVFHMIN-LABEL: fcmp_ult_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -877,12 +814,9 @@ define <8 x i1> @fcmp_ult_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: fcmp_ult_vf_swap_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -926,12 +860,9 @@ define <8 x i1> @fcmp_ule_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ;
 ; ZVFHMIN-LABEL: fcmp_ule_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -954,12 +885,9 @@ define <8 x i1> @fcmp_ule_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: fcmp_ule_vf_swap_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -1001,12 +929,9 @@ define <8 x i1> @fcmp_une_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ;
 ; ZVFHMIN-LABEL: fcmp_une_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -1028,12 +953,9 @@ define <8 x i1> @fcmp_une_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: fcmp_une_vf_swap_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -1084,12 +1006,9 @@ define <8 x i1> @fcmp_uno_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ;
 ; ZVFHMIN-LABEL: fcmp_uno_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
@@ -1118,12 +1037,9 @@ define <8 x i1> @fcmp_uno_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: fcmp_uno_vf_swap_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll
index f023c760f14a7..7a7236235d120 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll
@@ -61,16 +61,13 @@ define <2 x half> @vfadd_vf_v2f16(<2 x half> %va, half %b, <2 x i1> %m, i32 zero
 ;
 ; ZVFHMIN-LABEL: vfadd_vf_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -89,16 +86,13 @@ define <2 x half> @vfadd_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vfadd_vf_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v8
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -184,16 +178,13 @@ define <4 x half> @vfadd_vf_v4f16(<4 x half> %va, half %b, <4 x i1> %m, i32 zero
 ;
 ; ZVFHMIN-LABEL: vfadd_vf_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -212,16 +203,13 @@ define <4 x half> @vfadd_vf_v4f16_unmasked(<4 x half> %va, half %b, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vfadd_vf_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v8
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -284,12 +272,9 @@ define <8 x half> @vfadd_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero
 ;
 ; ZVFHMIN-LABEL: vfadd_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -312,12 +297,9 @@ define <8 x half> @vfadd_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vfadd_vf_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -384,12 +366,9 @@ define <16 x half> @vfadd_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32
 ;
 ; ZVFHMIN-LABEL: vfadd_vf_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -412,12 +391,9 @@ define <16 x half> @vfadd_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe
 ;
 ; ZVFHMIN-LABEL: vfadd_vf_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll
index 9fb8377d5a5ef..cb83e5ff4f2b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll
@@ -61,16 +61,13 @@ define <2 x half> @vfdiv_vf_v2f16(<2 x half> %va, half %b, <2 x i1> %m, i32 zero
 ;
 ; ZVFHMIN-LABEL: vfdiv_vf_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -89,16 +86,13 @@ define <2 x half> @vfdiv_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vfdiv_vf_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v8
+; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -184,16 +178,13 @@ define <4 x half> @vfdiv_vf_v4f16(<4 x half> %va, half %b, <4 x i1> %m, i32 zero
 ;
 ; ZVFHMIN-LABEL: vfdiv_vf_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -212,16 +203,13 @@ define <4 x half> @vfdiv_vf_v4f16_unmasked(<4 x half> %va, half %b, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vfdiv_vf_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v8
+; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -284,12 +272,9 @@ define <8 x half> @vfdiv_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero
 ;
 ; ZVFHMIN-LABEL: vfdiv_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -312,12 +297,9 @@ define <8 x half> @vfdiv_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vfdiv_vf_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -384,12 +366,9 @@ define <16 x half> @vfdiv_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32
 ;
 ; ZVFHMIN-LABEL: vfdiv_vf_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -412,12 +391,9 @@ define <16 x half> @vfdiv_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe
 ;
 ; ZVFHMIN-LABEL: vfdiv_vf_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
index e2e48cee3eacc..6dcebc9763d82 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
@@ -64,17 +64,14 @@ define <2 x half> @vfma_vf_v2f16(<2 x half> %va, half %b, <2 x half> %vc, <2 x i
 ;
 ; ZVFHMIN-LABEL: vfma_vf_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v11
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -93,17 +90,14 @@ define <2 x half> @vfma_vf_v2f16_unmasked(<2 x half> %va, half %b, <2 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfma_vf_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v11
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -169,17 +163,14 @@ define <4 x half> @vfma_vf_v4f16(<4 x half> %va, half %b, <4 x half> %vc, <4 x i
 ;
 ; ZVFHMIN-LABEL: vfma_vf_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v11
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -198,17 +189,14 @@ define <4 x half> @vfma_vf_v4f16_unmasked(<4 x half> %va, half %b, <4 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfma_vf_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v11
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -274,17 +262,14 @@ define <8 x half> @vfma_vf_v8f16(<8 x half> %va, half %b, <8 x half> %vc, <8 x i
 ;
 ; ZVFHMIN-LABEL: vfma_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v10, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -303,17 +288,14 @@ define <8 x half> @vfma_vf_v8f16_unmasked(<8 x half> %va, half %b, <8 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfma_vf_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -379,17 +361,14 @@ define <16 x half> @vfma_vf_v16f16(<16 x half> %va, half %b, <16 x half> %vc, <1
 ;
 ; ZVFHMIN-LABEL: vfma_vf_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v12
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
@@ -408,17 +387,14 @@ define <16 x half> @vfma_vf_v16f16_unmasked(<16 x half> %va, half %b, <16 x half
 ;
 ; ZVFHMIN-LABEL: vfma_vf_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v12
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax.ll
index a9d6b5f047ebb..11420a23285d0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax.ll
@@ -40,16 +40,13 @@ define <2 x half> @vfmax_v2f16_vf(<2 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmax_v2f16_vf:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
+; ZVFHMIN-NEXT:    vfmax.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -68,16 +65,13 @@ define <2 x half> @vfmax_v2f16_fv(<2 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmax_v2f16_fv:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -119,16 +113,13 @@ define <4 x half> @vfmax_v4f16_vf(<4 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmax_v4f16_vf:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
+; ZVFHMIN-NEXT:    vfmax.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -147,16 +138,13 @@ define <4 x half> @vfmax_v4f16_fv(<4 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmax_v4f16_fv:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -198,12 +186,9 @@ define <8 x half> @vfmax_v8f16_vf(<8 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmax_v8f16_vf:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -226,12 +211,9 @@ define <8 x half> @vfmax_v8f16_fv(<8 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmax_v8f16_fv:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -277,12 +259,9 @@ define <16 x half> @vfmax_v16f16_vf(<16 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmax_v16f16_vf:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -305,12 +284,9 @@ define <16 x half> @vfmax_v16f16_fv(<16 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmax_v16f16_fv:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin.ll
index d7c6fb3568f66..e8ae32a45f7cd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin.ll
@@ -40,16 +40,13 @@ define <2 x half> @vfmin_v2f16_vf(<2 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmin_v2f16_vf:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
+; ZVFHMIN-NEXT:    vfmin.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -68,16 +65,13 @@ define <2 x half> @vfmin_v2f16_fv(<2 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmin_v2f16_fv:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -119,16 +113,13 @@ define <4 x half> @vfmin_v4f16_vf(<4 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmin_v4f16_vf:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
+; ZVFHMIN-NEXT:    vfmin.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -147,16 +138,13 @@ define <4 x half> @vfmin_v4f16_fv(<4 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmin_v4f16_fv:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -198,12 +186,9 @@ define <8 x half> @vfmin_v8f16_vf(<8 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmin_v8f16_vf:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -226,12 +211,9 @@ define <8 x half> @vfmin_v8f16_fv(<8 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmin_v8f16_fv:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -277,12 +259,9 @@ define <16 x half> @vfmin_v16f16_vf(<16 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmin_v16f16_vf:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -305,12 +284,9 @@ define <16 x half> @vfmin_v16f16_fv(<16 x half> %a, half %b) {
 ;
 ; ZVFHMIN-LABEL: vfmin_v16f16_fv:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll
index 64ce0a12de8cf..86f140723d7f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll
@@ -61,16 +61,13 @@ define <2 x half> @vfmul_vf_v2f16(<2 x half> %va, half %b, <2 x i1> %m, i32 zero
 ;
 ; ZVFHMIN-LABEL: vfmul_vf_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -89,16 +86,13 @@ define <2 x half> @vfmul_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vfmul_vf_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v8
+; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -184,16 +178,13 @@ define <4 x half> @vfmul_vf_v4f16(<4 x half> %va, half %b, <4 x i1> %m, i32 zero
 ;
 ; ZVFHMIN-LABEL: vfmul_vf_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -212,16 +203,13 @@ define <4 x half> @vfmul_vf_v4f16_unmasked(<4 x half> %va, half %b, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vfmul_vf_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v8
+; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -284,12 +272,9 @@ define <8 x half> @vfmul_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero
 ;
 ; ZVFHMIN-LABEL: vfmul_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -312,12 +297,9 @@ define <8 x half> @vfmul_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vfmul_vf_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -384,12 +366,9 @@ define <16 x half> @vfmul_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32
 ;
 ; ZVFHMIN-LABEL: vfmul_vf_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -412,12 +391,9 @@ define <16 x half> @vfmul_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe
 ;
 ; ZVFHMIN-LABEL: vfmul_vf_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll
index eb717a851ed46..d0a0bf516d355 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll
@@ -61,16 +61,13 @@ define <2 x half> @vfsub_vf_v2f16(<2 x half> %va, half %b, <2 x i1> %m, i32 zero
 ;
 ; ZVFHMIN-LABEL: vfsub_vf_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -89,16 +86,13 @@ define <2 x half> @vfsub_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vfsub_vf_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v8
+; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -184,16 +178,13 @@ define <4 x half> @vfsub_vf_v4f16(<4 x half> %va, half %b, <4 x i1> %m, i32 zero
 ;
 ; ZVFHMIN-LABEL: vfsub_vf_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -212,16 +203,13 @@ define <4 x half> @vfsub_vf_v4f16_unmasked(<4 x half> %va, half %b, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vfsub_vf_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v8
+; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -284,12 +272,9 @@ define <8 x half> @vfsub_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zero
 ;
 ; ZVFHMIN-LABEL: vfsub_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -312,12 +297,9 @@ define <8 x half> @vfsub_vf_v8f16_unmasked(<8 x half> %va, half %b, i32 zeroext
 ;
 ; ZVFHMIN-LABEL: vfsub_vf_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -384,12 +366,9 @@ define <16 x half> @vfsub_vf_v16f16(<16 x half> %va, half %b, <16 x i1> %m, i32
 ;
 ; ZVFHMIN-LABEL: vfsub_vf_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -412,12 +391,9 @@ define <16 x half> @vfsub_vf_v16f16_unmasked(<16 x half> %va, half %b, i32 zeroe
 ;
 ; ZVFHMIN-LABEL: vfsub_vf_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge-bf16.ll
index da7f9f56fcf16..4186a6b304a22 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge-bf16.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge-bf16.ll
@@ -24,11 +24,11 @@ define <2 x bfloat> @vpmerge_vv_v2bf16(<2 x bfloat> %va, <2 x bfloat> %vb, <2 x
 define <2 x bfloat> @vpmerge_vf_v2bf16(bfloat %a, <2 x bfloat> %vb, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpmerge_vf_v2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfmv.v.f v9, fa5
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; CHECK-NEXT:    fmv.x.h a1, fa0
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <2 x bfloat> poison, bfloat %a, i32 0
   %va = shufflevector <2 x bfloat> %elt.head, <2 x bfloat> poison, <2 x i32> zeroinitializer
@@ -52,11 +52,11 @@ define <4 x bfloat> @vpmerge_vv_v4bf16(<4 x bfloat> %va, <4 x bfloat> %vb, <4 x
 define <4 x bfloat> @vpmerge_vf_v4bf16(bfloat %a, <4 x bfloat> %vb, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpmerge_vf_v4bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfmv.v.f v9, fa5
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; CHECK-NEXT:    fmv.x.h a1, fa0
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <4 x bfloat> poison, bfloat %a, i32 0
   %va = shufflevector <4 x bfloat> %elt.head, <4 x bfloat> poison, <4 x i32> zeroinitializer
@@ -80,11 +80,11 @@ define <8 x bfloat> @vpmerge_vv_v8bf16(<8 x bfloat> %va, <8 x bfloat> %vb, <8 x
 define <8 x bfloat> @vpmerge_vf_v8bf16(bfloat %a, <8 x bfloat> %vb, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpmerge_vf_v8bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmv.v.f v10, fa5
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
+; CHECK-NEXT:    fmv.x.h a1, fa0
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <8 x bfloat> poison, bfloat %a, i32 0
   %va = shufflevector <8 x bfloat> %elt.head, <8 x bfloat> poison, <8 x i32> zeroinitializer
@@ -108,11 +108,11 @@ define <16 x bfloat> @vpmerge_vv_v16bf16(<16 x bfloat> %va, <16 x bfloat> %vb, <
 define <16 x bfloat> @vpmerge_vf_v16bf16(bfloat %a, <16 x bfloat> %vb, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpmerge_vf_v16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfmv.v.f v12, fa5
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
+; CHECK-NEXT:    fmv.x.h a1, fa0
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <16 x bfloat> poison, bfloat %a, i32 0
   %va = shufflevector <16 x bfloat> %elt.head, <16 x bfloat> poison, <16 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
index 9f0561b394b81..bdf76dc63ddd8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
@@ -846,11 +846,11 @@ define <2 x half> @vpmerge_vf_v2f16(half %a, <2 x half> %vb, <2 x i1> %m, i32 ze
 ;
 ; ZVFHMIN-LABEL: vpmerge_vf_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <2 x half> poison, half %a, i32 0
   %va = shufflevector <2 x half> %elt.head, <2 x half> poison, <2 x i32> zeroinitializer
@@ -880,11 +880,11 @@ define <4 x half> @vpmerge_vf_v4f16(half %a, <4 x half> %vb, <4 x i1> %m, i32 ze
 ;
 ; ZVFHMIN-LABEL: vpmerge_vf_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <4 x half> poison, half %a, i32 0
   %va = shufflevector <4 x half> %elt.head, <4 x half> poison, <4 x i32> zeroinitializer
@@ -914,11 +914,11 @@ define <8 x half> @vpmerge_vf_v8f16(half %a, <8 x half> %vb, <8 x i1> %m, i32 ze
 ;
 ; ZVFHMIN-LABEL: vpmerge_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %a, i32 0
   %va = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -948,11 +948,11 @@ define <16 x half> @vpmerge_vf_v16f16(half %a, <16 x half> %vb, <16 x i1> %m, i3
 ;
 ; ZVFHMIN-LABEL: vpmerge_vf_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <16 x half> poison, half %a, i32 0
   %va = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer

From f1615e32379ff1ea125a8b3ac8792c3e0b5e6f2c Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 3 Sep 2024 17:52:08 -0700
Subject: [PATCH 008/425] [WebAssembly] Remove Kind argument from
 WebAssemblyOperand (NFC) (#107157)

The `Kind` argument does not need to passed separately.
---
 .../AsmParser/WebAssemblyAsmParser.cpp        | 56 ++++++++-----------
 1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index c63740d267819..24a9ad67cfe04 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -77,16 +77,16 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
     struct BrLOp BrL;
   };
 
-  WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, TokOp T)
-      : Kind(K), StartLoc(Start), EndLoc(End), Tok(T) {}
-  WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, IntOp I)
-      : Kind(K), StartLoc(Start), EndLoc(End), Int(I) {}
-  WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, FltOp F)
-      : Kind(K), StartLoc(Start), EndLoc(End), Flt(F) {}
-  WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, SymOp S)
-      : Kind(K), StartLoc(Start), EndLoc(End), Sym(S) {}
-  WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End)
-      : Kind(K), StartLoc(Start), EndLoc(End), BrL() {}
+  WebAssemblyOperand(SMLoc Start, SMLoc End, TokOp T)
+      : Kind(Token), StartLoc(Start), EndLoc(End), Tok(T) {}
+  WebAssemblyOperand(SMLoc Start, SMLoc End, IntOp I)
+      : Kind(Integer), StartLoc(Start), EndLoc(End), Int(I) {}
+  WebAssemblyOperand(SMLoc Start, SMLoc End, FltOp F)
+      : Kind(Float), StartLoc(Start), EndLoc(End), Flt(F) {}
+  WebAssemblyOperand(SMLoc Start, SMLoc End, SymOp S)
+      : Kind(Symbol), StartLoc(Start), EndLoc(End), Sym(S) {}
+  WebAssemblyOperand(SMLoc Start, SMLoc End)
+      : Kind(BrList), StartLoc(Start), EndLoc(End), BrL() {}
 
   ~WebAssemblyOperand() {
     if (isBrList())
@@ -388,8 +388,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
     if (IsNegative)
       Val = -Val;
     Operands.push_back(std::make_unique<WebAssemblyOperand>(
-        WebAssemblyOperand::Integer, Int.getLoc(), Int.getEndLoc(),
-        WebAssemblyOperand::IntOp{Val}));
+        Int.getLoc(), Int.getEndLoc(), WebAssemblyOperand::IntOp{Val}));
     Parser.Lex();
   }
 
@@ -401,8 +400,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
     if (IsNegative)
       Val = -Val;
     Operands.push_back(std::make_unique<WebAssemblyOperand>(
-        WebAssemblyOperand::Float, Flt.getLoc(), Flt.getEndLoc(),
-        WebAssemblyOperand::FltOp{Val}));
+        Flt.getLoc(), Flt.getEndLoc(), WebAssemblyOperand::FltOp{Val}));
     Parser.Lex();
     return false;
   }
@@ -423,8 +421,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
     if (IsNegative)
       Val = -Val;
     Operands.push_back(std::make_unique<WebAssemblyOperand>(
-        WebAssemblyOperand::Float, Flt.getLoc(), Flt.getEndLoc(),
-        WebAssemblyOperand::FltOp{Val}));
+        Flt.getLoc(), Flt.getEndLoc(), WebAssemblyOperand::FltOp{Val}));
     Parser.Lex();
     return false;
   }
@@ -459,8 +456,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
         // up later.
         auto Tok = Lexer.getTok();
         Operands.push_back(std::make_unique<WebAssemblyOperand>(
-            WebAssemblyOperand::Integer, Tok.getLoc(), Tok.getEndLoc(),
-            WebAssemblyOperand::IntOp{-1}));
+            Tok.getLoc(), Tok.getEndLoc(), WebAssemblyOperand::IntOp{-1}));
       }
     }
     return false;
@@ -474,8 +470,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
       NestingStack.back().Sig = Sig;
     }
     Operands.push_back(std::make_unique<WebAssemblyOperand>(
-        WebAssemblyOperand::Integer, NameLoc, NameLoc,
-        WebAssemblyOperand::IntOp{static_cast<int64_t>(BT)}));
+        NameLoc, NameLoc, WebAssemblyOperand::IntOp{static_cast<int64_t>(BT)}));
   }
 
   bool parseLimits(wasm::WasmLimits *Limits) {
@@ -512,16 +507,14 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
             GetOrCreateFunctionTableSymbol(getContext(), Tok.getString(), is64);
         const auto *Val = MCSymbolRefExpr::create(Sym, getContext());
         *Op = std::make_unique<WebAssemblyOperand>(
-            WebAssemblyOperand::Symbol, Tok.getLoc(), Tok.getEndLoc(),
-            WebAssemblyOperand::SymOp{Val});
+            Tok.getLoc(), Tok.getEndLoc(), WebAssemblyOperand::SymOp{Val});
         Parser.Lex();
         return expect(AsmToken::Comma, ",");
       } else {
         const auto *Val =
             MCSymbolRefExpr::create(DefaultFunctionTable, getContext());
         *Op = std::make_unique<WebAssemblyOperand>(
-            WebAssemblyOperand::Symbol, SMLoc(), SMLoc(),
-            WebAssemblyOperand::SymOp{Val});
+            SMLoc(), SMLoc(), WebAssemblyOperand::SymOp{Val});
         return false;
       }
     } else {
@@ -529,8 +522,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
       // write a table symbol or issue relocations.  Instead we just ensure the
       // table is live and write a zero.
       getStreamer().emitSymbolAttribute(DefaultFunctionTable, MCSA_NoDeadStrip);
-      *Op = std::make_unique<WebAssemblyOperand>(WebAssemblyOperand::Integer,
-                                                 SMLoc(), SMLoc(),
+      *Op = std::make_unique<WebAssemblyOperand>(SMLoc(), SMLoc(),
                                                  WebAssemblyOperand::IntOp{0});
       return false;
     }
@@ -564,7 +556,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
 
     // Now construct the name as first operand.
     Operands.push_back(std::make_unique<WebAssemblyOperand>(
-        WebAssemblyOperand::Token, NameLoc, SMLoc::getFromPointer(Name.end()),
+        NameLoc, SMLoc::getFromPointer(Name.end()),
         WebAssemblyOperand::TokOp{Name}));
 
     // If this instruction is part of a control flow structure, ensure
@@ -645,8 +637,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
       const MCExpr *Expr = MCSymbolRefExpr::create(
           WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, Ctx);
       Operands.push_back(std::make_unique<WebAssemblyOperand>(
-          WebAssemblyOperand::Symbol, Loc.getLoc(), Loc.getEndLoc(),
-          WebAssemblyOperand::SymOp{Expr}));
+          Loc.getLoc(), Loc.getEndLoc(), WebAssemblyOperand::SymOp{Expr}));
     }
 
     while (Lexer.isNot(AsmToken::EndOfStatement)) {
@@ -671,8 +662,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
           if (Parser.parseExpression(Val, End))
             return error("Cannot parse symbol: ", Lexer.getTok());
           Operands.push_back(std::make_unique<WebAssemblyOperand>(
-              WebAssemblyOperand::Symbol, Start, End,
-              WebAssemblyOperand::SymOp{Val}));
+              Start, End, WebAssemblyOperand::SymOp{Val}));
           if (checkForP2AlignIfLoadStore(Operands, Name))
             return true;
         }
@@ -705,8 +695,8 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
       }
       case AsmToken::LCurly: {
         Parser.Lex();
-        auto Op = std::make_unique<WebAssemblyOperand>(
-            WebAssemblyOperand::BrList, Tok.getLoc(), Tok.getEndLoc());
+        auto Op =
+            std::make_unique<WebAssemblyOperand>(Tok.getLoc(), Tok.getEndLoc());
         if (!Lexer.is(AsmToken::RCurly))
           for (;;) {
             Op->BrL.List.push_back(Lexer.getTok().getIntVal());

From c8763f04bf2162d3f0f4f967dfeb2f0feda0c75b Mon Sep 17 00:00:00 2001
From: Yun-Fly <yunfei.song@intel.com>
Date: Wed, 4 Sep 2024 09:19:09 +0800
Subject: [PATCH 009/425] [mlir][tensor] Fix consumer fusion for `tensor.pack`
 without explicit `outer_dims_perm` attribute (#106687)

---
 mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp      | 4 ++--
 .../Interfaces/TilingInterface/tile-and-fuse-consumer.mlir    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
index f35a9cd4cb927..9e17184ebed79 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
@@ -286,7 +286,7 @@ struct PackOpTiling
     SmallVector<OpFoldResult> outerDimOffsets, outerDimSizes;
     DenseMap<int64_t, OpFoldResult> dimAndTileMapping =
         packOp.getDimAndTileMapping();
-    for (auto dim : packOp.getOuterDimsPerm()) {
+    for (auto dim : llvm::seq<int64_t>(packOp.getSourceRank())) {
       if (dimAndTileMapping.count(dim)) {
         FailureOr<int64_t> cstSize =
             ValueBoundsConstraintSet::computeConstantBound(
@@ -327,7 +327,7 @@ struct PackOpTiling
         outerDimSizes.push_back(sizes[dim]);
       }
     }
-
+    applyPermToRange(outerDimOffsets, outerDimSizes, packOp.getOuterDimsPerm());
     resultOffsets = outerDimOffsets;
     resultSizes = outerDimSizes;
     return success();
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
index 741dfbfb1cd5c..83c5ec8d7342c 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
@@ -337,7 +337,7 @@ module {
             }
         }
         %output = tensor.empty() : tensor<4x32x16xf32>
-        %pack = tensor.pack %1 outer_dims_perm = [0, 1] inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32>
+        %pack = tensor.pack %1 inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32>
         return %pack : tensor<4x32x16xf32>
     }
 }
@@ -366,7 +366,7 @@ module attributes {transform.with_named_sequence} {
 //      CHECK:      %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV1]])
 //      CHECK:      %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], %[[IV2]], 0] [2, 32, 16] [1, 1, 1]
 //      CHECK:      %[[TILED_PACK_OUT:.*]] = tensor.pack %[[GENERIC_OUT]]
-// CHECK-SAME:                              outer_dims_perm = [0, 1] inner_dims_pos = [0] inner_tiles = [16]
+// CHECK-SAME:                              inner_dims_pos = [0] inner_tiles = [16]
 // CHECK-SAME:                              into %[[TILED_PACK_DEST]]
 //      CHECK:      scf.forall.in_parallel {
 //      CHECK:          tensor.parallel_insert_slice %[[TILED_PACK_OUT]] into %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]],  %[[IV2]], 0] [2, 32, 16] [1, 1, 1]

From 99f02a874984f2b79c3fbd8ae6bbceb7366521ad Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Wed, 4 Sep 2024 05:35:13 +0400
Subject: [PATCH 010/425] [clang] Add tests for CWG issues about language
 linkage (#107019)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch covers Core issues about language linkage during declaration
matching resolved in
[P1787R6](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p1787r6.html),
namely [CWG563](https://cplusplus.github.io/CWG/issues/563.html) and
[CWG1818](https://cplusplus.github.io/CWG/issues/1818.html).

[CWG563](https://cplusplus.github.io/CWG/issues/563.html) "Linkage
specification for objects"
-----------

[P1787R6](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p1787r6.html):
> [CWG563](https://cplusplus.github.io/CWG/issues/563.html) is resolved
by simplifications that follow its suggestions.

Wording ([[dcl.link]/5](https://eel.is/c++draft/dcl.link#5)):
> In a
[linkage-specification](https://eel.is/c++draft/dcl.link#nt:linkage-specification),
the specified language linkage applies to the function types of all
function declarators and to all functions and variables whose names have
external linkage[.](https://eel.is/c++draft/dcl.link#5.sentence-5)

Now the wording clearly says that linkage-specification applies to
variables with external linkage.

[CWG1818](https://cplusplus.github.io/CWG/issues/1818.html) "Visibility
and inherited language linkage"
------------

[P1787R6](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p1787r6.html):
>
[CWG386](http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#386),
[CWG1839](http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1839),
[CWG1818](http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1818),
[CWG2058](http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#2058),
[CWG1900](http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1900),
and Richard’s observation in [“are non-type names ignored in a
class-head-name or
enum-head-name?”](http://lists.isocpp.org/core/2017/01/1604.php) are
resolved by describing the limited lookup that occurs for a
declarator-id, including the changes in Richard’s [proposed resolution
for
CWG1839](http://wiki.edg.com/pub/Wg21cologne2019/CoreWorkingGroup/cwg1839.html)
(which also resolves CWG1818 and what of CWG2058 was not resolved along
with CWG2059) and rejecting the example from
[CWG1477](http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1477).

Wording ([[dcl.link]/6](https://eel.is/c++draft/dcl.link#6)):
> A redeclaration of an entity without a linkage specification inherits
the language linkage of the entity and (if applicable) its
type[.](https://eel.is/c++draft/dcl.link#6.sentence-2).

Answer to the question in the example is `extern "C"`, and not linkage
mismatch. Further analysis of the example is provided as inline comments
in the test itself. Note that https://eel.is/c++draft/dcl.link#7 does
NOT apply in this example, as it's focused squarely at declarations that
are already known to have C language linkage, and declarations of
variables in the global scope.
---
 clang/test/CXX/drs/cwg1818.cpp | 34 ++++++++++++++++++++++++++++++++++
 clang/test/CXX/drs/cwg18xx.cpp |  2 ++
 clang/test/CXX/drs/cwg563.cpp  | 16 ++++++++++++++++
 clang/test/CXX/drs/cwg5xx.cpp  |  1 +
 clang/www/cxx_dr_status.html   |  4 ++--
 5 files changed, 55 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CXX/drs/cwg1818.cpp
 create mode 100644 clang/test/CXX/drs/cwg563.cpp

diff --git a/clang/test/CXX/drs/cwg1818.cpp b/clang/test/CXX/drs/cwg1818.cpp
new file mode 100644
index 0000000000000..bf2d12696a729
--- /dev/null
+++ b/clang/test/CXX/drs/cwg1818.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -std=c++98 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++14 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++17 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++20 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++23 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++2c %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+
+// expected-no-diagnostics
+
+namespace cwg1818 { // cwg1818: 3.4
+extern "C" void f() {
+  // This declaration binds name 'g' in the scope of function 'f',
+  // but its target scope corresponds to namespace 'cwg1818' (_N4988_.[dcl.meaning]/3.5).
+  // Linkage specification of 'f' applies to 'g' per _N4988_.[dcl.link]/5.
+  void g();
+}
+// Target scope of this declaration is naturally the one
+// that corresponds to namespace 'cwg1818',
+// which makes it declare the same entity
+// as the previous declaration per _N4988_.[basic.link]/8,
+// turning it into a redeclaration per _N4988_.[basic.def]/1.
+// Then _N4988_.[dcl.link]/6 applies, making it inherit
+// the (С) language linkage of the previous declaration.
+void g();
+} // namespace cwg1818
+
+// Check that the former 'g' has C language linkage,
+// then that the latter 'g' is considered to be a redeclaration of it,
+// which would make the latter 'g' inherit C language linkage from the former 'g'.
+
+// CHECK: LinkageSpecDecl [[LINKAGE_DECL:0x[0-9a-f]+]] {{.*}} C
+// CHECK: FunctionDecl [[FIRST_DECL:0x[0-9a-f]+]] parent [[LINKAGE_DECL]] {{.*}} g 'void ()'
+// CHECK: FunctionDecl {{.*}} prev [[FIRST_DECL]] {{.*}} g 'void ()'
diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp
index adfdb738e81c9..61b7faa96a9fb 100644
--- a/clang/test/CXX/drs/cwg18xx.cpp
+++ b/clang/test/CXX/drs/cwg18xx.cpp
@@ -222,6 +222,8 @@ namespace cwg1815 { // cwg1815: no
 #endif
 }
 
+// cwg1818 is in cwg1818.cpp
+
 namespace cwg1820 { // cwg1820: 3.5
 typedef int A;
 typedef int cwg1820::A;
diff --git a/clang/test/CXX/drs/cwg563.cpp b/clang/test/CXX/drs/cwg563.cpp
new file mode 100644
index 0000000000000..d585fefc44ffc
--- /dev/null
+++ b/clang/test/CXX/drs/cwg563.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -std=c++98 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++14 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++17 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++20 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++23 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++2c %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+
+// expected-no-diagnostics
+
+namespace cwg563 { // cwg563: 3.3
+extern "C" int a;
+} // namespace cwg563
+
+// CHECK:      LinkageSpecDecl {{.*}} C
+// CHECK-NEXT: `-VarDecl {{.*}} a 'int'
diff --git a/clang/test/CXX/drs/cwg5xx.cpp b/clang/test/CXX/drs/cwg5xx.cpp
index 6a0bb7a196669..ed0c7159dfc88 100644
--- a/clang/test/CXX/drs/cwg5xx.cpp
+++ b/clang/test/CXX/drs/cwg5xx.cpp
@@ -799,6 +799,7 @@ namespace cwg561 { // cwg561: yes
 }
 
 // cwg562: na
+// cwg563 is in cwg563.cpp
 
 namespace cwg564 { // cwg564: yes
   extern "C++" void f(int);
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index ca25776823cfa..aa79c3706f32b 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -3431,7 +3431,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/563.html">563</a></td>
     <td>CD6</td>
     <td>Linkage specification for objects</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="564">
     <td><a href="https://cplusplus.github.io/CWG/issues/564.html">564</a></td>
@@ -10735,7 +10735,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/1818.html">1818</a></td>
     <td>CD6</td>
     <td>Visibility and inherited language linkage</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="1819">
     <td><a href="https://cplusplus.github.io/CWG/issues/1819.html">1819</a></td>

From b057e16740311b9c690c0c991c48b5087bf24d9a Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Tue, 3 Sep 2024 18:44:32 -0700
Subject: [PATCH 011/425] [IR] Remove unused MINARITY operand trait tpl args,
 NFC (#107165)

These don't look like they've been used since the original 'use-diet'
branch was merged in 2008 ( f6caff66a1bfa6464e6a17c0bcfcf06a09a9b909)
---
 llvm/include/llvm/Analysis/MemorySSA.h |  2 +-
 llvm/include/llvm/IR/Constants.h       |  2 +-
 llvm/include/llvm/IR/Function.h        |  3 +--
 llvm/include/llvm/IR/InstrTypes.h      |  4 ++--
 llvm/include/llvm/IR/Instructions.h    | 29 +++++++++-----------------
 llvm/include/llvm/IR/OperandTraits.h   |  4 +---
 llvm/include/llvm/IR/Operator.h        |  4 ++--
 llvm/include/llvm/IR/User.h            |  1 -
 llvm/lib/IR/ConstantsContext.h         |  2 +-
 9 files changed, 19 insertions(+), 32 deletions(-)

diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
index 43fea6ba27ec4..c5eff151ca418 100644
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -689,7 +689,7 @@ inline void MemoryUseOrDef::resetOptimized() {
     cast<MemoryUse>(this)->resetOptimized();
 }
 
-template <> struct OperandTraits<MemoryPhi> : public HungoffOperandTraits<2> {};
+template <> struct OperandTraits<MemoryPhi> : public HungoffOperandTraits {};
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryPhi, MemoryAccess)
 
 /// Encapsulates MemorySSA, including all data associated with memory
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index 2788751e8b62a..62ccde96e5397 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -1381,7 +1381,7 @@ class ConstantExpr : public Constant {
 
 template <>
 struct OperandTraits<ConstantExpr>
-    : public VariadicOperandTraits<ConstantExpr, 1> {};
+    : public VariadicOperandTraits<ConstantExpr> {};
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantExpr, Constant)
 
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index 4abf978687d9d..ce48eed7883e6 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -1038,8 +1038,7 @@ class LLVM_EXTERNAL_VISIBILITY Function : public GlobalObject,
 /// Return value: true =>  null pointer dereference is not undefined.
 bool NullPointerIsDefined(const Function *F, unsigned AS = 0);
 
-template <>
-struct OperandTraits<Function> : public HungoffOperandTraits<3> {};
+template <> struct OperandTraits<Function> : public HungoffOperandTraits {};
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(Function, Value)
 
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index afae564bf022d..6fddedd86e97b 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -2407,7 +2407,7 @@ class CallBase : public Instruction {
 };
 
 template <>
-struct OperandTraits<CallBase> : public VariadicOperandTraits<CallBase, 1> {};
+struct OperandTraits<CallBase> : public VariadicOperandTraits<CallBase> {};
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CallBase, Value)
 
@@ -2474,7 +2474,7 @@ class FuncletPadInst : public Instruction {
 
 template <>
 struct OperandTraits<FuncletPadInst>
-    : public VariadicOperandTraits<FuncletPadInst, /*MINARITY=*/1> {};
+    : public VariadicOperandTraits<FuncletPadInst> {};
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(FuncletPadInst, Value)
 
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index dbd7d49a3e767..6257d03458cab 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -1112,9 +1112,8 @@ class GetElementPtrInst : public Instruction {
 };
 
 template <>
-struct OperandTraits<GetElementPtrInst> :
-  public VariadicOperandTraits<GetElementPtrInst, 1> {
-};
+struct OperandTraits<GetElementPtrInst>
+    : public VariadicOperandTraits<GetElementPtrInst> {};
 
 GetElementPtrInst::GetElementPtrInst(Type *PointeeType, Value *Ptr,
                                      ArrayRef<Value *> IdxList, unsigned Values,
@@ -2723,9 +2722,7 @@ class PHINode : public Instruction {
   void growOperands();
 };
 
-template <>
-struct OperandTraits<PHINode> : public HungoffOperandTraits<2> {
-};
+template <> struct OperandTraits<PHINode> : public HungoffOperandTraits {};
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(PHINode, Value)
 
@@ -2825,8 +2822,7 @@ class LandingPadInst : public Instruction {
 };
 
 template <>
-struct OperandTraits<LandingPadInst> : public HungoffOperandTraits<1> {
-};
+struct OperandTraits<LandingPadInst> : public HungoffOperandTraits {};
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(LandingPadInst, Value)
 
@@ -2903,8 +2899,7 @@ class ReturnInst : public Instruction {
 };
 
 template <>
-struct OperandTraits<ReturnInst> : public VariadicOperandTraits<ReturnInst> {
-};
+struct OperandTraits<ReturnInst> : public VariadicOperandTraits<ReturnInst> {};
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ReturnInst, Value)
 
@@ -3039,8 +3034,7 @@ class BranchInst : public Instruction {
 };
 
 template <>
-struct OperandTraits<BranchInst> : public VariadicOperandTraits<BranchInst, 1> {
-};
+struct OperandTraits<BranchInst> : public VariadicOperandTraits<BranchInst> {};
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BranchInst, Value)
 
@@ -3428,9 +3422,7 @@ class SwitchInstProfUpdateWrapper {
   static CaseWeightOpt getSuccessorWeight(const SwitchInst &SI, unsigned idx);
 };
 
-template <>
-struct OperandTraits<SwitchInst> : public HungoffOperandTraits<2> {
-};
+template <> struct OperandTraits<SwitchInst> : public HungoffOperandTraits {};
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SwitchInst, Value)
 
@@ -3554,8 +3546,7 @@ class IndirectBrInst : public Instruction {
 };
 
 template <>
-struct OperandTraits<IndirectBrInst> : public HungoffOperandTraits<1> {
-};
+struct OperandTraits<IndirectBrInst> : public HungoffOperandTraits {};
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(IndirectBrInst, Value)
 
@@ -4105,7 +4096,7 @@ class CatchSwitchInst : public Instruction {
 };
 
 template <>
-struct OperandTraits<CatchSwitchInst> : public HungoffOperandTraits<2> {};
+struct OperandTraits<CatchSwitchInst> : public HungoffOperandTraits {};
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CatchSwitchInst, Value)
 
@@ -4337,7 +4328,7 @@ class CleanupReturnInst : public Instruction {
 
 template <>
 struct OperandTraits<CleanupReturnInst>
-    : public VariadicOperandTraits<CleanupReturnInst, /*MINARITY=*/1> {};
+    : public VariadicOperandTraits<CleanupReturnInst> {};
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CleanupReturnInst, Value)
 
diff --git a/llvm/include/llvm/IR/OperandTraits.h b/llvm/include/llvm/IR/OperandTraits.h
index ffece6324aab0..efc3d7838feaf 100644
--- a/llvm/include/llvm/IR/OperandTraits.h
+++ b/llvm/include/llvm/IR/OperandTraits.h
@@ -64,8 +64,7 @@ struct OptionalOperandTraits : public FixedNumOperandTraits<SubClass, ARITY> {
 /// when it is a prefix to the User object, and the number of Use objects is
 /// only known at allocation time.
 
-template <typename SubClass, unsigned MINARITY = 0>
-struct VariadicOperandTraits {
+template <typename SubClass> struct VariadicOperandTraits {
   static Use *op_begin(SubClass* U) {
     static_assert(
         !std::is_polymorphic<SubClass>::value,
@@ -91,7 +90,6 @@ struct VariadicOperandTraits {
 /// This is the traits class that is needed when the Use array must be
 /// resizable.
 
-template <unsigned MINARITY = 1>
 struct HungoffOperandTraits {
   static Use *op_begin(User* U) {
     return U->getHungOffOperands();
diff --git a/llvm/include/llvm/IR/Operator.h b/llvm/include/llvm/IR/Operator.h
index f63f54ef94107..88b9bfc0be4b1 100644
--- a/llvm/include/llvm/IR/Operator.h
+++ b/llvm/include/llvm/IR/Operator.h
@@ -533,8 +533,8 @@ class GEPOperator
 };
 
 template <>
-struct OperandTraits<GEPOperator>
-    : public VariadicOperandTraits<GEPOperator, 1> {};
+struct OperandTraits<GEPOperator> : public VariadicOperandTraits<GEPOperator> {
+};
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GEPOperator, Value)
 
diff --git a/llvm/include/llvm/IR/User.h b/llvm/include/llvm/IR/User.h
index a9cf60151e5dc..910815f236abe 100644
--- a/llvm/include/llvm/IR/User.h
+++ b/llvm/include/llvm/IR/User.h
@@ -42,7 +42,6 @@ template <class>
 struct OperandTraits;
 
 class User : public Value {
-  template <unsigned>
   friend struct HungoffOperandTraits;
 
   LLVM_ATTRIBUTE_ALWAYS_INLINE static void *
diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h
index acf05379e8a2a..bd19ec6b9dcac 100644
--- a/llvm/lib/IR/ConstantsContext.h
+++ b/llvm/lib/IR/ConstantsContext.h
@@ -246,7 +246,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ShuffleVectorConstantExpr, Value)
 
 template <>
 struct OperandTraits<GetElementPtrConstantExpr>
-    : public VariadicOperandTraits<GetElementPtrConstantExpr, 1> {};
+    : public VariadicOperandTraits<GetElementPtrConstantExpr> {};
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GetElementPtrConstantExpr, Value)
 

From ed220e15718498d0f854f1044ddcbfee00739aa7 Mon Sep 17 00:00:00 2001
From: Elvis Wang <110374989+ElvisWang123@users.noreply.github.com>
Date: Wed, 4 Sep 2024 09:46:02 +0800
Subject: [PATCH 012/425] [VPlan][NFC] Implement
 `VPWidenMemoryRecipe::computeCost()`. (#105614)

In this patch, we implement the `computeCost()` function in
`VPWidenMemoryRecipe`.
---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  4 ++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 46 +++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bd71dbffa929e..9ad98a5371d81 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2539,6 +2539,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
     llvm_unreachable("VPWidenMemoryRecipe should not be instantiated.");
   }
 
+  /// Return the cost of this VPWidenMemoryRecipe.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
   Instruction &getIngredient() const { return Ingredient; }
 };
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 69c76edd0f554..49ed733107da9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -272,6 +272,12 @@ static Instruction *getInstructionForCost(const VPRecipeBase *R) {
     return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
   if (auto *IG = dyn_cast<VPInterleaveRecipe>(R))
     return IG->getInsertPos();
+  // Currently the legacy cost model only calculates the instruction cost with
+  // underlying instruction. Removing the WidenMem here will prevent
+  // force-target-instruction-cost overwriting the cost of recipe with
+  // underlying instruction which is inconsistent with the legacy model.
+  // TODO: Remove WidenMem from this function when we don't need to compare to
+  // the legacy model.
   if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
     return &WidenMem->getIngredient();
   return nullptr;
@@ -2132,6 +2138,46 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
+                                                 VPCostContext &Ctx) const {
+  Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
+  const Align Alignment =
+      getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
+  unsigned AS =
+      getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+  if (!Consecutive) {
+    // TODO: Using the original IR may not be accurate.
+    // Currently, ARM will use the underlying IR to calculate gather/scatter
+    // instruction cost.
+    const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
+    assert(!Reverse &&
+           "Inconsecutive memory access should not have the order.");
+    return Ctx.TTI.getAddressComputationCost(Ty) +
+           Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
+                                          IsMasked, Alignment, CostKind,
+                                          &Ingredient);
+  }
+
+  InstructionCost Cost = 0;
+  if (IsMasked) {
+    Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
+                                          AS, CostKind);
+  } else {
+    TTI::OperandValueInfo OpInfo =
+        Ctx.TTI.getOperandInfo(Ingredient.getOperand(0));
+    Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
+                                    CostKind, OpInfo, &Ingredient);
+  }
+  if (!Reverse)
+    return Cost;
+
+  return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+                                        cast<VectorType>(Ty), std::nullopt,
+                                        CostKind, 0);
+}
+
 void VPWidenLoadRecipe::execute(VPTransformState &State) {
   auto *LI = cast<LoadInst>(&Ingredient);
 

From 9b5971ad0355d43a9bd37b1067d93ff8b08eba81 Mon Sep 17 00:00:00 2001
From: chuongg3 <chuong.goh@arm.com>
Date: Tue, 3 Sep 2024 18:55:23 -0700
Subject: [PATCH 013/425] [AArch64][GlobalISel] Lower G_BUILD_VECTOR to
 G_INSERT_VECTOR_ELT (#105686)

The lowering happens in post-legalizer lowering if any source registers
from G_BUILD_VECTOR are not constants.

Add pattern pragment setting `scalar_to_vector ($src)` asequivalent to
`vector_insert (undef), ($src), (i61 0)`
---
 llvm/lib/Target/AArch64/AArch64Combine.td     |  10 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  73 +-
 .../GISel/AArch64InstructionSelector.cpp      |  15 +
 .../GISel/AArch64PostLegalizerLowering.cpp    |  34 +
 .../legalize-shuffle-vector-widen-crash.ll    |  12 +-
 ...legalizer-lowering-build-vector-to-dup.mir |  24 +-
 .../postlegalizer-lowering-shuffle-splat.mir  |  51 +-
 llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll  |   2 +-
 llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll  |   2 +-
 llvm/test/CodeGen/AArch64/aarch64-smull.ll    | 116 ++--
 llvm/test/CodeGen/AArch64/abs.ll              |   8 +-
 llvm/test/CodeGen/AArch64/add.ll              | 100 ++-
 llvm/test/CodeGen/AArch64/andorxor.ll         | 300 ++++-----
 llvm/test/CodeGen/AArch64/arm64-dup.ll        |  61 +-
 .../AArch64/arm64-extract-insert-varidx.ll    |  45 +-
 .../AArch64/arm64-indexed-vector-ldst.ll      |  82 +--
 llvm/test/CodeGen/AArch64/arm64-neon-copy.ll  | 318 ++++-----
 .../CodeGen/AArch64/arm64-subvector-extend.ll | 372 ++++-------
 llvm/test/CodeGen/AArch64/arm64-tbl.ll        | 159 +++--
 llvm/test/CodeGen/AArch64/bitcast.ll          | 106 +--
 llvm/test/CodeGen/AArch64/bswap.ll            |   6 +-
 llvm/test/CodeGen/AArch64/concat-vector.ll    | 178 +++--
 llvm/test/CodeGen/AArch64/fabs.ll             |  32 +-
 llvm/test/CodeGen/AArch64/faddsub.ll          |  80 +--
 llvm/test/CodeGen/AArch64/fcmp.ll             | 334 +++++----
 llvm/test/CodeGen/AArch64/fcopysign.ll        |  34 +-
 llvm/test/CodeGen/AArch64/fcvt.ll             | 224 +++----
 llvm/test/CodeGen/AArch64/fdiv.ll             |  40 +-
 llvm/test/CodeGen/AArch64/fexplog.ll          | 315 ++++-----
 .../AArch64/fixed-vector-interleave.ll        |  14 +-
 llvm/test/CodeGen/AArch64/fminimummaximum.ll  |  80 +--
 llvm/test/CodeGen/AArch64/fminmax.ll          |  80 +--
 llvm/test/CodeGen/AArch64/fmla.ll             | 168 ++---
 llvm/test/CodeGen/AArch64/fmul.ll             |  40 +-
 llvm/test/CodeGen/AArch64/fneg.ll             |  32 +-
 llvm/test/CodeGen/AArch64/fpow.ll             |  12 +-
 llvm/test/CodeGen/AArch64/fpowi.ll            |  12 +-
 llvm/test/CodeGen/AArch64/fptoi.ll            |  70 +-
 llvm/test/CodeGen/AArch64/fptrunc.ll          |  12 +-
 llvm/test/CodeGen/AArch64/frem.ll             |  12 +-
 llvm/test/CodeGen/AArch64/fsincos.ll          | 126 ++--
 llvm/test/CodeGen/AArch64/fsqrt.ll            |  32 +-
 llvm/test/CodeGen/AArch64/icmp.ll             |  16 +-
 llvm/test/CodeGen/AArch64/insertextract.ll    |  47 +-
 llvm/test/CodeGen/AArch64/itofp.ll            | 180 +++--
 llvm/test/CodeGen/AArch64/llvm.exp10.ll       |  10 +-
 llvm/test/CodeGen/AArch64/load.ll             |  48 +-
 llvm/test/CodeGen/AArch64/mul.ll              | 134 ++--
 .../AArch64/neon-bitwise-instructions.ll      |  34 +-
 .../AArch64/neon-compare-instructions.ll      |  12 +-
 llvm/test/CodeGen/AArch64/neon-extadd.ll      | 358 +++++-----
 llvm/test/CodeGen/AArch64/neon-extmul.ll      |  28 +-
 llvm/test/CodeGen/AArch64/neon-perm.ll        |  13 +-
 llvm/test/CodeGen/AArch64/ptradd.ll           |  52 +-
 llvm/test/CodeGen/AArch64/rem.ll              | 632 +++++++++---------
 llvm/test/CodeGen/AArch64/sadd_sat_vec.ll     |  38 +-
 llvm/test/CodeGen/AArch64/sext.ll             | 206 +++---
 llvm/test/CodeGen/AArch64/shift.ll            | 177 ++---
 llvm/test/CodeGen/AArch64/shufflevector.ll    |  70 +-
 llvm/test/CodeGen/AArch64/ssub_sat_vec.ll     |  38 +-
 llvm/test/CodeGen/AArch64/sub.ll              | 100 ++-
 llvm/test/CodeGen/AArch64/uadd_sat_vec.ll     |  38 +-
 llvm/test/CodeGen/AArch64/usub_sat_vec.ll     |  38 +-
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    | 528 ++++++---------
 llvm/test/CodeGen/AArch64/xtn.ll              |  77 ++-
 llvm/test/CodeGen/AArch64/zext.ll             | 150 ++---
 66 files changed, 3200 insertions(+), 3647 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index d12f834da5a15..f99d1e276c60f 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -222,7 +222,15 @@ def build_vector_to_dup : GICombineRule<
   (apply [{ applyBuildVectorToDup(*${root}, MRI, B); }])
 >;
 
-def build_vector_lowering : GICombineGroup<[build_vector_to_dup]>;
+def build_vector_to_vector_insert : GICombineRule<
+  (defs root:$root, register_matchinfo:$matchinfo),
+  (match (G_BUILD_VECTOR $dst, GIVariadic<>:$unused):$root,
+          [{ return matchLowerBuildToInsertVecElt(*${root}, MRI); }]),
+  (apply [{ applyLowerBuildToInsertVecElt(*${root}, MRI, B); }])
+>;
+
+def build_vector_lowering : GICombineGroup<[build_vector_to_dup,
+                                            build_vector_to_vector_insert]>;
 
 def lower_vector_fcmp : GICombineRule<
   (defs root:$root),
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 2fff6fffcd7c6..c659697c3a1be 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3315,6 +3315,10 @@ defm LDRSW  : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
 // Pre-fetch.
 defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
 
+def vec_ins_or_scal_vec : PatFrags<(ops node:$src),
+                          [(vector_insert undef, node:$src, (i64 0)),
+                           (scalar_to_vector node:$src)]>;
+
 // For regular load, we do not have any alignment requirement.
 // Thus, it is safe to directly map the vector loads with interesting
 // addressing modes.
@@ -3323,13 +3327,13 @@ multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
                               ValueType ScalTy, ValueType VecTy,
                               Instruction LOADW, Instruction LOADX,
                               SubRegIndex sub> {
-  def : Pat<(VecTy (scalar_to_vector (ScalTy
+  def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
               (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
             (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
                            (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
                            sub)>;
 
-  def : Pat<(VecTy (scalar_to_vector (ScalTy
+  def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
               (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
             (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
                            (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
@@ -3357,12 +3361,12 @@ defm : ScalToVecROLoadPat<ro64, load,       i64, v2i64, LDRDroW, LDRDroX, dsub>;
 defm : ScalToVecROLoadPat<ro64, load,       f64, v2f64, LDRDroW, LDRDroX, dsub>;
 
 
-def : Pat <(v1i64 (scalar_to_vector (i64
+def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
                       (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
                                            ro_Wextend64:$extend))))),
            (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
 
-def : Pat <(v1i64 (scalar_to_vector (i64
+def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
                       (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
                                            ro_Xextend64:$extend))))),
            (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
@@ -3495,34 +3499,34 @@ def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
 // Thus, it is safe to directly map the vector loads with interesting
 // addressing modes.
 // FIXME: We could do the same for bitconvert to floating point vectors.
-def : Pat <(v8i8 (scalar_to_vector (i32
+def : Pat <(v8i8 (vec_ins_or_scal_vec (i32
                (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
            (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
                           (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-def : Pat <(v16i8 (scalar_to_vector (i32
+def : Pat <(v16i8 (vec_ins_or_scal_vec (i32
                (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
                           (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-def : Pat <(v4i16 (scalar_to_vector (i32
+def : Pat <(v4i16 (vec_ins_or_scal_vec (i32
                (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
            (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-def : Pat <(v8i16 (scalar_to_vector (i32
+def : Pat <(v8i16 (vec_ins_or_scal_vec (i32
                (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
            (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-def : Pat <(v2i32 (scalar_to_vector (i32
+def : Pat <(v2i32 (vec_ins_or_scal_vec (i32
                (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
            (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
                           (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-def : Pat <(v4i32 (scalar_to_vector (i32
+def : Pat <(v4i32 (vec_ins_or_scal_vec (i32
                (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
                           (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-def : Pat <(v1i64 (scalar_to_vector (i64
+def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
                (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
-def : Pat <(v2i64 (scalar_to_vector (i64
+def : Pat <(v2i64 (vec_ins_or_scal_vec (i64
                (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
            (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
                           (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
@@ -6848,10 +6852,10 @@ def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn),
 
 defm INS : SIMDIns;
 
-def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v16i8 (vec_ins_or_scal_vec GPR32:$Rn)),
           (SUBREG_TO_REG (i32 0),
                          (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v8i8 (vec_ins_or_scal_vec GPR32:$Rn)),
           (SUBREG_TO_REG (i32 0),
                          (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
 
@@ -6859,50 +6863,49 @@ def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
 def : Pat<(v8i8 (bitconvert (i64 (zext GPR32:$Rn)))),
           (SUBREG_TO_REG (i32 0), (f32 (FMOVWSr GPR32:$Rn)), ssub)>;
 
-def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v8i16 (vec_ins_or_scal_vec GPR32:$Rn)),
           (SUBREG_TO_REG (i32 0),
                          (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v4i16 (vec_ins_or_scal_vec GPR32:$Rn)),
           (SUBREG_TO_REG (i32 0),
                          (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
 
-def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v4f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
           (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v8f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
           (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
 
-def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v4bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
           (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v8bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
           (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
 
-def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
+def : Pat<(v2i32 (vec_ins_or_scal_vec (i32 FPR32:$Rn))),
             (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
                                   (i32 FPR32:$Rn), ssub))>;
-def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
+def : Pat<(v4i32 (vec_ins_or_scal_vec (i32 FPR32:$Rn))),
             (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
                                   (i32 FPR32:$Rn), ssub))>;
-
-def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
+def : Pat<(v2i64 (vec_ins_or_scal_vec (i64 FPR64:$Rn))),
             (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
                                   (i64 FPR64:$Rn), dsub))>;
 
-def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v4f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
           (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v8f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
           (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
 
-def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v4bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
           (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v8bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
           (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
 
-def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
+def : Pat<(v4f32 (vec_ins_or_scal_vec (f32 FPR32:$Rn))),
           (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
-def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
+def : Pat<(v2f32 (vec_ins_or_scal_vec (f32 FPR32:$Rn))),
           (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
 
-def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
+def : Pat<(v2f64 (vec_ins_or_scal_vec (f64 FPR64:$Rn))),
           (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
 
 def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
@@ -8507,7 +8510,7 @@ def : Ld1Lane64IdxOpPat<extloadi8, VectorIndexH, v4i16, i32, LD1i8, VectorIndexH
 let Predicates = [HasNEON] in {
   class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy,
                           SDPatternOperator ExtLoad, Instruction LD1>
-    : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))),
+    : Pat<(ResultTy (vec_ins_or_scal_vec (i32 (ExtLoad GPR64sp:$Rn)))),
             (ResultTy (EXTRACT_SUBREG
               (LD1 (VecTy (IMPLICIT_DEF)), 0, GPR64sp:$Rn), dsub))>;
 
@@ -8940,11 +8943,11 @@ def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
+def : Pat<(v1i64 (vec_ins_or_scal_vec GPR64:$Xn)),
           (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
+def : Pat<(v1f64 (vec_ins_or_scal_vec GPR64:$Xn)),
           (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
+def : Pat<(v1f64 (vec_ins_or_scal_vec (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
 
 def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
           (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index e9e6b6cb68d0d..18361cf368564 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2116,6 +2116,21 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
     I.getOperand(1).setReg(NewSrc.getReg(0));
     return true;
   }
+  case AArch64::G_INSERT_VECTOR_ELT: {
+    // Convert the type from p0 to s64 to help selection.
+    LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+    LLT SrcVecTy = MRI.getType(I.getOperand(1).getReg());
+    if (!SrcVecTy.isPointerVector())
+      return false;
+    auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(2).getReg());
+    MRI.setType(I.getOperand(1).getReg(),
+                DstTy.changeElementType(LLT::scalar(64)));
+    MRI.setType(I.getOperand(0).getReg(),
+                DstTy.changeElementType(LLT::scalar(64)));
+    MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
+    I.getOperand(2).setReg(NewSrc.getReg(0));
+    return true;
+  }
   case TargetOpcode::G_UITOFP:
   case TargetOpcode::G_SITOFP: {
     // If both source and destination regbanks are FPR, then convert the opcode
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 90ac4bdff4e0e..b40fe55fdfaf6 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -1054,6 +1054,40 @@ void applyLowerVectorFCMP(MachineInstr &MI, MachineRegisterInfo &MRI,
   MI.eraseFromParent();
 }
 
+// Matches G_BUILD_VECTOR where at least one source operand is not a constant
+bool matchLowerBuildToInsertVecElt(MachineInstr &MI, MachineRegisterInfo &MRI) {
+  auto *GBuildVec = cast<GBuildVector>(&MI);
+
+  // Check if the values are all constants
+  for (unsigned I = 0; I < GBuildVec->getNumSources(); ++I) {
+    auto ConstVal =
+        getAnyConstantVRegValWithLookThrough(GBuildVec->getSourceReg(I), MRI);
+
+    if (!ConstVal.has_value())
+      return true;
+  }
+
+  return false;
+}
+
+void applyLowerBuildToInsertVecElt(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                   MachineIRBuilder &B) {
+  auto *GBuildVec = cast<GBuildVector>(&MI);
+  LLT DstTy = MRI.getType(GBuildVec->getReg(0));
+  Register DstReg = B.buildUndef(DstTy).getReg(0);
+
+  for (unsigned I = 0; I < GBuildVec->getNumSources(); ++I) {
+    Register SrcReg = GBuildVec->getSourceReg(I);
+    if (mi_match(SrcReg, MRI, m_GImplicitDef()))
+      continue;
+    auto IdxReg = B.buildConstant(LLT::scalar(64), I);
+    DstReg =
+        B.buildInsertVectorElement(DstTy, DstReg, SrcReg, IdxReg).getReg(0);
+  }
+  B.buildCopy(GBuildVec->getReg(0), DstReg);
+  GBuildVec->eraseFromParent();
+}
+
 bool matchFormTruncstore(MachineInstr &MI, MachineRegisterInfo &MRI,
                          Register &SrcReg) {
   assert(MI.getOpcode() == TargetOpcode::G_STORE);
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
index f7efaeaa50705..87c1307ad2955 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
@@ -10,12 +10,14 @@ define i32 @bar() {
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    mov b1, v0[1]
-; CHECK-NEXT:    mov b2, v0[2]
-; CHECK-NEXT:    mov b3, v0[3]
-; CHECK-NEXT:    mov.h v0[1], v1[0]
-; CHECK-NEXT:    mov.h v2[1], v3[0]
+; CHECK-NEXT:    mov b2, v0[3]
+; CHECK-NEXT:    mov b3, v0[2]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    mov.h v0[1], w8
+; CHECK-NEXT:    mov.h v3[1], w9
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    ushll.4s v1, v2, #0
+; CHECK-NEXT:    ushll.4s v1, v3, #0
 ; CHECK-NEXT:    mov.d v0[1], v1[0]
 ; CHECK-NEXT:    movi.4s v1, #1
 ; CHECK-NEXT:    and.16b v0, v0, v1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
index 70867c2ea2842..0115531dfb09a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
@@ -42,20 +42,30 @@ body:             |
     ; LOWER-NEXT: {{  $}}
     ; LOWER-NEXT: %r:_(s32) = COPY $w0
     ; LOWER-NEXT: %q:_(s32) = COPY $w1
-    ; LOWER-NEXT: %build_vector:_(<2 x s32>) = G_BUILD_VECTOR %r(s32), %q(s32)
+    ; LOWER-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
+    ; LOWER-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; LOWER-NEXT: [[IVEC:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %r(s32), [[C]](s64)
+    ; LOWER-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; LOWER-NEXT: [[IVEC1:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], %q(s32), [[C1]](s64)
+    ; LOWER-NEXT: %build_vector:_(<2 x s32>) = COPY [[IVEC1]](<2 x s32>)
     ; LOWER-NEXT: $d0 = COPY %build_vector(<2 x s32>)
     ; LOWER-NEXT: RET_ReallyLR implicit $d0
     ;
     ; SELECT-LABEL: name: dont_combine_different_reg
     ; SELECT: liveins: $d0, $w0, $w1
     ; SELECT-NEXT: {{  $}}
-    ; SELECT-NEXT: %r:gpr32all = COPY $w0
+    ; SELECT-NEXT: %r:gpr32 = COPY $w0
     ; SELECT-NEXT: %q:gpr32 = COPY $w1
-    ; SELECT-NEXT: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; SELECT-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], %r, %subreg.ssub
-    ; SELECT-NEXT: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 1, %q
-    ; SELECT-NEXT: %build_vector:fpr64 = COPY [[INSvi32gpr]].dsub
-    ; SELECT-NEXT: $d0 = COPY %build_vector
+    ; SELECT-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
+    ; SELECT-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; SELECT-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[DEF]], %subreg.dsub
+    ; SELECT-NEXT: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 0, %r
+    ; SELECT-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr]].dsub
+    ; SELECT-NEXT: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; SELECT-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[COPY]], %subreg.dsub
+    ; SELECT-NEXT: [[INSvi32gpr1:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG1]], 1, %q
+    ; SELECT-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr1]].dsub
+    ; SELECT-NEXT: $d0 = COPY [[COPY1]]
     ; SELECT-NEXT: RET_ReallyLR implicit $d0
     %r:_(s32) = COPY $w0
     %q:_(s32) = COPY $w1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
index 71094825e42f3..7c7689bcb80b5 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
@@ -355,7 +355,21 @@ body:             |
     ; CHECK: liveins: $w0, $w1, $w2, $w3
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %lane:_(s32) = COPY $w0
-    ; CHECK-NEXT: %shuf:_(<4 x s32>) = G_DUP %lane(s32)
+    ; CHECK-NEXT: %b:_(s32) = COPY $w1
+    ; CHECK-NEXT: %c:_(s32) = COPY $w2
+    ; CHECK-NEXT: %d:_(s32) = COPY $w3
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %lane(s32), [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], %b(s32), [[C1]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[IVEC2:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC1]], %c(s32), [[C2]](s64)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[IVEC3:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC2]], %d(s32), [[C3]](s64)
+    ; CHECK-NEXT: %buildvec:_(<4 x s32>) = COPY [[IVEC3]](<4 x s32>)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %shuf:_(<4 x s32>) = G_DUPLANE32 %buildvec, [[C4]](s64)
     ; CHECK-NEXT: $q0 = COPY %shuf(<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %lane:_(s32) = COPY $w0
@@ -367,7 +381,7 @@ body:             |
     %shuf:_(<4 x s32>) = G_SHUFFLE_VECTOR %buildvec(<4 x s32>), %undef, shufflemask(0, 0, 0, 0)
     $q0 = COPY %shuf(<4 x s32>)
     RET_ReallyLR implicit $q0
- 
+
 ...
 ---
 name:            build_vector_rhs
@@ -382,10 +396,35 @@ body:             |
     ;
     ; CHECK-LABEL: name: build_vector
     ; CHECK: liveins: $w0, $w1, $w2, $w3, $w4
-    ; CHECK: %lane_1:_(s32) = COPY $w1
-    ; CHECK: %shuf:_(<4 x s32>) = G_DUP %lane_1(s32)
-    ; CHECK: $q0 = COPY %shuf(<4 x s32>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %lane_0:_(s32) = COPY $w0
+    ; CHECK-NEXT: %lane_1:_(s32) = COPY $w1
+    ; CHECK-NEXT: %b:_(s32) = COPY $w2
+    ; CHECK-NEXT: %c:_(s32) = COPY $w3
+    ; CHECK-NEXT: %d:_(s32) = COPY $w4
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %lane_0(s32), [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], %b(s32), [[C1]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[IVEC2:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC1]], %c(s32), [[C2]](s64)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[IVEC3:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC2]], %d(s32), [[C3]](s64)
+    ; CHECK-NEXT: %buildvec0:_(<4 x s32>) = COPY [[IVEC3]](<4 x s32>)
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC4:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF1]], %lane_1(s32), [[C4]](s64)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[IVEC5:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC4]], %b(s32), [[C5]](s64)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[IVEC6:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC5]], %c(s32), [[C6]](s64)
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[IVEC7:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC6]], %d(s32), [[C7]](s64)
+    ; CHECK-NEXT: %buildvec1:_(<4 x s32>) = COPY [[IVEC7]](<4 x s32>)
+    ; CHECK-NEXT: %shuf:_(<4 x s32>) = G_SHUFFLE_VECTOR %buildvec0(<4 x s32>), %buildvec1, shufflemask(4, 4, 4, 4)
+    ; CHECK-NEXT: $q0 = COPY %shuf(<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %lane_0:_(s32) = COPY $w0
     %lane_1:_(s32) = COPY $w1
     %b:_(s32) = COPY $w2
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
index f47da47002fbc..9734ab35bd6b2 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
@@ -76,7 +76,7 @@ define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ; CHECK-GI-NEXT:    bic w9, w9, w8
 ; CHECK-GI-NEXT:    and w8, w8, w10
 ; CHECK-GI-NEXT:    orr w8, w9, w8
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %neg = xor <1 x i32> %C, <i32 -1>
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
index 6431cfc58a54d..45ad4b07ff66f 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -76,7 +76,7 @@ define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ; CHECK-GI-NEXT:    and w9, w8, w9
 ; CHECK-GI-NEXT:    bic w8, w10, w8
 ; CHECK-GI-NEXT:    orr w8, w9, w8
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %and = and <1 x i32> %C, %B
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 307aa397eabbb..d677526bab000 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -209,24 +209,22 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
 ; CHECK-GI-NEXT:    ldr w8, [x0]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    uxtb w8, w8
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b1, v0.b[2]
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
 ; CHECK-GI-NEXT:    mov b3, v0.b[3]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov w10, s2
 ; CHECK-GI-NEXT:    fmov w11, s3
+; CHECK-GI-NEXT:    ldr d2, [x1]
 ; CHECK-GI-NEXT:    uxtb w9, w9
 ; CHECK-GI-NEXT:    uxtb w10, w10
 ; CHECK-GI-NEXT:    uxtb w11, w11
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    fmov s3, w11
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    mov v1.h[1], w11
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ldr d2, [x1]
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
@@ -269,25 +267,25 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ;
 ; CHECK-GI-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr h1, [x0]
+; CHECK-GI-NEXT:    ld1 { v1.h }[0], [x0]
 ; CHECK-GI-NEXT:    ldr h2, [x0, #2]
 ; CHECK-GI-NEXT:    movi d0, #0x00ffff0000ffff
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    mov w8, v0.s[0]
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
-; CHECK-GI-NEXT:    ldr d0, [x1]
-; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    fmov d1, x8
-; CHECK-GI-NEXT:    fmov x11, d0
-; CHECK-GI-NEXT:    mov v1.d[1], x9
-; CHECK-GI-NEXT:    mov x9, v0.d[1]
-; CHECK-GI-NEXT:    fmov x10, d1
-; CHECK-GI-NEXT:    mov x8, v1.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-GI-NEXT:    ret
   %load.A = load <2 x i16>, ptr %A
   %load.B = load <2 x i32>, ptr %B
@@ -322,14 +320,14 @@ define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-GI-NEXT:    ret
   %load.A = load <2 x i32>, ptr %A
   %and.A = and <2 x i32> %load.A, <i32 u0x7FFFFFFF, i32 u0x7FFFFFFF>
@@ -1048,14 +1046,14 @@ define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
 ; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI36_0]
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-GI-NEXT:    ret
   %tmp3 = sext <2 x i32> %arg to <2 x i64>
   %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
@@ -1163,14 +1161,14 @@ define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI40_0
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI40_0]
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-GI-NEXT:    ret
   %tmp3 = zext <2 x i32> %arg to <2 x i64>
   %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
@@ -1264,15 +1262,15 @@ define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI43_0
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI43_0]
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
 ; CHECK-GI-NEXT:    movi v1.2d, #0x000000ffffffff
-; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
   %tmp3 = zext <2 x i32> %arg to <2 x i64>
@@ -1891,15 +1889,15 @@ define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    movi v2.2d, #0x000000000000ff
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
 ; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <2 x i32> %src1 to <2 x i64>
@@ -1947,10 +1945,10 @@ define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) {
 ; CHECK-GI-NEXT:    fmov x9, d0
 ; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    mul x9, x9, x12
-; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    mov v0.d[0], x8
 ; CHECK-GI-NEXT:    mul x11, x13, x14
+; CHECK-GI-NEXT:    mov v1.d[0], x9
 ; CHECK-GI-NEXT:    mov v0.d[1], x10
-; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    mov v1.d[1], x11
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1992,9 +1990,9 @@ define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) {
 ; CHECK-GI-NEXT:    mul x8, x8, x9
 ; CHECK-GI-NEXT:    mul x9, x12, x9
 ; CHECK-GI-NEXT:    mul x10, x10, x11
-; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    mov v0.d[0], x8
 ; CHECK-GI-NEXT:    mul x11, x13, x11
-; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mov v1.d[0], x9
 ; CHECK-GI-NEXT:    mov v0.d[1], x10
 ; CHECK-GI-NEXT:    mov v1.d[1], x11
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index 78c1ff7b99370..6da019a79b727 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -247,7 +247,7 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){
 ; CHECK-GI-NEXT:    fmov w9, s0
 ; CHECK-GI-NEXT:    cmp w8, #0
 ; CHECK-GI-NEXT:    cneg w8, w9, le
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -299,10 +299,8 @@ define <3 x i8> @abs_v3i8(<3 x i8> %a){
 ; CHECK-GI-LABEL: abs_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], w1
+; CHECK-GI-NEXT:    mov v0.b[2], w2
 ; CHECK-GI-NEXT:    abs v0.8b, v0.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
 ; CHECK-GI-NEXT:    umov w1, v0.b[1]
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index ee15445a7bbd6..fc1a0c71d4cdf 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -71,13 +71,13 @@ define void @v2i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v2i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #1]
-; CHECK-GI-NEXT:    ldr b2, [x1]
+; CHECK-GI-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    ld1 { v1.b }[0], [x1]
+; CHECK-GI-NEXT:    ldr b2, [x0, #1]
 ; CHECK-GI-NEXT:    ldr b3, [x1, #1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    add v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str b0, [x0]
 ; CHECK-GI-NEXT:    str b1, [x0, #1]
@@ -112,22 +112,18 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w10, [x1]
+; CHECK-GI-NEXT:    ldrb w9, [x1]
+; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    fmov s3, w11
 ; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
 ; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    add v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[2]
 ; CHECK-GI-NEXT:    str b0, [x0]
@@ -159,27 +155,27 @@ define void @v4i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v5.8b, #0
 ; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x0]
 ; CHECK-GI-NEXT:    ret
@@ -247,13 +243,13 @@ define void @v2i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str h0, [x0]
@@ -281,18 +277,16 @@ define void @v3i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    add x10, x1, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    ldr h3, [x1, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    add v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index 1176c98ce44e3..5385a917619fa 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -191,13 +191,13 @@ define void @and_v2i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: and_v2i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #1]
-; CHECK-GI-NEXT:    ldr b2, [x1]
+; CHECK-GI-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    ld1 { v1.b }[0], [x1]
+; CHECK-GI-NEXT:    ldr b2, [x0, #1]
 ; CHECK-GI-NEXT:    ldr b3, [x1, #1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str b0, [x0]
 ; CHECK-GI-NEXT:    str b1, [x0, #1]
@@ -228,13 +228,13 @@ define void @or_v2i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: or_v2i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #1]
-; CHECK-GI-NEXT:    ldr b2, [x1]
+; CHECK-GI-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    ld1 { v1.b }[0], [x1]
+; CHECK-GI-NEXT:    ldr b2, [x0, #1]
 ; CHECK-GI-NEXT:    ldr b3, [x1, #1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str b0, [x0]
 ; CHECK-GI-NEXT:    str b1, [x0, #1]
@@ -265,13 +265,13 @@ define void @xor_v2i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: xor_v2i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #1]
-; CHECK-GI-NEXT:    ldr b2, [x1]
+; CHECK-GI-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    ld1 { v1.b }[0], [x1]
+; CHECK-GI-NEXT:    ldr b2, [x0, #1]
 ; CHECK-GI-NEXT:    ldr b3, [x1, #1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str b0, [x0]
 ; CHECK-GI-NEXT:    str b1, [x0, #1]
@@ -306,22 +306,18 @@ define void @and_v3i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: and_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w10, [x1]
+; CHECK-GI-NEXT:    ldrb w9, [x1]
+; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    fmov s3, w11
 ; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
 ; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[2]
 ; CHECK-GI-NEXT:    str b0, [x0]
@@ -358,22 +354,18 @@ define void @or_v3i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: or_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w10, [x1]
+; CHECK-GI-NEXT:    ldrb w9, [x1]
+; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    fmov s3, w11
 ; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
 ; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[2]
 ; CHECK-GI-NEXT:    str b0, [x0]
@@ -410,22 +402,18 @@ define void @xor_v3i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: xor_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w10, [x1]
+; CHECK-GI-NEXT:    ldrb w9, [x1]
+; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    fmov s3, w11
 ; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
 ; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[2]
 ; CHECK-GI-NEXT:    str b0, [x0]
@@ -459,27 +447,27 @@ define void @and_v4i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v5.8b, #0
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x0]
 ; CHECK-GI-NEXT:    ret
@@ -510,27 +498,27 @@ define void @or_v4i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v5.8b, #0
 ; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x0]
 ; CHECK-GI-NEXT:    ret
@@ -561,27 +549,27 @@ define void @xor_v4i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v5.8b, #0
 ; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x0]
 ; CHECK-GI-NEXT:    ret
@@ -723,13 +711,13 @@ define void @and_v2i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: and_v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str h0, [x0]
@@ -762,13 +750,13 @@ define void @or_v2i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: or_v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str h0, [x0]
@@ -801,13 +789,13 @@ define void @xor_v2i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: xor_v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str h0, [x0]
@@ -836,18 +824,16 @@ define void @and_v3i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: and_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    add x10, x1, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    ldr h3, [x1, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
@@ -875,18 +861,16 @@ define void @or_v3i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: or_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    add x10, x1, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    ldr h3, [x1, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
@@ -914,18 +898,16 @@ define void @xor_v3i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: xor_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    add x10, x1, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    ldr h3, [x1, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
+; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 0291f8c912304..a25763e3b1590 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -334,25 +334,40 @@ entry:
 }
 
 define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone  {
-; CHECK-LABEL: f:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov.s v0[1], w1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov.s v0[1], w1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov.s v0[0], w0
+; CHECK-GI-NEXT:    mov.s v0[1], w1
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
   %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
   ret <2 x i32> %vecinit1
 }
 
 define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone  {
-; CHECK-LABEL: g:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov.s v0[1], w1
-; CHECK-NEXT:    mov.s v0[2], w1
-; CHECK-NEXT:    mov.s v0[3], w0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: g:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov.s v0[1], w1
+; CHECK-SD-NEXT:    mov.s v0[2], w1
+; CHECK-SD-NEXT:    mov.s v0[3], w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: g:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov.s v0[0], w0
+; CHECK-GI-NEXT:    mov.s v0[1], w1
+; CHECK-GI-NEXT:    mov.s v0[2], w1
+; CHECK-GI-NEXT:    mov.s v0[3], w0
+; CHECK-GI-NEXT:    ret
   %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
   %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
   %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
@@ -361,11 +376,17 @@ define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone  {
 }
 
 define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone  {
-; CHECK-LABEL: h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, x0
-; CHECK-NEXT:    mov.d v0[1], x1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov d0, x0
+; CHECK-SD-NEXT:    mov.d v0[1], x1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov.d v0[0], x0
+; CHECK-GI-NEXT:    mov.d v0[1], x1
+; CHECK-GI-NEXT:    ret
   %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
   %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
   ret <2 x i64> %vecinit1
@@ -386,8 +407,8 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) {
 ;
 ; CHECK-GI-LABEL: test_build_illegal:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov s0, v0[3]
-; CHECK-GI-NEXT:    mov.h v0[3], v0[0]
+; CHECK-GI-NEXT:    mov.s w8, v0[3]
+; CHECK-GI-NEXT:    mov.h v0[3], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %val = extractelement <4 x i32> %in, i32 3
diff --git a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
index bc399c8d4ff07..8611532d6ea92 100644
--- a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
@@ -29,19 +29,20 @@ define <4 x i8> @test_varidx_extract_v8s8(<8 x i8> %x, i32 %idx) {
 ; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GISEL-NEXT:    mov w9, w0
 ; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GISEL-NEXT:    mov b1, v0.b[1]
 ; CHECK-GISEL-NEXT:    add x8, sp, #8
-; CHECK-GISEL-NEXT:    and x9, x9, #0x7
 ; CHECK-GISEL-NEXT:    str d0, [sp, #8]
+; CHECK-GISEL-NEXT:    and x9, x9, #0x7
+; CHECK-GISEL-NEXT:    mov b2, v0.b[1]
 ; CHECK-GISEL-NEXT:    mov b3, v0.b[2]
 ; CHECK-GISEL-NEXT:    lsl x10, x9, #1
 ; CHECK-GISEL-NEXT:    mov b0, v0.b[3]
 ; CHECK-GISEL-NEXT:    sub x9, x10, x9
-; CHECK-GISEL-NEXT:    ldr b2, [x8, x9]
-; CHECK-GISEL-NEXT:    mov v2.b[1], v1.b[0]
-; CHECK-GISEL-NEXT:    mov v2.b[2], v3.b[0]
-; CHECK-GISEL-NEXT:    mov v2.b[3], v0.b[0]
-; CHECK-GISEL-NEXT:    ushll v0.8h, v2.8b, #0
+; CHECK-GISEL-NEXT:    ldr b1, [x8, x9]
+; CHECK-GISEL-NEXT:    mov v1.b[0], v1.b[0]
+; CHECK-GISEL-NEXT:    mov v1.b[1], v2.b[0]
+; CHECK-GISEL-NEXT:    mov v1.b[2], v3.b[0]
+; CHECK-GISEL-NEXT:    mov v1.b[3], v0.b[0]
+; CHECK-GISEL-NEXT:    ushll v0.8h, v1.8b, #0
 ; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GISEL-NEXT:    add sp, sp, #16
 ; CHECK-GISEL-NEXT:    ret
@@ -82,14 +83,15 @@ define <8 x i8> @test_varidx_extract_v16s8(<16 x i8> %x, i32 %idx) {
 ; CHECK-GISEL-NEXT:    sub sp, sp, #16
 ; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GISEL-NEXT:    mov w9, w0
-; CHECK-GISEL-NEXT:    mov b2, v0.b[1]
 ; CHECK-GISEL-NEXT:    mov x8, sp
-; CHECK-GISEL-NEXT:    and x9, x9, #0xf
 ; CHECK-GISEL-NEXT:    str q0, [sp]
+; CHECK-GISEL-NEXT:    and x9, x9, #0xf
+; CHECK-GISEL-NEXT:    mov b2, v0.b[1]
 ; CHECK-GISEL-NEXT:    mov b3, v0.b[2]
 ; CHECK-GISEL-NEXT:    lsl x10, x9, #1
 ; CHECK-GISEL-NEXT:    sub x9, x10, x9
 ; CHECK-GISEL-NEXT:    ldr b1, [x8, x9]
+; CHECK-GISEL-NEXT:    mov v1.b[0], v1.b[0]
 ; CHECK-GISEL-NEXT:    mov v1.b[1], v2.b[0]
 ; CHECK-GISEL-NEXT:    mov b2, v0.b[3]
 ; CHECK-GISEL-NEXT:    mov v1.b[2], v3.b[0]
@@ -176,15 +178,14 @@ define <2 x i16> @test_varidx_extract_v4s16(<4 x i16> %x, i32 %idx) {
 ; CHECK-GISEL:       // %bb.0:
 ; CHECK-GISEL-NEXT:    sub sp, sp, #16
 ; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GISEL-NEXT:    mov w9, w0
-; CHECK-GISEL-NEXT:    mov h1, v0.h[1]
+; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GISEL-NEXT:    add x8, sp, #8
 ; CHECK-GISEL-NEXT:    str d0, [sp, #8]
 ; CHECK-GISEL-NEXT:    and x9, x9, #0x3
-; CHECK-GISEL-NEXT:    ldr h0, [x8, x9, lsl #1]
-; CHECK-GISEL-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GISEL-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GISEL-NEXT:    ldr h1, [x8, x9, lsl #1]
+; CHECK-GISEL-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GISEL-NEXT:    ushll v0.4s, v1.4h, #0
 ; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GISEL-NEXT:    add sp, sp, #16
 ; CHECK-GISEL-NEXT:    ret
@@ -217,16 +218,13 @@ define <4 x i16> @test_varidx_extract_v8s16(<8 x i16> %x, i32 %idx) {
 ; CHECK-GISEL-NEXT:    sub sp, sp, #16
 ; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GISEL-NEXT:    mov w9, w0
-; CHECK-GISEL-NEXT:    mov h2, v0.h[1]
 ; CHECK-GISEL-NEXT:    mov x8, sp
 ; CHECK-GISEL-NEXT:    str q0, [sp]
 ; CHECK-GISEL-NEXT:    and x9, x9, #0x7
-; CHECK-GISEL-NEXT:    mov h3, v0.h[2]
 ; CHECK-GISEL-NEXT:    ldr h1, [x8, x9, lsl #1]
-; CHECK-GISEL-NEXT:    mov h0, v0.h[3]
-; CHECK-GISEL-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GISEL-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GISEL-NEXT:    mov v1.h[3], v0.h[0]
+; CHECK-GISEL-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GISEL-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GISEL-NEXT:    mov v1.h[3], v0.h[3]
 ; CHECK-GISEL-NEXT:    fmov d0, d1
 ; CHECK-GISEL-NEXT:    add sp, sp, #16
 ; CHECK-GISEL-NEXT:    ret
@@ -289,13 +287,12 @@ define <2 x i32> @test_varidx_extract_v4s32(<4 x i32> %x, i32 %idx) {
 ; CHECK-GISEL-NEXT:    sub sp, sp, #16
 ; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GISEL-NEXT:    mov w9, w0
-; CHECK-GISEL-NEXT:    mov s1, v0.s[1]
 ; CHECK-GISEL-NEXT:    mov x8, sp
 ; CHECK-GISEL-NEXT:    str q0, [sp]
 ; CHECK-GISEL-NEXT:    and x9, x9, #0x3
-; CHECK-GISEL-NEXT:    ldr s0, [x8, x9, lsl #2]
-; CHECK-GISEL-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GISEL-NEXT:    ldr s1, [x8, x9, lsl #2]
+; CHECK-GISEL-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GISEL-NEXT:    fmov d0, d1
 ; CHECK-GISEL-NEXT:    add sp, sp, #16
 ; CHECK-GISEL-NEXT:    ret
   %tmp = extractelement <4 x i32> %x, i32 %idx
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 720951eca6a34..0412aef7545e9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -13820,12 +13820,10 @@ define void @test_ld1lane_build(ptr %ptr0, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr
 ; CHECK-GI-LABEL: test_ld1lane_build:
 ; CHECK-GI:       ; %bb.0:
 ; CHECK-GI-NEXT:    ldr s0, [x0]
-; CHECK-GI-NEXT:    ldr s1, [x1]
-; CHECK-GI-NEXT:    ldr s2, [x2]
-; CHECK-GI-NEXT:    ldr s3, [x3]
-; CHECK-GI-NEXT:    mov.s v0[1], v1[0]
-; CHECK-GI-NEXT:    mov.s v2[1], v3[0]
-; CHECK-GI-NEXT:    sub.2s v0, v0, v2
+; CHECK-GI-NEXT:    ldr s1, [x2]
+; CHECK-GI-NEXT:    ld1.s { v0 }[1], [x1]
+; CHECK-GI-NEXT:    ld1.s { v1 }[1], [x3]
+; CHECK-GI-NEXT:    sub.2s v0, v0, v1
 ; CHECK-GI-NEXT:    str d0, [x4]
 ; CHECK-GI-NEXT:    ret
   %load0 = load i32, ptr %ptr0, align 4
@@ -13844,28 +13842,15 @@ define void @test_ld1lane_build(ptr %ptr0, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr
 }
 
 define void  @test_ld1lane_build_i16(ptr %a, ptr %b, ptr %c, ptr %d, <4 x i16> %e, ptr %p) {
-; CHECK-SD-LABEL: test_ld1lane_build_i16:
-; CHECK-SD:       ; %bb.0:
-; CHECK-SD-NEXT:    ldr h1, [x0]
-; CHECK-SD-NEXT:    ld1.h { v1 }[1], [x1]
-; CHECK-SD-NEXT:    ld1.h { v1 }[2], [x2]
-; CHECK-SD-NEXT:    ld1.h { v1 }[3], [x3]
-; CHECK-SD-NEXT:    sub.4h v0, v1, v0
-; CHECK-SD-NEXT:    str d0, [x4]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_ld1lane_build_i16:
-; CHECK-GI:       ; %bb.0:
-; CHECK-GI-NEXT:    ldr h1, [x0]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    mov.h v1[1], v2[0]
-; CHECK-GI-NEXT:    ldr h2, [x2]
-; CHECK-GI-NEXT:    mov.h v1[2], v2[0]
-; CHECK-GI-NEXT:    ldr h2, [x3]
-; CHECK-GI-NEXT:    mov.h v1[3], v2[0]
-; CHECK-GI-NEXT:    sub.4h v0, v1, v0
-; CHECK-GI-NEXT:    str d0, [x4]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_ld1lane_build_i16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr h1, [x0]
+; CHECK-NEXT:    ld1.h { v1 }[1], [x1]
+; CHECK-NEXT:    ld1.h { v1 }[2], [x2]
+; CHECK-NEXT:    ld1.h { v1 }[3], [x3]
+; CHECK-NEXT:    sub.4h v0, v1, v0
+; CHECK-NEXT:    str d0, [x4]
+; CHECK-NEXT:    ret
   %ld.a = load i16, ptr %a
   %ld.b = load i16, ptr %b
   %ld.c = load i16, ptr %c
@@ -13880,34 +13865,18 @@ define void  @test_ld1lane_build_i16(ptr %a, ptr %b, ptr %c, ptr %d, <4 x i16> %
 }
 
 define void  @test_ld1lane_build_half(ptr %a, ptr %b, ptr %c, ptr %d, <4 x half> %e, ptr %p) {
-; CHECK-SD-LABEL: test_ld1lane_build_half:
-; CHECK-SD:       ; %bb.0:
-; CHECK-SD-NEXT:    ldr h1, [x0]
-; CHECK-SD-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-SD-NEXT:    ld1.h { v1 }[1], [x1]
-; CHECK-SD-NEXT:    ld1.h { v1 }[2], [x2]
-; CHECK-SD-NEXT:    ld1.h { v1 }[3], [x3]
-; CHECK-SD-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-SD-NEXT:    fsub.4s v0, v1, v0
-; CHECK-SD-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-SD-NEXT:    str d0, [x4]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_ld1lane_build_half:
-; CHECK-GI:       ; %bb.0:
-; CHECK-GI-NEXT:    ldr h1, [x0]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT:    mov.h v1[1], v2[0]
-; CHECK-GI-NEXT:    ldr h2, [x2]
-; CHECK-GI-NEXT:    mov.h v1[2], v2[0]
-; CHECK-GI-NEXT:    ldr h2, [x3]
-; CHECK-GI-NEXT:    mov.h v1[3], v2[0]
-; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NEXT:    fsub.4s v0, v1, v0
-; CHECK-GI-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-GI-NEXT:    str d0, [x4]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_ld1lane_build_half:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr h1, [x0]
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    ld1.h { v1 }[1], [x1]
+; CHECK-NEXT:    ld1.h { v1 }[2], [x2]
+; CHECK-NEXT:    ld1.h { v1 }[3], [x3]
+; CHECK-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-NEXT:    fsub.4s v0, v1, v0
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    str d0, [x4]
+; CHECK-NEXT:    ret
   %ld.a = load half, ptr %a
   %ld.b = load half, ptr %b
   %ld.c = load half, ptr %c
@@ -13942,6 +13911,7 @@ define void  @test_ld1lane_build_i8(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e, ptr
 ; CHECK-GI-NEXT:    ldr b1, [x0]
 ; CHECK-GI-NEXT:    ldr b2, [x1]
 ; CHECK-GI-NEXT:    ldr x8, [sp]
+; CHECK-GI-NEXT:    mov.b v1[0], v1[0]
 ; CHECK-GI-NEXT:    mov.b v1[1], v2[0]
 ; CHECK-GI-NEXT:    ldr b2, [x2]
 ; CHECK-GI-NEXT:    mov.b v1[2], v2[0]
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index c56f4409e3a62..c0d91c1e0c836 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1259,7 +1259,7 @@ define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
 ;
 ; CHECK-GI-LABEL: scalar_to_vector.v2i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    mov v0.s[0], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %b = insertelement <2 x i32> undef, i32 %a, i32 0
@@ -1267,19 +1267,29 @@ define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
 }
 
 define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
-; CHECK-LABEL: scalar_to_vector.v4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_to_vector.v4i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_to_vector.v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v0.s[0], w0
+; CHECK-GI-NEXT:    ret
   %b = insertelement <4 x i32> undef, i32 %a, i32 0
   ret <4 x i32> %b
 }
 
 define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
-; CHECK-LABEL: scalar_to_vector.v2i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, x0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_to_vector.v2i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov d0, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_to_vector.v2i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v0.d[0], x0
+; CHECK-GI-NEXT:    ret
   %b = insertelement <2 x i64> undef, i64 %a, i32 0
   ret <2 x i64> %b
 }
@@ -1348,21 +1358,22 @@ define <8 x i8> @getl(<16 x i8> %x) #0 {
 ;
 ; CHECK-GI-LABEL: getl:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    mov b5, v0.b[5]
-; CHECK-GI-NEXT:    mov b6, v0.b[6]
-; CHECK-GI-NEXT:    mov b7, v0.b[7]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v4.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v5.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v6.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v7.b[0]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov v1.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[3]
+; CHECK-GI-NEXT:    mov v1.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[4]
+; CHECK-GI-NEXT:    mov v1.b[3], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[5]
+; CHECK-GI-NEXT:    mov v1.b[4], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[6]
+; CHECK-GI-NEXT:    mov b0, v0.b[7]
+; CHECK-GI-NEXT:    mov v1.b[5], v2.b[0]
+; CHECK-GI-NEXT:    mov v1.b[6], v3.b[0]
+; CHECK-GI-NEXT:    mov v1.b[7], v0.b[0]
+; CHECK-GI-NEXT:    fmov d0, d1
 ; CHECK-GI-NEXT:    ret
   %vecext = extractelement <16 x i8> %x, i32 0
   %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0
@@ -1405,16 +1416,13 @@ define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) {
 ; CHECK-GI-NEXT:    sub sp, sp, #16
 ; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GI-NEXT:    mov w9, w0
-; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    mov x8, sp
 ; CHECK-GI-NEXT:    str q0, [sp]
 ; CHECK-GI-NEXT:    and x9, x9, #0x7
-; CHECK-GI-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NEXT:    ldr h1, [x8, x9, lsl #1]
-; CHECK-GI-NEXT:    mov h0, v0.h[3]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
 ; CHECK-GI-NEXT:    fmov d0, d1
 ; CHECK-GI-NEXT:    add sp, sp, #16
 ; CHECK-GI-NEXT:    ret
@@ -1709,8 +1717,8 @@ define <2 x i32> @test_concat_undef_v1i32(<2 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: test_concat_undef_v1i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v0.s[1], v0.s[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1794,25 +1802,26 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
 ;
 ; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v16i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov v2.16b, v1.16b
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov b3, v0.b[1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI127_0
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v0.b[2]
-; CHECK-GI-NEXT:    mov b4, v0.b[3]
-; CHECK-GI-NEXT:    mov b5, v0.b[4]
-; CHECK-GI-NEXT:    mov b6, v0.b[5]
-; CHECK-GI-NEXT:    mov b7, v0.b[6]
-; CHECK-GI-NEXT:    mov b16, v0.b[7]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI127_0]
-; CHECK-GI-NEXT:    mov v0.b[2], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v4.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v5.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v6.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v7.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v16.b[0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    mov v1.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov v1.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[4]
+; CHECK-GI-NEXT:    mov v1.b[3], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[5]
+; CHECK-GI-NEXT:    mov v1.b[4], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[6]
+; CHECK-GI-NEXT:    mov b0, v0.b[7]
+; CHECK-GI-NEXT:    mov v1.b[5], v3.b[0]
+; CHECK-GI-NEXT:    mov v1.b[6], v4.b[0]
+; CHECK-GI-NEXT:    mov v1.b[7], v0.b[0]
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI127_0]
+; CHECK-GI-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v0.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <8 x i8> %x, i32 0
@@ -1844,36 +1853,38 @@ define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 {
 ;
 ; CHECK-GI-LABEL: test_concat_v16i8_v16i8_v8i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[1]
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov b4, v0.b[3]
-; CHECK-GI-NEXT:    mov b5, v0.b[4]
-; CHECK-GI-NEXT:    mov b6, v0.b[5]
-; CHECK-GI-NEXT:    mov b7, v0.b[6]
-; CHECK-GI-NEXT:    mov b16, v0.b[7]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[1]
-; CHECK-GI-NEXT:    mov v0.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov v2.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[4]
+; CHECK-GI-NEXT:    mov v2.b[3], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[5]
+; CHECK-GI-NEXT:    mov v2.b[4], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[6]
+; CHECK-GI-NEXT:    mov b0, v0.b[7]
+; CHECK-GI-NEXT:    mov v2.b[5], v3.b[0]
 ; CHECK-GI-NEXT:    mov b3, v1.b[2]
-; CHECK-GI-NEXT:    mov v0.b[3], v4.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v5.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v6.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v7.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v16.b[0]
-; CHECK-GI-NEXT:    mov v0.b[8], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[9], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[10], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[6], v4.b[0]
+; CHECK-GI-NEXT:    mov v2.b[7], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[1]
+; CHECK-GI-NEXT:    mov v2.b[8], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[9], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[3]
+; CHECK-GI-NEXT:    mov v2.b[10], v3.b[0]
 ; CHECK-GI-NEXT:    mov b3, v1.b[4]
-; CHECK-GI-NEXT:    mov v0.b[11], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[5]
-; CHECK-GI-NEXT:    mov v0.b[12], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[11], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[5]
+; CHECK-GI-NEXT:    mov v2.b[12], v3.b[0]
 ; CHECK-GI-NEXT:    mov b3, v1.b[6]
-; CHECK-GI-NEXT:    mov b1, v1.b[7]
-; CHECK-GI-NEXT:    mov v0.b[13], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[15], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[13], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[7]
+; CHECK-GI-NEXT:    mov v2.b[14], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[15], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <16 x i8> %x, i32 0
@@ -1922,36 +1933,38 @@ define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
 ; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v8i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[1]
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov b4, v0.b[3]
-; CHECK-GI-NEXT:    mov b5, v0.b[4]
-; CHECK-GI-NEXT:    mov b6, v0.b[5]
-; CHECK-GI-NEXT:    mov b7, v0.b[6]
-; CHECK-GI-NEXT:    mov b16, v0.b[7]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[1]
-; CHECK-GI-NEXT:    mov v0.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov v2.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[4]
+; CHECK-GI-NEXT:    mov v2.b[3], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[5]
+; CHECK-GI-NEXT:    mov v2.b[4], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[6]
+; CHECK-GI-NEXT:    mov b0, v0.b[7]
+; CHECK-GI-NEXT:    mov v2.b[5], v3.b[0]
 ; CHECK-GI-NEXT:    mov b3, v1.b[2]
-; CHECK-GI-NEXT:    mov v0.b[3], v4.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v5.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v6.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v7.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v16.b[0]
-; CHECK-GI-NEXT:    mov v0.b[8], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[9], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[10], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[6], v4.b[0]
+; CHECK-GI-NEXT:    mov v2.b[7], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[1]
+; CHECK-GI-NEXT:    mov v2.b[8], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[9], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[3]
+; CHECK-GI-NEXT:    mov v2.b[10], v3.b[0]
 ; CHECK-GI-NEXT:    mov b3, v1.b[4]
-; CHECK-GI-NEXT:    mov v0.b[11], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[5]
-; CHECK-GI-NEXT:    mov v0.b[12], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[11], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[5]
+; CHECK-GI-NEXT:    mov v2.b[12], v3.b[0]
 ; CHECK-GI-NEXT:    mov b3, v1.b[6]
-; CHECK-GI-NEXT:    mov b1, v1.b[7]
-; CHECK-GI-NEXT:    mov v0.b[13], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[15], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[13], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[7]
+; CHECK-GI-NEXT:    mov v2.b[14], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[15], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <8 x i8> %x, i32 0
@@ -2017,17 +2030,15 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
 ;
 ; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v8i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov v2.16b, v1.16b
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI131_0
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI131_0]
-; CHECK-GI-NEXT:    mov v0.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI131_0]
+; CHECK-GI-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v0.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <4 x i16> %x, i32 0
@@ -2051,20 +2062,16 @@ define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 {
 ;
 ; CHECK-GI-LABEL: test_concat_v8i16_v8i16_v4i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NEXT:    mov h3, v0.h[2]
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov h3, v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NEXT:    mov h1, v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[6], v3.h[0]
-; CHECK-GI-NEXT:    mov v0.h[7], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v2.h[4], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[5], v1.h[1]
+; CHECK-GI-NEXT:    mov v2.h[6], v1.h[2]
+; CHECK-GI-NEXT:    mov v2.h[7], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <8 x i16> %x, i32 0
@@ -2097,20 +2104,16 @@ define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
 ; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v4i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NEXT:    mov h3, v0.h[2]
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov h3, v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NEXT:    mov h1, v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[6], v3.h[0]
-; CHECK-GI-NEXT:    mov v0.h[7], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v2.h[4], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[5], v1.h[1]
+; CHECK-GI-NEXT:    mov v2.h[6], v1.h[2]
+; CHECK-GI-NEXT:    mov v2.h[7], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <4 x i16> %x, i32 0
@@ -2160,13 +2163,13 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
 ;
 ; CHECK-GI-LABEL: test_concat_v4i32_v2i32_v4i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov v2.16b, v1.16b
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI135_0
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI135_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI135_0]
+; CHECK-GI-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v0.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <2 x i32> %x, i32 0
@@ -2186,12 +2189,12 @@ define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 {
 ;
 ; CHECK-GI-LABEL: test_concat_v4i32_v4i32_v2i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT:    mov s2, v1.s[1]
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v2.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[3], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <4 x i32> %x, i32 0
@@ -2241,11 +2244,18 @@ entry:
 }
 
 define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 {
-; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_concat_v2i64_v2i64_v1i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_concat_v2i64_v2i64_v1i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.d[0], v0.d[0]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <2 x i64> %x, i32 0
   %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
index 1f5654d59926d..a6a825b26b3b5 100644
--- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -466,92 +466,62 @@ define <32 x i8> @sext_v32i1(<32 x i1> %arg) {
 ;
 ; CHECK-GI-LABEL: sext_v32i1:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr w9, [sp, #64]
-; CHECK-GI-NEXT:    ldr w8, [sp, #72]
+; CHECK-GI-NEXT:    ldr w8, [sp, #64]
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w9, [sp, #72]
+; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #80]
+; CHECK-GI-NEXT:    mov.b v0[1], w1
+; CHECK-GI-NEXT:    mov.b v1[1], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #128]
-; CHECK-GI-NEXT:    mov.b v0[1], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w2
-; CHECK-GI-NEXT:    mov.b v1[1], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov.b v0[2], w2
+; CHECK-GI-NEXT:    mov.b v1[2], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #88]
-; CHECK-GI-NEXT:    mov.b v0[2], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w3
-; CHECK-GI-NEXT:    mov.b v1[2], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov.b v0[3], w3
+; CHECK-GI-NEXT:    mov.b v1[3], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #96]
-; CHECK-GI-NEXT:    mov.b v0[3], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w4
-; CHECK-GI-NEXT:    mov.b v1[3], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov.b v0[4], w4
+; CHECK-GI-NEXT:    mov.b v1[4], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #104]
-; CHECK-GI-NEXT:    mov.b v0[4], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    mov.b v1[4], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov.b v0[5], w5
+; CHECK-GI-NEXT:    mov.b v1[5], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #112]
-; CHECK-GI-NEXT:    mov.b v0[5], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w6
-; CHECK-GI-NEXT:    mov.b v1[5], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov.b v0[6], w6
+; CHECK-GI-NEXT:    mov.b v1[6], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #120]
-; CHECK-GI-NEXT:    mov.b v0[6], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w7
-; CHECK-GI-NEXT:    mov.b v1[6], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov.b v0[7], w7
+; CHECK-GI-NEXT:    mov.b v1[7], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    mov.b v0[7], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov.b v0[8], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #8]
-; CHECK-GI-NEXT:    mov.b v1[7], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v1[8], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #136]
-; CHECK-GI-NEXT:    mov.b v0[8], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov.b v0[9], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    mov.b v1[8], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v1[9], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #144]
-; CHECK-GI-NEXT:    mov.b v0[9], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov.b v0[10], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-NEXT:    mov.b v1[9], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v1[10], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #152]
-; CHECK-GI-NEXT:    mov.b v0[10], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov.b v0[11], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-NEXT:    mov.b v1[10], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v1[11], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #160]
-; CHECK-GI-NEXT:    mov.b v0[11], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov.b v0[12], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-NEXT:    mov.b v1[11], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v1[12], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #168]
-; CHECK-GI-NEXT:    mov.b v0[12], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov.b v0[13], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-NEXT:    mov.b v1[12], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v1[13], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #176]
-; CHECK-GI-NEXT:    mov.b v0[13], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov.b v0[14], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-NEXT:    mov.b v1[13], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v1[14], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #184]
-; CHECK-GI-NEXT:    mov.b v0[14], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    mov.b v1[14], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov.b v0[15], v2[0]
-; CHECK-GI-NEXT:    mov.b v1[15], v3[0]
+; CHECK-GI-NEXT:    mov.b v0[15], w8
+; CHECK-GI-NEXT:    mov.b v1[15], w9
 ; CHECK-GI-NEXT:    shl.16b v0, v0, #7
 ; CHECK-GI-NEXT:    shl.16b v1, v1, #7
 ; CHECK-GI-NEXT:    sshr.16b v0, v0, #7
@@ -840,194 +810,134 @@ define <64 x i8> @sext_v64i1(<64 x i1> %arg) {
 ; CHECK-GI-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GI-NEXT:    .cfi_offset w29, -16
-; CHECK-GI-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-NEXT:    ldr w11, [sp, #88]
+; CHECK-GI-NEXT:    ldr w13, [sp, #80]
+; CHECK-GI-NEXT:    ldr w11, [sp, #208]
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s3, w1
-; CHECK-GI-NEXT:    ldr w8, [sp, #208]
-; CHECK-GI-NEXT:    ldr w10, [sp, #216]
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s4, w11
 ; CHECK-GI-NEXT:    ldr w9, [sp, #336]
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    fmov s5, w10
-; CHECK-GI-NEXT:    ldr w11, [sp, #344]
-; CHECK-GI-NEXT:    mov.b v0[1], v3[0]
+; CHECK-GI-NEXT:    ldr w8, [sp, #88]
+; CHECK-GI-NEXT:    ldr w10, [sp, #216]
+; CHECK-GI-NEXT:    fmov s1, w13
+; CHECK-GI-NEXT:    fmov s2, w11
+; CHECK-GI-NEXT:    ldr w12, [sp, #344]
 ; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v0[1], w1
 ; CHECK-GI-NEXT:    ldr w9, [sp, #224]
-; CHECK-GI-NEXT:    mov.b v1[1], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w2
-; CHECK-GI-NEXT:    fmov s6, w11
-; CHECK-GI-NEXT:    mov.b v2[1], v5[0]
+; CHECK-GI-NEXT:    ldr w11, [sp, #400]
+; CHECK-GI-NEXT:    mov.b v1[1], w8
+; CHECK-GI-NEXT:    mov.b v2[1], w10
 ; CHECK-GI-NEXT:    ldr w8, [sp, #96]
+; CHECK-GI-NEXT:    mov.b v3[1], w12
 ; CHECK-GI-NEXT:    ldr w10, [sp, #352]
-; CHECK-GI-NEXT:    ldr w11, [sp, #16]
-; CHECK-GI-NEXT:    mov.b v0[2], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #232]
-; CHECK-GI-NEXT:    mov.b v3[1], v6[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    mov.b v0[2], w2
+; CHECK-GI-NEXT:    mov.b v1[2], w8
+; CHECK-GI-NEXT:    mov.b v2[2], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #104]
+; CHECK-GI-NEXT:    mov.b v3[2], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #232]
 ; CHECK-GI-NEXT:    ldr w10, [sp, #360]
-; CHECK-GI-NEXT:    mov.b v2[2], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w3
-; CHECK-GI-NEXT:    mov.b v1[2], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    mov.b v0[3], w3
+; CHECK-GI-NEXT:    mov.b v1[3], w8
+; CHECK-GI-NEXT:    mov.b v2[3], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #112]
-; CHECK-GI-NEXT:    mov.b v3[2], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #368]
-; CHECK-GI-NEXT:    mov.b v0[3], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    mov.b v3[3], w10
 ; CHECK-GI-NEXT:    ldr w9, [sp, #240]
-; CHECK-GI-NEXT:    mov.b v1[3], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    ldr w10, [sp, #368]
+; CHECK-GI-NEXT:    mov.b v0[4], w4
+; CHECK-GI-NEXT:    mov.b v1[4], w8
+; CHECK-GI-NEXT:    mov.b v2[4], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #120]
-; CHECK-GI-NEXT:    mov.b v2[3], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w4
-; CHECK-GI-NEXT:    mov.b v3[3], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #376]
-; CHECK-GI-NEXT:    mov.b v0[4], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    mov.b v3[4], w10
 ; CHECK-GI-NEXT:    ldr w9, [sp, #248]
-; CHECK-GI-NEXT:    mov.b v1[4], v5[0]
-; CHECK-GI-NEXT:    mov.b v3[4], v6[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #376]
+; CHECK-GI-NEXT:    mov.b v0[5], w5
+; CHECK-GI-NEXT:    mov.b v1[5], w8
+; CHECK-GI-NEXT:    mov.b v2[5], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #128]
-; CHECK-GI-NEXT:    ldr w10, [sp, #384]
-; CHECK-GI-NEXT:    mov.b v2[4], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w5
-; CHECK-GI-NEXT:    mov.b v1[5], v5[0]
-; CHECK-GI-NEXT:    mov.b v3[5], v6[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    mov.b v0[5], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    mov.b v3[5], w10
 ; CHECK-GI-NEXT:    ldr w9, [sp, #256]
-; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #384]
+; CHECK-GI-NEXT:    mov.b v0[6], w6
+; CHECK-GI-NEXT:    mov.b v1[6], w8
+; CHECK-GI-NEXT:    mov.b v2[6], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #136]
-; CHECK-GI-NEXT:    ldr w10, [sp, #392]
-; CHECK-GI-NEXT:    mov.b v2[5], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w6
-; CHECK-GI-NEXT:    mov.b v1[6], v5[0]
-; CHECK-GI-NEXT:    mov.b v3[6], v6[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w8, [sp, #144]
-; CHECK-GI-NEXT:    ldr w10, [sp, #400]
-; CHECK-GI-NEXT:    mov.b v0[6], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    mov.b v3[6], w10
 ; CHECK-GI-NEXT:    ldr w9, [sp, #264]
-; CHECK-GI-NEXT:    mov.b v1[7], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #152]
-; CHECK-GI-NEXT:    mov.b v3[7], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #408]
-; CHECK-GI-NEXT:    mov.b v2[6], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w7
-; CHECK-GI-NEXT:    mov.b v1[8], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #160]
-; CHECK-GI-NEXT:    mov.b v0[7], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #272]
-; CHECK-GI-NEXT:    mov.b v3[8], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #416]
-; CHECK-GI-NEXT:    mov.b v2[7], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #24]
-; CHECK-GI-NEXT:    mov.b v1[9], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #168]
-; CHECK-GI-NEXT:    mov.b v3[9], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #424]
-; CHECK-GI-NEXT:    mov.b v0[8], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #280]
-; CHECK-GI-NEXT:    mov.b v1[10], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #176]
-; CHECK-GI-NEXT:    mov.b v2[8], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #32]
-; CHECK-GI-NEXT:    mov.b v3[10], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #432]
-; CHECK-GI-NEXT:    mov.b v0[9], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #288]
-; CHECK-GI-NEXT:    mov.b v1[11], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #184]
-; CHECK-GI-NEXT:    mov.b v3[11], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #440]
-; CHECK-GI-NEXT:    mov.b v2[9], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #40]
-; CHECK-GI-NEXT:    mov.b v1[12], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #192]
-; CHECK-GI-NEXT:    mov.b v0[10], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #296]
-; CHECK-GI-NEXT:    mov.b v3[12], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #448]
-; CHECK-GI-NEXT:    mov.b v2[10], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #48]
-; CHECK-GI-NEXT:    mov.b v1[13], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #200]
-; CHECK-GI-NEXT:    mov.b v3[13], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #456]
-; CHECK-GI-NEXT:    mov.b v0[11], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #304]
-; CHECK-GI-NEXT:    fmov s7, w10
-; CHECK-GI-NEXT:    mov.b v1[14], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    mov.b v2[11], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #56]
-; CHECK-GI-NEXT:    mov.b v3[14], v6[0]
-; CHECK-GI-NEXT:    mov.b v0[12], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #312]
-; CHECK-GI-NEXT:    mov.b v1[15], v5[0]
-; CHECK-GI-NEXT:    mov.b v3[15], v7[0]
-; CHECK-GI-NEXT:    mov.b v2[12], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #64]
-; CHECK-GI-NEXT:    shl.16b v1, v1, #7
-; CHECK-GI-NEXT:    mov.b v0[13], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #320]
-; CHECK-GI-NEXT:    shl.16b v3, v3, #7
-; CHECK-GI-NEXT:    sshr.16b v1, v1, #7
-; CHECK-GI-NEXT:    mov.b v2[13], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #72]
-; CHECK-GI-NEXT:    sshr.16b v3, v3, #7
-; CHECK-GI-NEXT:    mov.b v0[14], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #328]
-; CHECK-GI-NEXT:    fmov s6, w9
-; CHECK-GI-NEXT:    mov.b v2[14], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    mov.b v0[15], v4[0]
-; CHECK-GI-NEXT:    mov.b v2[15], v6[0]
+; CHECK-GI-NEXT:    ldr w10, [sp, #392]
+; CHECK-GI-NEXT:    mov.b v0[7], w7
+; CHECK-GI-NEXT:    mov.b v1[7], w8
+; CHECK-GI-NEXT:    mov.b v2[7], w9
+; CHECK-GI-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-NEXT:    mov.b v3[7], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #144]
+; CHECK-GI-NEXT:    ldr w10, [sp, #272]
+; CHECK-GI-NEXT:    mov.b v0[8], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-NEXT:    mov.b v1[8], w9
+; CHECK-GI-NEXT:    mov.b v2[8], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #152]
+; CHECK-GI-NEXT:    mov.b v3[8], w11
+; CHECK-GI-NEXT:    ldr w10, [sp, #280]
+; CHECK-GI-NEXT:    ldr w11, [sp, #408]
+; CHECK-GI-NEXT:    mov.b v0[9], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #32]
+; CHECK-GI-NEXT:    mov.b v1[9], w9
+; CHECK-GI-NEXT:    mov.b v2[9], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #160]
+; CHECK-GI-NEXT:    mov.b v3[9], w11
+; CHECK-GI-NEXT:    ldr w10, [sp, #288]
+; CHECK-GI-NEXT:    ldr w11, [sp, #416]
+; CHECK-GI-NEXT:    mov.b v0[10], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #40]
+; CHECK-GI-NEXT:    mov.b v1[10], w9
+; CHECK-GI-NEXT:    mov.b v2[10], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #168]
+; CHECK-GI-NEXT:    mov.b v3[10], w11
+; CHECK-GI-NEXT:    ldr w10, [sp, #296]
+; CHECK-GI-NEXT:    ldr w11, [sp, #424]
+; CHECK-GI-NEXT:    mov.b v0[11], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #48]
+; CHECK-GI-NEXT:    mov.b v1[11], w9
+; CHECK-GI-NEXT:    mov.b v2[11], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #176]
+; CHECK-GI-NEXT:    mov.b v3[11], w11
+; CHECK-GI-NEXT:    ldr w10, [sp, #304]
+; CHECK-GI-NEXT:    ldr w11, [sp, #432]
+; CHECK-GI-NEXT:    mov.b v0[12], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #56]
+; CHECK-GI-NEXT:    mov.b v1[12], w9
+; CHECK-GI-NEXT:    mov.b v2[12], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #184]
+; CHECK-GI-NEXT:    mov.b v3[12], w11
+; CHECK-GI-NEXT:    ldr w10, [sp, #312]
+; CHECK-GI-NEXT:    ldr w11, [sp, #440]
+; CHECK-GI-NEXT:    mov.b v0[13], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #64]
+; CHECK-GI-NEXT:    mov.b v1[13], w9
+; CHECK-GI-NEXT:    mov.b v2[13], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #192]
+; CHECK-GI-NEXT:    mov.b v3[13], w11
+; CHECK-GI-NEXT:    ldr w10, [sp, #320]
+; CHECK-GI-NEXT:    ldr w11, [sp, #448]
+; CHECK-GI-NEXT:    mov.b v0[14], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #72]
+; CHECK-GI-NEXT:    mov.b v1[14], w9
+; CHECK-GI-NEXT:    mov.b v2[14], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #200]
+; CHECK-GI-NEXT:    mov.b v3[14], w11
+; CHECK-GI-NEXT:    ldr w10, [sp, #328]
+; CHECK-GI-NEXT:    ldr w11, [sp, #456]
+; CHECK-GI-NEXT:    mov.b v0[15], w8
+; CHECK-GI-NEXT:    mov.b v1[15], w9
+; CHECK-GI-NEXT:    mov.b v2[15], w10
+; CHECK-GI-NEXT:    mov.b v3[15], w11
 ; CHECK-GI-NEXT:    shl.16b v0, v0, #7
+; CHECK-GI-NEXT:    shl.16b v1, v1, #7
 ; CHECK-GI-NEXT:    shl.16b v2, v2, #7
+; CHECK-GI-NEXT:    shl.16b v3, v3, #7
 ; CHECK-GI-NEXT:    sshr.16b v0, v0, #7
+; CHECK-GI-NEXT:    sshr.16b v1, v1, #7
 ; CHECK-GI-NEXT:    sshr.16b v2, v2, #7
+; CHECK-GI-NEXT:    sshr.16b v3, v3, #7
 ; CHECK-GI-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
   %res = sext <64 x i1> %arg to <64 x i8>
diff --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
index 44b92e6ccd088..a854cb7fec991 100644
--- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
@@ -368,28 +368,26 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x
 ; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
 ; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-GI-NEXT:    mov.16b v5, v4
-; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov.b v4[1], w0
+; CHECK-GI-NEXT:    mov.b v4[2], w0
+; CHECK-GI-NEXT:    mov.b v4[3], w0
+; CHECK-GI-NEXT:    mov.b v4[4], w0
+; CHECK-GI-NEXT:    mov.b v4[5], w0
+; CHECK-GI-NEXT:    mov.b v4[6], w0
+; CHECK-GI-NEXT:    mov.b v4[7], w0
+; CHECK-GI-NEXT:    mov.b v4[8], w8
+; CHECK-GI-NEXT:    mov.b v4[9], w8
+; CHECK-GI-NEXT:    mov.b v4[10], w8
+; CHECK-GI-NEXT:    mov.b v4[11], w8
+; CHECK-GI-NEXT:    mov.b v4[12], w8
+; CHECK-GI-NEXT:    mov.b v4[13], w8
+; CHECK-GI-NEXT:    mov.b v4[14], w8
+; CHECK-GI-NEXT:    mov.b v4[15], w8
 ; CHECK-GI-NEXT:    adrp x8, .LCPI10_1
-; CHECK-GI-NEXT:    mov.b v5[8], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[9], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[10], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[11], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[12], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[13], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[15], v4[0]
-; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI10_1]
+; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI10_1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI10_0
-; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v5
-; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v5
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI10_0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
 ; CHECK-GI-NEXT:    ret
@@ -488,35 +486,32 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x
 ; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov w8, #1 // =0x1
-; CHECK-GI-NEXT:    fmov s6, w0
 ; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
 ; CHECK-GI-NEXT:    fmov s4, w8
-; CHECK-GI-NEXT:    mov w8, #255 // =0xff
 ; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-GI-NEXT:    mov.16b v5, v4
-; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov.b v4[1], w8
+; CHECK-GI-NEXT:    mov.b v4[2], w8
+; CHECK-GI-NEXT:    mov.b v4[3], w8
+; CHECK-GI-NEXT:    mov.b v4[4], w8
+; CHECK-GI-NEXT:    mov.b v4[5], w8
+; CHECK-GI-NEXT:    mov.b v4[6], w8
+; CHECK-GI-NEXT:    mov.b v4[7], w8
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    mov.b v4[8], w8
+; CHECK-GI-NEXT:    mov.b v4[9], w8
+; CHECK-GI-NEXT:    mov.b v4[10], w8
+; CHECK-GI-NEXT:    mov.b v4[11], w8
+; CHECK-GI-NEXT:    mov.b v4[12], w0
+; CHECK-GI-NEXT:    mov.b v4[13], w0
+; CHECK-GI-NEXT:    mov.b v4[14], w8
 ; CHECK-GI-NEXT:    adrp x8, .LCPI11_1
-; CHECK-GI-NEXT:    mov.b v5[8], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[9], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[10], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[11], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[12], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[13], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
-; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI11_1]
+; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI11_1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI11_0
-; CHECK-GI-NEXT:    mov.b v5[15], v6[0]
-; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v5
-; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    mov.b v4[15], w0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v5
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI11_0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
 ; CHECK-GI-NEXT:    ret
@@ -623,32 +618,30 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    fmov s4, w0
 ; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    adrp x9, .LCPI12_1
 ; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q5, [x9, :lo12:.LCPI12_1]
 ; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
 ; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    mov.16b v5, v4
-; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w8
-; CHECK-GI-NEXT:    adrp x8, .LCPI12_1
-; CHECK-GI-NEXT:    mov.b v5[8], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[9], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[10], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[11], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[12], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[13], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[15], v4[0]
-; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI12_1]
+; CHECK-GI-NEXT:    mov.b v4[1], w0
+; CHECK-GI-NEXT:    tbl.16b v2, { v2, v3 }, v5
+; CHECK-GI-NEXT:    mov.b v4[2], w0
+; CHECK-GI-NEXT:    mov.b v4[3], w0
+; CHECK-GI-NEXT:    mov.b v4[4], w0
+; CHECK-GI-NEXT:    mov.b v4[5], w0
+; CHECK-GI-NEXT:    mov.b v4[6], w0
+; CHECK-GI-NEXT:    mov.b v4[7], w0
+; CHECK-GI-NEXT:    mov.b v4[8], w8
+; CHECK-GI-NEXT:    mov.b v4[9], w8
+; CHECK-GI-NEXT:    mov.b v4[10], w8
+; CHECK-GI-NEXT:    mov.b v4[11], w8
+; CHECK-GI-NEXT:    mov.b v4[12], w8
+; CHECK-GI-NEXT:    mov.b v4[13], w8
+; CHECK-GI-NEXT:    mov.b v4[14], w8
+; CHECK-GI-NEXT:    mov.b v4[15], w8
 ; CHECK-GI-NEXT:    adrp x8, .LCPI12_0
-; CHECK-GI-NEXT:    tbl.16b v2, { v2, v3 }, v4
-; CHECK-GI-NEXT:    tbl.16b v3, { v0, v1 }, v5
+; CHECK-GI-NEXT:    tbl.16b v3, { v0, v1 }, v4
 ; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI12_0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v2, v3 }, v0
 ; CHECK-GI-NEXT:    ret
@@ -774,30 +767,28 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16
 ; CHECK-GI-NEXT:    mov w8, #255 // =0xff
 ; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
 ; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    fmov s6, w8
-; CHECK-GI-NEXT:    adrp x8, .LCPI13_1
 ; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
 ; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    mov.16b v5, v4
-; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[8], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[9], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[10], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[11], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[12], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[13], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[15], v4[0]
-; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI13_1]
+; CHECK-GI-NEXT:    mov.b v4[1], w0
+; CHECK-GI-NEXT:    mov.b v4[2], w0
+; CHECK-GI-NEXT:    mov.b v4[3], w0
+; CHECK-GI-NEXT:    mov.b v4[4], w0
+; CHECK-GI-NEXT:    mov.b v4[5], w0
+; CHECK-GI-NEXT:    mov.b v4[6], w0
+; CHECK-GI-NEXT:    mov.b v4[7], w0
+; CHECK-GI-NEXT:    mov.b v4[8], w8
+; CHECK-GI-NEXT:    mov.b v4[9], w8
+; CHECK-GI-NEXT:    mov.b v4[10], w8
+; CHECK-GI-NEXT:    mov.b v4[11], w8
+; CHECK-GI-NEXT:    mov.b v4[12], w8
+; CHECK-GI-NEXT:    mov.b v4[13], w8
+; CHECK-GI-NEXT:    adrp x8, .LCPI13_1
+; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI13_1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI13_0
-; CHECK-GI-NEXT:    tbl.16b v2, { v2, v3 }, v4
-; CHECK-GI-NEXT:    tbl.16b v3, { v0, v1 }, v5
+; CHECK-GI-NEXT:    tbl.16b v2, { v2, v3 }, v5
+; CHECK-GI-NEXT:    mov.b v4[14], w0
+; CHECK-GI-NEXT:    mov.b v4[15], w0
+; CHECK-GI-NEXT:    tbl.16b v3, { v0, v1 }, v4
 ; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI13_0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v2, v3 }, v0
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 5de99586f7fc7..79cfeedb74bce 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -13,7 +13,7 @@ define <4 x i16> @foo1(<2 x i32> %a) {
 ; CHECK-GI-LABEL: foo1:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov w8, #58712 // =0xe558
-; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    zip1 v0.2s, v1.2s, v0.2s
 ; CHECK-GI-NEXT:    rev32 v0.4h, v0.4h
 ; CHECK-GI-NEXT:    ret
@@ -33,7 +33,7 @@ define <4 x i16> @foo2(<2 x i32> %a) {
 ; CHECK-GI-LABEL: foo2:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov w8, #712 // =0x2c8
-; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    zip1 v0.2s, v1.2s, v0.2s
 ; CHECK-GI-NEXT:    rev32 v0.4h, v0.4h
 ; CHECK-GI-NEXT:    ret
@@ -60,13 +60,11 @@ define i32 @bitcast_v4i8_i32(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-GI-LABEL: bitcast_v4i8_i32:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
   %c = add <4 x i8> %a, %b
@@ -87,12 +85,13 @@ define <4 x i8> @bitcast_i32_v4i8(i32 %a, i32 %b){
 ; CHECK-GI-NEXT:    add w8, w0, w1
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %c = add i32 %a, %b
@@ -117,9 +116,9 @@ define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){
 ; CHECK-GI-LABEL: bitcast_v2i16_i32:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    xtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
   %c = add <2 x i16> %a, %b
@@ -419,16 +418,17 @@ define <4 x i8> @bitcast_v2i16_v4i8(<2 x i16> %a, <2 x i16> %b){
 ; CHECK-GI-LABEL: bitcast_v2i16_v4i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    xtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %c = add <2 x i16> %a, %b
@@ -455,13 +455,11 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-GI-LABEL: bitcast_v4i8_v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
@@ -515,10 +513,12 @@ define <4 x i64> @bitcast_v8i32_v4i64(<8 x i32> %a, <8 x i32> %b){
 ;
 ; CHECK-GI-LABEL: bitcast_v8i32_v4i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
+; CHECK-GI-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    add v3.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    mov x8, v2.d[1]
+; CHECK-GI-NEXT:    mov x9, v3.d[1]
+; CHECK-GI-NEXT:    mov v0.d[0], v2.d[0]
+; CHECK-GI-NEXT:    mov v1.d[0], v3.d[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    mov v1.d[1], x9
 ; CHECK-GI-NEXT:    ret
@@ -574,10 +574,12 @@ define <4 x i64> @bitcast_v16i16_v4i64(<16 x i16> %a, <16 x i16> %b){
 ;
 ; CHECK-GI-LABEL: bitcast_v16i16_v4i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    add v0.8h, v0.8h, v2.8h
-; CHECK-GI-NEXT:    add v1.8h, v1.8h, v3.8h
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
+; CHECK-GI-NEXT:    add v2.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    add v3.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    mov x8, v2.d[1]
+; CHECK-GI-NEXT:    mov x9, v3.d[1]
+; CHECK-GI-NEXT:    mov v0.d[0], v2.d[0]
+; CHECK-GI-NEXT:    mov v1.d[0], v3.d[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    mov v1.d[1], x9
 ; CHECK-GI-NEXT:    ret
@@ -614,14 +616,18 @@ define <8 x i64> @bitcast_v16i32_v8i64(<16 x i32> %a, <16 x i32> %b){
 ;
 ; CHECK-GI-LABEL: bitcast_v16i32_v8i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    add v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v5.4s
-; CHECK-GI-NEXT:    add v2.4s, v2.4s, v6.4s
-; CHECK-GI-NEXT:    add v3.4s, v3.4s, v7.4s
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
-; CHECK-GI-NEXT:    mov x10, v2.d[1]
-; CHECK-GI-NEXT:    mov x11, v3.d[1]
+; CHECK-GI-NEXT:    add v4.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    add v5.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    add v6.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    add v7.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    mov x8, v4.d[1]
+; CHECK-GI-NEXT:    mov x9, v5.d[1]
+; CHECK-GI-NEXT:    mov x10, v6.d[1]
+; CHECK-GI-NEXT:    mov x11, v7.d[1]
+; CHECK-GI-NEXT:    mov v0.d[0], v4.d[0]
+; CHECK-GI-NEXT:    mov v1.d[0], v5.d[0]
+; CHECK-GI-NEXT:    mov v2.d[0], v6.d[0]
+; CHECK-GI-NEXT:    mov v3.d[0], v7.d[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    mov v1.d[1], x9
 ; CHECK-GI-NEXT:    mov v2.d[1], x10
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 071613b9cc011..9ee924dd2548a 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -110,8 +110,8 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %a){
 ; CHECK-GI-LABEL: bswap_v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
 ; CHECK-GI-NEXT:    rev16 v0.8b, v0.8b
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
@@ -146,7 +146,7 @@ define <1 x i32> @bswap_v1i32(<1 x i32> %a){
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    rev w8, w8
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index f6eeeef4faf7e..18570b2d793ff 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -11,12 +11,13 @@ define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) {
 ; CHECK-GI-LABEL: concat1:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov s2, v1.s[1]
-; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v2.b[0]
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    mov v0.b[3], w9
 ; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -32,22 +33,20 @@ define <8 x i8> @concat2(<4 x i8> %A, <4 x i8> %B) {
 ;
 ; CHECK-GI-LABEL: concat2:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h3, v0.h[1]
-; CHECK-GI-NEXT:    mov h4, v1.h[2]
-; CHECK-GI-NEXT:    mov h5, v1.h[3]
-; CHECK-GI-NEXT:    mov h6, v0.h[3]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov v0.h[1], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[2], v4.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v5.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v6.h[0]
-; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v2.8h
+; CHECK-GI-NEXT:    xtn v1.8b, v3.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -75,14 +74,16 @@ define <4 x i16> @concat4(<2 x i16> %A, <2 x i16> %B) {
 ;
 ; CHECK-GI-LABEL: concat4:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov s2, v1.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s3, v0.s[1]
-; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
-; CHECK-GI-NEXT:    mov v0.s[1], v3.s[0]
-; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    xtn v1.4h, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -145,8 +146,9 @@ define <4 x half> @concat9(<2 x half> %A, <2 x half> %B) {
 ;
 ; CHECK-GI-LABEL: concat9:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -181,12 +183,14 @@ define <8 x i16> @concat_v8s16_v2s16(ptr %ptr) {
 ;
 ; CHECK-GI-LABEL: concat_v8s16_v2s16:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    dup v0.4s, w8
 ; CHECK-GI-NEXT:    ldr h1, [x0]
 ; CHECK-GI-NEXT:    ldr h2, [x0, #2]
+; CHECK-GI-NEXT:    dup v0.4s, w8
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    xtn v2.4h, v0.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v1.4s
+; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
@@ -208,9 +212,10 @@ define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) {
 ; CHECK-GI-NEXT:    dup v0.8h, w8
 ; CHECK-GI-NEXT:    xtn v1.8b, v0.8h
 ; CHECK-GI-NEXT:    ldr s0, [x0]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    ret
     %a = load <4 x i8>, ptr %ptr
     %b = shufflevector <4 x i8> %a, <4 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -218,24 +223,13 @@ define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) {
 }
 
 define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %ptrD) {
-; CHECK-SD-LABEL: concat_v16s8_v4s8_load:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr s0, [x0]
-; CHECK-SD-NEXT:    ld1 { v0.s }[1], [x1]
-; CHECK-SD-NEXT:    ld1 { v0.s }[2], [x2]
-; CHECK-SD-NEXT:    ld1 { v0.s }[3], [x3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: concat_v16s8_v4s8_load:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr s0, [x0]
-; CHECK-GI-NEXT:    ldr s1, [x1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    ldr s1, [x2]
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    ldr s1, [x3]
-; CHECK-GI-NEXT:    mov v0.s[3], v1.s[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: concat_v16s8_v4s8_load:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ld1 { v0.s }[1], [x1]
+; CHECK-NEXT:    ld1 { v0.s }[2], [x2]
+; CHECK-NEXT:    ld1 { v0.s }[3], [x3]
+; CHECK-NEXT:    ret
     %A = load <4 x i8>, ptr %ptrA
     %B = load <4 x i8>, ptr %ptrB
     %C = load <4 x i8>, ptr %ptrC
@@ -261,41 +255,35 @@ define <16 x i8> @concat_v16s8_v4s8_reg(<4 x i8> %A, <4 x i8> %B, <4 x i8> %C, <
 ;
 ; CHECK-GI-LABEL: concat_v16s8_v4s8_reg:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov h4, v1.h[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h5, v0.h[1]
+; CHECK-GI-NEXT:    mov v4.h[0], v0.h[0]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v5.h[0], v1.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT:    mov h6, v1.h[2]
-; CHECK-GI-NEXT:    mov h7, v1.h[3]
-; CHECK-GI-NEXT:    mov h16, v2.h[1]
-; CHECK-GI-NEXT:    mov h17, v0.h[3]
-; CHECK-GI-NEXT:    mov h18, v2.h[3]
-; CHECK-GI-NEXT:    mov v1.h[1], v4.h[0]
-; CHECK-GI-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NEXT:    mov v0.h[1], v5.h[0]
-; CHECK-GI-NEXT:    mov h5, v2.h[2]
-; CHECK-GI-NEXT:    mov v2.h[1], v16.h[0]
-; CHECK-GI-NEXT:    mov v1.h[2], v6.h[0]
-; CHECK-GI-NEXT:    mov h6, v3.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v4.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v5.h[0]
-; CHECK-GI-NEXT:    mov h4, v3.h[2]
-; CHECK-GI-NEXT:    mov h5, v3.h[3]
-; CHECK-GI-NEXT:    mov v1.h[3], v7.h[0]
-; CHECK-GI-NEXT:    mov v3.h[1], v6.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v17.h[0]
-; CHECK-GI-NEXT:    mov v2.h[3], v18.h[0]
-; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
-; CHECK-GI-NEXT:    mov v3.h[2], v4.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
-; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
-; CHECK-GI-NEXT:    mov v3.h[3], v5.h[0]
+; CHECK-GI-NEXT:    mov v6.h[0], v2.h[0]
+; CHECK-GI-NEXT:    mov v7.h[0], v3.h[0]
+; CHECK-GI-NEXT:    mov v4.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v5.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v6.h[1], v2.h[1]
+; CHECK-GI-NEXT:    mov v7.h[1], v3.h[1]
+; CHECK-GI-NEXT:    mov v4.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v5.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v6.h[2], v2.h[2]
+; CHECK-GI-NEXT:    mov v7.h[2], v3.h[2]
+; CHECK-GI-NEXT:    mov v4.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v5.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v6.h[3], v2.h[3]
+; CHECK-GI-NEXT:    mov v7.h[3], v3.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v4.8h
+; CHECK-GI-NEXT:    xtn v1.8b, v5.8h
+; CHECK-GI-NEXT:    xtn v2.8b, v6.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    xtn v1.8b, v7.8h
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    xtn v1.8b, v3.8h
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    mov v0.s[3], w8
@@ -320,27 +308,29 @@ define <8 x i16> @concat_v8s16_v2s16_reg(<2 x i16> %A, <2 x i16> %B, <2 x i16> %
 ;
 ; CHECK-GI-LABEL: concat_v8s16_v2s16_reg:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov s4, v1.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s5, v0.s[1]
+; CHECK-GI-NEXT:    mov v4.s[0], v0.s[0]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v5.s[0], v1.s[0]
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT:    mov v1.s[1], v4.s[0]
-; CHECK-GI-NEXT:    mov s4, v2.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v5.s[0]
+; CHECK-GI-NEXT:    mov v4.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v5.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v1.s[0], v2.s[0]
+; CHECK-GI-NEXT:    xtn v0.4h, v4.4s
+; CHECK-GI-NEXT:    xtn v4.4h, v5.4s
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[1]
+; CHECK-GI-NEXT:    mov v2.s[0], v3.s[0]
+; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
-; CHECK-GI-NEXT:    mov v2.s[1], v4.s[0]
-; CHECK-GI-NEXT:    mov s4, v3.s[1]
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v2.s[1], v3.s[1]
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    fmov w8, s4
 ; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
-; CHECK-GI-NEXT:    mov v3.s[1], v4.s[0]
-; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
-; CHECK-GI-NEXT:    xtn v1.4h, v3.4s
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    ret
     %b = shufflevector <2 x i16> %A, <2 x i16> %B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll
index de108b0bc2b7a..e19e2ead11f4d 100644
--- a/llvm/test/CodeGen/AArch64/fabs.ll
+++ b/llvm/test/CodeGen/AArch64/fabs.ll
@@ -161,27 +161,21 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fabs v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fabs v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    fabs v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index 6913a62fb266c..b15579199a059 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -188,33 +188,25 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fadd v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fadd_v7f16:
@@ -537,33 +529,25 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fsub v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fsub v1.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    fsub v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fsub_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index a5d7ae147ffda..8ca1e9ee5b617 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -556,7 +556,7 @@ define <2 x double> @v2f128_double(<2 x fp128> %a, <2 x fp128> %b, <2 x double>
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    cset w19, lt
 ; CHECK-GI-NEXT:    bl __lttf2
-; CHECK-GI-NEXT:    fmov d0, x19
+; CHECK-GI-NEXT:    mov v0.d[0], x19
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    cset w8, lt
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #32] // 32-byte Folded Reload
@@ -663,29 +663,29 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    cset w22, lt
 ; CHECK-GI-NEXT:    bl __lttf2
-; CHECK-GI-NEXT:    ldp q0, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    sbfx x8, x21, #0, #1
-; CHECK-GI-NEXT:    ldp q4, q3, [sp, #96] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    sbfx x9, x22, #0, #1
-; CHECK-GI-NEXT:    fmov d1, x8
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldr x30, [sp, #128] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v2.d[1], v0.d[0]
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v1.d[0], x8
+; CHECK-GI-NEXT:    sbfx x8, x22, #0, #1
+; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
+; CHECK-GI-NEXT:    ldp q4, q3, [sp, #96] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #144] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mov v1.d[1], x8
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    cset w8, lt
 ; CHECK-GI-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-GI-NEXT:    mov v1.d[1], x9
-; CHECK-GI-NEXT:    ldp x22, x21, [sp, #144] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-GI-NEXT:    bic v0.16b, v3.16b, v0.16b
 ; CHECK-GI-NEXT:    and x9, x19, x8
 ; CHECK-GI-NEXT:    bic x8, x20, x8
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    orr x8, x9, x8
-; CHECK-GI-NEXT:    bic v1.16b, v3.16b, v1.16b
-; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #176
@@ -831,21 +831,21 @@ define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x
 ; CHECK-GI-NEXT:    fcmp d2, d5
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
-; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    cset w9, mi
-; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    fmov d2, x9
+; CHECK-GI-NEXT:    mov v2.d[0], x9
+; CHECK-GI-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-GI-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mov v3.s[0], w9
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    mov v3.s[1], w9
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    neg v1.4s, v1.4s
-; CHECK-GI-NEXT:    mov v2.s[2], w8
+; CHECK-GI-NEXT:    mov v3.s[2], w9
 ; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v3.16b
 ; CHECK-GI-NEXT:    and v0.16b, v6.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v7.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -902,18 +902,18 @@ define <3 x float> @v3f32_float(<3 x float> %a, <3 x float> %b, <3 x float> %d,
 ; CHECK-GI-LABEL: v3f32_float:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov v5.s[0], w9
 ; CHECK-GI-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NEXT:    mov v5.s[1], w9
 ; CHECK-GI-NEXT:    mov v4.s[2], w8
-; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mov v5.s[2], w9
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    neg v4.4s, v4.4s
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    neg v1.4s, v4.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
 ; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -980,18 +980,18 @@ define <3 x i32> @v3f32_i32(<3 x float> %a, <3 x float> %b, <3 x i32> %d, <3 x i
 ; CHECK-GI-LABEL: v3f32_i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov v5.s[0], w9
 ; CHECK-GI-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NEXT:    mov v5.s[1], w9
 ; CHECK-GI-NEXT:    mov v4.s[2], w8
-; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mov v5.s[2], w9
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    neg v4.4s, v4.4s
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    neg v1.4s, v4.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
 ; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -1106,44 +1106,38 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ; CHECK-GI-NOFP16-LABEL: v7f16_half:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #15 // =0xf
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
-; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    fmov s5, w8
+; CHECK-GI-NOFP16-NEXT:    mov w9, #65535 // =0xffff
+; CHECK-GI-NOFP16-NEXT:    fmov s7, w9
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[1], w9
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[2], w9
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v7.16b, v4.16b
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s6, w8
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.16b, v6.16b
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], w8
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[3], w9
 ; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v18.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v19.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[2], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v16.4s, v16.4h
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[3], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v16.4s, v5.4s
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[4], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[4], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[4], w8
+; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v6.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[4], w9
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[5], w8
 ; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[5], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[5], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[6], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[6], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    ushl v0.8h, v0.8h, v7.8h
-; CHECK-GI-NOFP16-NEXT:    neg v1.8h, v7.8h
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[5], w9
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[6], w8
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[6], w9
+; CHECK-GI-NOFP16-NEXT:    ushl v0.8h, v0.8h, v5.8h
+; CHECK-GI-NOFP16-NEXT:    neg v1.8h, v5.8h
 ; CHECK-GI-NOFP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
-; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v0.16b, v17.16b
+; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v0.16b, v7.16b
 ; CHECK-GI-NOFP16-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NOFP16-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -1152,28 +1146,26 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ; CHECK-GI-FP16-LABEL: v7f16_half:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    mov w8, #15 // =0xf
+; CHECK-GI-FP16-NEXT:    mov w9, #65535 // =0xffff
 ; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    fmov s4, w8
-; CHECK-GI-FP16-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-GI-FP16-NEXT:    fmov s6, w8
-; CHECK-GI-FP16-NEXT:    mov v5.16b, v4.16b
-; CHECK-GI-FP16-NEXT:    mov v7.16b, v6.16b
-; CHECK-GI-FP16-NEXT:    mov v5.h[1], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v7.h[1], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[2], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v7.h[2], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[3], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v7.h[3], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[4], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v7.h[4], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[5], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v7.h[5], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[6], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v7.h[6], v6.h[0]
-; CHECK-GI-FP16-NEXT:    ushl v0.8h, v0.8h, v5.8h
-; CHECK-GI-FP16-NEXT:    neg v1.8h, v5.8h
+; CHECK-GI-FP16-NEXT:    fmov s5, w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[1], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[1], w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[2], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[2], w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[3], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[3], w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[4], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[4], w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[5], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[5], w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[6], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[6], w9
+; CHECK-GI-FP16-NEXT:    ushl v0.8h, v0.8h, v4.8h
+; CHECK-GI-FP16-NEXT:    neg v1.8h, v4.8h
 ; CHECK-GI-FP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    eor v1.16b, v0.16b, v7.16b
+; CHECK-GI-FP16-NEXT:    eor v1.16b, v0.16b, v5.16b
 ; CHECK-GI-FP16-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-FP16-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-FP16-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -1599,59 +1591,52 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ;
 ; CHECK-GI-NOFP16-LABEL: v7f16_i32:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #31 // =0x1f
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #32]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    fmov s16, w0
-; CHECK-GI-NOFP16-NEXT:    fmov s18, w4
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NOFP16-NEXT:    mov w9, #-1 // =0xffffffff
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[0], w0
+; CHECK-GI-NOFP16-NEXT:    mov v6.s[0], w9
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[0], w7
+; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp]
+; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #24]
+; CHECK-GI-NOFP16-NEXT:    ldr s18, [sp, #32]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[1], w1
+; CHECK-GI-NOFP16-NEXT:    mov v17.s[1], v18.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.s[1], w9
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], v16.s[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp, #8]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[2], w8
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s3, w8
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s5, [sp]
-; CHECK-GI-NOFP16-NEXT:    mov v16.s[1], w1
-; CHECK-GI-NOFP16-NEXT:    mov v18.s[1], w5
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w8
-; CHECK-GI-NOFP16-NEXT:    fmov w9, s5
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w7
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #8]
-; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s7, [sp, #24]
-; CHECK-GI-NOFP16-NEXT:    mov v16.s[2], w2
-; CHECK-GI-NOFP16-NEXT:    mov v5.s[1], w9
-; CHECK-GI-NOFP16-NEXT:    fmov w9, s6
-; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #16]
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w8
-; CHECK-GI-NOFP16-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], v17.s[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #40]
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[2], w2
+; CHECK-GI-NOFP16-NEXT:    mov v6.s[2], w9
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], v16.s[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp, #40]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v18.s[2], w6
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    mov v16.s[3], w3
-; CHECK-GI-NOFP16-NEXT:    mov v5.s[2], w9
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], v17.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcmgt v2.4s, v4.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[1], w8
-; CHECK-GI-NOFP16-NEXT:    ushl v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    neg v3.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[2], w8
-; CHECK-GI-NOFP16-NEXT:    sshl v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    fmov w8, s6
-; CHECK-GI-NOFP16-NEXT:    mov v5.s[3], w8
-; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v2.16b, v4.16b
-; CHECK-GI-NOFP16-NEXT:    and v2.16b, v18.16b, v2.16b
-; CHECK-GI-NOFP16-NEXT:    and v1.16b, v7.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v16.16b, v5.16b
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    mov v17.s[2], v16.s[0]
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[3], w3
+; CHECK-GI-NOFP16-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[0], w4
+; CHECK-GI-NOFP16-NEXT:    ushl v2.4s, v2.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    neg v4.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w5
+; CHECK-GI-NOFP16-NEXT:    sshl v2.4s, v2.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    ldr s4, [sp, #16]
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w6
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[3], v4.s[0]
+; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v2.16b, v6.16b
+; CHECK-GI-NOFP16-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-GI-NOFP16-NEXT:    and v1.16b, v17.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v5.16b, v7.16b
 ; CHECK-GI-NOFP16-NEXT:    orr v1.16b, v2.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov s3, v0.s[2]
@@ -1670,59 +1655,56 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-FP16-LABEL: v7f16_i32:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT:    mov w10, #31 // =0x1f
-; CHECK-GI-FP16-NEXT:    ldr s3, [sp]
-; CHECK-GI-FP16-NEXT:    fmov s2, w10
-; CHECK-GI-FP16-NEXT:    fmov s6, w0
-; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #8]
-; CHECK-GI-FP16-NEXT:    fmov s17, w4
-; CHECK-GI-FP16-NEXT:    ldr s7, [sp, #24]
+; CHECK-GI-FP16-NEXT:    mov w9, #31 // =0x1f
+; CHECK-GI-FP16-NEXT:    mov v4.s[0], w0
+; CHECK-GI-FP16-NEXT:    mov v2.s[0], w9
+; CHECK-GI-FP16-NEXT:    mov v5.s[0], w7
+; CHECK-GI-FP16-NEXT:    ldr s6, [sp]
+; CHECK-GI-FP16-NEXT:    mov v7.s[0], w4
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #32]
+; CHECK-GI-FP16-NEXT:    ldr s17, [sp, #8]
 ; CHECK-GI-FP16-NEXT:    umov w8, v0.h[4]
-; CHECK-GI-FP16-NEXT:    umov w9, v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v2.s[1], w10
-; CHECK-GI-FP16-NEXT:    mov v6.s[1], w1
-; CHECK-GI-FP16-NEXT:    mov v17.s[1], w5
-; CHECK-GI-FP16-NEXT:    mov v7.s[1], v16.s[0]
+; CHECK-GI-FP16-NEXT:    umov w10, v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v4.s[1], w1
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov v5.s[1], v6.s[0]
+; CHECK-GI-FP16-NEXT:    ldr s6, [sp, #24]
+; CHECK-GI-FP16-NEXT:    mov v7.s[1], w5
+; CHECK-GI-FP16-NEXT:    mov v6.s[1], v16.s[0]
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #40]
-; CHECK-GI-FP16-NEXT:    fmov s1, w8
+; CHECK-GI-FP16-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-FP16-NEXT:    umov w8, v0.h[6]
-; CHECK-GI-FP16-NEXT:    mov v2.s[2], w10
 ; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-FP16-NEXT:    mov v6.s[2], w2
-; CHECK-GI-FP16-NEXT:    mov v17.s[2], w6
-; CHECK-GI-FP16-NEXT:    mov v7.s[2], v16.s[0]
-; CHECK-GI-FP16-NEXT:    mov v1.s[1], w9
-; CHECK-GI-FP16-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT:    fmov s5, w9
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w9
+; CHECK-GI-FP16-NEXT:    mov v4.s[2], w2
+; CHECK-GI-FP16-NEXT:    mov v5.s[2], v17.s[0]
+; CHECK-GI-FP16-NEXT:    mov v7.s[2], w6
 ; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    mov v6.s[3], w3
-; CHECK-GI-FP16-NEXT:    mov v1.s[2], w8
-; CHECK-GI-FP16-NEXT:    fmov w8, s3
-; CHECK-GI-FP16-NEXT:    fmov s3, w7
-; CHECK-GI-FP16-NEXT:    mov v5.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov v6.s[2], v16.s[0]
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], w10
+; CHECK-GI-FP16-NEXT:    mov w10, #-1 // =0xffffffff
+; CHECK-GI-FP16-NEXT:    mov v3.s[0], w10
+; CHECK-GI-FP16-NEXT:    mov v4.s[3], w3
 ; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    mov v3.s[1], w8
-; CHECK-GI-FP16-NEXT:    fmov w8, s4
-; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #16]
+; CHECK-GI-FP16-NEXT:    mov v1.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov v3.s[1], w10
 ; CHECK-GI-FP16-NEXT:    ushl v1.4s, v1.4s, v2.4s
 ; CHECK-GI-FP16-NEXT:    neg v2.4s, v2.4s
-; CHECK-GI-FP16-NEXT:    mov v5.s[2], w9
-; CHECK-GI-FP16-NEXT:    mov v3.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov v3.s[2], w10
 ; CHECK-GI-FP16-NEXT:    sshl v1.4s, v1.4s, v2.4s
-; CHECK-GI-FP16-NEXT:    fmov w8, s4
-; CHECK-GI-FP16-NEXT:    eor v2.16b, v1.16b, v5.16b
-; CHECK-GI-FP16-NEXT:    and v1.16b, v17.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    mov v3.s[3], w8
-; CHECK-GI-FP16-NEXT:    and v2.16b, v7.16b, v2.16b
-; CHECK-GI-FP16-NEXT:    bsl v0.16b, v6.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    ldr s2, [sp, #16]
+; CHECK-GI-FP16-NEXT:    mov v5.s[3], v2.s[0]
+; CHECK-GI-FP16-NEXT:    eor v3.16b, v1.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    and v1.16b, v7.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    and v2.16b, v6.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v4.16b, v5.16b
 ; CHECK-GI-FP16-NEXT:    orr v1.16b, v1.16b, v2.16b
 ; CHECK-GI-FP16-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov s3, v0.s[2]
 ; CHECK-GI-FP16-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-FP16-NEXT:    fmov w0, s0
 ; CHECK-GI-FP16-NEXT:    mov s5, v1.s[1]
 ; CHECK-GI-FP16-NEXT:    mov s6, v1.s[2]
-; CHECK-GI-FP16-NEXT:    fmov w0, s0
 ; CHECK-GI-FP16-NEXT:    fmov w4, s1
 ; CHECK-GI-FP16-NEXT:    fmov w1, s2
 ; CHECK-GI-FP16-NEXT:    fmov w2, s3
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index 84376107679d8..a42ec8e253be2 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -156,8 +156,8 @@ define <3 x float> @copysign_v3f32(<3 x float> %a, <3 x float> %b) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov w8, #-2147483648 // =0x80000000
 ; CHECK-GI-NEXT:    mov w9, #2147483647 // =0x7fffffff
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w9
+; CHECK-GI-NEXT:    mov v3.s[0], w8
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-NEXT:    mov v3.s[1], w8
 ; CHECK-GI-NEXT:    mov v2.s[2], w9
@@ -207,22 +207,20 @@ define <7 x half> @copysign_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NEXT:    mov w9, #32767 // =0x7fff
 ; CHECK-GI-NEXT:    fmov s2, w9
 ; CHECK-GI-NEXT:    fmov s3, w8
-; CHECK-GI-NEXT:    mov v4.16b, v2.16b
-; CHECK-GI-NEXT:    mov v5.16b, v3.16b
-; CHECK-GI-NEXT:    mov v4.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov v5.h[1], v3.h[0]
-; CHECK-GI-NEXT:    mov v4.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v5.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v4.h[3], v2.h[0]
-; CHECK-GI-NEXT:    mov v5.h[3], v3.h[0]
-; CHECK-GI-NEXT:    mov v4.h[4], v2.h[0]
-; CHECK-GI-NEXT:    mov v5.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v4.h[5], v2.h[0]
-; CHECK-GI-NEXT:    mov v5.h[5], v3.h[0]
-; CHECK-GI-NEXT:    mov v4.h[6], v2.h[0]
-; CHECK-GI-NEXT:    mov v5.h[6], v3.h[0]
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v4.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    mov v3.h[2], w8
+; CHECK-GI-NEXT:    mov v2.h[3], w9
+; CHECK-GI-NEXT:    mov v3.h[3], w8
+; CHECK-GI-NEXT:    mov v2.h[4], w9
+; CHECK-GI-NEXT:    mov v3.h[4], w8
+; CHECK-GI-NEXT:    mov v2.h[5], w9
+; CHECK-GI-NEXT:    mov v3.h[5], w8
+; CHECK-GI-NEXT:    mov v2.h[6], w9
+; CHECK-GI-NEXT:    mov v3.h[6], w8
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index 1c761ea083028..b408e9c1bd4e6 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -164,27 +164,21 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: ceil_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    frintp v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintp v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frintp v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: ceil_v7f16:
@@ -469,27 +463,21 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: floor_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    frintm v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintm v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frintm v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: floor_v7f16:
@@ -774,27 +762,21 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: nearbyint_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    frinti v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    frinti v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frinti v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: nearbyint_v7f16:
@@ -1079,27 +1061,21 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: roundeven_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    frintn v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintn v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frintn v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: roundeven_v7f16:
@@ -1384,27 +1360,21 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: rint_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    frintx v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintx v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frintx v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: rint_v7f16:
@@ -1689,27 +1659,21 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: round_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    frinta v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    frinta v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frinta v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: round_v7f16:
@@ -1994,27 +1958,21 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: trunc_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    frintz v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintz v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frintz v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: trunc_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index d73a5dc73eefc..5bdccccc62b99 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -188,33 +188,25 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fdiv v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    fdiv v1.4s, v1.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fdiv_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll
index 93d3d96d67b65..30ce389f23128 100644
--- a/llvm/test/CodeGen/AArch64/fexplog.ll
+++ b/llvm/test/CodeGen/AArch64/fexplog.ll
@@ -678,12 +678,12 @@ define <7 x half> @exp_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -701,18 +701,19 @@ define <7 x half> @exp_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -789,21 +790,21 @@ define <4 x half> @exp_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -919,12 +920,12 @@ define <8 x half> @exp_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -947,21 +948,21 @@ define <8 x half> @exp_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -1155,7 +1156,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h9
@@ -1180,7 +1181,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -1231,7 +1232,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q2, [sp, #112] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x29, x30, [sp, #304] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -1257,7 +1258,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    fcvt h2, s0
-; CHECK-GI-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q0, [sp, #128] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    ldr q0, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
@@ -1948,12 +1949,12 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -1971,18 +1972,19 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -2059,21 +2061,21 @@ define <4 x half> @exp2_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -2189,12 +2191,12 @@ define <8 x half> @exp2_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -2217,21 +2219,21 @@ define <8 x half> @exp2_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -2425,7 +2427,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
@@ -2450,7 +2452,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -2501,7 +2503,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q2, [sp, #112] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x29, x30, [sp, #304] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -2527,7 +2529,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    fcvt h2, s0
-; CHECK-GI-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q0, [sp, #128] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    ldr q0, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
@@ -3218,12 +3220,12 @@ define <7 x half> @log_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -3241,18 +3243,19 @@ define <7 x half> @log_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -3329,21 +3332,21 @@ define <4 x half> @log_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -3459,12 +3462,12 @@ define <8 x half> @log_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -3487,21 +3490,21 @@ define <8 x half> @log_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -3695,7 +3698,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h9
@@ -3720,7 +3723,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -3771,7 +3774,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q2, [sp, #112] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x29, x30, [sp, #304] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -3797,7 +3800,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    fcvt h2, s0
-; CHECK-GI-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q0, [sp, #128] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    ldr q0, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
@@ -4488,12 +4491,12 @@ define <7 x half> @log2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -4511,18 +4514,19 @@ define <7 x half> @log2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -4599,21 +4603,21 @@ define <4 x half> @log2_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -4729,12 +4733,12 @@ define <8 x half> @log2_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -4757,21 +4761,21 @@ define <8 x half> @log2_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -4965,7 +4969,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
@@ -4990,7 +4994,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -5041,7 +5045,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q2, [sp, #112] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x29, x30, [sp, #304] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -5067,7 +5071,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    fcvt h2, s0
-; CHECK-GI-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q0, [sp, #128] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    ldr q0, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
@@ -5758,12 +5762,12 @@ define <7 x half> @log10_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -5781,18 +5785,19 @@ define <7 x half> @log10_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -5869,21 +5874,21 @@ define <4 x half> @log10_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -5999,12 +6004,12 @@ define <8 x half> @log10_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -6027,21 +6032,21 @@ define <8 x half> @log10_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -6235,7 +6240,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h9
@@ -6260,7 +6265,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -6311,7 +6316,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q2, [sp, #112] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x29, x30, [sp, #304] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -6337,7 +6342,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    fcvt h2, s0
-; CHECK-GI-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q0, [sp, #128] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    ldr q0, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
index 2ea7e0f3c44a9..aa20304e52a95 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
@@ -11,13 +11,15 @@ define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
 ; CHECK-GI-LABEL: interleave2_v4f16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    dup v2.4s, w8
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    xtn v0.4h, v2.4s
+; CHECK-GI-NEXT:    mov v1.s[0], w8
+; CHECK-GI-NEXT:    mov v2.s[0], w9
+; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    zip1 v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    zip1 v0.4h, v1.4h, v2.4h
 ; CHECK-GI-NEXT:    ret
   %retval = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1)
   ret <4 x half> %retval
diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index 357d91960624b..fb12f8acf1745 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -664,33 +664,25 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-NOFP16-GI-NEXT:    fmin v2.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fmin v1.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fmin v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h2, v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: min_v7f16:
@@ -770,33 +762,25 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-NOFP16-GI-NEXT:    fmax v2.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fmax v1.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fmax v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h2, v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: max_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index 61199f82615bb..64f0da8b4cd0f 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -664,33 +664,25 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-NOFP16-GI-NEXT:    fminnm v2.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fminnm v1.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fminnm v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h2, v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: min_v7f16:
@@ -770,33 +762,25 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-NOFP16-GI-NEXT:    fmaxnm v2.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fmaxnm v1.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fmaxnm v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h2, v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: max_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index 4b019b57d968d..7bcaae5a77eac 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -257,39 +257,29 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h18, v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fmla v5.4s, v4.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[1], v19.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v5.4s
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[2], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v16.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v18.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmla v4.4s, v3.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v4.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v2.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v5.4h, v5.4s
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v5.h[1]
+; CHECK-GI-NOFP16-NEXT:    fmla v3.4s, v2.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v5.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fma_v7f16:
@@ -864,46 +854,36 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fmul v3.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    mov h4, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[1], v16.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v7.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov h1, v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v5.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v2.h[5]
+; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v3.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmuladd_v7f16:
@@ -1362,46 +1342,36 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fmul v3.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    mov h4, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[1], v16.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v7.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov h1, v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v5.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v2.h[5]
+; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v3.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmul_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index 1f41f2385c335..bd3d1353e643e 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -188,33 +188,25 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmul_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll
index cc0f7d2fd6075..a0e9edff733e0 100644
--- a/llvm/test/CodeGen/AArch64/fneg.ll
+++ b/llvm/test/CodeGen/AArch64/fneg.ll
@@ -162,27 +162,21 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fneg v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fneg v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    fneg v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll
index 8d40121ad4543..6e8cd0c8c00b4 100644
--- a/llvm/test/CodeGen/AArch64/fpow.ll
+++ b/llvm/test/CodeGen/AArch64/fpow.ll
@@ -965,22 +965,22 @@ define <4 x half> @pow_v4f16(<4 x half> %a, <4 x half> %b) {
 ; CHECK-GI-NEXT:    fcvt s2, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h12
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s2
 ; CHECK-GI-NEXT:    bl powf
 ; CHECK-GI-NEXT:    fcvt s2, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h13
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s2
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
diff --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll
index 5dbcaa4a5fda1..62fc1c0854ca8 100644
--- a/llvm/test/CodeGen/AArch64/fpowi.ll
+++ b/llvm/test/CodeGen/AArch64/fpowi.ll
@@ -869,22 +869,22 @@ define <4 x half> @powi_v4f16(<4 x half> %a, i32 %b) {
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl __powisf2
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 0c880592d955b..20b5567e973d0 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -2585,7 +2585,7 @@ define <3 x i64> @fptos_v3f32_v3i64(<3 x float> %a) {
 ;
 ; CHECK-GI-LABEL: fptos_v3f32_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[2]
 ; CHECK-GI-NEXT:    fcvtl v0.2d, v0.2s
 ; CHECK-GI-NEXT:    fcvtl v1.2d, v1.2s
 ; CHECK-GI-NEXT:    fcvtzs v0.2d, v0.2d
@@ -2614,7 +2614,7 @@ define <3 x i64> @fptou_v3f32_v3i64(<3 x float> %a) {
 ;
 ; CHECK-GI-LABEL: fptou_v3f32_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[2]
 ; CHECK-GI-NEXT:    fcvtl v0.2d, v0.2s
 ; CHECK-GI-NEXT:    fcvtl v1.2d, v1.2s
 ; CHECK-GI-NEXT:    fcvtzu v0.2d, v0.2d
@@ -3181,10 +3181,10 @@ define <3 x i16> @fptos_v3f32_v3i16(<3 x float> %a) {
 ; CHECK-GI-LABEL: fptos_v3f32_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3202,10 +3202,10 @@ define <3 x i16> @fptou_v3f32_v3i16(<3 x float> %a) {
 ; CHECK-GI-LABEL: fptou_v3f32_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -6077,10 +6077,10 @@ define <3 x i16> @fptos_v3f16_v3i16(<3 x half> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -6110,10 +6110,10 @@ define <3 x i16> @fptou_v3f16_v3i16(<3 x half> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -7297,7 +7297,7 @@ define <2 x i64> @fptos_v2f128_v2i64(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov x19, x0
 ; CHECK-GI-NEXT:    bl __fixtfdi
-; CHECK-GI-NEXT:    fmov d0, x19
+; CHECK-GI-NEXT:    mov v0.d[0], x19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.d[1], x0
 ; CHECK-GI-NEXT:    add sp, sp, #32
@@ -7340,7 +7340,7 @@ define <2 x i64> @fptou_v2f128_v2i64(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov x19, x0
 ; CHECK-GI-NEXT:    bl __fixunstfdi
-; CHECK-GI-NEXT:    fmov d0, x19
+; CHECK-GI-NEXT:    mov v0.d[0], x19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.d[1], x0
 ; CHECK-GI-NEXT:    add sp, sp, #32
@@ -7496,7 +7496,7 @@ define <2 x i32> @fptos_v2f128_v2i32(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __fixtfsi
-; CHECK-GI-NEXT:    fmov s0, w19
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.s[1], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7539,7 +7539,7 @@ define <2 x i32> @fptou_v2f128_v2i32(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __fixunstfsi
-; CHECK-GI-NEXT:    fmov s0, w19
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.s[1], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7591,7 +7591,7 @@ define <3 x i32> @fptos_v3f128_v3i32(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w20, w0
 ; CHECK-GI-NEXT:    bl __fixtfsi
-; CHECK-GI-NEXT:    fmov s0, w19
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.s[1], w20
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
@@ -7644,7 +7644,7 @@ define <3 x i32> @fptou_v3f128_v3i32(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w20, w0
 ; CHECK-GI-NEXT:    bl __fixunstfsi
-; CHECK-GI-NEXT:    fmov s0, w19
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.s[1], w20
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
@@ -7689,9 +7689,8 @@ define <2 x i16> @fptos_v2f128_v2i16(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __fixtfsi
 ; CHECK-GI-NEXT:    fmov s0, w19
-; CHECK-GI-NEXT:    fmov s1, w0
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w0
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #32
@@ -7734,9 +7733,8 @@ define <2 x i16> @fptou_v2f128_v2i16(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __fixunstfsi
 ; CHECK-GI-NEXT:    fmov s0, w19
-; CHECK-GI-NEXT:    fmov s1, w0
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w0
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #32
@@ -7791,12 +7789,10 @@ define <3 x i16> @fptos_v3f128_v3i16(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    mov w20, w0
 ; CHECK-GI-NEXT:    bl __fixtfsi
 ; CHECK-GI-NEXT:    fmov s0, w19
-; CHECK-GI-NEXT:    fmov s1, w20
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.h[1], w20
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -7850,12 +7846,10 @@ define <3 x i16> @fptou_v3f128_v3i16(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    mov w20, w0
 ; CHECK-GI-NEXT:    bl __fixunstfsi
 ; CHECK-GI-NEXT:    fmov s0, w19
-; CHECK-GI-NEXT:    fmov s1, w20
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.h[1], w20
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -7896,7 +7890,7 @@ define <2 x i8> @fptos_v2f128_v2i8(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __fixtfsi
-; CHECK-GI-NEXT:    fmov s0, w19
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.s[1], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7939,7 +7933,7 @@ define <2 x i8> @fptou_v2f128_v2i8(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __fixunstfsi
-; CHECK-GI-NEXT:    fmov s0, w19
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.s[1], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index aec5d7959226c..c0d4ddef23132 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -261,9 +261,9 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fcvt s2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -363,9 +363,9 @@ define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) {
 ; CHECK-GI-LABEL: fptrunc_v2f32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll
index 1a10fd2f1cdc3..fe5146d79895c 100644
--- a/llvm/test/CodeGen/AArch64/frem.ll
+++ b/llvm/test/CodeGen/AArch64/frem.ll
@@ -952,22 +952,22 @@ define <4 x half> @frem_v4f16(<4 x half> %a, <4 x half> %b) {
 ; CHECK-GI-NEXT:    fcvt s2, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h12
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s2
 ; CHECK-GI-NEXT:    bl fmodf
 ; CHECK-GI-NEXT:    fcvt s2, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h13
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s2
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll
index 0b34f9570fa77..557add3a4eaeb 100644
--- a/llvm/test/CodeGen/AArch64/fsincos.ll
+++ b/llvm/test/CodeGen/AArch64/fsincos.ll
@@ -678,12 +678,12 @@ define <7 x half> @sin_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -701,18 +701,19 @@ define <7 x half> @sin_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -789,21 +790,21 @@ define <4 x half> @sin_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -919,12 +920,12 @@ define <8 x half> @sin_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -947,21 +948,21 @@ define <8 x half> @sin_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -1155,7 +1156,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h9
@@ -1180,7 +1181,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -1231,7 +1232,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q2, [sp, #112] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x29, x30, [sp, #304] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -1257,7 +1258,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    fcvt h2, s0
-; CHECK-GI-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q0, [sp, #128] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    ldr q0, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
@@ -1948,12 +1949,12 @@ define <7 x half> @cos_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -1971,18 +1972,19 @@ define <7 x half> @cos_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -2059,21 +2061,21 @@ define <4 x half> @cos_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -2189,12 +2191,12 @@ define <8 x half> @cos_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -2217,21 +2219,21 @@ define <8 x half> @cos_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -2425,7 +2427,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h9
@@ -2450,7 +2452,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -2501,7 +2503,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q2, [sp, #112] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x29, x30, [sp, #304] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -2527,7 +2529,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    fcvt h2, s0
-; CHECK-GI-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q0, [sp, #128] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    ldr q0, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index 4b48bcc5508db..6c5fd8e52b017 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -196,27 +196,21 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: sqrt_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fsqrt v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fsqrt v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fsqrt v2.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: sqrt_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index 6baf1a84d407c..b00e5d6c701d8 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -1228,18 +1228,18 @@ define <3 x i32> @v3i32_i32(<3 x i32> %a, <3 x i32> %b, <3 x i32> %d, <3 x i32>
 ; CHECK-GI-LABEL: v3i32_i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-GI-NEXT:    cmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov v5.s[0], w9
 ; CHECK-GI-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NEXT:    mov v5.s[1], w9
 ; CHECK-GI-NEXT:    mov v4.s[2], w8
-; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mov v5.s[2], w9
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    neg v4.4s, v4.4s
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    neg v1.4s, v4.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
 ; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index 8b82004388b09..296e267a9c7f0 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -250,23 +250,13 @@ entry:
 }
 
 define <3 x float> @insert_v3f32_0(<3 x float> %a, float %b, i32 %c) {
-; CHECK-SD-LABEL: insert_v3f32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-SD-NEXT:    mov v1.s[1], v0.s[1]
-; CHECK-SD-NEXT:    mov v1.s[2], v0.s[2]
-; CHECK-SD-NEXT:    mov v0.16b, v1.16b
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: insert_v3f32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-GI-NEXT:    mov s0, v0.s[2]
-; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
-; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: insert_v3f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
 entry:
   %d = insertelement <3 x float> %a, float %b, i32 0
   ret <3 x float> %d
@@ -281,10 +271,11 @@ define <3 x float> @insert_v3f32_2(<3 x float> %a, float %b, i32 %c) {
 ;
 ; CHECK-GI-LABEL: insert_v3f32_2:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = insertelement <3 x float> %a, float %b, i32 2
@@ -983,11 +974,9 @@ define <3 x i32> @insert_v3i32_0(<3 x i32> %a, i32 %b, i32 %c) {
 ;
 ; CHECK-GI-LABEL: insert_v3i32_0:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, v0.s[1]
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    mov w9, v0.s[2]
-; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    mov v1.s[2], w9
+; CHECK-GI-NEXT:    mov v1.s[0], w0
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1003,10 +992,10 @@ define <3 x i32> @insert_v3i32_2(<3 x i32> %a, i32 %b, i32 %c) {
 ;
 ; CHECK-GI-LABEL: insert_v3i32_2:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], w0
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = insertelement <3 x i32> %a, i32 %b, i32 2
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 7a4c5cee27b80..4ac04798e1548 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -3309,30 +3309,28 @@ define <3 x double> @stofp_v3i8_v3f64(<3 x i8> %a) {
 ; CHECK-GI-LABEL: stofp_v3i8_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v0.h[2], w2
 ; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    smov x8, v0.s[0]
 ; CHECK-GI-NEXT:    smov x9, v0.s[1]
-; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
-; CHECK-GI-NEXT:    fmov d1, x8
-; CHECK-GI-NEXT:    smov x8, v0.s[0]
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    smov x8, v1.s[0]
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    smov x9, v1.s[1]
+; CHECK-GI-NEXT:    mov v1.d[0], x8
 ; CHECK-GI-NEXT:    mov v1.d[1], x9
-; CHECK-GI-NEXT:    smov x9, v0.s[1]
-; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    scvtf v0.2d, v1.2d
-; CHECK-GI-NEXT:    mov v2.d[1], x9
+; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    scvtf v2.2d, v1.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
-; CHECK-GI-NEXT:    scvtf v2.2d, v2.2d
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    ret
@@ -3363,30 +3361,28 @@ define <3 x double> @utofp_v3i8_v3f64(<3 x i8> %a) {
 ; CHECK-GI-LABEL: utofp_v3i8_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v0.h[2], w2
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    mov w8, v0.s[0]
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
-; CHECK-GI-NEXT:    ushll v0.4s, v1.4h, #0
-; CHECK-GI-NEXT:    fmov d1, x8
-; CHECK-GI-NEXT:    mov w8, v0.s[0]
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov w8, v1.s[0]
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov v1.d[0], x8
 ; CHECK-GI-NEXT:    mov v1.d[1], x9
-; CHECK-GI-NEXT:    mov w9, v0.s[1]
-; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    ucvtf v0.2d, v1.2d
-; CHECK-GI-NEXT:    mov v2.d[1], x9
+; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    ucvtf v2.2d, v1.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
-; CHECK-GI-NEXT:    ucvtf v2.2d, v2.2d
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    ret
@@ -4479,13 +4475,13 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    scvtf v1.2d, v2.2d
+; CHECK-GI-NEXT:    scvtf v2.2d, v2.2d
 ; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-GI-NEXT:    fcvtn v1.2s, v1.2d
-; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i64> %a to <3 x float>
@@ -4511,13 +4507,13 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    ucvtf v1.2d, v2.2d
+; CHECK-GI-NEXT:    ucvtf v2.2d, v2.2d
 ; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-GI-NEXT:    fcvtn v1.2s, v1.2d
-; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i64> %a to <3 x float>
@@ -5267,10 +5263,8 @@ define <3 x float> @stofp_v3i8_v3f32(<3 x i8> %a) {
 ; CHECK-GI-LABEL: stofp_v3i8_v3f32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v0.h[2], w2
 ; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
@@ -5302,11 +5296,9 @@ define <3 x float> @utofp_v3i8_v3f32(<3 x i8> %a) {
 ; CHECK-GI-LABEL: utofp_v3i8_v3f32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v0.h[2], w2
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[1]
@@ -6227,9 +6219,9 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -6276,9 +6268,9 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7215,9 +7207,9 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-LABEL: stofp_v2i32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7238,9 +7230,9 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-LABEL: utofp_v2i32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7448,9 +7440,9 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.2s, v0.2s, #16
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7459,8 +7451,8 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-FP16-LABEL: stofp_v2i16_v2f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], w8
 ; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
@@ -7491,9 +7483,9 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    movi d1, #0x00ffff0000ffff
 ; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7502,8 +7494,8 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-FP16-LABEL: utofp_v2i16_v2f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], w8
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
@@ -7986,9 +7978,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7997,9 +7989,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-FP16-LABEL: stofp_v2i8_v2f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-FP16-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-FP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-FP16-NEXT:    xtn v0.4h, v1.4s
 ; CHECK-GI-FP16-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
@@ -8048,9 +8040,9 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    movi d1, #0x0000ff000000ff
 ; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -8059,16 +8051,16 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-FP16-LABEL: utofp_v2i8_v2f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-FP16-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-FP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-FP16-NEXT:    xtn v0.4h, v1.4s
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    movi d1, #0x0000ff000000ff
 ; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], w8
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
@@ -8105,10 +8097,8 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-NOFP16-LABEL: stofp_v3i8_v3f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fmov s0, w0
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w1
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w2
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w2
 ; CHECK-GI-NOFP16-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
@@ -8126,10 +8116,8 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-FP16-LABEL: stofp_v3i8_v3f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    fmov s0, w0
-; CHECK-GI-FP16-NEXT:    fmov s1, w1
-; CHECK-GI-FP16-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-FP16-NEXT:    fmov s1, w2
-; CHECK-GI-FP16-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-FP16-NEXT:    mov v0.b[1], w1
+; CHECK-GI-FP16-NEXT:    mov v0.b[2], w2
 ; CHECK-GI-FP16-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    ret
@@ -8162,11 +8150,9 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-NOFP16-LABEL: utofp_v3i8_v3f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fmov s0, w0
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w1
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w2
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w2
 ; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
@@ -8183,10 +8169,8 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-FP16-LABEL: utofp_v3i8_v3f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    fmov s0, w0
-; CHECK-GI-FP16-NEXT:    fmov s1, w1
-; CHECK-GI-FP16-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-FP16-NEXT:    fmov s1, w2
-; CHECK-GI-FP16-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-FP16-NEXT:    mov v0.b[1], w1
+; CHECK-GI-FP16-NEXT:    mov v0.b[2], w2
 ; CHECK-GI-FP16-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
index 51d17ad0644f1..c1ea891bc86e7 100644
--- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
@@ -267,21 +267,21 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) {
 ; GISEL-NEXT:    bl exp10f
 ; GISEL-NEXT:    fcvt s1, h9
 ; GISEL-NEXT:    fcvt h0, s0
-; GISEL-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; GISEL-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; GISEL-NEXT:    fmov s0, s1
 ; GISEL-NEXT:    bl exp10f
 ; GISEL-NEXT:    fcvt s1, h10
 ; GISEL-NEXT:    fcvt h0, s0
-; GISEL-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; GISEL-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; GISEL-NEXT:    fmov s0, s1
 ; GISEL-NEXT:    bl exp10f
-; GISEL-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; GISEL-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; GISEL-NEXT:    fcvt h0, s0
+; GISEL-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; GISEL-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; GISEL-NEXT:    mov v1.h[1], v2.h[0]
-; GISEL-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; GISEL-NEXT:    mov v1.h[1], v3.h[0]
 ; GISEL-NEXT:    mov v1.h[2], v2.h[0]
 ; GISEL-NEXT:    mov v1.h[3], v0.h[0]
 ; GISEL-NEXT:    mov v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index c3c0ec5e3d9d8..a4d1c53c272aa 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -118,7 +118,7 @@ define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b){
 ;
 ; CHECK-GI-LABEL: load_v2i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    ld1 { v0.b }[0], [x0]
 ; CHECK-GI-NEXT:    ldr b1, [x0, #1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -158,8 +158,8 @@ define <2 x i16> @load_v2i16(ptr %ptr){
 ; CHECK-GI-LABEL: load_v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -235,6 +235,7 @@ define <7 x i8> @load_v7i8(ptr %ptr){
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr b0, [x0]
 ; CHECK-GI-NEXT:    ldr b1, [x0, #1]
+; CHECK-GI-NEXT:    mov v0.b[0], v0.b[0]
 ; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
 ; CHECK-GI-NEXT:    ldr b1, [x0, #2]
 ; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
@@ -261,10 +262,10 @@ define <3 x i16> @load_v3i16(ptr %ptr){
 ; CHECK-GI-LABEL: load_v3i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x8]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %a = load <3 x i16>, ptr %ptr
@@ -280,18 +281,18 @@ define <7 x i16> @load_v7i16(ptr %ptr){
 ; CHECK-GI-LABEL: load_v7i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #6]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #8]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #10]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #12]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #6
+; CHECK-GI-NEXT:    ld1 { v0.h }[3], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #8
+; CHECK-GI-NEXT:    ld1 { v0.h }[4], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #10
+; CHECK-GI-NEXT:    ld1 { v0.h }[5], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #12
+; CHECK-GI-NEXT:    ld1 { v0.h }[6], [x8]
 ; CHECK-GI-NEXT:    ret
     %a = load <7 x i16>, ptr %ptr
     ret <7 x i16> %a
@@ -305,10 +306,11 @@ define <3 x i32> @load_v3i32(ptr %ptr){
 ;
 ; CHECK-GI-LABEL: load_v3i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldp s0, s1, [x0]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    ldr s1, [x0, #8]
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    ldr s0, [x0]
+; CHECK-GI-NEXT:    add x8, x0, #4
+; CHECK-GI-NEXT:    ld1 { v0.s }[1], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #8
+; CHECK-GI-NEXT:    ld1 { v0.s }[2], [x8]
 ; CHECK-GI-NEXT:    ret
     %a = load <3 x i32>, ptr %ptr
     ret <3 x i32> %a
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 02258bc47c54d..9e748c9641aa8 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -83,13 +83,13 @@ define void @v2i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v2i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #1]
-; CHECK-GI-NEXT:    ldr b2, [x1]
+; CHECK-GI-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    ld1 { v1.b }[0], [x1]
+; CHECK-GI-NEXT:    ldr b2, [x0, #1]
 ; CHECK-GI-NEXT:    ldr b3, [x1, #1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str b0, [x0]
 ; CHECK-GI-NEXT:    str b1, [x0, #1]
@@ -124,22 +124,18 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w10, [x1]
+; CHECK-GI-NEXT:    ldrb w9, [x1]
+; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    fmov s3, w11
 ; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
 ; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[2]
 ; CHECK-GI-NEXT:    str b0, [x0]
@@ -171,27 +167,27 @@ define void @v4i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v5.8b, #0
 ; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x0]
 ; CHECK-GI-NEXT:    ret
@@ -259,13 +255,13 @@ define void @v2i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str h0, [x0]
@@ -293,18 +289,16 @@ define void @v3i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    add x10, x1, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    ldr h3, [x1, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
+; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
@@ -416,14 +410,14 @@ define <2 x i64> @v2i64(<2 x i64> %d, <2 x i64> %e) {
 ;
 ; CHECK-GI-LABEL: v2i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = mul <2 x i64> %d, %e
@@ -461,16 +455,16 @@ define <3 x i64> @v3i64(<3 x i64> %d, <3 x i64> %e) {
 ; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    fmov x11, d3
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v3.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d3
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x11, v3.d[1]
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov x9, d5
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
 ; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    fmov x9, d5
 ; CHECK-GI-NEXT:    mul x8, x8, x9
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -515,10 +509,10 @@ define <4 x i64> @v4i64(<4 x i64> %d, <4 x i64> %e) {
 ; CHECK-GI-NEXT:    fmov x9, d1
 ; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    mul x9, x9, x12
-; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    mov v0.d[0], x8
 ; CHECK-GI-NEXT:    mul x11, x13, x14
+; CHECK-GI-NEXT:    mov v1.d[0], x9
 ; CHECK-GI-NEXT:    mov v0.d[1], x10
-; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    mov v1.d[1], x11
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 50c0c8b11e751..dbb4270fb8002 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1120,12 +1120,10 @@ define <4 x i16> @vselect_constant_cond_zero_v4i16(<4 x i16> %a) {
 ; CHECK-GI-NEXT:    mov w8, #1 // =0x1
 ; CHECK-GI-NEXT:    mov w9, #0 // =0x0
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    mov v3.16b, v1.16b
-; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v3.b[3], v1.b[0]
-; CHECK-GI-NEXT:    ushll v1.8h, v3.8b, #0
+; CHECK-GI-NEXT:    mov v1.b[1], w9
+; CHECK-GI-NEXT:    mov v1.b[2], w9
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #15
 ; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #15
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
@@ -1148,10 +1146,9 @@ define <4 x i32> @vselect_constant_cond_zero_v4i32(<4 x i32> %a) {
 ; CHECK-GI-NEXT:    mov w9, #0 // =0x0
 ; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    mov v3.16b, v1.16b
-; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ushll v1.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    mov v1.d[1], v2.d[0]
 ; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
@@ -1199,12 +1196,10 @@ define <4 x i16> @vselect_constant_cond_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-GI-NEXT:    mov w8, #1 // =0x1
 ; CHECK-GI-NEXT:    mov w9, #0 // =0x0
 ; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v4.16b, v2.16b
-; CHECK-GI-NEXT:    mov v4.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v4.b[2], v3.b[0]
-; CHECK-GI-NEXT:    mov v4.b[3], v2.b[0]
-; CHECK-GI-NEXT:    ushll v2.8h, v4.8b, #0
+; CHECK-GI-NEXT:    mov v2.b[1], w9
+; CHECK-GI-NEXT:    mov v2.b[2], w9
+; CHECK-GI-NEXT:    mov v2.b[3], w8
+; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    shl v2.4h, v2.4h, #15
 ; CHECK-GI-NEXT:    sshr v2.4h, v2.4h, #15
 ; CHECK-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
@@ -1227,10 +1222,9 @@ define <4 x i32> @vselect_constant_cond_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-GI-NEXT:    mov w9, #0 // =0x0
 ; CHECK-GI-NEXT:    fmov s2, w8
 ; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v4.16b, v2.16b
-; CHECK-GI-NEXT:    mov v4.h[1], v3.h[0]
-; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ushll v2.4s, v4.4h, #0
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
 ; CHECK-GI-NEXT:    shl v2.4s, v2.4s, #31
diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
index 59958afdd0d1e..adc89f7a0d99d 100644
--- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -2674,10 +2674,10 @@ define <4 x i32> @fcmal4xfloat(<4 x float> %A, <4 x float> %B) {
 ; CHECK-GI-NEXT:    mov w8, #1 // =0x1
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    mov v1.16b, v0.16b
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v0.h[0]
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w8
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    mov v1.d[1], v0.d[0]
 ; CHECK-GI-NEXT:    shl v0.4s, v1.4s, #31
 ; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #31
@@ -2725,10 +2725,10 @@ define <4 x i32> @fcmnv4xfloat(<4 x float> %A, <4 x float> %B) {
 ; CHECK-GI-NEXT:    mov w8, #0 // =0x0
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    mov v1.16b, v0.16b
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v0.h[0]
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w8
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    mov v1.d[1], v0.d[0]
 ; CHECK-GI-NEXT:    shl v0.4s, v1.4s, #31
 ; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #31
diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll
index 6f4b090fb22bd..f5e566f49b91e 100644
--- a/llvm/test/CodeGen/AArch64/neon-extadd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll
@@ -1266,133 +1266,99 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: v20:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w9, [sp, #64]
-; CHECK-GI-NEXT:    ldr w10, [sp, #72]
-; CHECK-GI-NEXT:    and w13, w2, #0xff
-; CHECK-GI-NEXT:    ldr w11, [sp, #80]
-; CHECK-GI-NEXT:    ldr w12, [sp, #88]
-; CHECK-GI-NEXT:    fmov s19, w13
-; CHECK-GI-NEXT:    fmov s0, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #224]
-; CHECK-GI-NEXT:    fmov s16, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #232]
-; CHECK-GI-NEXT:    fmov s3, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #240]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #248]
-; CHECK-GI-NEXT:    fmov s1, w12
-; CHECK-GI-NEXT:    fmov s7, w10
-; CHECK-GI-NEXT:    and w10, w1, #0xff
-; CHECK-GI-NEXT:    fmov s5, w11
-; CHECK-GI-NEXT:    fmov s4, w9
 ; CHECK-GI-NEXT:    and w9, w0, #0xff
-; CHECK-GI-NEXT:    ldrb w11, [sp]
-; CHECK-GI-NEXT:    ldrb w12, [sp, #8]
-; CHECK-GI-NEXT:    fmov s6, w9
-; CHECK-GI-NEXT:    fmov s20, w10
-; CHECK-GI-NEXT:    ldrb w9, [sp, #96]
-; CHECK-GI-NEXT:    ldrb w10, [sp, #104]
-; CHECK-GI-NEXT:    fmov s17, w11
-; CHECK-GI-NEXT:    fmov s21, w12
-; CHECK-GI-NEXT:    ldrb w11, [sp, #160]
-; CHECK-GI-NEXT:    mov v0.b[1], v16.b[0]
-; CHECK-GI-NEXT:    fmov s18, w9
-; CHECK-GI-NEXT:    fmov s22, w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #96]
+; CHECK-GI-NEXT:    and w11, w1, #0xff
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    ldrb w9, [sp]
+; CHECK-GI-NEXT:    ldrb w12, [sp, #104]
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #160]
+; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    ldrb w9, [sp, #168]
-; CHECK-GI-NEXT:    mov v6.h[1], v20.h[0]
-; CHECK-GI-NEXT:    fmov s20, w11
-; CHECK-GI-NEXT:    ldrb w10, [sp, #16]
-; CHECK-GI-NEXT:    mov v17.h[1], v21.h[0]
-; CHECK-GI-NEXT:    fmov s21, w9
-; CHECK-GI-NEXT:    ldrb w9, [sp, #112]
-; CHECK-GI-NEXT:    mov v18.h[1], v22.h[0]
-; CHECK-GI-NEXT:    fmov s23, w10
-; CHECK-GI-NEXT:    ldrb w10, [sp, #176]
-; CHECK-GI-NEXT:    and w11, w3, #0xff
-; CHECK-GI-NEXT:    mov v2.b[1], v7.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v3.b[0]
-; CHECK-GI-NEXT:    mov v6.h[2], v19.h[0]
-; CHECK-GI-NEXT:    fmov s19, w9
-; CHECK-GI-NEXT:    mov v20.h[1], v21.h[0]
-; CHECK-GI-NEXT:    ldrb w9, [sp, #24]
-; CHECK-GI-NEXT:    fmov s22, w11
-; CHECK-GI-NEXT:    mov v17.h[2], v23.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w11
+; CHECK-GI-NEXT:    ldrb w11, [sp, #8]
+; CHECK-GI-NEXT:    fmov s3, w10
+; CHECK-GI-NEXT:    mov v2.h[1], w12
+; CHECK-GI-NEXT:    and w10, w2, #0xff
+; CHECK-GI-NEXT:    and w12, w5, #0xff
+; CHECK-GI-NEXT:    mov v1.h[1], w11
 ; CHECK-GI-NEXT:    and w11, w4, #0xff
-; CHECK-GI-NEXT:    mov v18.h[2], v19.h[0]
-; CHECK-GI-NEXT:    fmov s19, w10
-; CHECK-GI-NEXT:    ldrb w10, [sp, #120]
-; CHECK-GI-NEXT:    fmov s23, w9
+; CHECK-GI-NEXT:    mov v3.h[1], w9
+; CHECK-GI-NEXT:    ldrb w9, [sp, #112]
+; CHECK-GI-NEXT:    mov v0.h[2], w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #16]
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    ldrb w9, [sp, #176]
+; CHECK-GI-NEXT:    mov v1.h[2], w10
+; CHECK-GI-NEXT:    and w10, w3, #0xff
+; CHECK-GI-NEXT:    mov v3.h[2], w9
+; CHECK-GI-NEXT:    ldrb w9, [sp, #120]
+; CHECK-GI-NEXT:    mov v0.h[3], w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #24]
+; CHECK-GI-NEXT:    mov v2.h[3], w9
 ; CHECK-GI-NEXT:    ldrb w9, [sp, #184]
-; CHECK-GI-NEXT:    mov v6.h[3], v22.h[0]
-; CHECK-GI-NEXT:    fmov s21, w11
-; CHECK-GI-NEXT:    and w11, w6, #0xff
-; CHECK-GI-NEXT:    mov v2.b[2], v5.b[0]
-; CHECK-GI-NEXT:    mov v20.h[2], v19.h[0]
-; CHECK-GI-NEXT:    fmov s19, w10
-; CHECK-GI-NEXT:    fmov s16, w9
+; CHECK-GI-NEXT:    mov v1.h[3], w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #64]
+; CHECK-GI-NEXT:    mov v3.h[3], w9
 ; CHECK-GI-NEXT:    ldrb w9, [sp, #128]
-; CHECK-GI-NEXT:    and w10, w5, #0xff
-; CHECK-GI-NEXT:    mov v17.h[3], v23.h[0]
-; CHECK-GI-NEXT:    mov v6.h[4], v21.h[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v1.b[0]
-; CHECK-GI-NEXT:    mov v18.h[3], v19.h[0]
-; CHECK-GI-NEXT:    fmov s19, w9
-; CHECK-GI-NEXT:    ldrb w9, [sp, #192]
-; CHECK-GI-NEXT:    mov v20.h[3], v16.h[0]
-; CHECK-GI-NEXT:    fmov s16, w10
-; CHECK-GI-NEXT:    ldrb w10, [sp, #32]
-; CHECK-GI-NEXT:    mov v2.b[3], v4.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    mov v18.h[4], v19.h[0]
-; CHECK-GI-NEXT:    fmov s19, w10
+; CHECK-GI-NEXT:    mov v0.h[4], w11
+; CHECK-GI-NEXT:    ldrb w11, [sp, #32]
+; CHECK-GI-NEXT:    fmov s4, w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #192]
+; CHECK-GI-NEXT:    mov v2.h[4], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #72]
+; CHECK-GI-NEXT:    mov v1.h[4], w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #224]
+; CHECK-GI-NEXT:    mov v3.h[4], w10
 ; CHECK-GI-NEXT:    ldrb w10, [sp, #136]
-; CHECK-GI-NEXT:    mov v6.h[5], v16.h[0]
-; CHECK-GI-NEXT:    fmov s16, w10
-; CHECK-GI-NEXT:    ldrb w10, [sp, #48]
-; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-GI-NEXT:    mov v17.h[4], v19.h[0]
-; CHECK-GI-NEXT:    fmov s19, w9
-; CHECK-GI-NEXT:    ldrb w9, [sp, #40]
-; CHECK-GI-NEXT:    mov v18.h[5], v16.h[0]
-; CHECK-GI-NEXT:    fmov s16, w9
+; CHECK-GI-NEXT:    mov v4.b[1], w9
+; CHECK-GI-NEXT:    fmov s5, w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #232]
+; CHECK-GI-NEXT:    mov v0.h[5], w12
+; CHECK-GI-NEXT:    ldrb w12, [sp, #40]
+; CHECK-GI-NEXT:    mov v2.h[5], w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #200]
 ; CHECK-GI-NEXT:    ldrb w9, [sp, #144]
-; CHECK-GI-NEXT:    mov v20.h[4], v19.h[0]
-; CHECK-GI-NEXT:    fmov s19, w11
-; CHECK-GI-NEXT:    ldrb w11, [sp, #200]
-; CHECK-GI-NEXT:    add v0.4h, v0.4h, v2.4h
-; CHECK-GI-NEXT:    fmov s7, w11
-; CHECK-GI-NEXT:    mov v17.h[5], v16.h[0]
-; CHECK-GI-NEXT:    fmov s16, w9
-; CHECK-GI-NEXT:    ldrb w11, [sp, #208]
-; CHECK-GI-NEXT:    mov v6.h[6], v19.h[0]
-; CHECK-GI-NEXT:    ldrb w9, [sp, #56]
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov v20.h[5], v7.h[0]
-; CHECK-GI-NEXT:    fmov s7, w10
-; CHECK-GI-NEXT:    mov v18.h[6], v16.h[0]
-; CHECK-GI-NEXT:    fmov s16, w11
+; CHECK-GI-NEXT:    mov v5.b[1], w11
+; CHECK-GI-NEXT:    mov v1.h[5], w12
+; CHECK-GI-NEXT:    mov v3.h[5], w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #80]
+; CHECK-GI-NEXT:    ldr w12, [sp, #240]
+; CHECK-GI-NEXT:    and w11, w6, #0xff
+; CHECK-GI-NEXT:    mov v0.h[6], w11
+; CHECK-GI-NEXT:    ldrb w11, [sp, #48]
+; CHECK-GI-NEXT:    mov v2.h[6], w9
+; CHECK-GI-NEXT:    ldrb w9, [sp, #208]
+; CHECK-GI-NEXT:    mov v4.b[2], w10
 ; CHECK-GI-NEXT:    ldrb w10, [sp, #152]
-; CHECK-GI-NEXT:    and w11, w7, #0xff
-; CHECK-GI-NEXT:    fmov s3, w11
-; CHECK-GI-NEXT:    str q0, [x8, #64]
-; CHECK-GI-NEXT:    fmov s5, w10
+; CHECK-GI-NEXT:    mov v5.b[2], w12
+; CHECK-GI-NEXT:    mov v1.h[6], w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #248]
+; CHECK-GI-NEXT:    mov v3.h[6], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-NEXT:    and w12, w7, #0xff
+; CHECK-GI-NEXT:    mov v0.h[7], w12
+; CHECK-GI-NEXT:    mov v2.h[7], w10
+; CHECK-GI-NEXT:    ldrb w12, [sp, #56]
+; CHECK-GI-NEXT:    mov v4.b[3], w9
 ; CHECK-GI-NEXT:    ldrb w10, [sp, #216]
-; CHECK-GI-NEXT:    mov v17.h[6], v7.h[0]
-; CHECK-GI-NEXT:    mov v20.h[6], v16.h[0]
-; CHECK-GI-NEXT:    fmov s7, w9
-; CHECK-GI-NEXT:    mov v6.h[7], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w10
-; CHECK-GI-NEXT:    mov v18.h[7], v5.h[0]
-; CHECK-GI-NEXT:    mov v17.h[7], v7.h[0]
-; CHECK-GI-NEXT:    mov v20.h[7], v3.h[0]
-; CHECK-GI-NEXT:    add v1.8h, v6.8h, v18.8h
-; CHECK-GI-NEXT:    add v3.8h, v17.8h, v20.8h
+; CHECK-GI-NEXT:    mov v5.b[3], w11
+; CHECK-GI-NEXT:    mov v1.h[7], w12
+; CHECK-GI-NEXT:    mov v3.h[7], w10
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    ushll v2.8h, v4.8b, #0
+; CHECK-GI-NEXT:    ushll v4.8h, v5.8b, #0
+; CHECK-GI-NEXT:    add v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    add v2.4h, v2.4h, v4.4h
 ; CHECK-GI-NEXT:    ushll v4.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
-; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    stp q4, q1, [x8]
-; CHECK-GI-NEXT:    stp q2, q3, [x8, #32]
+; CHECK-GI-NEXT:    stp q3, q0, [x8]
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    stp q4, q1, [x8, #32]
+; CHECK-GI-NEXT:    str q2, [x8, #64]
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <20 x i8> %s0 to <20 x i32>
@@ -1497,107 +1463,83 @@ define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) {
 ;
 ; CHECK-GI-LABEL: i12:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr w12, [sp]
+; CHECK-GI-NEXT:    ldr w14, [sp, #32]
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
-; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    ldr w9, [sp, #8]
-; CHECK-GI-NEXT:    ldr w11, [sp, #32]
-; CHECK-GI-NEXT:    ldr w12, [sp, #40]
-; CHECK-GI-NEXT:    fmov s5, w7
-; CHECK-GI-NEXT:    ldr w10, [sp, #16]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ldr w16, [sp, #128]
+; CHECK-GI-NEXT:    ldr w17, [sp, #160]
 ; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    fmov s2, w12
+; CHECK-GI-NEXT:    fmov s3, w14
+; CHECK-GI-NEXT:    ldr w12, [sp, #64]
+; CHECK-GI-NEXT:    ldr w14, [sp, #96]
+; CHECK-GI-NEXT:    ldr w13, [sp, #8]
+; CHECK-GI-NEXT:    ldr w15, [sp, #40]
 ; CHECK-GI-NEXT:    fmov s4, w12
-; CHECK-GI-NEXT:    ldr w12, [sp, #96]
-; CHECK-GI-NEXT:    ldr w13, [sp, #104]
-; CHECK-GI-NEXT:    ldr w14, [sp, #128]
-; CHECK-GI-NEXT:    ldr w15, [sp, #136]
-; CHECK-GI-NEXT:    ldr w16, [sp, #160]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w2
-; CHECK-GI-NEXT:    fmov s7, w13
-; CHECK-GI-NEXT:    fmov s16, w15
-; CHECK-GI-NEXT:    ldr w17, [sp, #168]
-; CHECK-GI-NEXT:    ldr w9, [sp, #24]
-; CHECK-GI-NEXT:    ldr w13, [sp, #176]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w6
-; CHECK-GI-NEXT:    fmov s17, w17
-; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w11
+; CHECK-GI-NEXT:    fmov s6, w16
+; CHECK-GI-NEXT:    fmov s7, w17
+; CHECK-GI-NEXT:    fmov s5, w14
+; CHECK-GI-NEXT:    mov v2.h[1], w13
+; CHECK-GI-NEXT:    mov v3.h[1], w15
+; CHECK-GI-NEXT:    ldr w13, [sp, #72]
+; CHECK-GI-NEXT:    ldr w15, [sp, #104]
+; CHECK-GI-NEXT:    ldr w12, [sp, #136]
+; CHECK-GI-NEXT:    ldr w18, [sp, #168]
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v1.h[1], w5
+; CHECK-GI-NEXT:    mov v4.h[1], w13
+; CHECK-GI-NEXT:    mov v5.h[1], w15
+; CHECK-GI-NEXT:    mov v6.h[1], w12
+; CHECK-GI-NEXT:    mov v7.h[1], w18
+; CHECK-GI-NEXT:    ldr w10, [sp, #16]
 ; CHECK-GI-NEXT:    ldr w11, [sp, #48]
-; CHECK-GI-NEXT:    mov v1.h[3], v5.h[0]
-; CHECK-GI-NEXT:    fmov s5, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #64]
-; CHECK-GI-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w3
-; CHECK-GI-NEXT:    mov v2.h[2], v5.h[0]
-; CHECK-GI-NEXT:    fmov s5, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #72]
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    fmov s6, w11
-; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    mov v3.h[2], v5.h[0]
-; CHECK-GI-NEXT:    fmov s5, w10
-; CHECK-GI-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-NEXT:    ldr w10, [sp, #112]
-; CHECK-GI-NEXT:    ldr w11, [sp, #144]
-; CHECK-GI-NEXT:    mov v2.h[3], v4.h[0]
-; CHECK-GI-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NEXT:    fmov s6, w12
-; CHECK-GI-NEXT:    fmov s18, w11
-; CHECK-GI-NEXT:    ldr w12, [sp, #88]
+; CHECK-GI-NEXT:    ldr w12, [sp, #80]
+; CHECK-GI-NEXT:    ldr w13, [sp, #112]
+; CHECK-GI-NEXT:    ldr w14, [sp, #144]
+; CHECK-GI-NEXT:    ldr w15, [sp, #176]
+; CHECK-GI-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NEXT:    mov v1.h[2], w6
+; CHECK-GI-NEXT:    mov v2.h[2], w10
+; CHECK-GI-NEXT:    mov v3.h[2], w11
+; CHECK-GI-NEXT:    mov v4.h[2], w12
+; CHECK-GI-NEXT:    mov v5.h[2], w13
+; CHECK-GI-NEXT:    mov v6.h[2], w14
+; CHECK-GI-NEXT:    mov v7.h[2], w15
+; CHECK-GI-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-NEXT:    ldr w9, [sp, #56]
+; CHECK-GI-NEXT:    ldr w10, [sp, #88]
+; CHECK-GI-NEXT:    ldr w11, [sp, #120]
+; CHECK-GI-NEXT:    ldr w12, [sp, #152]
+; CHECK-GI-NEXT:    ldr w13, [sp, #184]
+; CHECK-GI-NEXT:    mov v0.h[3], w3
+; CHECK-GI-NEXT:    mov v1.h[3], w7
+; CHECK-GI-NEXT:    mov v2.h[3], w8
+; CHECK-GI-NEXT:    mov v3.h[3], w9
+; CHECK-GI-NEXT:    mov v4.h[3], w10
+; CHECK-GI-NEXT:    mov v5.h[3], w11
+; CHECK-GI-NEXT:    mov v6.h[3], w12
+; CHECK-GI-NEXT:    mov v7.h[3], w13
+; CHECK-GI-NEXT:    movi v16.4s, #15, msl #8
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-GI-NEXT:    fmov s7, w14
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    mov v7.h[1], v16.h[0]
-; CHECK-GI-NEXT:    fmov s16, w16
-; CHECK-GI-NEXT:    mov v16.h[1], v17.h[0]
-; CHECK-GI-NEXT:    fmov s17, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #152]
-; CHECK-GI-NEXT:    mov v7.h[2], v18.h[0]
-; CHECK-GI-NEXT:    fmov s18, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #120]
-; CHECK-GI-NEXT:    mov v5.h[2], v17.h[0]
-; CHECK-GI-NEXT:    fmov s17, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #184]
-; CHECK-GI-NEXT:    mov v3.h[3], v18.h[0]
-; CHECK-GI-NEXT:    fmov s4, w8
-; CHECK-GI-NEXT:    fmov s18, w10
-; CHECK-GI-NEXT:    mov v6.h[2], v17.h[0]
-; CHECK-GI-NEXT:    fmov s17, w13
 ; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    mov v16.h[2], v17.h[0]
-; CHECK-GI-NEXT:    fmov s17, w12
-; CHECK-GI-NEXT:    mov v6.h[3], v4.h[0]
-; CHECK-GI-NEXT:    movi v4.4s, #15, msl #8
-; CHECK-GI-NEXT:    mov v5.h[3], v17.h[0]
-; CHECK-GI-NEXT:    fmov s17, w9
-; CHECK-GI-NEXT:    mov v16.h[3], v18.h[0]
-; CHECK-GI-NEXT:    ushll v6.4s, v6.4h, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v4.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v4.16b
-; CHECK-GI-NEXT:    mov v7.h[3], v17.h[0]
-; CHECK-GI-NEXT:    and v2.16b, v2.16b, v4.16b
-; CHECK-GI-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT:    ushll v4.4s, v4.4h, #0
 ; CHECK-GI-NEXT:    ushll v5.4s, v5.4h, #0
-; CHECK-GI-NEXT:    ushll v16.4s, v16.4h, #0
-; CHECK-GI-NEXT:    and v6.16b, v6.16b, v4.16b
+; CHECK-GI-NEXT:    ushll v6.4s, v6.4h, #0
 ; CHECK-GI-NEXT:    ushll v7.4s, v7.4h, #0
-; CHECK-GI-NEXT:    and v5.16b, v5.16b, v4.16b
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v6.4s
-; CHECK-GI-NEXT:    and v7.16b, v7.16b, v4.16b
-; CHECK-GI-NEXT:    and v4.16b, v16.16b, v4.16b
-; CHECK-GI-NEXT:    add v0.4s, v0.4s, v5.4s
-; CHECK-GI-NEXT:    add v2.4s, v2.4s, v7.4s
-; CHECK-GI-NEXT:    add v3.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v16.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v16.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v16.16b
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v16.16b
+; CHECK-GI-NEXT:    and v4.16b, v4.16b, v16.16b
+; CHECK-GI-NEXT:    and v5.16b, v5.16b, v16.16b
+; CHECK-GI-NEXT:    and v6.16b, v6.16b, v16.16b
+; CHECK-GI-NEXT:    and v7.16b, v7.16b, v16.16b
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    add v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    add v3.4s, v3.4s, v7.4s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i12> %s0 to <16 x i32>
diff --git a/llvm/test/CodeGen/AArch64/neon-extmul.ll b/llvm/test/CodeGen/AArch64/neon-extmul.ll
index 3dbc033dfab96..f83ac8ed642cc 100644
--- a/llvm/test/CodeGen/AArch64/neon-extmul.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extmul.ll
@@ -272,18 +272,18 @@ define <8 x i64> @extaddsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
 ; CHECK-GI-NEXT:    mul x15, x15, x16
 ; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    fmov x11, d0
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v1.d[0], x9
 ; CHECK-GI-NEXT:    mul x13, x13, x18
-; CHECK-GI-NEXT:    mov v0.d[1], x12
 ; CHECK-GI-NEXT:    mul x11, x11, x14
 ; CHECK-GI-NEXT:    mov x14, v6.d[1]
+; CHECK-GI-NEXT:    mov v0.d[1], x12
+; CHECK-GI-NEXT:    mov v2.d[0], x10
 ; CHECK-GI-NEXT:    mov v1.d[1], x15
-; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    mul x14, x14, x17
-; CHECK-GI-NEXT:    fmov d3, x11
-; CHECK-GI-NEXT:    mov v3.d[1], x13
+; CHECK-GI-NEXT:    mov v3.d[0], x11
 ; CHECK-GI-NEXT:    mov v2.d[1], x14
+; CHECK-GI-NEXT:    mov v3.d[1], x13
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i8> %s0 to <8 x i64>
@@ -423,22 +423,22 @@ define <8 x i64> @extmuladdsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1, <8 x i64> %b)
 ; CHECK-GI-NEXT:    mul x15, x15, x16
 ; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    fmov x11, d0
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v1.d[0], x9
 ; CHECK-GI-NEXT:    mul x13, x13, x18
-; CHECK-GI-NEXT:    mov v0.d[1], x12
 ; CHECK-GI-NEXT:    mul x11, x11, x14
 ; CHECK-GI-NEXT:    mov x14, v18.d[1]
+; CHECK-GI-NEXT:    mov v0.d[1], x12
+; CHECK-GI-NEXT:    mov v6.d[0], x10
 ; CHECK-GI-NEXT:    mov v1.d[1], x15
-; CHECK-GI-NEXT:    fmov d6, x10
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
 ; CHECK-GI-NEXT:    mul x14, x14, x17
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    mov v7.d[0], x11
 ; CHECK-GI-NEXT:    add v1.2d, v1.2d, v3.2d
-; CHECK-GI-NEXT:    fmov d7, x11
-; CHECK-GI-NEXT:    mov v7.d[1], x13
 ; CHECK-GI-NEXT:    mov v6.d[1], x14
-; CHECK-GI-NEXT:    add v3.2d, v7.2d, v5.2d
+; CHECK-GI-NEXT:    mov v7.d[1], x13
 ; CHECK-GI-NEXT:    add v2.2d, v6.2d, v4.2d
+; CHECK-GI-NEXT:    add v3.2d, v7.2d, v5.2d
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i8> %s0 to <8 x i64>
diff --git a/llvm/test/CodeGen/AArch64/neon-perm.ll b/llvm/test/CodeGen/AArch64/neon-perm.ll
index 15763543113eb..2897741780f60 100644
--- a/llvm/test/CodeGen/AArch64/neon-perm.ll
+++ b/llvm/test/CodeGen/AArch64/neon-perm.ll
@@ -1741,12 +1741,13 @@ define <4 x i8> @test_vzip1_v4i8(<8 x i8> %p) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
  %lo = shufflevector <8 x i8> %p, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/ptradd.ll b/llvm/test/CodeGen/AArch64/ptradd.ll
index af283f6a093e9..3263a5e03c1fd 100644
--- a/llvm/test/CodeGen/AArch64/ptradd.ll
+++ b/llvm/test/CodeGen/AArch64/ptradd.ll
@@ -77,17 +77,18 @@ define void @vector_gep_v3i32(<3 x ptr> %b, <3 x i32> %off, ptr %p) {
 ;
 ; CHECK-GI-LABEL: vector_gep_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    smov x8, v3.s[0]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    smov x9, v3.s[1]
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    fmov d1, x8
-; CHECK-GI-NEXT:    mov w8, v3.s[2]
-; CHECK-GI-NEXT:    mov v1.d[1], x9
+; CHECK-GI-NEXT:    smov x9, v3.s[0]
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    smov x10, v3.s[1]
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v4.d[0], x9
 ; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mov w8, v3.s[2]
+; CHECK-GI-NEXT:    mov v4.d[1], x10
 ; CHECK-GI-NEXT:    add x8, x9, w8, sxtw
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v4.2d
 ; CHECK-GI-NEXT:    str x8, [x0, #16]
 ; CHECK-GI-NEXT:    str q0, [x0]
 ; CHECK-GI-NEXT:    ret
@@ -166,17 +167,18 @@ define void @vector_gep_v3i64(<3 x ptr> %b, <3 x i64> %off, ptr %p) {
 ;
 ; CHECK-GI-LABEL: vector_gep_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x8, d0
 ; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
-; CHECK-GI-NEXT:    fmov x8, d2
 ; CHECK-GI-NEXT:    fmov x9, d5
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    fmov x8, d2
 ; CHECK-GI-NEXT:    add x8, x8, x9
-; CHECK-GI-NEXT:    str x8, [x0, #16]
 ; CHECK-GI-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT:    str x8, [x0, #16]
 ; CHECK-GI-NEXT:    str q0, [x0]
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -206,13 +208,21 @@ entry:
 }
 
 define void @vector_gep_v4i128(<2 x ptr> %b, <2 x i128> %off, ptr %p) {
-; CHECK-LABEL: vector_gep_v4i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    mov v1.d[1], x2
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    str q0, [x4]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: vector_gep_v4i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d1, x0
+; CHECK-SD-NEXT:    mov v1.d[1], x2
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    str q0, [x4]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vector_gep_v4i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.d[0], x0
+; CHECK-GI-NEXT:    mov v1.d[1], x2
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    str q0, [x4]
+; CHECK-GI-NEXT:    ret
 entry:
   %g = getelementptr i8, <2 x ptr> %b, <2 x i128> %off
   store <2 x ptr> %g, ptr %p
diff --git a/llvm/test/CodeGen/AArch64/rem.ll b/llvm/test/CodeGen/AArch64/rem.ll
index 81682c5f0ce85..d807635f5d87d 100644
--- a/llvm/test/CodeGen/AArch64/rem.ll
+++ b/llvm/test/CodeGen/AArch64/rem.ll
@@ -190,7 +190,7 @@ define <2 x i8> @sv2i8(<2 x i8> %d, <2 x i8> %e) {
 ; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -284,7 +284,7 @@ define <4 x i8> @sv4i8(<4 x i8> %d, <4 x i8> %e) {
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    sdiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
@@ -363,12 +363,12 @@ define <8 x i8> @sv8i8(<8 x i8> %d, <8 x i8> %e) {
 ; CHECK-GI-NEXT:    fmov w13, s1
 ; CHECK-GI-NEXT:    mov w14, v1.s[1]
 ; CHECK-GI-NEXT:    mov w15, v1.s[2]
-; CHECK-GI-NEXT:    mov w16, v1.s[3]
 ; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v2.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v2.s[2]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[3]
 ; CHECK-GI-NEXT:    sdiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v2.s[3]
 ; CHECK-GI-NEXT:    mov v4.s[1], w9
@@ -381,11 +381,11 @@ define <8 x i8> @sv8i8(<8 x i8> %d, <8 x i8> %e) {
 ; CHECK-GI-NEXT:    mls v2.4s, v4.4s, v3.4s
 ; CHECK-GI-NEXT:    sdiv w13, w13, w14
 ; CHECK-GI-NEXT:    mov w14, v0.s[2]
-; CHECK-GI-NEXT:    fmov s5, w12
+; CHECK-GI-NEXT:    mov v5.s[0], w12
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
 ; CHECK-GI-NEXT:    sdiv w14, w14, w15
-; CHECK-GI-NEXT:    mov w15, v0.s[3]
 ; CHECK-GI-NEXT:    mov v5.s[1], w13
-; CHECK-GI-NEXT:    sdiv w8, w15, w16
+; CHECK-GI-NEXT:    sdiv w8, w8, w12
 ; CHECK-GI-NEXT:    mov v5.s[2], w14
 ; CHECK-GI-NEXT:    mov v5.s[3], w8
 ; CHECK-GI-NEXT:    mls v0.4s, v5.4s, v1.4s
@@ -527,20 +527,20 @@ define <16 x i8> @sv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    mov w18, v1.s[1]
 ; CHECK-GI-NEXT:    mov w0, v1.s[2]
 ; CHECK-GI-NEXT:    mov w1, v1.s[3]
-; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    sdiv w11, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v2.s[1]
 ; CHECK-GI-NEXT:    mov w9, v3.s[1]
 ; CHECK-GI-NEXT:    fmov w2, s7
 ; CHECK-GI-NEXT:    mov w3, v7.s[1]
 ; CHECK-GI-NEXT:    mov w4, v7.s[2]
-; CHECK-GI-NEXT:    mov w5, v7.s[3]
-; CHECK-GI-NEXT:    sdiv w11, w8, w9
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v2.s[2]
 ; CHECK-GI-NEXT:    mov w9, v3.s[2]
-; CHECK-GI-NEXT:    fmov s16, w10
+; CHECK-GI-NEXT:    mov v16.s[0], w11
+; CHECK-GI-NEXT:    mov w11, v6.s[3]
 ; CHECK-GI-NEXT:    sdiv w9, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v2.s[3]
-; CHECK-GI-NEXT:    mov v16.s[1], w11
+; CHECK-GI-NEXT:    mov v16.s[1], w10
 ; CHECK-GI-NEXT:    sdiv w8, w8, w12
 ; CHECK-GI-NEXT:    fmov w12, s4
 ; CHECK-GI-NEXT:    mov v16.s[2], w9
@@ -552,7 +552,8 @@ define <16 x i8> @sv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    sdiv w15, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[2]
 ; CHECK-GI-NEXT:    mov w13, v5.s[2]
-; CHECK-GI-NEXT:    fmov s17, w14
+; CHECK-GI-NEXT:    mov v17.s[0], w14
+; CHECK-GI-NEXT:    mov w14, v7.s[3]
 ; CHECK-GI-NEXT:    sdiv w13, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[3]
 ; CHECK-GI-NEXT:    mov v17.s[1], w15
@@ -565,7 +566,7 @@ define <16 x i8> @sv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    mls v4.4s, v17.4s, v5.4s
 ; CHECK-GI-NEXT:    sdiv w17, w17, w18
 ; CHECK-GI-NEXT:    mov w18, v0.s[2]
-; CHECK-GI-NEXT:    fmov s18, w16
+; CHECK-GI-NEXT:    mov v18.s[0], w16
 ; CHECK-GI-NEXT:    sdiv w18, w18, w0
 ; CHECK-GI-NEXT:    mov w0, v0.s[3]
 ; CHECK-GI-NEXT:    mov v18.s[1], w17
@@ -579,11 +580,10 @@ define <16 x i8> @sv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    uzp1 v1.8h, v2.8h, v4.8h
 ; CHECK-GI-NEXT:    sdiv w2, w2, w3
 ; CHECK-GI-NEXT:    mov w3, v6.s[2]
-; CHECK-GI-NEXT:    fmov s19, w1
+; CHECK-GI-NEXT:    mov v19.s[0], w1
 ; CHECK-GI-NEXT:    sdiv w3, w3, w4
-; CHECK-GI-NEXT:    mov w4, v6.s[3]
 ; CHECK-GI-NEXT:    mov v19.s[1], w2
-; CHECK-GI-NEXT:    sdiv w10, w4, w5
+; CHECK-GI-NEXT:    sdiv w10, w11, w14
 ; CHECK-GI-NEXT:    mov v19.s[2], w3
 ; CHECK-GI-NEXT:    mov v19.s[3], w10
 ; CHECK-GI-NEXT:    mls v6.4s, v19.4s, v7.4s
@@ -866,14 +866,13 @@ define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv32i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub sp, sp, #112
-; CHECK-GI-NEXT:    stp x29, x30, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x26, x25, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x24, x23, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x22, x21, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 96
 ; CHECK-GI-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-NEXT:    .cfi_offset w21, -24
@@ -902,43 +901,41 @@ define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    fmov w9, s7
 ; CHECK-GI-NEXT:    mov w12, v7.s[3]
 ; CHECK-GI-NEXT:    fmov w13, s5
-; CHECK-GI-NEXT:    mov w14, v5.s[1]
 ; CHECK-GI-NEXT:    mov w16, v5.s[3]
 ; CHECK-GI-NEXT:    fmov w6, s19
 ; CHECK-GI-NEXT:    mov w7, v19.s[3]
 ; CHECK-GI-NEXT:    fmov w21, s17
-; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    mov w23, v17.s[3]
+; CHECK-GI-NEXT:    sdiv w11, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v6.s[1]
 ; CHECK-GI-NEXT:    mov w9, v7.s[1]
-; CHECK-GI-NEXT:    mov w22, v17.s[3]
-; CHECK-GI-NEXT:    sdiv w11, w8, w9
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v6.s[2]
 ; CHECK-GI-NEXT:    mov w9, v7.s[2]
-; CHECK-GI-NEXT:    fmov s20, w10
+; CHECK-GI-NEXT:    mov v20.s[0], w11
 ; CHECK-GI-NEXT:    sdiv w9, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v6.s[3]
 ; CHECK-GI-NEXT:    sshll2 v6.8h, v0.16b, #0
-; CHECK-GI-NEXT:    mov v20.s[1], w11
+; CHECK-GI-NEXT:    mov v20.s[1], w10
 ; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    sshll v28.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
 ; CHECK-GI-NEXT:    sdiv w8, w8, w12
 ; CHECK-GI-NEXT:    fmov w12, s4
 ; CHECK-GI-NEXT:    mov v20.s[2], w9
-; CHECK-GI-NEXT:    sdiv w13, w12, w13
+; CHECK-GI-NEXT:    sdiv w15, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[1]
-; CHECK-GI-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
-; CHECK-GI-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
-; CHECK-GI-NEXT:    mov v20.s[3], w11
-; CHECK-GI-NEXT:    sdiv w15, w12, w14
+; CHECK-GI-NEXT:    mov w13, v5.s[1]
+; CHECK-GI-NEXT:    mov v20.s[3], w8
+; CHECK-GI-NEXT:    sdiv w14, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[2]
-; CHECK-GI-NEXT:    mov w14, v5.s[2]
+; CHECK-GI-NEXT:    mov w13, v5.s[2]
 ; CHECK-GI-NEXT:    sshll v5.4s, v6.4h, #0
-; CHECK-GI-NEXT:    fmov s21, w13
-; CHECK-GI-NEXT:    sdiv w14, w12, w14
+; CHECK-GI-NEXT:    mov v21.s[0], w15
+; CHECK-GI-NEXT:    sdiv w13, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[3]
 ; CHECK-GI-NEXT:    sshll2 v4.8h, v2.16b, #0
-; CHECK-GI-NEXT:    mov v21.s[1], w15
+; CHECK-GI-NEXT:    mov v21.s[1], w14
 ; CHECK-GI-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    sshll v7.4s, v4.4h, #0
 ; CHECK-GI-NEXT:    sshll v30.4s, v2.4h, #0
@@ -947,72 +944,72 @@ define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    mls v28.4s, v20.4s, v30.4s
 ; CHECK-GI-NEXT:    sdiv w12, w12, w16
 ; CHECK-GI-NEXT:    fmov w16, s5
-; CHECK-GI-NEXT:    mov v21.s[2], w14
-; CHECK-GI-NEXT:    sdiv w18, w16, w17
+; CHECK-GI-NEXT:    mov v21.s[2], w13
+; CHECK-GI-NEXT:    sdiv w1, w16, w17
 ; CHECK-GI-NEXT:    mov w16, v5.s[1]
 ; CHECK-GI-NEXT:    mov w17, v7.s[1]
 ; CHECK-GI-NEXT:    mov v21.s[3], w12
 ; CHECK-GI-NEXT:    mls v0.4s, v21.4s, v2.4s
-; CHECK-GI-NEXT:    sdiv w1, w16, w17
+; CHECK-GI-NEXT:    sdiv w0, w16, w17
 ; CHECK-GI-NEXT:    mov w16, v5.s[2]
 ; CHECK-GI-NEXT:    mov w17, v7.s[2]
-; CHECK-GI-NEXT:    fmov s22, w18
+; CHECK-GI-NEXT:    mov v22.s[0], w1
 ; CHECK-GI-NEXT:    uzp1 v0.8h, v28.8h, v0.8h
-; CHECK-GI-NEXT:    sdiv w0, w16, w17
+; CHECK-GI-NEXT:    sdiv w18, w16, w17
 ; CHECK-GI-NEXT:    mov w16, v5.s[3]
 ; CHECK-GI-NEXT:    mov w17, v7.s[3]
 ; CHECK-GI-NEXT:    sshll2 v5.4s, v6.8h, #0
 ; CHECK-GI-NEXT:    sshll2 v7.4s, v4.8h, #0
-; CHECK-GI-NEXT:    mov v22.s[1], w1
+; CHECK-GI-NEXT:    mov v22.s[1], w0
 ; CHECK-GI-NEXT:    sshll v6.4s, v6.4h, #0
 ; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
 ; CHECK-GI-NEXT:    fmov w2, s7
-; CHECK-GI-NEXT:    mov w3, v7.s[3]
+; CHECK-GI-NEXT:    mov w4, v7.s[3]
 ; CHECK-GI-NEXT:    sdiv w16, w16, w17
 ; CHECK-GI-NEXT:    fmov w17, s5
-; CHECK-GI-NEXT:    mov v22.s[2], w0
+; CHECK-GI-NEXT:    mov v22.s[2], w18
 ; CHECK-GI-NEXT:    sdiv w5, w17, w2
 ; CHECK-GI-NEXT:    mov w17, v5.s[1]
 ; CHECK-GI-NEXT:    mov w2, v7.s[1]
 ; CHECK-GI-NEXT:    mov v22.s[3], w16
 ; CHECK-GI-NEXT:    mls v6.4s, v22.4s, v4.4s
-; CHECK-GI-NEXT:    sdiv w4, w17, w2
+; CHECK-GI-NEXT:    sdiv w3, w17, w2
 ; CHECK-GI-NEXT:    mov w17, v5.s[2]
 ; CHECK-GI-NEXT:    mov w2, v7.s[2]
-; CHECK-GI-NEXT:    fmov s23, w5
+; CHECK-GI-NEXT:    mov v23.s[0], w5
 ; CHECK-GI-NEXT:    sdiv w2, w17, w2
 ; CHECK-GI-NEXT:    mov w17, v5.s[3]
-; CHECK-GI-NEXT:    mov v23.s[1], w4
-; CHECK-GI-NEXT:    sdiv w17, w17, w3
-; CHECK-GI-NEXT:    fmov w3, s18
+; CHECK-GI-NEXT:    mov v23.s[1], w3
+; CHECK-GI-NEXT:    sdiv w17, w17, w4
+; CHECK-GI-NEXT:    fmov w4, s18
 ; CHECK-GI-NEXT:    mov v23.s[2], w2
-; CHECK-GI-NEXT:    sdiv w20, w3, w6
-; CHECK-GI-NEXT:    mov w3, v18.s[1]
+; CHECK-GI-NEXT:    sdiv w20, w4, w6
+; CHECK-GI-NEXT:    mov w4, v18.s[1]
 ; CHECK-GI-NEXT:    mov w6, v19.s[1]
 ; CHECK-GI-NEXT:    mov v23.s[3], w17
 ; CHECK-GI-NEXT:    mls v5.4s, v23.4s, v7.4s
-; CHECK-GI-NEXT:    sdiv w19, w3, w6
-; CHECK-GI-NEXT:    mov w3, v18.s[2]
+; CHECK-GI-NEXT:    sdiv w19, w4, w6
+; CHECK-GI-NEXT:    mov w4, v18.s[2]
 ; CHECK-GI-NEXT:    mov w6, v19.s[2]
-; CHECK-GI-NEXT:    fmov s24, w20
+; CHECK-GI-NEXT:    mov v24.s[0], w20
 ; CHECK-GI-NEXT:    uzp1 v2.8h, v6.8h, v5.8h
 ; CHECK-GI-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT:    sdiv w6, w3, w6
-; CHECK-GI-NEXT:    mov w3, v18.s[3]
+; CHECK-GI-NEXT:    sdiv w6, w4, w6
+; CHECK-GI-NEXT:    mov w4, v18.s[3]
 ; CHECK-GI-NEXT:    mov v24.s[1], w19
-; CHECK-GI-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    sdiv w3, w3, w7
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w4, w4, w7
 ; CHECK-GI-NEXT:    fmov w7, s16
 ; CHECK-GI-NEXT:    mov v24.s[2], w6
-; CHECK-GI-NEXT:    sdiv w23, w7, w21
+; CHECK-GI-NEXT:    sdiv w24, w7, w21
 ; CHECK-GI-NEXT:    mov w7, v16.s[1]
 ; CHECK-GI-NEXT:    mov w21, v17.s[1]
-; CHECK-GI-NEXT:    mov v24.s[3], w3
-; CHECK-GI-NEXT:    sdiv w24, w7, w21
+; CHECK-GI-NEXT:    mov v24.s[3], w4
+; CHECK-GI-NEXT:    sdiv w22, w7, w21
 ; CHECK-GI-NEXT:    mov w7, v16.s[2]
 ; CHECK-GI-NEXT:    mov w21, v17.s[2]
 ; CHECK-GI-NEXT:    sshll2 v17.8h, v1.16b, #0
-; CHECK-GI-NEXT:    fmov s25, w23
+; CHECK-GI-NEXT:    mov v25.s[0], w24
 ; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
 ; CHECK-GI-NEXT:    sshll v18.4s, v17.4h, #0
 ; CHECK-GI-NEXT:    sshll v29.4s, v1.4h, #0
@@ -1020,9 +1017,8 @@ define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    sdiv w21, w7, w21
 ; CHECK-GI-NEXT:    mov w7, v16.s[3]
 ; CHECK-GI-NEXT:    sshll2 v16.8h, v3.16b, #0
-; CHECK-GI-NEXT:    mov v25.s[1], w24
+; CHECK-GI-NEXT:    mov v25.s[1], w22
 ; CHECK-GI-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-GI-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    sshll v19.4s, v16.4h, #0
 ; CHECK-GI-NEXT:    sshll v31.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
@@ -1032,51 +1028,51 @@ define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    mov w28, v19.s[3]
 ; CHECK-GI-NEXT:    sshll2 v19.4s, v16.8h, #0
 ; CHECK-GI-NEXT:    sshll v16.4s, v16.4h, #0
-; CHECK-GI-NEXT:    sdiv w7, w7, w22
-; CHECK-GI-NEXT:    fmov w22, s18
+; CHECK-GI-NEXT:    sdiv w7, w7, w23
+; CHECK-GI-NEXT:    fmov w23, s18
 ; CHECK-GI-NEXT:    mov v25.s[2], w21
 ; CHECK-GI-NEXT:    mls v29.4s, v24.4s, v31.4s
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov w29, s19
 ; CHECK-GI-NEXT:    mov w30, v19.s[1]
-; CHECK-GI-NEXT:    mov w8, v19.s[2]
-; CHECK-GI-NEXT:    mov w10, v19.s[3]
-; CHECK-GI-NEXT:    sdiv w25, w22, w25
-; CHECK-GI-NEXT:    mov w22, v18.s[1]
+; CHECK-GI-NEXT:    mov w15, v19.s[2]
+; CHECK-GI-NEXT:    sdiv w25, w23, w25
+; CHECK-GI-NEXT:    mov w23, v18.s[1]
 ; CHECK-GI-NEXT:    mov v25.s[3], w7
 ; CHECK-GI-NEXT:    mls v1.4s, v25.4s, v3.4s
-; CHECK-GI-NEXT:    sdiv w26, w22, w26
-; CHECK-GI-NEXT:    mov w22, v18.s[2]
-; CHECK-GI-NEXT:    fmov s26, w25
+; CHECK-GI-NEXT:    sdiv w26, w23, w26
+; CHECK-GI-NEXT:    mov w23, v18.s[2]
+; CHECK-GI-NEXT:    mov v26.s[0], w25
 ; CHECK-GI-NEXT:    uzp1 v1.8h, v29.8h, v1.8h
-; CHECK-GI-NEXT:    sdiv w27, w22, w27
-; CHECK-GI-NEXT:    mov w22, v18.s[3]
+; CHECK-GI-NEXT:    sdiv w27, w23, w27
+; CHECK-GI-NEXT:    mov w23, v18.s[3]
 ; CHECK-GI-NEXT:    sshll2 v18.4s, v17.8h, #0
 ; CHECK-GI-NEXT:    mov v26.s[1], w26
 ; CHECK-GI-NEXT:    sshll v17.4s, v17.4h, #0
-; CHECK-GI-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov w11, v18.s[2]
 ; CHECK-GI-NEXT:    mov w9, v18.s[3]
-; CHECK-GI-NEXT:    sdiv w22, w22, w28
+; CHECK-GI-NEXT:    sdiv w23, w23, w28
 ; CHECK-GI-NEXT:    fmov w28, s18
 ; CHECK-GI-NEXT:    mov v26.s[2], w27
 ; CHECK-GI-NEXT:    sdiv w28, w28, w29
 ; CHECK-GI-NEXT:    mov w29, v18.s[1]
-; CHECK-GI-NEXT:    mov v26.s[3], w22
-; CHECK-GI-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v26.s[3], w23
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mls v17.4s, v26.4s, v16.4s
 ; CHECK-GI-NEXT:    sdiv w29, w29, w30
-; CHECK-GI-NEXT:    mov w30, v18.s[2]
-; CHECK-GI-NEXT:    fmov s27, w28
-; CHECK-GI-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    sdiv w8, w30, w8
+; CHECK-GI-NEXT:    mov v27.s[0], w28
+; CHECK-GI-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w10, w11, w15
+; CHECK-GI-NEXT:    mov w11, v19.s[3]
 ; CHECK-GI-NEXT:    mov v27.s[1], w29
-; CHECK-GI-NEXT:    ldp x29, x30, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    sdiv w9, w9, w10
-; CHECK-GI-NEXT:    mov v27.s[2], w8
-; CHECK-GI-NEXT:    mov v27.s[3], w9
+; CHECK-GI-NEXT:    sdiv w8, w9, w11
+; CHECK-GI-NEXT:    mov v27.s[2], w10
+; CHECK-GI-NEXT:    mov v27.s[3], w8
 ; CHECK-GI-NEXT:    mls v18.4s, v27.4s, v19.4s
 ; CHECK-GI-NEXT:    uzp1 v3.8h, v17.8h, v18.8h
 ; CHECK-GI-NEXT:    uzp1 v1.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT:    add sp, sp, #112
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <32 x i8> %d, %e
@@ -1113,7 +1109,7 @@ define <2 x i8> @uv2i8(<2 x i8> %d, <2 x i8> %e) {
 ; CHECK-GI-NEXT:    udiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -1206,7 +1202,7 @@ define <4 x i8> @uv4i8(<4 x i8> %d, <4 x i8> %e) {
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    udiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
@@ -1285,12 +1281,12 @@ define <8 x i8> @uv8i8(<8 x i8> %d, <8 x i8> %e) {
 ; CHECK-GI-NEXT:    fmov w13, s1
 ; CHECK-GI-NEXT:    mov w14, v1.s[1]
 ; CHECK-GI-NEXT:    mov w15, v1.s[2]
-; CHECK-GI-NEXT:    mov w16, v1.s[3]
 ; CHECK-GI-NEXT:    udiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v2.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v2.s[2]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[3]
 ; CHECK-GI-NEXT:    udiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v2.s[3]
 ; CHECK-GI-NEXT:    mov v4.s[1], w9
@@ -1303,11 +1299,11 @@ define <8 x i8> @uv8i8(<8 x i8> %d, <8 x i8> %e) {
 ; CHECK-GI-NEXT:    mls v2.4s, v4.4s, v3.4s
 ; CHECK-GI-NEXT:    udiv w13, w13, w14
 ; CHECK-GI-NEXT:    mov w14, v0.s[2]
-; CHECK-GI-NEXT:    fmov s5, w12
+; CHECK-GI-NEXT:    mov v5.s[0], w12
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
 ; CHECK-GI-NEXT:    udiv w14, w14, w15
-; CHECK-GI-NEXT:    mov w15, v0.s[3]
 ; CHECK-GI-NEXT:    mov v5.s[1], w13
-; CHECK-GI-NEXT:    udiv w8, w15, w16
+; CHECK-GI-NEXT:    udiv w8, w8, w12
 ; CHECK-GI-NEXT:    mov v5.s[2], w14
 ; CHECK-GI-NEXT:    mov v5.s[3], w8
 ; CHECK-GI-NEXT:    mls v0.4s, v5.4s, v1.4s
@@ -1449,20 +1445,20 @@ define <16 x i8> @uv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    mov w18, v1.s[1]
 ; CHECK-GI-NEXT:    mov w0, v1.s[2]
 ; CHECK-GI-NEXT:    mov w1, v1.s[3]
-; CHECK-GI-NEXT:    udiv w10, w8, w9
+; CHECK-GI-NEXT:    udiv w11, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v2.s[1]
 ; CHECK-GI-NEXT:    mov w9, v3.s[1]
 ; CHECK-GI-NEXT:    fmov w2, s7
 ; CHECK-GI-NEXT:    mov w3, v7.s[1]
 ; CHECK-GI-NEXT:    mov w4, v7.s[2]
-; CHECK-GI-NEXT:    mov w5, v7.s[3]
-; CHECK-GI-NEXT:    udiv w11, w8, w9
+; CHECK-GI-NEXT:    udiv w10, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v2.s[2]
 ; CHECK-GI-NEXT:    mov w9, v3.s[2]
-; CHECK-GI-NEXT:    fmov s16, w10
+; CHECK-GI-NEXT:    mov v16.s[0], w11
+; CHECK-GI-NEXT:    mov w11, v6.s[3]
 ; CHECK-GI-NEXT:    udiv w9, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v2.s[3]
-; CHECK-GI-NEXT:    mov v16.s[1], w11
+; CHECK-GI-NEXT:    mov v16.s[1], w10
 ; CHECK-GI-NEXT:    udiv w8, w8, w12
 ; CHECK-GI-NEXT:    fmov w12, s4
 ; CHECK-GI-NEXT:    mov v16.s[2], w9
@@ -1474,7 +1470,8 @@ define <16 x i8> @uv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    udiv w15, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[2]
 ; CHECK-GI-NEXT:    mov w13, v5.s[2]
-; CHECK-GI-NEXT:    fmov s17, w14
+; CHECK-GI-NEXT:    mov v17.s[0], w14
+; CHECK-GI-NEXT:    mov w14, v7.s[3]
 ; CHECK-GI-NEXT:    udiv w13, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[3]
 ; CHECK-GI-NEXT:    mov v17.s[1], w15
@@ -1487,7 +1484,7 @@ define <16 x i8> @uv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    mls v4.4s, v17.4s, v5.4s
 ; CHECK-GI-NEXT:    udiv w17, w17, w18
 ; CHECK-GI-NEXT:    mov w18, v0.s[2]
-; CHECK-GI-NEXT:    fmov s18, w16
+; CHECK-GI-NEXT:    mov v18.s[0], w16
 ; CHECK-GI-NEXT:    udiv w18, w18, w0
 ; CHECK-GI-NEXT:    mov w0, v0.s[3]
 ; CHECK-GI-NEXT:    mov v18.s[1], w17
@@ -1501,11 +1498,10 @@ define <16 x i8> @uv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    uzp1 v1.8h, v2.8h, v4.8h
 ; CHECK-GI-NEXT:    udiv w2, w2, w3
 ; CHECK-GI-NEXT:    mov w3, v6.s[2]
-; CHECK-GI-NEXT:    fmov s19, w1
+; CHECK-GI-NEXT:    mov v19.s[0], w1
 ; CHECK-GI-NEXT:    udiv w3, w3, w4
-; CHECK-GI-NEXT:    mov w4, v6.s[3]
 ; CHECK-GI-NEXT:    mov v19.s[1], w2
-; CHECK-GI-NEXT:    udiv w10, w4, w5
+; CHECK-GI-NEXT:    udiv w10, w11, w14
 ; CHECK-GI-NEXT:    mov v19.s[2], w3
 ; CHECK-GI-NEXT:    mov v19.s[3], w10
 ; CHECK-GI-NEXT:    mls v6.4s, v19.4s, v7.4s
@@ -1788,14 +1784,13 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: uv32i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub sp, sp, #112
-; CHECK-GI-NEXT:    stp x29, x30, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x26, x25, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x24, x23, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x22, x21, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 96
 ; CHECK-GI-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-NEXT:    .cfi_offset w21, -24
@@ -1824,43 +1819,41 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    fmov w9, s7
 ; CHECK-GI-NEXT:    mov w12, v7.s[3]
 ; CHECK-GI-NEXT:    fmov w13, s5
-; CHECK-GI-NEXT:    mov w14, v5.s[1]
 ; CHECK-GI-NEXT:    mov w16, v5.s[3]
 ; CHECK-GI-NEXT:    fmov w6, s19
 ; CHECK-GI-NEXT:    mov w7, v19.s[3]
 ; CHECK-GI-NEXT:    fmov w21, s17
-; CHECK-GI-NEXT:    udiv w10, w8, w9
+; CHECK-GI-NEXT:    mov w23, v17.s[3]
+; CHECK-GI-NEXT:    udiv w11, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v6.s[1]
 ; CHECK-GI-NEXT:    mov w9, v7.s[1]
-; CHECK-GI-NEXT:    mov w22, v17.s[3]
-; CHECK-GI-NEXT:    udiv w11, w8, w9
+; CHECK-GI-NEXT:    udiv w10, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v6.s[2]
 ; CHECK-GI-NEXT:    mov w9, v7.s[2]
-; CHECK-GI-NEXT:    fmov s20, w10
+; CHECK-GI-NEXT:    mov v20.s[0], w11
 ; CHECK-GI-NEXT:    udiv w9, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v6.s[3]
 ; CHECK-GI-NEXT:    ushll2 v6.8h, v0.16b, #0
-; CHECK-GI-NEXT:    mov v20.s[1], w11
+; CHECK-GI-NEXT:    mov v20.s[1], w10
 ; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    ushll v28.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
 ; CHECK-GI-NEXT:    udiv w8, w8, w12
 ; CHECK-GI-NEXT:    fmov w12, s4
 ; CHECK-GI-NEXT:    mov v20.s[2], w9
-; CHECK-GI-NEXT:    udiv w13, w12, w13
+; CHECK-GI-NEXT:    udiv w15, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[1]
-; CHECK-GI-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
-; CHECK-GI-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
-; CHECK-GI-NEXT:    mov v20.s[3], w11
-; CHECK-GI-NEXT:    udiv w15, w12, w14
+; CHECK-GI-NEXT:    mov w13, v5.s[1]
+; CHECK-GI-NEXT:    mov v20.s[3], w8
+; CHECK-GI-NEXT:    udiv w14, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[2]
-; CHECK-GI-NEXT:    mov w14, v5.s[2]
+; CHECK-GI-NEXT:    mov w13, v5.s[2]
 ; CHECK-GI-NEXT:    ushll v5.4s, v6.4h, #0
-; CHECK-GI-NEXT:    fmov s21, w13
-; CHECK-GI-NEXT:    udiv w14, w12, w14
+; CHECK-GI-NEXT:    mov v21.s[0], w15
+; CHECK-GI-NEXT:    udiv w13, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[3]
 ; CHECK-GI-NEXT:    ushll2 v4.8h, v2.16b, #0
-; CHECK-GI-NEXT:    mov v21.s[1], w15
+; CHECK-GI-NEXT:    mov v21.s[1], w14
 ; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    ushll v7.4s, v4.4h, #0
 ; CHECK-GI-NEXT:    ushll v30.4s, v2.4h, #0
@@ -1869,72 +1862,72 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    mls v28.4s, v20.4s, v30.4s
 ; CHECK-GI-NEXT:    udiv w12, w12, w16
 ; CHECK-GI-NEXT:    fmov w16, s5
-; CHECK-GI-NEXT:    mov v21.s[2], w14
-; CHECK-GI-NEXT:    udiv w18, w16, w17
+; CHECK-GI-NEXT:    mov v21.s[2], w13
+; CHECK-GI-NEXT:    udiv w1, w16, w17
 ; CHECK-GI-NEXT:    mov w16, v5.s[1]
 ; CHECK-GI-NEXT:    mov w17, v7.s[1]
 ; CHECK-GI-NEXT:    mov v21.s[3], w12
 ; CHECK-GI-NEXT:    mls v0.4s, v21.4s, v2.4s
-; CHECK-GI-NEXT:    udiv w1, w16, w17
+; CHECK-GI-NEXT:    udiv w0, w16, w17
 ; CHECK-GI-NEXT:    mov w16, v5.s[2]
 ; CHECK-GI-NEXT:    mov w17, v7.s[2]
-; CHECK-GI-NEXT:    fmov s22, w18
+; CHECK-GI-NEXT:    mov v22.s[0], w1
 ; CHECK-GI-NEXT:    uzp1 v0.8h, v28.8h, v0.8h
-; CHECK-GI-NEXT:    udiv w0, w16, w17
+; CHECK-GI-NEXT:    udiv w18, w16, w17
 ; CHECK-GI-NEXT:    mov w16, v5.s[3]
 ; CHECK-GI-NEXT:    mov w17, v7.s[3]
 ; CHECK-GI-NEXT:    ushll2 v5.4s, v6.8h, #0
 ; CHECK-GI-NEXT:    ushll2 v7.4s, v4.8h, #0
-; CHECK-GI-NEXT:    mov v22.s[1], w1
+; CHECK-GI-NEXT:    mov v22.s[1], w0
 ; CHECK-GI-NEXT:    ushll v6.4s, v6.4h, #0
 ; CHECK-GI-NEXT:    ushll v4.4s, v4.4h, #0
 ; CHECK-GI-NEXT:    fmov w2, s7
-; CHECK-GI-NEXT:    mov w3, v7.s[3]
+; CHECK-GI-NEXT:    mov w4, v7.s[3]
 ; CHECK-GI-NEXT:    udiv w16, w16, w17
 ; CHECK-GI-NEXT:    fmov w17, s5
-; CHECK-GI-NEXT:    mov v22.s[2], w0
+; CHECK-GI-NEXT:    mov v22.s[2], w18
 ; CHECK-GI-NEXT:    udiv w5, w17, w2
 ; CHECK-GI-NEXT:    mov w17, v5.s[1]
 ; CHECK-GI-NEXT:    mov w2, v7.s[1]
 ; CHECK-GI-NEXT:    mov v22.s[3], w16
 ; CHECK-GI-NEXT:    mls v6.4s, v22.4s, v4.4s
-; CHECK-GI-NEXT:    udiv w4, w17, w2
+; CHECK-GI-NEXT:    udiv w3, w17, w2
 ; CHECK-GI-NEXT:    mov w17, v5.s[2]
 ; CHECK-GI-NEXT:    mov w2, v7.s[2]
-; CHECK-GI-NEXT:    fmov s23, w5
+; CHECK-GI-NEXT:    mov v23.s[0], w5
 ; CHECK-GI-NEXT:    udiv w2, w17, w2
 ; CHECK-GI-NEXT:    mov w17, v5.s[3]
-; CHECK-GI-NEXT:    mov v23.s[1], w4
-; CHECK-GI-NEXT:    udiv w17, w17, w3
-; CHECK-GI-NEXT:    fmov w3, s18
+; CHECK-GI-NEXT:    mov v23.s[1], w3
+; CHECK-GI-NEXT:    udiv w17, w17, w4
+; CHECK-GI-NEXT:    fmov w4, s18
 ; CHECK-GI-NEXT:    mov v23.s[2], w2
-; CHECK-GI-NEXT:    udiv w20, w3, w6
-; CHECK-GI-NEXT:    mov w3, v18.s[1]
+; CHECK-GI-NEXT:    udiv w20, w4, w6
+; CHECK-GI-NEXT:    mov w4, v18.s[1]
 ; CHECK-GI-NEXT:    mov w6, v19.s[1]
 ; CHECK-GI-NEXT:    mov v23.s[3], w17
 ; CHECK-GI-NEXT:    mls v5.4s, v23.4s, v7.4s
-; CHECK-GI-NEXT:    udiv w19, w3, w6
-; CHECK-GI-NEXT:    mov w3, v18.s[2]
+; CHECK-GI-NEXT:    udiv w19, w4, w6
+; CHECK-GI-NEXT:    mov w4, v18.s[2]
 ; CHECK-GI-NEXT:    mov w6, v19.s[2]
-; CHECK-GI-NEXT:    fmov s24, w20
+; CHECK-GI-NEXT:    mov v24.s[0], w20
 ; CHECK-GI-NEXT:    uzp1 v2.8h, v6.8h, v5.8h
 ; CHECK-GI-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT:    udiv w6, w3, w6
-; CHECK-GI-NEXT:    mov w3, v18.s[3]
+; CHECK-GI-NEXT:    udiv w6, w4, w6
+; CHECK-GI-NEXT:    mov w4, v18.s[3]
 ; CHECK-GI-NEXT:    mov v24.s[1], w19
-; CHECK-GI-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    udiv w3, w3, w7
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    udiv w4, w4, w7
 ; CHECK-GI-NEXT:    fmov w7, s16
 ; CHECK-GI-NEXT:    mov v24.s[2], w6
-; CHECK-GI-NEXT:    udiv w23, w7, w21
+; CHECK-GI-NEXT:    udiv w24, w7, w21
 ; CHECK-GI-NEXT:    mov w7, v16.s[1]
 ; CHECK-GI-NEXT:    mov w21, v17.s[1]
-; CHECK-GI-NEXT:    mov v24.s[3], w3
-; CHECK-GI-NEXT:    udiv w24, w7, w21
+; CHECK-GI-NEXT:    mov v24.s[3], w4
+; CHECK-GI-NEXT:    udiv w22, w7, w21
 ; CHECK-GI-NEXT:    mov w7, v16.s[2]
 ; CHECK-GI-NEXT:    mov w21, v17.s[2]
 ; CHECK-GI-NEXT:    ushll2 v17.8h, v1.16b, #0
-; CHECK-GI-NEXT:    fmov s25, w23
+; CHECK-GI-NEXT:    mov v25.s[0], w24
 ; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-GI-NEXT:    ushll v18.4s, v17.4h, #0
 ; CHECK-GI-NEXT:    ushll v29.4s, v1.4h, #0
@@ -1942,9 +1935,8 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    udiv w21, w7, w21
 ; CHECK-GI-NEXT:    mov w7, v16.s[3]
 ; CHECK-GI-NEXT:    ushll2 v16.8h, v3.16b, #0
-; CHECK-GI-NEXT:    mov v25.s[1], w24
+; CHECK-GI-NEXT:    mov v25.s[1], w22
 ; CHECK-GI-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-GI-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ushll v19.4s, v16.4h, #0
 ; CHECK-GI-NEXT:    ushll v31.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
@@ -1954,51 +1946,51 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    mov w28, v19.s[3]
 ; CHECK-GI-NEXT:    ushll2 v19.4s, v16.8h, #0
 ; CHECK-GI-NEXT:    ushll v16.4s, v16.4h, #0
-; CHECK-GI-NEXT:    udiv w7, w7, w22
-; CHECK-GI-NEXT:    fmov w22, s18
+; CHECK-GI-NEXT:    udiv w7, w7, w23
+; CHECK-GI-NEXT:    fmov w23, s18
 ; CHECK-GI-NEXT:    mov v25.s[2], w21
 ; CHECK-GI-NEXT:    mls v29.4s, v24.4s, v31.4s
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov w29, s19
 ; CHECK-GI-NEXT:    mov w30, v19.s[1]
-; CHECK-GI-NEXT:    mov w8, v19.s[2]
-; CHECK-GI-NEXT:    mov w10, v19.s[3]
-; CHECK-GI-NEXT:    udiv w25, w22, w25
-; CHECK-GI-NEXT:    mov w22, v18.s[1]
+; CHECK-GI-NEXT:    mov w15, v19.s[2]
+; CHECK-GI-NEXT:    udiv w25, w23, w25
+; CHECK-GI-NEXT:    mov w23, v18.s[1]
 ; CHECK-GI-NEXT:    mov v25.s[3], w7
 ; CHECK-GI-NEXT:    mls v1.4s, v25.4s, v3.4s
-; CHECK-GI-NEXT:    udiv w26, w22, w26
-; CHECK-GI-NEXT:    mov w22, v18.s[2]
-; CHECK-GI-NEXT:    fmov s26, w25
+; CHECK-GI-NEXT:    udiv w26, w23, w26
+; CHECK-GI-NEXT:    mov w23, v18.s[2]
+; CHECK-GI-NEXT:    mov v26.s[0], w25
 ; CHECK-GI-NEXT:    uzp1 v1.8h, v29.8h, v1.8h
-; CHECK-GI-NEXT:    udiv w27, w22, w27
-; CHECK-GI-NEXT:    mov w22, v18.s[3]
+; CHECK-GI-NEXT:    udiv w27, w23, w27
+; CHECK-GI-NEXT:    mov w23, v18.s[3]
 ; CHECK-GI-NEXT:    ushll2 v18.4s, v17.8h, #0
 ; CHECK-GI-NEXT:    mov v26.s[1], w26
 ; CHECK-GI-NEXT:    ushll v17.4s, v17.4h, #0
-; CHECK-GI-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov w11, v18.s[2]
 ; CHECK-GI-NEXT:    mov w9, v18.s[3]
-; CHECK-GI-NEXT:    udiv w22, w22, w28
+; CHECK-GI-NEXT:    udiv w23, w23, w28
 ; CHECK-GI-NEXT:    fmov w28, s18
 ; CHECK-GI-NEXT:    mov v26.s[2], w27
 ; CHECK-GI-NEXT:    udiv w28, w28, w29
 ; CHECK-GI-NEXT:    mov w29, v18.s[1]
-; CHECK-GI-NEXT:    mov v26.s[3], w22
-; CHECK-GI-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v26.s[3], w23
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mls v17.4s, v26.4s, v16.4s
 ; CHECK-GI-NEXT:    udiv w29, w29, w30
-; CHECK-GI-NEXT:    mov w30, v18.s[2]
-; CHECK-GI-NEXT:    fmov s27, w28
-; CHECK-GI-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    udiv w8, w30, w8
+; CHECK-GI-NEXT:    mov v27.s[0], w28
+; CHECK-GI-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    udiv w10, w11, w15
+; CHECK-GI-NEXT:    mov w11, v19.s[3]
 ; CHECK-GI-NEXT:    mov v27.s[1], w29
-; CHECK-GI-NEXT:    ldp x29, x30, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    udiv w9, w9, w10
-; CHECK-GI-NEXT:    mov v27.s[2], w8
-; CHECK-GI-NEXT:    mov v27.s[3], w9
+; CHECK-GI-NEXT:    udiv w8, w9, w11
+; CHECK-GI-NEXT:    mov v27.s[2], w10
+; CHECK-GI-NEXT:    mov v27.s[3], w8
 ; CHECK-GI-NEXT:    mls v18.4s, v27.4s, v19.4s
 ; CHECK-GI-NEXT:    uzp1 v3.8h, v17.8h, v18.8h
 ; CHECK-GI-NEXT:    uzp1 v1.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT:    add sp, sp, #112
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = urem <32 x i8> %d, %e
@@ -2037,7 +2029,7 @@ define <2 x i16> @sv2i16(<2 x i16> %d, <2 x i16> %e) {
 ; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -2086,11 +2078,9 @@ define <3 x i16> @sv3i16(<3 x i16> %d, <3 x i16> %e) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    sdiv w16, w14, w15
 ; CHECK-GI-NEXT:    msub w9, w13, w12, w11
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w9
 ; CHECK-GI-NEXT:    msub w8, w16, w15, w14
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2139,7 +2129,7 @@ define <4 x i16> @sv4i16(<4 x i16> %d, <4 x i16> %e) {
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    sdiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
@@ -2214,12 +2204,12 @@ define <8 x i16> @sv8i16(<8 x i16> %d, <8 x i16> %e) {
 ; CHECK-GI-NEXT:    fmov w13, s1
 ; CHECK-GI-NEXT:    mov w14, v1.s[1]
 ; CHECK-GI-NEXT:    mov w15, v1.s[2]
-; CHECK-GI-NEXT:    mov w16, v1.s[3]
 ; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v2.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v2.s[2]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[3]
 ; CHECK-GI-NEXT:    sdiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v2.s[3]
 ; CHECK-GI-NEXT:    mov v4.s[1], w9
@@ -2232,11 +2222,11 @@ define <8 x i16> @sv8i16(<8 x i16> %d, <8 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v2.4s, v4.4s, v3.4s
 ; CHECK-GI-NEXT:    sdiv w13, w13, w14
 ; CHECK-GI-NEXT:    mov w14, v0.s[2]
-; CHECK-GI-NEXT:    fmov s5, w12
+; CHECK-GI-NEXT:    mov v5.s[0], w12
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
 ; CHECK-GI-NEXT:    sdiv w14, w14, w15
-; CHECK-GI-NEXT:    mov w15, v0.s[3]
 ; CHECK-GI-NEXT:    mov v5.s[1], w13
-; CHECK-GI-NEXT:    sdiv w8, w15, w16
+; CHECK-GI-NEXT:    sdiv w8, w8, w12
 ; CHECK-GI-NEXT:    mov v5.s[2], w14
 ; CHECK-GI-NEXT:    mov v5.s[3], w8
 ; CHECK-GI-NEXT:    mls v0.4s, v5.4s, v1.4s
@@ -2397,18 +2387,17 @@ define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mov w1, v7.s[3]
 ; CHECK-GI-NEXT:    sshll2 v7.4s, v3.8h, #0
 ; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    sdiv w11, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v4.s[1]
 ; CHECK-GI-NEXT:    mov w9, v5.s[1]
 ; CHECK-GI-NEXT:    fmov w2, s7
 ; CHECK-GI-NEXT:    mov w3, v7.s[1]
 ; CHECK-GI-NEXT:    mov w4, v7.s[2]
-; CHECK-GI-NEXT:    mov w5, v7.s[3]
-; CHECK-GI-NEXT:    sdiv w11, w8, w9
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v4.s[2]
 ; CHECK-GI-NEXT:    mov w9, v5.s[2]
 ; CHECK-GI-NEXT:    sshll2 v5.4s, v2.8h, #0
-; CHECK-GI-NEXT:    fmov s16, w10
+; CHECK-GI-NEXT:    mov v16.s[0], w11
 ; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    fmov w13, s5
 ; CHECK-GI-NEXT:    mov w14, v5.s[1]
@@ -2417,7 +2406,7 @@ define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    sdiv w9, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v4.s[3]
 ; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
-; CHECK-GI-NEXT:    mov v16.s[1], w11
+; CHECK-GI-NEXT:    mov v16.s[1], w10
 ; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    sdiv w8, w8, w12
 ; CHECK-GI-NEXT:    fmov w12, s4
@@ -2428,7 +2417,8 @@ define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v0.4s, v16.4s, v2.4s
 ; CHECK-GI-NEXT:    sdiv w14, w12, w14
 ; CHECK-GI-NEXT:    mov w12, v4.s[2]
-; CHECK-GI-NEXT:    fmov s17, w13
+; CHECK-GI-NEXT:    mov v17.s[0], w13
+; CHECK-GI-NEXT:    mov w13, v7.s[3]
 ; CHECK-GI-NEXT:    sdiv w15, w12, w15
 ; CHECK-GI-NEXT:    mov w12, v4.s[3]
 ; CHECK-GI-NEXT:    mov v17.s[1], w14
@@ -2441,13 +2431,14 @@ define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v4.4s, v17.4s, v5.4s
 ; CHECK-GI-NEXT:    sdiv w17, w17, w18
 ; CHECK-GI-NEXT:    mov w18, v6.s[2]
-; CHECK-GI-NEXT:    fmov s18, w16
+; CHECK-GI-NEXT:    mov v18.s[0], w16
 ; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
 ; CHECK-GI-NEXT:    sdiv w18, w18, w0
 ; CHECK-GI-NEXT:    mov w0, v6.s[3]
 ; CHECK-GI-NEXT:    sshll2 v6.4s, v1.8h, #0
 ; CHECK-GI-NEXT:    mov v18.s[1], w17
 ; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov w11, v6.s[3]
 ; CHECK-GI-NEXT:    sdiv w0, w0, w1
 ; CHECK-GI-NEXT:    fmov w1, s6
 ; CHECK-GI-NEXT:    mov v18.s[2], w18
@@ -2457,11 +2448,10 @@ define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v1.4s, v18.4s, v3.4s
 ; CHECK-GI-NEXT:    sdiv w2, w2, w3
 ; CHECK-GI-NEXT:    mov w3, v6.s[2]
-; CHECK-GI-NEXT:    fmov s19, w1
+; CHECK-GI-NEXT:    mov v19.s[0], w1
 ; CHECK-GI-NEXT:    sdiv w3, w3, w4
-; CHECK-GI-NEXT:    mov w4, v6.s[3]
 ; CHECK-GI-NEXT:    mov v19.s[1], w2
-; CHECK-GI-NEXT:    sdiv w10, w4, w5
+; CHECK-GI-NEXT:    sdiv w10, w11, w13
 ; CHECK-GI-NEXT:    mov v19.s[2], w3
 ; CHECK-GI-NEXT:    mov v19.s[3], w10
 ; CHECK-GI-NEXT:    mls v6.4s, v19.4s, v7.4s
@@ -2502,7 +2492,7 @@ define <2 x i16> @uv2i16(<2 x i16> %d, <2 x i16> %e) {
 ; CHECK-GI-NEXT:    udiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -2556,11 +2546,9 @@ define <3 x i16> @uv3i16(<3 x i16> %d, <3 x i16> %e) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    udiv w16, w14, w15
 ; CHECK-GI-NEXT:    msub w9, w13, w12, w11
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w9
 ; CHECK-GI-NEXT:    msub w8, w16, w15, w14
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2609,7 +2597,7 @@ define <4 x i16> @uv4i16(<4 x i16> %d, <4 x i16> %e) {
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    udiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
@@ -2684,12 +2672,12 @@ define <8 x i16> @uv8i16(<8 x i16> %d, <8 x i16> %e) {
 ; CHECK-GI-NEXT:    fmov w13, s1
 ; CHECK-GI-NEXT:    mov w14, v1.s[1]
 ; CHECK-GI-NEXT:    mov w15, v1.s[2]
-; CHECK-GI-NEXT:    mov w16, v1.s[3]
 ; CHECK-GI-NEXT:    udiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v2.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v2.s[2]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[3]
 ; CHECK-GI-NEXT:    udiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v2.s[3]
 ; CHECK-GI-NEXT:    mov v4.s[1], w9
@@ -2702,11 +2690,11 @@ define <8 x i16> @uv8i16(<8 x i16> %d, <8 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v2.4s, v4.4s, v3.4s
 ; CHECK-GI-NEXT:    udiv w13, w13, w14
 ; CHECK-GI-NEXT:    mov w14, v0.s[2]
-; CHECK-GI-NEXT:    fmov s5, w12
+; CHECK-GI-NEXT:    mov v5.s[0], w12
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
 ; CHECK-GI-NEXT:    udiv w14, w14, w15
-; CHECK-GI-NEXT:    mov w15, v0.s[3]
 ; CHECK-GI-NEXT:    mov v5.s[1], w13
-; CHECK-GI-NEXT:    udiv w8, w15, w16
+; CHECK-GI-NEXT:    udiv w8, w8, w12
 ; CHECK-GI-NEXT:    mov v5.s[2], w14
 ; CHECK-GI-NEXT:    mov v5.s[3], w8
 ; CHECK-GI-NEXT:    mls v0.4s, v5.4s, v1.4s
@@ -2867,18 +2855,17 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mov w1, v7.s[3]
 ; CHECK-GI-NEXT:    ushll2 v7.4s, v3.8h, #0
 ; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    udiv w10, w8, w9
+; CHECK-GI-NEXT:    udiv w11, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v4.s[1]
 ; CHECK-GI-NEXT:    mov w9, v5.s[1]
 ; CHECK-GI-NEXT:    fmov w2, s7
 ; CHECK-GI-NEXT:    mov w3, v7.s[1]
 ; CHECK-GI-NEXT:    mov w4, v7.s[2]
-; CHECK-GI-NEXT:    mov w5, v7.s[3]
-; CHECK-GI-NEXT:    udiv w11, w8, w9
+; CHECK-GI-NEXT:    udiv w10, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v4.s[2]
 ; CHECK-GI-NEXT:    mov w9, v5.s[2]
 ; CHECK-GI-NEXT:    ushll2 v5.4s, v2.8h, #0
-; CHECK-GI-NEXT:    fmov s16, w10
+; CHECK-GI-NEXT:    mov v16.s[0], w11
 ; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    fmov w13, s5
 ; CHECK-GI-NEXT:    mov w14, v5.s[1]
@@ -2887,7 +2874,7 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    udiv w9, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v4.s[3]
 ; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
-; CHECK-GI-NEXT:    mov v16.s[1], w11
+; CHECK-GI-NEXT:    mov v16.s[1], w10
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    udiv w8, w8, w12
 ; CHECK-GI-NEXT:    fmov w12, s4
@@ -2898,7 +2885,8 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v0.4s, v16.4s, v2.4s
 ; CHECK-GI-NEXT:    udiv w14, w12, w14
 ; CHECK-GI-NEXT:    mov w12, v4.s[2]
-; CHECK-GI-NEXT:    fmov s17, w13
+; CHECK-GI-NEXT:    mov v17.s[0], w13
+; CHECK-GI-NEXT:    mov w13, v7.s[3]
 ; CHECK-GI-NEXT:    udiv w15, w12, w15
 ; CHECK-GI-NEXT:    mov w12, v4.s[3]
 ; CHECK-GI-NEXT:    mov v17.s[1], w14
@@ -2911,13 +2899,14 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v4.4s, v17.4s, v5.4s
 ; CHECK-GI-NEXT:    udiv w17, w17, w18
 ; CHECK-GI-NEXT:    mov w18, v6.s[2]
-; CHECK-GI-NEXT:    fmov s18, w16
+; CHECK-GI-NEXT:    mov v18.s[0], w16
 ; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
 ; CHECK-GI-NEXT:    udiv w18, w18, w0
 ; CHECK-GI-NEXT:    mov w0, v6.s[3]
 ; CHECK-GI-NEXT:    ushll2 v6.4s, v1.8h, #0
 ; CHECK-GI-NEXT:    mov v18.s[1], w17
 ; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov w11, v6.s[3]
 ; CHECK-GI-NEXT:    udiv w0, w0, w1
 ; CHECK-GI-NEXT:    fmov w1, s6
 ; CHECK-GI-NEXT:    mov v18.s[2], w18
@@ -2927,11 +2916,10 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v1.4s, v18.4s, v3.4s
 ; CHECK-GI-NEXT:    udiv w2, w2, w3
 ; CHECK-GI-NEXT:    mov w3, v6.s[2]
-; CHECK-GI-NEXT:    fmov s19, w1
+; CHECK-GI-NEXT:    mov v19.s[0], w1
 ; CHECK-GI-NEXT:    udiv w3, w3, w4
-; CHECK-GI-NEXT:    mov w4, v6.s[3]
 ; CHECK-GI-NEXT:    mov v19.s[1], w2
-; CHECK-GI-NEXT:    udiv w10, w4, w5
+; CHECK-GI-NEXT:    udiv w10, w11, w13
 ; CHECK-GI-NEXT:    mov v19.s[2], w3
 ; CHECK-GI-NEXT:    mov v19.s[3], w10
 ; CHECK-GI-NEXT:    mls v6.4s, v19.4s, v7.4s
@@ -2970,7 +2958,7 @@ define <2 x i32> @sv2i32(<2 x i32> %d, <2 x i32> %e) {
 ; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -3002,10 +2990,10 @@ define <3 x i32> @sv3i32(<3 x i32> %d, <3 x i32> %e) {
 ;
 ; CHECK-GI-LABEL: sv3i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v1.s[1]
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
 ; CHECK-GI-NEXT:    mov s0, v0.s[2]
 ; CHECK-GI-NEXT:    mov s1, v1.s[2]
 ; CHECK-GI-NEXT:    sdiv w10, w8, w9
@@ -3015,11 +3003,11 @@ define <3 x i32> @sv3i32(<3 x i32> %d, <3 x i32> %e) {
 ; CHECK-GI-NEXT:    fmov w15, s1
 ; CHECK-GI-NEXT:    sdiv w13, w11, w12
 ; CHECK-GI-NEXT:    msub w8, w10, w9, w8
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    sdiv w16, w14, w15
-; CHECK-GI-NEXT:    msub w9, w13, w12, w11
-; CHECK-GI-NEXT:    mov v0.s[1], w9
-; CHECK-GI-NEXT:    msub w8, w16, w15, w14
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    sdiv w9, w14, w15
+; CHECK-GI-NEXT:    msub w8, w13, w12, w11
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    msub w8, w9, w15, w14
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3063,7 +3051,7 @@ define <4 x i32> @sv4i32(<4 x i32> %d, <4 x i32> %e) {
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    sdiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
@@ -3141,12 +3129,12 @@ define <8 x i32> @sv8i32(<8 x i32> %d, <8 x i32> %e) {
 ; CHECK-GI-NEXT:    fmov w13, s3
 ; CHECK-GI-NEXT:    mov w14, v3.s[1]
 ; CHECK-GI-NEXT:    mov w15, v3.s[2]
-; CHECK-GI-NEXT:    mov w16, v3.s[3]
 ; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov w8, v1.s[3]
 ; CHECK-GI-NEXT:    sdiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v4.s[1], w9
@@ -3159,11 +3147,11 @@ define <8 x i32> @sv8i32(<8 x i32> %d, <8 x i32> %e) {
 ; CHECK-GI-NEXT:    mls v0.4s, v4.4s, v2.4s
 ; CHECK-GI-NEXT:    sdiv w13, w13, w14
 ; CHECK-GI-NEXT:    mov w14, v1.s[2]
-; CHECK-GI-NEXT:    fmov s5, w12
+; CHECK-GI-NEXT:    mov v5.s[0], w12
+; CHECK-GI-NEXT:    mov w12, v3.s[3]
 ; CHECK-GI-NEXT:    sdiv w14, w14, w15
-; CHECK-GI-NEXT:    mov w15, v1.s[3]
 ; CHECK-GI-NEXT:    mov v5.s[1], w13
-; CHECK-GI-NEXT:    sdiv w8, w15, w16
+; CHECK-GI-NEXT:    sdiv w8, w8, w12
 ; CHECK-GI-NEXT:    mov v5.s[2], w14
 ; CHECK-GI-NEXT:    mov v5.s[3], w8
 ; CHECK-GI-NEXT:    mls v1.4s, v5.4s, v3.4s
@@ -3201,7 +3189,7 @@ define <2 x i32> @uv2i32(<2 x i32> %d, <2 x i32> %e) {
 ; CHECK-GI-NEXT:    udiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -3233,10 +3221,10 @@ define <3 x i32> @uv3i32(<3 x i32> %d, <3 x i32> %e) {
 ;
 ; CHECK-GI-LABEL: uv3i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v1.s[1]
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
 ; CHECK-GI-NEXT:    mov s0, v0.s[2]
 ; CHECK-GI-NEXT:    mov s1, v1.s[2]
 ; CHECK-GI-NEXT:    udiv w10, w8, w9
@@ -3246,11 +3234,11 @@ define <3 x i32> @uv3i32(<3 x i32> %d, <3 x i32> %e) {
 ; CHECK-GI-NEXT:    fmov w15, s1
 ; CHECK-GI-NEXT:    udiv w13, w11, w12
 ; CHECK-GI-NEXT:    msub w8, w10, w9, w8
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    udiv w16, w14, w15
-; CHECK-GI-NEXT:    msub w9, w13, w12, w11
-; CHECK-GI-NEXT:    mov v0.s[1], w9
-; CHECK-GI-NEXT:    msub w8, w16, w15, w14
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    udiv w9, w14, w15
+; CHECK-GI-NEXT:    msub w8, w13, w12, w11
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    msub w8, w9, w15, w14
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3294,7 +3282,7 @@ define <4 x i32> @uv4i32(<4 x i32> %d, <4 x i32> %e) {
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    udiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
@@ -3372,12 +3360,12 @@ define <8 x i32> @uv8i32(<8 x i32> %d, <8 x i32> %e) {
 ; CHECK-GI-NEXT:    fmov w13, s3
 ; CHECK-GI-NEXT:    mov w14, v3.s[1]
 ; CHECK-GI-NEXT:    mov w15, v3.s[2]
-; CHECK-GI-NEXT:    mov w16, v3.s[3]
 ; CHECK-GI-NEXT:    udiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov w8, v1.s[3]
 ; CHECK-GI-NEXT:    udiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v4.s[1], w9
@@ -3390,11 +3378,11 @@ define <8 x i32> @uv8i32(<8 x i32> %d, <8 x i32> %e) {
 ; CHECK-GI-NEXT:    mls v0.4s, v4.4s, v2.4s
 ; CHECK-GI-NEXT:    udiv w13, w13, w14
 ; CHECK-GI-NEXT:    mov w14, v1.s[2]
-; CHECK-GI-NEXT:    fmov s5, w12
+; CHECK-GI-NEXT:    mov v5.s[0], w12
+; CHECK-GI-NEXT:    mov w12, v3.s[3]
 ; CHECK-GI-NEXT:    udiv w14, w14, w15
-; CHECK-GI-NEXT:    mov w15, v1.s[3]
 ; CHECK-GI-NEXT:    mov v5.s[1], w13
-; CHECK-GI-NEXT:    udiv w8, w15, w16
+; CHECK-GI-NEXT:    udiv w8, w8, w12
 ; CHECK-GI-NEXT:    mov v5.s[2], w14
 ; CHECK-GI-NEXT:    mov v5.s[3], w8
 ; CHECK-GI-NEXT:    mls v1.4s, v5.4s, v3.4s
@@ -3427,14 +3415,14 @@ define <2 x i64> @sv2i64(<2 x i64> %d, <2 x i64> %e) {
 ; CHECK-GI-NEXT:    mov x11, v0.d[1]
 ; CHECK-GI-NEXT:    sdiv x8, x8, x9
 ; CHECK-GI-NEXT:    sdiv x11, x11, x10
-; CHECK-GI-NEXT:    fmov d1, x8
+; CHECK-GI-NEXT:    mov v1.d[0], x8
 ; CHECK-GI-NEXT:    mov v1.d[1], x11
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x8, v1.d[1]
-; CHECK-GI-NEXT:    mul x9, x11, x9
-; CHECK-GI-NEXT:    mul x8, x8, x10
-; CHECK-GI-NEXT:    fmov d1, x9
-; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    mul x9, x11, x10
+; CHECK-GI-NEXT:    mov v1.d[0], x8
+; CHECK-GI-NEXT:    mov v1.d[1], x9
 ; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3481,21 +3469,21 @@ define <3 x i64> @sv3i64(<3 x i64> %d, <3 x i64> %e) {
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    sdiv x8, x8, x9
 ; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    fmov x14, d3
-; CHECK-GI-NEXT:    mov x12, v3.d[1]
+; CHECK-GI-NEXT:    fmov x11, d3
+; CHECK-GI-NEXT:    mov x14, v3.d[1]
 ; CHECK-GI-NEXT:    sdiv x9, x9, x10
-; CHECK-GI-NEXT:    fmov d6, x8
+; CHECK-GI-NEXT:    mov v6.d[0], x8
 ; CHECK-GI-NEXT:    fmov x8, d2
 ; CHECK-GI-NEXT:    mov v6.d[1], x9
 ; CHECK-GI-NEXT:    fmov x9, d5
-; CHECK-GI-NEXT:    sdiv x10, x8, x9
-; CHECK-GI-NEXT:    fmov x13, d6
-; CHECK-GI-NEXT:    mov x11, v6.d[1]
-; CHECK-GI-NEXT:    mul x13, x13, x14
-; CHECK-GI-NEXT:    mul x11, x11, x12
-; CHECK-GI-NEXT:    fmov d2, x13
+; CHECK-GI-NEXT:    sdiv x12, x8, x9
+; CHECK-GI-NEXT:    fmov x10, d6
+; CHECK-GI-NEXT:    mov x13, v6.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x11, x13, x14
+; CHECK-GI-NEXT:    mov v2.d[0], x10
 ; CHECK-GI-NEXT:    mov v2.d[1], x11
-; CHECK-GI-NEXT:    msub x8, x10, x9, x8
+; CHECK-GI-NEXT:    msub x8, x12, x9, x8
 ; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -3542,26 +3530,26 @@ define <4 x i64> @sv4i64(<4 x i64> %d, <4 x i64> %e) {
 ; CHECK-GI-NEXT:    mov x14, v3.d[1]
 ; CHECK-GI-NEXT:    mov x15, v1.d[1]
 ; CHECK-GI-NEXT:    sdiv x8, x8, x9
-; CHECK-GI-NEXT:    sdiv x11, x11, x10
-; CHECK-GI-NEXT:    fmov d2, x8
 ; CHECK-GI-NEXT:    sdiv x12, x12, x13
-; CHECK-GI-NEXT:    mov v2.d[1], x11
-; CHECK-GI-NEXT:    fmov x11, d2
-; CHECK-GI-NEXT:    mov x8, v2.d[1]
-; CHECK-GI-NEXT:    mul x9, x11, x9
-; CHECK-GI-NEXT:    mul x8, x8, x10
-; CHECK-GI-NEXT:    fmov d2, x9
-; CHECK-GI-NEXT:    mov v2.d[1], x8
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    sdiv x11, x11, x10
+; CHECK-GI-NEXT:    mov v3.d[0], x12
 ; CHECK-GI-NEXT:    sdiv x15, x15, x14
-; CHECK-GI-NEXT:    fmov d3, x12
-; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    mov v2.d[1], x11
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    mov x11, v2.d[1]
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    mul x10, x11, x10
+; CHECK-GI-NEXT:    mov v2.d[0], x8
 ; CHECK-GI-NEXT:    mov v3.d[1], x15
-; CHECK-GI-NEXT:    fmov x11, d3
-; CHECK-GI-NEXT:    mov x10, v3.d[1]
-; CHECK-GI-NEXT:    mul x11, x11, x13
-; CHECK-GI-NEXT:    mul x10, x10, x14
-; CHECK-GI-NEXT:    fmov d3, x11
-; CHECK-GI-NEXT:    mov v3.d[1], x10
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    fmov x9, d3
+; CHECK-GI-NEXT:    mov x12, v3.d[1]
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    mul x9, x9, x13
+; CHECK-GI-NEXT:    mul x11, x12, x14
+; CHECK-GI-NEXT:    mov v3.d[0], x9
+; CHECK-GI-NEXT:    mov v3.d[1], x11
 ; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v3.2d
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3592,14 +3580,14 @@ define <2 x i64> @uv2i64(<2 x i64> %d, <2 x i64> %e) {
 ; CHECK-GI-NEXT:    mov x11, v0.d[1]
 ; CHECK-GI-NEXT:    udiv x8, x8, x9
 ; CHECK-GI-NEXT:    udiv x11, x11, x10
-; CHECK-GI-NEXT:    fmov d1, x8
+; CHECK-GI-NEXT:    mov v1.d[0], x8
 ; CHECK-GI-NEXT:    mov v1.d[1], x11
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x8, v1.d[1]
-; CHECK-GI-NEXT:    mul x9, x11, x9
-; CHECK-GI-NEXT:    mul x8, x8, x10
-; CHECK-GI-NEXT:    fmov d1, x9
-; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    mul x9, x11, x10
+; CHECK-GI-NEXT:    mov v1.d[0], x8
+; CHECK-GI-NEXT:    mov v1.d[1], x9
 ; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3646,21 +3634,21 @@ define <3 x i64> @uv3i64(<3 x i64> %d, <3 x i64> %e) {
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    udiv x8, x8, x9
 ; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    fmov x14, d3
-; CHECK-GI-NEXT:    mov x12, v3.d[1]
+; CHECK-GI-NEXT:    fmov x11, d3
+; CHECK-GI-NEXT:    mov x14, v3.d[1]
 ; CHECK-GI-NEXT:    udiv x9, x9, x10
-; CHECK-GI-NEXT:    fmov d6, x8
+; CHECK-GI-NEXT:    mov v6.d[0], x8
 ; CHECK-GI-NEXT:    fmov x8, d2
 ; CHECK-GI-NEXT:    mov v6.d[1], x9
 ; CHECK-GI-NEXT:    fmov x9, d5
-; CHECK-GI-NEXT:    udiv x10, x8, x9
-; CHECK-GI-NEXT:    fmov x13, d6
-; CHECK-GI-NEXT:    mov x11, v6.d[1]
-; CHECK-GI-NEXT:    mul x13, x13, x14
-; CHECK-GI-NEXT:    mul x11, x11, x12
-; CHECK-GI-NEXT:    fmov d2, x13
+; CHECK-GI-NEXT:    udiv x12, x8, x9
+; CHECK-GI-NEXT:    fmov x10, d6
+; CHECK-GI-NEXT:    mov x13, v6.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x11, x13, x14
+; CHECK-GI-NEXT:    mov v2.d[0], x10
 ; CHECK-GI-NEXT:    mov v2.d[1], x11
-; CHECK-GI-NEXT:    msub x8, x10, x9, x8
+; CHECK-GI-NEXT:    msub x8, x12, x9, x8
 ; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -3707,26 +3695,26 @@ define <4 x i64> @uv4i64(<4 x i64> %d, <4 x i64> %e) {
 ; CHECK-GI-NEXT:    mov x14, v3.d[1]
 ; CHECK-GI-NEXT:    mov x15, v1.d[1]
 ; CHECK-GI-NEXT:    udiv x8, x8, x9
-; CHECK-GI-NEXT:    udiv x11, x11, x10
-; CHECK-GI-NEXT:    fmov d2, x8
 ; CHECK-GI-NEXT:    udiv x12, x12, x13
-; CHECK-GI-NEXT:    mov v2.d[1], x11
-; CHECK-GI-NEXT:    fmov x11, d2
-; CHECK-GI-NEXT:    mov x8, v2.d[1]
-; CHECK-GI-NEXT:    mul x9, x11, x9
-; CHECK-GI-NEXT:    mul x8, x8, x10
-; CHECK-GI-NEXT:    fmov d2, x9
-; CHECK-GI-NEXT:    mov v2.d[1], x8
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    udiv x11, x11, x10
+; CHECK-GI-NEXT:    mov v3.d[0], x12
 ; CHECK-GI-NEXT:    udiv x15, x15, x14
-; CHECK-GI-NEXT:    fmov d3, x12
-; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    mov v2.d[1], x11
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    mov x11, v2.d[1]
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    mul x10, x11, x10
+; CHECK-GI-NEXT:    mov v2.d[0], x8
 ; CHECK-GI-NEXT:    mov v3.d[1], x15
-; CHECK-GI-NEXT:    fmov x11, d3
-; CHECK-GI-NEXT:    mov x10, v3.d[1]
-; CHECK-GI-NEXT:    mul x11, x11, x13
-; CHECK-GI-NEXT:    mul x10, x10, x14
-; CHECK-GI-NEXT:    fmov d3, x11
-; CHECK-GI-NEXT:    mov v3.d[1], x10
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    fmov x9, d3
+; CHECK-GI-NEXT:    mov x12, v3.d[1]
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    mul x9, x9, x13
+; CHECK-GI-NEXT:    mul x11, x12, x14
+; CHECK-GI-NEXT:    mov v3.d[0], x9
+; CHECK-GI-NEXT:    mov v3.d[1], x11
 ; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v3.2d
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index fa0447c2c5d79..adac75758220e 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -165,18 +165,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    sqadd v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    sqadd v0.8b, v3.8b, v5.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x2]
 ; CHECK-GI-NEXT:    ret
@@ -249,12 +251,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-LABEL: v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    sqadd v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
+; CHECK-GI-NEXT:    sqadd v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    str h0, [x2]
 ; CHECK-GI-NEXT:    str h1, [x2, #2]
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 529a3b72e0971..0f256c1f18f58 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -224,15 +224,13 @@ define <3 x i16> @sext_v3i8_v3i16(<3 x i8> %a) {
 ; CHECK-GI-NEXT:    lsl w10, w2, #8
 ; CHECK-GI-NEXT:    sxth w8, w8
 ; CHECK-GI-NEXT:    sxth w9, w9
-; CHECK-GI-NEXT:    sxth w10, w10
 ; CHECK-GI-NEXT:    asr w8, w8, #8
 ; CHECK-GI-NEXT:    asr w9, w9, #8
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    asr w8, w10, #8
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    sxth w8, w10
+; CHECK-GI-NEXT:    asr w8, w8, #8
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -254,10 +252,10 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) {
 ; CHECK-GI-LABEL: sext_v3i8_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxtb w8, w0
-; CHECK-GI-NEXT:    sxtb w9, w1
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    sxtb w8, w1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    sxtb w8, w2
-; CHECK-GI-NEXT:    mov v0.s[1], w9
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -311,7 +309,7 @@ define <3 x i32> @sext_v3i16_v3i32(<3 x i16> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    smov w8, v0.h[0]
 ; CHECK-GI-NEXT:    smov w9, v0.h[1]
-; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    smov w8, v0.h[2]
 ; CHECK-GI-NEXT:    mov v1.s[1], w9
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
@@ -391,15 +389,13 @@ define <3 x i16> @sext_v3i10_v3i16(<3 x i10> %a) {
 ; CHECK-GI-NEXT:    lsl w10, w2, #6
 ; CHECK-GI-NEXT:    sxth w8, w8
 ; CHECK-GI-NEXT:    sxth w9, w9
-; CHECK-GI-NEXT:    sxth w10, w10
 ; CHECK-GI-NEXT:    asr w8, w8, #6
 ; CHECK-GI-NEXT:    asr w9, w9, #6
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    asr w8, w10, #6
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    sxth w8, w10
+; CHECK-GI-NEXT:    asr w8, w8, #6
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -421,10 +417,10 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) {
 ; CHECK-GI-LABEL: sext_v3i10_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sbfx w8, w0, #0, #10
-; CHECK-GI-NEXT:    sbfx w9, w1, #0, #10
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    sbfx w8, w1, #0, #10
+; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    sbfx w8, w2, #0, #10
-; CHECK-GI-NEXT:    mov v0.s[1], w9
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1033,43 +1029,29 @@ define <16 x i16> @sext_v16i10_v16i16(<16 x i10> %a) {
 ; CHECK-GI-LABEL: sext_v16i10_v16i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s2, w1
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    fmov s3, w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w2
-; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NEXT:    mov v1.h[2], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w3
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v0.h[3], w3
+; CHECK-GI-NEXT:    mov v1.h[3], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-NEXT:    mov v0.h[3], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w4
-; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v0.h[4], w4
+; CHECK-GI-NEXT:    mov v1.h[4], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-NEXT:    mov v0.h[4], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v0.h[5], w5
+; CHECK-GI-NEXT:    mov v1.h[5], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w6
-; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v0.h[6], w6
+; CHECK-GI-NEXT:    mov v1.h[6], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-NEXT:    mov v0.h[6], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w7
-; CHECK-GI-NEXT:    mov v1.h[6], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w8
-; CHECK-GI-NEXT:    mov v0.h[7], v2.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v3.h[0]
+; CHECK-GI-NEXT:    mov v0.h[7], w7
+; CHECK-GI-NEXT:    mov v1.h[7], w8
 ; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #6
 ; CHECK-GI-NEXT:    shl v1.8h, v1.8h, #6
 ; CHECK-GI-NEXT:    sshr v0.8h, v0.8h, #6
@@ -1123,54 +1105,42 @@ define <16 x i32> @sext_v16i10_v16i32(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v16i10_v16i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    ldr w9, [sp, #8]
-; CHECK-GI-NEXT:    ldr w10, [sp, #32]
+; CHECK-GI-NEXT:    ldr w9, [sp, #32]
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w4
+; CHECK-GI-NEXT:    ldr w10, [sp, #8]
 ; CHECK-GI-NEXT:    ldr w11, [sp, #40]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s3, w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    fmov s5, w10
-; CHECK-GI-NEXT:    fmov s6, w11
+; CHECK-GI-NEXT:    mov v0.h[1], w1
 ; CHECK-GI-NEXT:    ldr w9, [sp, #48]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w2
-; CHECK-GI-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NEXT:    fmov s4, w8
-; CHECK-GI-NEXT:    fmov s6, w9
+; CHECK-GI-NEXT:    mov v1.h[1], w5
+; CHECK-GI-NEXT:    mov v2.h[1], w10
+; CHECK-GI-NEXT:    mov v3.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NEXT:    mov v1.h[2], w6
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mov v3.h[2], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
 ; CHECK-GI-NEXT:    ldr w9, [sp, #56]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w6
-; CHECK-GI-NEXT:    mov v3.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w8
-; CHECK-GI-NEXT:    mov v5.h[2], v6.h[0]
-; CHECK-GI-NEXT:    fmov s6, w9
-; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w3
-; CHECK-GI-NEXT:    mov v3.h[3], v4.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w7
-; CHECK-GI-NEXT:    mov v5.h[3], v6.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mov v0.h[3], w3
+; CHECK-GI-NEXT:    mov v1.h[3], w7
+; CHECK-GI-NEXT:    mov v2.h[3], w8
+; CHECK-GI-NEXT:    mov v3.h[3], w9
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v5.4h, #0
-; CHECK-GI-NEXT:    shl v2.4s, v2.4s, #22
 ; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #22
-; CHECK-GI-NEXT:    shl v3.4s, v3.4s, #22
-; CHECK-GI-NEXT:    sshr v2.4s, v2.4s, #22
 ; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #22
+; CHECK-GI-NEXT:    shl v2.4s, v2.4s, #22
+; CHECK-GI-NEXT:    shl v3.4s, v3.4s, #22
 ; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #22
-; CHECK-GI-NEXT:    sshr v3.4s, v3.4s, #22
 ; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #22
+; CHECK-GI-NEXT:    sshr v2.4s, v2.4s, #22
+; CHECK-GI-NEXT:    sshr v3.4s, v3.4s, #22
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <16 x i10> %a to <16 x i32>
@@ -1228,67 +1198,55 @@ define <16 x i64> @sext_v16i10_v16i64(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v16i10_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-NEXT:    ldr w10, [sp, #32]
-; CHECK-GI-NEXT:    ldr w11, [sp, #40]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w9, [sp, #48]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
+; CHECK-GI-NEXT:    ldr w11, [sp, #40]
 ; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s3, w10
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v1.h[1], w5
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    mov v3.h[1], w11
+; CHECK-GI-NEXT:    ldr w9, [sp, #48]
+; CHECK-GI-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NEXT:    mov v1.h[2], w6
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mov v3.h[2], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w10
-; CHECK-GI-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w2
-; CHECK-GI-NEXT:    mov v2.h[2], v5.h[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w6
-; CHECK-GI-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-GI-NEXT:    mov v1.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #56]
-; CHECK-GI-NEXT:    mov v3.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w3
-; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w7
-; CHECK-GI-NEXT:    ushll v6.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll2 v2.2d, v2.4s, #0
-; CHECK-GI-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    shl v6.2d, v6.2d, #54
+; CHECK-GI-NEXT:    mov v0.h[3], w3
+; CHECK-GI-NEXT:    mov v1.h[3], w7
+; CHECK-GI-NEXT:    mov v2.h[3], w8
+; CHECK-GI-NEXT:    mov v3.h[3], w9
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    shl v18.2d, v2.2d, #54
-; CHECK-GI-NEXT:    mov v3.h[3], v4.h[0]
 ; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    ushll v4.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    ushll v5.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-GI-NEXT:    shl v4.2d, v4.2d, #54
-; CHECK-GI-NEXT:    shl v16.2d, v0.2d, #54
+; CHECK-GI-NEXT:    ushll v6.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll2 v2.2d, v2.4s, #0
 ; CHECK-GI-NEXT:    ushll v7.2d, v3.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    shl v4.2d, v4.2d, #54
+; CHECK-GI-NEXT:    shl v16.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    shl v5.2d, v5.2d, #54
 ; CHECK-GI-NEXT:    shl v17.2d, v1.2d, #54
-; CHECK-GI-NEXT:    sshr v0.2d, v4.2d, #54
-; CHECK-GI-NEXT:    sshr v1.2d, v16.2d, #54
-; CHECK-GI-NEXT:    sshr v4.2d, v6.2d, #54
+; CHECK-GI-NEXT:    shl v6.2d, v6.2d, #54
+; CHECK-GI-NEXT:    shl v18.2d, v2.2d, #54
 ; CHECK-GI-NEXT:    shl v7.2d, v7.2d, #54
 ; CHECK-GI-NEXT:    shl v19.2d, v3.2d, #54
+; CHECK-GI-NEXT:    sshr v0.2d, v4.2d, #54
+; CHECK-GI-NEXT:    sshr v1.2d, v16.2d, #54
 ; CHECK-GI-NEXT:    sshr v2.2d, v5.2d, #54
 ; CHECK-GI-NEXT:    sshr v3.2d, v17.2d, #54
+; CHECK-GI-NEXT:    sshr v4.2d, v6.2d, #54
 ; CHECK-GI-NEXT:    sshr v5.2d, v18.2d, #54
 ; CHECK-GI-NEXT:    sshr v6.2d, v7.2d, #54
 ; CHECK-GI-NEXT:    sshr v7.2d, v19.2d, #54
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index 9c8d3e0f07de8..951458da17c07 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -537,22 +537,29 @@ define <4 x i8> @shl_v4i8(<4 x i8> %0, <4 x i8> %1){
 ; CHECK-GI-NEXT:    mov h3, v1.h[1]
 ; CHECK-GI-NEXT:    mov h4, v0.h[2]
 ; CHECK-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NEXT:    mov h6, v1.h[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    mov h2, v1.h[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov h3, v1.h[3]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    mov v1.b[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    mov v1.b[2], w9
+; CHECK-GI-NEXT:    fmov w8, s5
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.b[3], w8
+; CHECK-GI-NEXT:    mov v1.b[3], w9
 ; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = shl <4 x i8> %0, %1
@@ -587,10 +594,10 @@ define <2 x i16> @shl_v2i16(<2 x i16> %0, <2 x i16> %1){
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v1.s[1]
-; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
 ; CHECK-GI-NEXT:    ushl v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
@@ -628,7 +635,7 @@ define <1 x i32> @shl_v1i32(<1 x i32> %0, <1 x i32> %1){
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsl w8, w8, w9
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = shl <1 x i32> %0, %1
@@ -684,24 +691,31 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %0, <4 x i8> %1){
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h3, v0.h[1]
 ; CHECK-GI-NEXT:    mov h4, v1.h[2]
-; CHECK-GI-NEXT:    mov h5, v1.h[3]
-; CHECK-GI-NEXT:    mov h6, v0.h[3]
-; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov v0.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v6.b[0]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov h2, v1.h[3]
+; CHECK-GI-NEXT:    fmov w9, s4
+; CHECK-GI-NEXT:    mov h4, v0.h[3]
+; CHECK-GI-NEXT:    mov v1.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov h3, v0.h[2]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov v1.b[2], w9
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    mov v0.b[3], w8
 ; CHECK-GI-NEXT:    neg v1.8b, v1.8b
 ; CHECK-GI-NEXT:    sshl v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = ashr <4 x i8> %0, %1
@@ -734,11 +748,11 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %0, <2 x i16> %1){
 ; CHECK-GI-LABEL: ashr_v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov s2, v1.s[1]
+; CHECK-GI-NEXT:    mov w8, v1.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s3, v0.s[1]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov w9, v0.s[1]
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[1], w9
 ; CHECK-GI-NEXT:    neg v1.4h, v1.4h
 ; CHECK-GI-NEXT:    sshl v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
@@ -774,7 +788,7 @@ define <1 x i32> @ashr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    asr w8, w8, w9
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = ashr <1 x i32> %0, %1
@@ -821,24 +835,31 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %0, <4 x i8> %1){
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h3, v0.h[1]
 ; CHECK-GI-NEXT:    mov h4, v1.h[2]
-; CHECK-GI-NEXT:    mov h5, v1.h[3]
-; CHECK-GI-NEXT:    mov h6, v0.h[3]
-; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov v0.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v6.b[0]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov h2, v1.h[3]
+; CHECK-GI-NEXT:    fmov w9, s4
+; CHECK-GI-NEXT:    mov h4, v0.h[3]
+; CHECK-GI-NEXT:    mov v1.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov h3, v0.h[2]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov v1.b[2], w9
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    mov v0.b[3], w8
 ; CHECK-GI-NEXT:    neg v1.8b, v1.8b
 ; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = lshr <4 x i8> %0, %1
@@ -870,11 +891,11 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %0, <2 x i16> %1){
 ; CHECK-GI-LABEL: lshr_v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov s2, v1.s[1]
+; CHECK-GI-NEXT:    mov w8, v1.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s3, v0.s[1]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov w9, v0.s[1]
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[1], w9
 ; CHECK-GI-NEXT:    neg v1.4h, v1.4h
 ; CHECK-GI-NEXT:    ushl v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
@@ -910,7 +931,7 @@ define <1 x i32> @lshr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsr w8, w8, w9
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = lshr <1 x i32> %0, %1
@@ -962,16 +983,12 @@ define <3 x i8> @shl_v3i8(<3 x i8> %0, <3 x i8> %1){
 ; CHECK-GI-LABEL: shl_v3i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
-; CHECK-GI-NEXT:    fmov s2, w3
-; CHECK-GI-NEXT:    fmov s3, w4
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v2.b[1], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w5
-; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
-; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    fmov s1, w3
+; CHECK-GI-NEXT:    mov v0.b[1], w1
+; CHECK-GI-NEXT:    mov v1.b[1], w4
+; CHECK-GI-NEXT:    mov v0.b[2], w2
+; CHECK-GI-NEXT:    mov v1.b[2], w5
+; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
 ; CHECK-GI-NEXT:    umov w1, v0.b[1]
 ; CHECK-GI-NEXT:    umov w2, v0.b[2]
@@ -1038,15 +1055,11 @@ define <3 x i8> @ashr_v3i8(<3 x i8> %0, <3 x i8> %1){
 ; CHECK-GI-LABEL: ashr_v3i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    fmov s0, w3
-; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
 ; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w2
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], w4
+; CHECK-GI-NEXT:    mov v1.b[1], w1
+; CHECK-GI-NEXT:    mov v0.b[2], w5
+; CHECK-GI-NEXT:    mov v1.b[2], w2
 ; CHECK-GI-NEXT:    neg v0.8b, v0.8b
 ; CHECK-GI-NEXT:    sshl v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
@@ -1118,15 +1131,11 @@ define <3 x i8> @lshr_v3i8(<3 x i8> %0, <3 x i8> %1){
 ; CHECK-GI-LABEL: lshr_v3i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    fmov s0, w3
-; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
 ; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w2
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], w4
+; CHECK-GI-NEXT:    mov v1.b[1], w1
+; CHECK-GI-NEXT:    mov v0.b[2], w5
+; CHECK-GI-NEXT:    mov v1.b[2], w2
 ; CHECK-GI-NEXT:    neg v0.8b, v0.8b
 ; CHECK-GI-NEXT:    ushl v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index b1131f287fe9a..954458e445974 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -213,17 +213,23 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    mov h3, v1.h[1]
-; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-GI-NEXT:    mov h4, v0.h[2]
 ; CHECK-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NEXT:    mov h6, v1.h[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    mov h2, v1.h[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov h3, v1.h[3]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    mov v1.b[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    mov v1.b[2], w9
+; CHECK-GI-NEXT:    fmov w8, s5
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.b[3], w8
+; CHECK-GI-NEXT:    mov v1.b[3], w9
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI15_0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
@@ -280,11 +286,11 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
 ; CHECK-GI-NEXT:    adrp x8, .LCPI17_0
-; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI17_0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
@@ -397,8 +403,17 @@ define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){
 ;
 ; CHECK-GI-LABEL: shufflevector_v4i8_zeroes:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    dup v0.8b, w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov h1, v0.h[3]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.b[3], w8
+; CHECK-GI-NEXT:    dup v0.8b, v0.b[0]
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
     %c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -433,8 +448,10 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){
 ;
 ; CHECK-GI-LABEL: shufflevector_v2i16_zeroes:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    dup v0.4h, w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    dup v0.4h, v0.h[0]
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
     %c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> <i32 0, i32 0>
@@ -493,18 +510,14 @@ define <3 x i8> @shufflevector_v3i8(<3 x i8> %a, <3 x i8> %b) {
 ; CHECK-GI-LABEL: shufflevector_v3i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
+; CHECK-GI-NEXT:    fmov s1, w3
 ; CHECK-GI-NEXT:    adrp x8, .LCPI30_0
-; CHECK-GI-NEXT:    fmov s2, w3
-; CHECK-GI-NEXT:    fmov s3, w4
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v2.b[1], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w5
-; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], w1
+; CHECK-GI-NEXT:    mov v1.b[1], w4
+; CHECK-GI-NEXT:    mov v0.b[2], w2
+; CHECK-GI-NEXT:    mov v1.b[2], w5
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI30_0]
-; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.d[1], v2.d[0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
 ; CHECK-GI-NEXT:    mov b2, v0.b[2]
@@ -614,7 +627,10 @@ define <3 x i8> @shufflevector_v3i8_zeroes(<3 x i8> %a, <3 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: shufflevector_v3i8_zeroes:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    dup v0.8b, w0
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    mov v0.b[1], w1
+; CHECK-GI-NEXT:    mov v0.b[2], w2
+; CHECK-GI-NEXT:    dup v0.8b, v0.b[0]
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
 ; CHECK-GI-NEXT:    mov b2, v0.b[2]
 ; CHECK-GI-NEXT:    fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index d8b2762cf15e9..12371ef2c0021 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -166,18 +166,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    sqsub v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    sqsub v0.8b, v3.8b, v5.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x2]
 ; CHECK-GI-NEXT:    ret
@@ -250,12 +252,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-LABEL: v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    sqsub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
+; CHECK-GI-NEXT:    sqsub v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    str h0, [x2]
 ; CHECK-GI-NEXT:    str h1, [x2, #2]
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 907605494dfbd..8e7586bd4843c 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -71,13 +71,13 @@ define void @v2i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v2i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #1]
-; CHECK-GI-NEXT:    ldr b2, [x1]
+; CHECK-GI-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    ld1 { v1.b }[0], [x1]
+; CHECK-GI-NEXT:    ldr b2, [x0, #1]
 ; CHECK-GI-NEXT:    ldr b3, [x1, #1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    sub v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-GI-NEXT:    sub v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str b0, [x0]
 ; CHECK-GI-NEXT:    str b1, [x0, #1]
@@ -112,22 +112,18 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w10, [x1]
+; CHECK-GI-NEXT:    ldrb w9, [x1]
+; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    fmov s3, w11
 ; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
 ; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    sub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    sub v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[2]
 ; CHECK-GI-NEXT:    str b0, [x0]
@@ -159,27 +155,27 @@ define void @v4i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v5.8b, #0
 ; CHECK-GI-NEXT:    sub v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x0]
 ; CHECK-GI-NEXT:    ret
@@ -247,13 +243,13 @@ define void @v2i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    sub v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str h0, [x0]
@@ -281,18 +277,16 @@ define void @v3i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    add x10, x1, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    ldr h3, [x1, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    sub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
+; CHECK-GI-NEXT:    sub v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index afc0d8704ebac..e99935e8677fc 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -162,18 +162,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    uqadd v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    uqadd v0.8b, v3.8b, v5.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x2]
 ; CHECK-GI-NEXT:    ret
@@ -248,12 +250,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-LABEL: v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    uqadd v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
+; CHECK-GI-NEXT:    uqadd v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    str h0, [x2]
 ; CHECK-GI-NEXT:    str h1, [x2, #2]
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index dfcbe96ea948a..cdba9625431a5 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -163,18 +163,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    uqsub v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    uqsub v0.8b, v3.8b, v5.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x2]
 ; CHECK-GI-NEXT:    ret
@@ -245,12 +247,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-LABEL: v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    uqsub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
+; CHECK-GI-NEXT:    uqsub v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    str h0, [x2]
 ; CHECK-GI-NEXT:    str h1, [x2, #2]
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 54ada05c90448..f46e6ae989ff2 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -3813,71 +3813,49 @@ define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) {
 ; CHECK-GI-LABEL: add_v24i8_v24i16_zext:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-NEXT:    ldr w10, [sp, #72]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w3
-; CHECK-GI-NEXT:    mov v0.b[3], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    mov v0.b[4], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w5
-; CHECK-GI-NEXT:    mov v0.b[5], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w6
-; CHECK-GI-NEXT:    mov v0.b[6], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w7
-; CHECK-GI-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], w1
+; CHECK-GI-NEXT:    mov v0.b[2], w2
+; CHECK-GI-NEXT:    mov v0.b[3], w3
+; CHECK-GI-NEXT:    mov v0.b[4], w4
+; CHECK-GI-NEXT:    mov v0.b[5], w5
+; CHECK-GI-NEXT:    mov v0.b[6], w6
+; CHECK-GI-NEXT:    mov v0.b[7], w7
+; CHECK-GI-NEXT:    mov v0.b[8], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #64]
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #8]
-; CHECK-GI-NEXT:    fmov s3, w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    mov v0.b[8], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov v0.b[9], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #88]
-; CHECK-GI-NEXT:    mov v0.b[9], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[1], w10
+; CHECK-GI-NEXT:    mov v0.b[10], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #96]
-; CHECK-GI-NEXT:    mov v0.b[10], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[2], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-NEXT:    mov v0.b[11], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-NEXT:    mov v1.b[3], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #104]
-; CHECK-GI-NEXT:    mov v0.b[11], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[3], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #96]
+; CHECK-GI-NEXT:    mov v0.b[12], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-NEXT:    mov v1.b[4], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #112]
-; CHECK-GI-NEXT:    mov v0.b[12], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[4], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #104]
+; CHECK-GI-NEXT:    mov v0.b[13], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-NEXT:    mov v1.b[5], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #120]
-; CHECK-GI-NEXT:    mov v0.b[13], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[5], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #112]
+; CHECK-GI-NEXT:    mov v0.b[14], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-NEXT:    mov v1.b[6], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
-; CHECK-GI-NEXT:    mov v1.b[7], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[15], v3.b[0]
-; CHECK-GI-NEXT:    uaddlv h1, v1.8b
+; CHECK-GI-NEXT:    mov v1.b[6], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #120]
+; CHECK-GI-NEXT:    mov v0.b[15], w8
+; CHECK-GI-NEXT:    mov v1.b[7], w9
 ; CHECK-GI-NEXT:    uaddlv h0, v0.16b
-; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    uaddlv h1, v1.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    add w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3960,71 +3938,49 @@ define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) {
 ; CHECK-GI-LABEL: add_v24i8_v24i16_sext:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-NEXT:    ldr w10, [sp, #72]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w3
-; CHECK-GI-NEXT:    mov v0.b[3], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    mov v0.b[4], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w5
-; CHECK-GI-NEXT:    mov v0.b[5], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w6
-; CHECK-GI-NEXT:    mov v0.b[6], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w7
-; CHECK-GI-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], w1
+; CHECK-GI-NEXT:    mov v0.b[2], w2
+; CHECK-GI-NEXT:    mov v0.b[3], w3
+; CHECK-GI-NEXT:    mov v0.b[4], w4
+; CHECK-GI-NEXT:    mov v0.b[5], w5
+; CHECK-GI-NEXT:    mov v0.b[6], w6
+; CHECK-GI-NEXT:    mov v0.b[7], w7
+; CHECK-GI-NEXT:    mov v0.b[8], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #64]
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #8]
-; CHECK-GI-NEXT:    fmov s3, w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    mov v0.b[8], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov v0.b[9], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #88]
-; CHECK-GI-NEXT:    mov v0.b[9], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[1], w10
+; CHECK-GI-NEXT:    mov v0.b[10], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #96]
-; CHECK-GI-NEXT:    mov v0.b[10], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[2], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-NEXT:    mov v0.b[11], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-NEXT:    mov v1.b[3], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #104]
-; CHECK-GI-NEXT:    mov v0.b[11], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[3], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #96]
+; CHECK-GI-NEXT:    mov v0.b[12], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-NEXT:    mov v1.b[4], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #112]
-; CHECK-GI-NEXT:    mov v0.b[12], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[4], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #104]
+; CHECK-GI-NEXT:    mov v0.b[13], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-NEXT:    mov v1.b[5], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #120]
-; CHECK-GI-NEXT:    mov v0.b[13], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[5], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #112]
+; CHECK-GI-NEXT:    mov v0.b[14], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-NEXT:    mov v1.b[6], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
-; CHECK-GI-NEXT:    mov v1.b[7], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[15], v3.b[0]
-; CHECK-GI-NEXT:    saddlv h1, v1.8b
+; CHECK-GI-NEXT:    mov v1.b[6], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #120]
+; CHECK-GI-NEXT:    mov v0.b[15], w8
+; CHECK-GI-NEXT:    mov v1.b[7], w9
 ; CHECK-GI-NEXT:    saddlv h0, v0.16b
-; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    saddlv h1, v1.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    add w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4168,71 +4124,49 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
 ; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_zext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
 ; CHECK-GI-BASE-NEXT:    fmov s0, w0
-; CHECK-GI-BASE-NEXT:    fmov s1, w1
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp]
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-BASE-NEXT:    ldr w10, [sp, #72]
-; CHECK-GI-BASE-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w2
-; CHECK-GI-BASE-NEXT:    fmov s2, w10
-; CHECK-GI-BASE-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w3
-; CHECK-GI-BASE-NEXT:    mov v0.b[3], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w4
-; CHECK-GI-BASE-NEXT:    mov v0.b[4], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w5
-; CHECK-GI-BASE-NEXT:    mov v0.b[5], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w6
-; CHECK-GI-BASE-NEXT:    mov v0.b[6], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w7
-; CHECK-GI-BASE-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[1], w1
+; CHECK-GI-BASE-NEXT:    mov v0.b[2], w2
+; CHECK-GI-BASE-NEXT:    mov v0.b[3], w3
+; CHECK-GI-BASE-NEXT:    mov v0.b[4], w4
+; CHECK-GI-BASE-NEXT:    mov v0.b[5], w5
+; CHECK-GI-BASE-NEXT:    mov v0.b[6], w6
+; CHECK-GI-BASE-NEXT:    mov v0.b[7], w7
+; CHECK-GI-BASE-NEXT:    mov v0.b[8], w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #64]
 ; CHECK-GI-BASE-NEXT:    fmov s1, w8
-; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #8]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-BASE-NEXT:    mov v0.b[8], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w9
+; CHECK-GI-BASE-NEXT:    mov v0.b[9], w9
 ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-BASE-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #88]
-; CHECK-GI-BASE-NEXT:    mov v0.b[9], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[1], w10
+; CHECK-GI-BASE-NEXT:    mov v0.b[10], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-BASE-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #96]
-; CHECK-GI-BASE-NEXT:    mov v0.b[10], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[2], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-BASE-NEXT:    mov v0.b[11], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-BASE-NEXT:    mov v1.b[3], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #104]
-; CHECK-GI-BASE-NEXT:    mov v0.b[11], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[3], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #96]
+; CHECK-GI-BASE-NEXT:    mov v0.b[12], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-BASE-NEXT:    mov v1.b[4], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #112]
-; CHECK-GI-BASE-NEXT:    mov v0.b[12], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[4], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #104]
+; CHECK-GI-BASE-NEXT:    mov v0.b[13], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-BASE-NEXT:    mov v1.b[5], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #120]
-; CHECK-GI-BASE-NEXT:    mov v0.b[13], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[5], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #112]
+; CHECK-GI-BASE-NEXT:    mov v0.b[14], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-BASE-NEXT:    mov v1.b[6], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
-; CHECK-GI-BASE-NEXT:    mov v1.b[7], v2.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[15], v3.b[0]
-; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.8b
+; CHECK-GI-BASE-NEXT:    mov v1.b[6], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #120]
+; CHECK-GI-BASE-NEXT:    mov v0.b[15], w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[7], w9
 ; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.8b
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-BASE-NEXT:    ret
@@ -4240,76 +4174,54 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
 ; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_zext:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
 ; CHECK-GI-DOT-NEXT:    fmov s0, w0
-; CHECK-GI-DOT-NEXT:    fmov s1, w1
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp]
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp]
 ; CHECK-GI-DOT-NEXT:    ldr w10, [sp, #72]
-; CHECK-GI-DOT-NEXT:    movi v4.8b, #1
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #8]
-; CHECK-GI-DOT-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w2
-; CHECK-GI-DOT-NEXT:    fmov s3, w10
-; CHECK-GI-DOT-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w3
-; CHECK-GI-DOT-NEXT:    mov v0.b[3], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w4
-; CHECK-GI-DOT-NEXT:    mov v0.b[4], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w5
-; CHECK-GI-DOT-NEXT:    mov v0.b[5], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w6
-; CHECK-GI-DOT-NEXT:    mov v0.b[6], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w7
-; CHECK-GI-DOT-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
+; CHECK-GI-DOT-NEXT:    movi v3.8b, #1
 ; CHECK-GI-DOT-NEXT:    fmov s1, w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v0.b[1], w1
+; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v1.b[1], w10
+; CHECK-GI-DOT-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-GI-DOT-NEXT:    mov v0.b[2], w2
+; CHECK-GI-DOT-NEXT:    mov v1.b[2], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #88]
-; CHECK-GI-DOT-NEXT:    mov v0.b[8], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[3], w3
+; CHECK-GI-DOT-NEXT:    mov v1.b[3], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #96]
-; CHECK-GI-DOT-NEXT:    mov v0.b[9], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[4], w4
+; CHECK-GI-DOT-NEXT:    mov v1.b[4], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #104]
-; CHECK-GI-DOT-NEXT:    mov v0.b[10], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[5], w5
+; CHECK-GI-DOT-NEXT:    mov v1.b[5], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #112]
-; CHECK-GI-DOT-NEXT:    mov v0.b[11], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[6], w6
+; CHECK-GI-DOT-NEXT:    mov v1.b[6], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #120]
-; CHECK-GI-DOT-NEXT:    mov v0.b[12], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    mov v0.b[7], w7
+; CHECK-GI-DOT-NEXT:    mov v1.b[7], w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[8], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #8]
+; CHECK-GI-DOT-NEXT:    fmov d1, d1
+; CHECK-GI-DOT-NEXT:    mov v0.b[9], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-DOT-NEXT:    udot v4.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    mov v0.b[10], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-DOT-NEXT:    mov v0.b[11], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #32]
+; CHECK-GI-DOT-NEXT:    mov v0.b[12], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #40]
+; CHECK-GI-DOT-NEXT:    mov v0.b[13], w8
 ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-DOT-NEXT:    fmov s5, w9
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w8
+; CHECK-GI-DOT-NEXT:    mov v0.b[14], w8
 ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-DOT-NEXT:    mov v0.b[13], v2.b[0]
-; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v5.b[0]
-; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w8
-; CHECK-GI-DOT-NEXT:    mov v4.d[1], v2.d[0]
-; CHECK-GI-DOT-NEXT:    fmov d1, d1
-; CHECK-GI-DOT-NEXT:    mov v0.b[15], v3.b[0]
-; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    udot v5.4s, v0.16b, v4.16b
-; CHECK-GI-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v3.4s
+; CHECK-GI-DOT-NEXT:    mov v0.b[15], w8
+; CHECK-GI-DOT-NEXT:    udot v5.4s, v0.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v4.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
@@ -4484,71 +4396,49 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
 ; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_sext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
 ; CHECK-GI-BASE-NEXT:    fmov s0, w0
-; CHECK-GI-BASE-NEXT:    fmov s1, w1
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp]
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-BASE-NEXT:    ldr w10, [sp, #72]
-; CHECK-GI-BASE-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w2
-; CHECK-GI-BASE-NEXT:    fmov s2, w10
-; CHECK-GI-BASE-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w3
-; CHECK-GI-BASE-NEXT:    mov v0.b[3], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w4
-; CHECK-GI-BASE-NEXT:    mov v0.b[4], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w5
-; CHECK-GI-BASE-NEXT:    mov v0.b[5], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w6
-; CHECK-GI-BASE-NEXT:    mov v0.b[6], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w7
-; CHECK-GI-BASE-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[1], w1
+; CHECK-GI-BASE-NEXT:    mov v0.b[2], w2
+; CHECK-GI-BASE-NEXT:    mov v0.b[3], w3
+; CHECK-GI-BASE-NEXT:    mov v0.b[4], w4
+; CHECK-GI-BASE-NEXT:    mov v0.b[5], w5
+; CHECK-GI-BASE-NEXT:    mov v0.b[6], w6
+; CHECK-GI-BASE-NEXT:    mov v0.b[7], w7
+; CHECK-GI-BASE-NEXT:    mov v0.b[8], w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #64]
 ; CHECK-GI-BASE-NEXT:    fmov s1, w8
-; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #8]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-BASE-NEXT:    mov v0.b[8], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w9
+; CHECK-GI-BASE-NEXT:    mov v0.b[9], w9
 ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-BASE-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #88]
-; CHECK-GI-BASE-NEXT:    mov v0.b[9], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[1], w10
+; CHECK-GI-BASE-NEXT:    mov v0.b[10], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-BASE-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #96]
-; CHECK-GI-BASE-NEXT:    mov v0.b[10], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[2], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-BASE-NEXT:    mov v0.b[11], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-BASE-NEXT:    mov v1.b[3], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #104]
-; CHECK-GI-BASE-NEXT:    mov v0.b[11], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[3], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #96]
+; CHECK-GI-BASE-NEXT:    mov v0.b[12], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-BASE-NEXT:    mov v1.b[4], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #112]
-; CHECK-GI-BASE-NEXT:    mov v0.b[12], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[4], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #104]
+; CHECK-GI-BASE-NEXT:    mov v0.b[13], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-BASE-NEXT:    mov v1.b[5], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #120]
-; CHECK-GI-BASE-NEXT:    mov v0.b[13], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[5], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #112]
+; CHECK-GI-BASE-NEXT:    mov v0.b[14], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-BASE-NEXT:    mov v1.b[6], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
-; CHECK-GI-BASE-NEXT:    mov v1.b[7], v2.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[15], v3.b[0]
-; CHECK-GI-BASE-NEXT:    saddlv h1, v1.8b
+; CHECK-GI-BASE-NEXT:    mov v1.b[6], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #120]
+; CHECK-GI-BASE-NEXT:    mov v0.b[15], w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[7], w9
 ; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    saddlv h1, v1.8b
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    sxth w0, w8
 ; CHECK-GI-BASE-NEXT:    ret
@@ -4556,76 +4446,54 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
 ; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_sext:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
 ; CHECK-GI-DOT-NEXT:    fmov s0, w0
-; CHECK-GI-DOT-NEXT:    fmov s1, w1
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp]
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp]
 ; CHECK-GI-DOT-NEXT:    ldr w10, [sp, #72]
-; CHECK-GI-DOT-NEXT:    movi v4.8b, #1
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #8]
-; CHECK-GI-DOT-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w2
-; CHECK-GI-DOT-NEXT:    fmov s3, w10
-; CHECK-GI-DOT-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w3
-; CHECK-GI-DOT-NEXT:    mov v0.b[3], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w4
-; CHECK-GI-DOT-NEXT:    mov v0.b[4], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w5
-; CHECK-GI-DOT-NEXT:    mov v0.b[5], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w6
-; CHECK-GI-DOT-NEXT:    mov v0.b[6], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w7
-; CHECK-GI-DOT-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
+; CHECK-GI-DOT-NEXT:    movi v3.8b, #1
 ; CHECK-GI-DOT-NEXT:    fmov s1, w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v0.b[1], w1
+; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v1.b[1], w10
+; CHECK-GI-DOT-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-GI-DOT-NEXT:    mov v0.b[2], w2
+; CHECK-GI-DOT-NEXT:    mov v1.b[2], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #88]
-; CHECK-GI-DOT-NEXT:    mov v0.b[8], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[3], w3
+; CHECK-GI-DOT-NEXT:    mov v1.b[3], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #96]
-; CHECK-GI-DOT-NEXT:    mov v0.b[9], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[4], w4
+; CHECK-GI-DOT-NEXT:    mov v1.b[4], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #104]
-; CHECK-GI-DOT-NEXT:    mov v0.b[10], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[5], w5
+; CHECK-GI-DOT-NEXT:    mov v1.b[5], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #112]
-; CHECK-GI-DOT-NEXT:    mov v0.b[11], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[6], w6
+; CHECK-GI-DOT-NEXT:    mov v1.b[6], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #120]
-; CHECK-GI-DOT-NEXT:    mov v0.b[12], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    mov v0.b[7], w7
+; CHECK-GI-DOT-NEXT:    mov v1.b[7], w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[8], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #8]
+; CHECK-GI-DOT-NEXT:    fmov d1, d1
+; CHECK-GI-DOT-NEXT:    mov v0.b[9], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-DOT-NEXT:    sdot v4.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    mov v0.b[10], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-DOT-NEXT:    mov v0.b[11], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #32]
+; CHECK-GI-DOT-NEXT:    mov v0.b[12], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #40]
+; CHECK-GI-DOT-NEXT:    mov v0.b[13], w8
 ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-DOT-NEXT:    fmov s5, w9
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w8
+; CHECK-GI-DOT-NEXT:    mov v0.b[14], w8
 ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-DOT-NEXT:    mov v0.b[13], v2.b[0]
-; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v5.b[0]
-; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w8
-; CHECK-GI-DOT-NEXT:    mov v4.d[1], v2.d[0]
-; CHECK-GI-DOT-NEXT:    fmov d1, d1
-; CHECK-GI-DOT-NEXT:    mov v0.b[15], v3.b[0]
-; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    sdot v5.4s, v0.16b, v4.16b
-; CHECK-GI-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v3.4s
+; CHECK-GI-DOT-NEXT:    mov v0.b[15], w8
+; CHECK-GI-DOT-NEXT:    sdot v5.4s, v0.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v4.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll
index e536ba240453e..ead790203f949 100644
--- a/llvm/test/CodeGen/AArch64/xtn.ll
+++ b/llvm/test/CodeGen/AArch64/xtn.ll
@@ -127,12 +127,19 @@ entry:
 }
 
 define <2 x i8> @xtn_v2i128_v2i8(<2 x i128> %a) {
-; CHECK-LABEL: xtn_v2i128_v2i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov v0.s[1], w2
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: xtn_v2i128_v2i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.s[1], w2
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v2i128_v2i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.s[0], w0
+; CHECK-GI-NEXT:    mov v0.s[1], w2
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = trunc <2 x i128> %a to <2 x i8>
   ret <2 x i8> %arg1
@@ -168,8 +175,7 @@ define <2 x i16> @xtn_v2i128_v2i16(<2 x i128> %a) {
 ; CHECK-GI-LABEL: xtn_v2i128_v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w2
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -189,23 +195,36 @@ entry:
 }
 
 define <2 x i32> @xtn_v2i128_v2i32(<2 x i128> %a) {
-; CHECK-LABEL: xtn_v2i128_v2i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov v0.s[1], w2
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: xtn_v2i128_v2i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.s[1], w2
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v2i128_v2i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.s[0], w0
+; CHECK-GI-NEXT:    mov v0.s[1], w2
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = trunc <2 x i128> %a to <2 x i32>
   ret <2 x i32> %arg1
 }
 
 define <2 x i64> @xtn_v2i128_v2i64(<2 x i128> %a) {
-; CHECK-LABEL: xtn_v2i128_v2i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov d0, x0
-; CHECK-NEXT:    mov v0.d[1], x2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: xtn_v2i128_v2i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d0, x0
+; CHECK-SD-NEXT:    mov v0.d[1], x2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v2i128_v2i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.d[0], x0
+; CHECK-GI-NEXT:    mov v0.d[1], x2
+; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = trunc <2 x i128> %a to <2 x i64>
   ret <2 x i64> %arg1
@@ -282,10 +301,10 @@ define <3 x i16> @xtn_v3i32_v3i16(<3 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: xtn_v3i32_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -309,11 +328,9 @@ define <3 x i16> @xtn_v3i64_v3i16(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    fmov x8, d0
 ; CHECK-GI-NEXT:    fmov x9, d1
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    fmov x8, d2
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -334,10 +351,10 @@ define <3 x i32> @xtn_v3i64_v3i32(<3 x i64> %a) {
 ; CHECK-GI-LABEL: xtn_v3i64_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    fmov x8, d2
-; CHECK-GI-NEXT:    mov v0.s[1], w9
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index bb968c8eb00fc..7e95b6684e821 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -245,11 +245,9 @@ define <3 x i16> @zext_v3i8_v3i16(<3 x i8> %a) {
 ; CHECK-GI-NEXT:    and w8, w0, #0xff
 ; CHECK-GI-NEXT:    and w9, w1, #0xff
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    and w8, w2, #0xff
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -271,10 +269,10 @@ define <3 x i32> @zext_v3i8_v3i32(<3 x i8> %a) {
 ; CHECK-GI-LABEL: zext_v3i8_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    and w8, w0, #0xff
-; CHECK-GI-NEXT:    and w9, w1, #0xff
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    and w8, w1, #0xff
+; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    and w8, w2, #0xff
-; CHECK-GI-NEXT:    mov v0.s[1], w9
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -328,7 +326,7 @@ define <3 x i32> @zext_v3i16_v3i32(<3 x i16> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    umov w8, v0.h[0]
 ; CHECK-GI-NEXT:    umov w9, v0.h[1]
-; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    umov w8, v0.h[2]
 ; CHECK-GI-NEXT:    mov v1.s[1], w9
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
@@ -406,11 +404,9 @@ define <3 x i16> @zext_v3i10_v3i16(<3 x i10> %a) {
 ; CHECK-GI-NEXT:    and w8, w0, #0x3ff
 ; CHECK-GI-NEXT:    and w9, w1, #0x3ff
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    and w8, w2, #0x3ff
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -432,10 +428,10 @@ define <3 x i32> @zext_v3i10_v3i32(<3 x i10> %a) {
 ; CHECK-GI-LABEL: zext_v3i10_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    and w8, w0, #0x3ff
-; CHECK-GI-NEXT:    and w9, w1, #0x3ff
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    and w8, w1, #0x3ff
+; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    and w8, w2, #0x3ff
-; CHECK-GI-NEXT:    mov v0.s[1], w9
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1089,51 +1085,39 @@ define <16 x i32> @zext_v16i10_v16i32(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v16i10_v16i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    fmov s3, w5
-; CHECK-GI-NEXT:    ldr w9, [sp, #8]
-; CHECK-GI-NEXT:    ldr w10, [sp, #32]
+; CHECK-GI-NEXT:    ldr w9, [sp, #32]
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w4
+; CHECK-GI-NEXT:    ldr w10, [sp, #8]
 ; CHECK-GI-NEXT:    ldr w11, [sp, #40]
 ; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s3, w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    fmov s5, w10
-; CHECK-GI-NEXT:    fmov s6, w11
+; CHECK-GI-NEXT:    mov v0.h[1], w1
 ; CHECK-GI-NEXT:    ldr w9, [sp, #48]
-; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w2
-; CHECK-GI-NEXT:    mov v2.h[1], v4.h[0]
-; CHECK-GI-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NEXT:    fmov s4, w8
-; CHECK-GI-NEXT:    fmov s6, w9
+; CHECK-GI-NEXT:    movi v4.4s, #3, msl #8
+; CHECK-GI-NEXT:    mov v1.h[1], w5
+; CHECK-GI-NEXT:    mov v2.h[1], w10
+; CHECK-GI-NEXT:    mov v3.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NEXT:    mov v1.h[2], w6
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mov v3.h[2], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
 ; CHECK-GI-NEXT:    ldr w9, [sp, #56]
-; CHECK-GI-NEXT:    mov v0.h[2], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w6
-; CHECK-GI-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w8
-; CHECK-GI-NEXT:    mov v5.h[2], v6.h[0]
-; CHECK-GI-NEXT:    fmov s6, w9
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w3
-; CHECK-GI-NEXT:    mov v2.h[3], v4.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w7
-; CHECK-GI-NEXT:    mov v5.h[3], v6.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
-; CHECK-GI-NEXT:    movi v3.4s, #3, msl #8
-; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    mov v0.h[3], w3
+; CHECK-GI-NEXT:    mov v1.h[3], w7
+; CHECK-GI-NEXT:    mov v2.h[3], w8
+; CHECK-GI-NEXT:    mov v3.h[3], w9
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v4.4s, v5.4h, #0
 ; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
-; CHECK-GI-NEXT:    and v2.16b, v2.16b, v3.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT:    and v3.16b, v4.16b, v3.16b
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v4.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v4.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <16 x i10> %a to <16 x i32>
@@ -1185,62 +1169,50 @@ define <16 x i64> @zext_v16i10_v16i64(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v16i10_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-NEXT:    ldr w10, [sp, #32]
-; CHECK-GI-NEXT:    ldr w11, [sp, #40]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w9, [sp, #48]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
+; CHECK-GI-NEXT:    ldr w11, [sp, #40]
 ; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s3, w10
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v1.h[1], w5
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    mov v3.h[1], w11
+; CHECK-GI-NEXT:    ldr w9, [sp, #48]
+; CHECK-GI-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NEXT:    mov v1.h[2], w6
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mov v3.h[2], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w10
-; CHECK-GI-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w2
-; CHECK-GI-NEXT:    mov v2.h[2], v5.h[0]
-; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    ldr w9, [sp, #56]
+; CHECK-GI-NEXT:    mov v0.h[3], w3
+; CHECK-GI-NEXT:    mov v1.h[3], w7
+; CHECK-GI-NEXT:    mov v2.h[3], w8
+; CHECK-GI-NEXT:    mov v3.h[3], w9
 ; CHECK-GI-NEXT:    adrp x8, .LCPI54_0
 ; CHECK-GI-NEXT:    ldr q7, [x8, :lo12:.LCPI54_0]
-; CHECK-GI-NEXT:    mov v0.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w6
-; CHECK-GI-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-GI-NEXT:    mov v1.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #56]
-; CHECK-GI-NEXT:    mov v3.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w3
-; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w7
-; CHECK-GI-NEXT:    ushll v17.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll2 v18.2d, v2.4s, #0
-; CHECK-GI-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w9
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov v3.h[3], v4.h[0]
 ; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    ushll v4.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v5.2d, v0.4s, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    ushll v6.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v16.2d, v1.4s, #0
-; CHECK-GI-NEXT:    and v0.16b, v4.16b, v7.16b
-; CHECK-GI-NEXT:    and v1.16b, v5.16b, v7.16b
-; CHECK-GI-NEXT:    and v4.16b, v17.16b, v7.16b
-; CHECK-GI-NEXT:    and v5.16b, v18.16b, v7.16b
+; CHECK-GI-NEXT:    ushll v17.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll2 v18.2d, v2.4s, #0
 ; CHECK-GI-NEXT:    ushll v19.2d, v3.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v20.2d, v3.4s, #0
+; CHECK-GI-NEXT:    and v0.16b, v4.16b, v7.16b
+; CHECK-GI-NEXT:    and v1.16b, v5.16b, v7.16b
 ; CHECK-GI-NEXT:    and v2.16b, v6.16b, v7.16b
 ; CHECK-GI-NEXT:    and v3.16b, v16.16b, v7.16b
+; CHECK-GI-NEXT:    and v4.16b, v17.16b, v7.16b
+; CHECK-GI-NEXT:    and v5.16b, v18.16b, v7.16b
 ; CHECK-GI-NEXT:    and v6.16b, v19.16b, v7.16b
 ; CHECK-GI-NEXT:    and v7.16b, v20.16b, v7.16b
 ; CHECK-GI-NEXT:    ret

From 12c0823d67a8d5a61d6430aac609ef5e468267a6 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Tue, 3 Sep 2024 19:06:40 -0700
Subject: [PATCH 014/425] [clang-format] Handle spaces in file paths in
 git-clang-format.bat (#107041)

This patch is provided by @jeliebig.

Fixes #107017.
---
 clang/tools/clang-format/git-clang-format.bat | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/tools/clang-format/git-clang-format.bat b/clang/tools/clang-format/git-clang-format.bat
index 9965cd4312fe3..19c82d8a04132 100644
--- a/clang/tools/clang-format/git-clang-format.bat
+++ b/clang/tools/clang-format/git-clang-format.bat
@@ -1 +1 @@
-py -3 %~pn0 %*
+py -3 "%~pn0" %*

From a27ff17034d66d852ba83be7d237d6a623cb4ff4 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Tue, 3 Sep 2024 19:07:14 -0700
Subject: [PATCH 015/425] [clang-format] Fix a regression in annotating
 ObjCBlockLParen (#107021)

Fixes #106994.
---
 clang/lib/Format/TokenAnnotator.cpp           | 3 +--
 clang/unittests/Format/TokenAnnotatorTest.cpp | 7 +++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 64e936f627d43..bf37062393719 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -3588,8 +3588,7 @@ static FormatToken *getFunctionName(const AnnotatedLine &Line,
 
     // Make sure the name is followed by a pair of parentheses.
     if (Name) {
-      if (Tok->is(tok::l_paren) && Tok->isNot(TT_FunctionTypeLParen) &&
-          Tok->MatchingParen) {
+      if (Tok->is(tok::l_paren) && Tok->is(TT_Unknown) && Tok->MatchingParen) {
         OpeningParen = Tok;
         return Name;
       }
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 5d37a65250d0b..b8a86245808e5 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -1678,6 +1678,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsObjCBlock) {
                     "}();");
   ASSERT_EQ(Tokens.size(), 19u) << Tokens;
   EXPECT_TOKEN(Tokens[9], tok::l_brace, TT_ObjCBlockLBrace);
+
+  Tokens = annotate("id (^block)(Foo *a) = ^id _Nullable(Foo *_Nullable a) {\n"
+                    "  return a;\n"
+                    "};");
+  ASSERT_EQ(Tokens.size(), 27u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown); // Not CtorDtorDeclName.
+  EXPECT_TOKEN(Tokens[1], tok::l_paren, TT_ObjCBlockLParen);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsObjCMethodExpr) {

From b55186eefd73b3848e01c8471c47a9354969d652 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= <alex@alexrp.com>
Date: Wed, 4 Sep 2024 04:07:35 +0200
Subject: [PATCH 016/425] [clang][Driver] Define soft float macros for PPC.
 (#106012)

Fixes #105972.

Co-authored-by: Qiu Chaofan <qcf@ecnelises.com>
---
 clang/lib/Basic/Targets/PPC.cpp      | 19 ++++++++++++++-----
 clang/lib/Basic/Targets/PPC.h        |  2 ++
 clang/test/Preprocessor/init-ppc.c   | 18 ++++++++++++++++++
 clang/test/Preprocessor/init-ppc64.c | 18 ++++++++++++++++++
 4 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index 04dc436eb1b9c..1448069173b5f 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -68,6 +68,10 @@ bool PPCTargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasSPE = true;
       LongDoubleWidth = LongDoubleAlign = 64;
       LongDoubleFormat = &llvm::APFloat::IEEEdouble();
+    } else if (Feature == "+frsqrte") {
+      HasFrsqrte = true;
+    } else if (Feature == "+frsqrtes") {
+      HasFrsqrtes = true;
     } else if (Feature == "-hard-float") {
       FloatABI = SoftFloat;
     } else if (Feature == "+paired-vector-memops") {
@@ -402,9 +406,18 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__VEC__", "10206");
     Builder.defineMacro("__ALTIVEC__");
   }
-  if (HasSPE) {
+  if (HasSPE)
     Builder.defineMacro("__SPE__");
+  if (HasSPE || FloatABI == SoftFloat)
     Builder.defineMacro("__NO_FPRS__");
+  if (FloatABI == SoftFloat) {
+    Builder.defineMacro("_SOFT_FLOAT");
+    Builder.defineMacro("_SOFT_DOUBLE");
+  } else {
+    if (HasFrsqrte)
+      Builder.defineMacro("__RSQRTE__");
+    if (HasFrsqrtes)
+      Builder.defineMacro("__RSQRTEF__");
   }
   if (HasVSX)
     Builder.defineMacro("__VSX__");
@@ -439,14 +452,10 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts,
   // FIXME: The following are not yet generated here by Clang, but are
   //        generated by GCC:
   //
-  //   _SOFT_FLOAT_
   //   __RECIP_PRECISION__
   //   __APPLE_ALTIVEC__
   //   __RECIP__
   //   __RECIPF__
-  //   __RSQRTE__
-  //   __RSQRTEF__
-  //   _SOFT_DOUBLE_
   //   __NO_LWSYNC__
   //   __CMODEL_MEDIUM__
   //   __CMODEL_LARGE__
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index 6d5d8dd54d013..b0833d30550af 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -73,6 +73,8 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
   bool HasExtDiv = false;
   bool HasP9Vector = false;
   bool HasSPE = false;
+  bool HasFrsqrte = false;
+  bool HasFrsqrtes = false;
   bool PairedVectorMemops = false;
   bool HasP10Vector = false;
   bool HasPCRelativeMemops = false;
diff --git a/clang/test/Preprocessor/init-ppc.c b/clang/test/Preprocessor/init-ppc.c
index 3fb642af9d742..1421b102a3dfd 100644
--- a/clang/test/Preprocessor/init-ppc.c
+++ b/clang/test/Preprocessor/init-ppc.c
@@ -977,3 +977,21 @@
 
 // RUN: %clang_cc1 -E -dM -triple=powerpc-unknown-openbsd -x c++ < /dev/null | FileCheck -match-full-lines -check-prefix PPC-OPENBSD-CXX %s
 // PPC-OPENBSD-CXX: #define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16UL
+
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-none-none < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR4-RSQRT %s
+//
+// PPCPWR4-RSQRT-NOT:#define __RSQRTEF__ 1
+// PPCPWR4-RSQRT-NOT:#define __RSQRTE__ 1
+//
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-none-none -target-feature +frsqrte -target-feature +frsqrtes < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR5-RSQRT %s
+//
+// PPCPWR5-RSQRT:#define __RSQRTEF__ 1
+// PPCPWR5-RSQRT:#define __RSQRTE__ 1
+
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-unknown-linux-gnu -target-feature -hard-float < /dev/null | FileCheck -match-full-lines -check-prefix PPC-SOFTFLT %s
+//
+// PPC-SOFTFLT:#define _SOFT_DOUBLE 1
+// PPC-SOFTFLT:#define _SOFT_FLOAT 1
+// PPC-SOFTFLT:#define __NO_FPRS__ 1
+// PPC-SOFTFLT-NOT:#define __RSQRTE__ 1
+// PPC-SOFTFLT-NOT:#define __RSQRTEF__ 1
diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c
index 56164beb913d5..57e2ca31d5d53 100644
--- a/clang/test/Preprocessor/init-ppc64.c
+++ b/clang/test/Preprocessor/init-ppc64.c
@@ -1110,3 +1110,21 @@
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-freebsd < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-FREEBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-freebsd < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-FREEBSD %s
 // PPC64-FREEBSD-NOT: #define __LONG_DOUBLE_128__ 1
+
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix PPC64PWR4-RSQRT %s
+//
+// PPC64PWR4-RSQRT-NOT:#define __RSQRTEF__ 1
+// PPC64PWR4-RSQRT-NOT:#define __RSQRTE__ 1
+//
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-feature +frsqrte -target-feature +frsqrtes < /dev/null | FileCheck -match-full-lines -check-prefix PPC64PWR5-RSQRT %s
+//
+// PPC64PWR5-RSQRT:#define __RSQRTEF__ 1
+// PPC64PWR5-RSQRT:#define __RSQRTE__ 1
+
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -target-feature -hard-float -xc /dev/null | FileCheck --check-prefix=PPC64-SOFTFLT %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-linux-gnu -target-feature -hard-float -xc /dev/null | FileCheck --check-prefix=PPC64-SOFTFLT %s
+// PPC64-SOFTFLT:#define _SOFT_DOUBLE 1
+// PPC64-SOFTFLT:#define _SOFT_FLOAT 1
+// PPC64-SOFTFLT:#define __NO_FPRS__ 1
+// PPC64-SOFTFLT-NOT:#define __RSQRTE__ 1
+// PPC64-SOFTFLT-NOT:#define __RSQRTEF__ 1

From 8d0816615f920b0783bafa903804b9e2a2fa4e91 Mon Sep 17 00:00:00 2001
From: yifeizh2 <yifei.zhang@intel.com>
Date: Wed, 4 Sep 2024 10:10:43 +0800
Subject: [PATCH 017/425] [MLIR][Tensor] Fix source/dest type check in UnPackOp
 canonicalize (#106094)

Fix `RankedTensorType` equality check in unpack op canonicalization.
---
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp   |  6 +++---
 mlir/test/Dialect/Tensor/canonicalize.mlir | 13 +++++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index e11c6aaccf74d..996de530c255d 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -4203,7 +4203,7 @@ static bool inferStaticShape(PackOp packOp, SmallVectorImpl<int64_t> &srcShape,
 }
 
 LogicalResult PackOp::canonicalize(PackOp packOp, PatternRewriter &rewriter) {
-  // Fold an unpack(pack(x)) to x.
+  // Fold an pack(unpack(x)) to x.
   if (auto unPackOp = packOp.getSource().getDefiningOp<UnPackOp>()) {
     if (unPackOp.getSourceType() != packOp.getDestType())
       return failure();
@@ -4437,9 +4437,9 @@ static bool inferStaticShape(UnPackOp op, SmallVectorImpl<int64_t> &srcShape,
 
 LogicalResult UnPackOp::canonicalize(UnPackOp unPackOp,
                                      PatternRewriter &rewriter) {
-  /// pack(unpack(x)) -> x
+  /// unpack(pack(x)) -> x
   if (PackOp packOp = unPackOp.getSource().getDefiningOp<tensor::PackOp>()) {
-    if (packOp.getDestType() != unPackOp.getSourceType())
+    if (packOp.getSourceType() != unPackOp.getDestType())
       return failure();
     if (packOp.getPaddingValue() ||
         !hasSameInnerOuterAttribute(packOp, unPackOp) ||
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 458ff51be7462..735790e5bd6c5 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -2268,6 +2268,19 @@ func.func @unpack_pack(%t: tensor<128x128xf32>, %tile1: index, %tile2: index) ->
 
 // -----
 
+// CHECK: func.func @unpack_pack_with_padding_no_canonicalization(
+// CHECK:         tensor.pack
+// CHECK:         tensor.unpack
+func.func @unpack_pack_with_padding_no_canonicalization(%t: tensor<256x512xbf16>) -> tensor<224x512xbf16> {
+  %tensor_empty = tensor.empty() : tensor<4x16x64x32xbf16>
+  %tensor_empty1 = tensor.empty() : tensor<224x512xbf16>
+  %packed = tensor.pack %t outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %tensor_empty : tensor<256x512xbf16> -> tensor<4x16x64x32xbf16>
+  %unpacked = tensor.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %tensor_empty1 : tensor<4x16x64x32xbf16> -> tensor<224x512xbf16> 
+  return %unpacked : tensor<224x512xbf16>
+}
+
+// -----
+
 // Chain NCnc -> NC -> NC -> NCnc
 // CHECK: func.func @pack_unpack(
 // CHECK-SAME: %[[T:.+]]: tensor<16x16x?x?xf32>,

From 812c96e8b9354e5e84d513f5b03172db5ad3b491 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Tue, 3 Sep 2024 19:11:12 -0700
Subject: [PATCH 018/425] [clang-format] Handle pointer/reference in macro
 definitions (#107074)

A macro definition needs its own scope stack in the annotator, so we add
the MacroBodyScopes stack and use ScopeStack to refer to it when in the
macro definition body.

Also, we need to have a scope type for a child block because its parent
line is parsed (and thus the scope type for the braces is popped off the
scope stack) before the lines in the child block are.

Fixes #99271.
---
 clang/lib/Format/TokenAnnotator.cpp           | 24 +++++++++++++------
 clang/lib/Format/TokenAnnotator.h             |  8 +++----
 clang/unittests/Format/TokenAnnotatorTest.cpp | 15 ++++++++++++
 3 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index bf37062393719..6a1cf61659fd9 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -137,9 +137,8 @@ class AnnotatingParser {
 private:
   ScopeType getScopeType(const FormatToken &Token) const {
     switch (Token.getType()) {
-    case TT_FunctionLBrace:
     case TT_LambdaLBrace:
-      return ST_Function;
+      return ST_ChildBlock;
     case TT_ClassLBrace:
     case TT_StructLBrace:
     case TT_UnionLBrace:
@@ -400,7 +399,8 @@ class AnnotatingParser {
                OpeningParen.Previous->MatchingParen->isOneOf(
                    TT_ObjCBlockLParen, TT_FunctionTypeLParen)) {
       Contexts.back().IsExpression = false;
-    } else if (!Line.MustBeDeclaration && !Line.InPPDirective) {
+    } else if (!Line.MustBeDeclaration &&
+               (!Line.InPPDirective || (Line.InMacroBody && !Scopes.empty()))) {
       bool IsForOrCatch =
           OpeningParen.Previous &&
           OpeningParen.Previous->isOneOf(tok::kw_for, tok::kw_catch);
@@ -3649,11 +3649,21 @@ static bool isCtorOrDtorName(const FormatToken *Tok) {
 }
 
 void TokenAnnotator::annotate(AnnotatedLine &Line) {
-  AnnotatingParser Parser(Style, Line, Keywords, Scopes);
+  if (!Line.InMacroBody)
+    MacroBodyScopes.clear();
+
+  auto &ScopeStack = Line.InMacroBody ? MacroBodyScopes : Scopes;
+  AnnotatingParser Parser(Style, Line, Keywords, ScopeStack);
   Line.Type = Parser.parseLine();
 
-  for (auto &Child : Line.Children)
-    annotate(*Child);
+  if (!Line.Children.empty()) {
+    ScopeStack.push_back(ST_ChildBlock);
+    for (auto &Child : Line.Children)
+      annotate(*Child);
+    // ScopeStack can become empty if Child has an unmatched `}`.
+    if (!ScopeStack.empty())
+      ScopeStack.pop_back();
+  }
 
   // With very deep nesting, ExpressionParser uses lots of stack and the
   // formatting algorithm is very slow. We're not going to do a good job here
@@ -3671,7 +3681,7 @@ void TokenAnnotator::annotate(AnnotatedLine &Line) {
   if (IsCpp) {
     FormatToken *OpeningParen = nullptr;
     auto *Tok = getFunctionName(Line, OpeningParen);
-    if (Tok && ((!Scopes.empty() && Scopes.back() == ST_Class) ||
+    if (Tok && ((!ScopeStack.empty() && ScopeStack.back() == ST_Class) ||
                 Line.endsWith(TT_FunctionLBrace) || isCtorOrDtorName(Tok))) {
       Tok->setFinalizedType(TT_CtorDtorDeclName);
       assert(OpeningParen);
diff --git a/clang/lib/Format/TokenAnnotator.h b/clang/lib/Format/TokenAnnotator.h
index f4f2bba0eb217..5a02030e5ba7f 100644
--- a/clang/lib/Format/TokenAnnotator.h
+++ b/clang/lib/Format/TokenAnnotator.h
@@ -36,11 +36,11 @@ enum LineType {
 };
 
 enum ScopeType {
+  // Contained in child block.
+  ST_ChildBlock,
   // Contained in class declaration/definition.
   ST_Class,
-  // Contained within function definition.
-  ST_Function,
-  // Contained within other scope block (loop, if/else, etc).
+  // Contained within other scope block (function, loop, if/else, etc).
   ST_Other,
 };
 
@@ -269,7 +269,7 @@ class TokenAnnotator {
 
   const AdditionalKeywords &Keywords;
 
-  SmallVector<ScopeType> Scopes;
+  SmallVector<ScopeType> Scopes, MacroBodyScopes;
 };
 
 } // end namespace format
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index b8a86245808e5..a2986f589396b 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -327,6 +327,21 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
   ASSERT_EQ(Tokens.size(), 26u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::ampamp, TT_BinaryOperator);
   EXPECT_TOKEN(Tokens[16], tok::ampamp, TT_BinaryOperator);
+
+  Tokens = annotate("#define FOO \\\n"
+                    "  void foo() { f(a * b); }");
+  ASSERT_EQ(Tokens.size(), 17u) << Tokens;
+  EXPECT_TOKEN(Tokens[11], tok::star, TT_BinaryOperator);
+
+  Tokens = annotate("#define FOO auto Foo = [] { f(a * b); };");
+  ASSERT_EQ(Tokens.size(), 19u) << Tokens;
+  EXPECT_TOKEN(Tokens[12], tok::star, TT_BinaryOperator);
+
+  Tokens = annotate("namespace {\n"
+                    "#define FOO(x) void foo(a##x *b);\n"
+                    "}");
+  ASSERT_EQ(Tokens.size(), 20u) << Tokens;
+  EXPECT_TOKEN(Tokens[14], tok::star, TT_PointerOrReference);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsUsesOfPlusAndMinus) {

From f4b9839d6f7c9ec2967a42f2d5546a2a2ae77ca4 Mon Sep 17 00:00:00 2001
From: Longsheng Mou <moulongsheng@huawei.com>
Date: Wed, 4 Sep 2024 10:21:27 +0800
Subject: [PATCH 019/425] [mlir][TensorToSPIRV] Add type check for
 `tensor.extract` in TensorToSPIRV (#107110)

This patch add a type check for `tensor.extract` in TensorToSPIRV.
Only convert `tensor.extract` with supported element type. Fix #74466.
---
 .../Conversion/TensorToSPIRV/TensorToSPIRV.cpp |  2 ++
 .../TensorToSPIRV/tensor-ops-to-spirv.mlir     | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRV.cpp b/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRV.cpp
index 0fb58623bdafb..468fffdd2df91 100644
--- a/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRV.cpp
+++ b/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRV.cpp
@@ -45,6 +45,8 @@ class TensorExtractPattern final
                   ConversionPatternRewriter &rewriter) const override {
     auto tensorType = cast<RankedTensorType>(extractOp.getTensor().getType());
 
+    if (!isa<spirv::ScalarType>(tensorType.getElementType()))
+      return rewriter.notifyMatchFailure(extractOp, "unsupported type");
     if (!tensorType.hasStaticShape())
       return rewriter.notifyMatchFailure(extractOp, "non-static tensor");
 
diff --git a/mlir/test/Conversion/TensorToSPIRV/tensor-ops-to-spirv.mlir b/mlir/test/Conversion/TensorToSPIRV/tensor-ops-to-spirv.mlir
index 32d0fbea65b16..b69c2d0408d17 100644
--- a/mlir/test/Conversion/TensorToSPIRV/tensor-ops-to-spirv.mlir
+++ b/mlir/test/Conversion/TensorToSPIRV/tensor-ops-to-spirv.mlir
@@ -29,6 +29,24 @@ func.func @tensor_extract_constant(%a : index, %b: index, %c: index) -> i32 {
 
 // -----
 
+// CHECK-LABEL: test_spirv_unsupported_type_index
+func.func @test_spirv_unsupported_type_index(%a : index) {
+  %cst = arith.constant dense<[1, 2]> : tensor<2xindex>
+  // CHECK: tensor.extract
+  %extract = tensor.extract %cst[%a] : tensor<2xindex>
+  return
+}
+
+// CHECK-LABEL: test_spirv_unsupported_type_i128
+func.func @test_spirv_unsupported_type_i128(%a : index) {
+  %cst = arith.constant dense<[1, 2]> : tensor<2xi128>
+  // CHECK: tensor.extract
+  %extract = tensor.extract %cst[%a] : tensor<2xi128>
+  return
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // Type conversion
 //===----------------------------------------------------------------------===//

From 37263b6c6741894ffbc0f61979c5c85db515ef2d Mon Sep 17 00:00:00 2001
From: Longsheng Mou <moulongsheng@huawei.com>
Date: Wed, 4 Sep 2024 10:24:17 +0800
Subject: [PATCH 020/425] [mlir][tosa] Add verifier for `tosa.pad` (#106351)

This patch adds verifier to `tosa.pad` which fixes a crash. `tosa.pad`
expect:
- same input and output tensor rank.
- 'padding' tensor rank equal to 2.

Fix #106168.
---
 mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td |  1 +
 mlir/lib/Dialect/Tosa/IR/TosaOps.cpp         | 14 +++++++++++
 mlir/test/Dialect/Tosa/invalid.mlir          | 25 ++++++++++++++++++++
 3 files changed, 40 insertions(+)

diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 0be0f8ef2d7a0..1a132e73be864 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -1594,6 +1594,7 @@ def Tosa_PadOp : Tosa_InferShapedTypeOp<"pad"> {
 
   let hasCanonicalizer = 1;
   let hasFolder = 1;
+  let hasVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 267a875710ed7..d93db1b237f31 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -817,6 +817,20 @@ LogicalResult tosa::PadOp::inferReturnTypeComponents(
   return success();
 }
 
+LogicalResult tosa::PadOp::verify() {
+  RankedTensorType inputType = getInput1().getType();
+  RankedTensorType outputType = getOutput().getType();
+  TensorType paddingType = getPadding().getType();
+
+  if (inputType.getRank() != outputType.getRank())
+    return emitOpError() << "expect same input and output tensor rank.";
+
+  if (paddingType.hasRank() && paddingType.getRank() != 2)
+    return emitOpError() << "expect 'padding' tensor rank equal to 2.";
+
+  return success();
+}
+
 static SmallVector<int64_t> convertToMlirShape(ArrayRef<int64_t> shape) {
   return to_vector(llvm::map_range(shape, [](int64_t dim) {
     return dim == -1 ? ShapedType::kDynamic : dim;
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index e72e154f95277..418f7687b3cce 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -72,6 +72,31 @@ func.func @test_pad_non_const(%arg0: tensor<13x21x3xi8>, %arg1: tensor<i8>) -> t
 
 // -----
 
+func.func @test_pad_io_rank_mismatch(%arg0: tensor<13x21xf32>, %arg1: tensor<2x2xi32>) {
+  // expected-error@+1 {{'tosa.pad' op expect same input and output tensor rank.}}
+  %1 = tosa.pad %arg0, %arg1 : (tensor<13x21xf32>, tensor<2x2xi32>) -> tensor<13x21x3xf32>
+  return
+}
+
+// -----
+
+func.func @test_pad_invalid_padding_rank(%arg0: tensor<13x21xf32>, %arg1: tensor<2xi32>) {
+  // expected-error@+1 {{'tosa.pad' op expect 'padding' tensor rank equal to 2.}}
+  %1 = tosa.pad %arg0, %arg1 : (tensor<13x21xf32>, tensor<2xi32>) -> tensor<13x21xf32>
+  return
+}
+
+// -----
+
+func.func @test_pad_invalid_padConst_rank(%arg0: tensor<13x21xf32>, %arg1: tensor<2x2xi32>) {
+  %0 = "tosa.const"() {value = dense<3.14> : tensor<1xf32>} : () -> tensor<1xf32>
+  // expected-error@+1 {{'tosa.pad' op operand #2 must be 0D tensor of number values, but got 'tensor<1xf32>'}}
+  %1 = tosa.pad %arg0, %arg1, %0 : (tensor<13x21xf32>, tensor<2x2xi32>, tensor<1xf32>) -> tensor<13x21xf32>
+  return
+}
+
+// -----
+
 func.func @test_transpose_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3xi32>) -> tensor<3x13x21xf32> {
   // expected-error@+1 {{'tosa.transpose' op perms of transpose is not constant}}
   %0 = tosa.transpose %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<3xi32>) -> tensor<3x13x21xf32>

From a628bc3c2e7314e4b7c9af0d10cf39a70c731d15 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 3 Sep 2024 19:55:08 -0700
Subject: [PATCH 021/425] [AArch64] Fix a warning

This patch fixes:

  lib/Target/AArch64/AArch64GenPostLegalizeGILowering.inc:506:14:
  error: unused variable 'GIMatchData_matchinfo'
  [-Werror,-Wunused-variable]
---
 llvm/lib/Target/AArch64/AArch64Combine.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index f99d1e276c60f..25989fb598895 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -223,7 +223,7 @@ def build_vector_to_dup : GICombineRule<
 >;
 
 def build_vector_to_vector_insert : GICombineRule<
-  (defs root:$root, register_matchinfo:$matchinfo),
+  (defs root:$root),
   (match (G_BUILD_VECTOR $dst, GIVariadic<>:$unused):$root,
           [{ return matchLowerBuildToInsertVecElt(*${root}, MRI); }]),
   (apply [{ applyLowerBuildToInsertVecElt(*${root}, MRI, B); }])

From 9a17a6016d02afa6e973f141ab1cada68571f2d2 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 3 Sep 2024 20:02:18 -0700
Subject: [PATCH 022/425] [PowerPC] Use DenseMap::operator[] (NFC) (#107044)

---
 llvm/lib/Target/PowerPC/PPCFrameLowering.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index a57ed33bda9c7..f7188b856461b 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -2424,8 +2424,7 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
   // or two GPRs, so we need table to record information for later save/restore.
   for (const CalleeSavedInfo &Info : CSI) {
     if (Info.isSpilledToReg()) {
-      auto &SpilledVSR =
-          VSRContainingGPRs.FindAndConstruct(Info.getDstReg()).second;
+      auto &SpilledVSR = VSRContainingGPRs[Info.getDstReg()];
       assert(SpilledVSR.second == 0 &&
              "Can't spill more than two GPRs into VSR!");
       if (SpilledVSR.first == 0)

From f15e3e58c59b4d31eee24fa9debc5dfad0c20028 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 3 Sep 2024 20:02:29 -0700
Subject: [PATCH 023/425] [CGOpenMPRuntime] Use DenseMap::operator[] (NFC)
 (#107158)

I'm planning to deprecate DenseMap::FindAndConstruct in favor of
DenseMap::operator[].
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 34120486996fb..23b977be81602 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1149,10 +1149,8 @@ void CGOpenMPRuntime::emitUserDefinedReduction(
         /*IsCombiner=*/false);
   }
   UDRMap.try_emplace(D, Combiner, Initializer);
-  if (CGF) {
-    auto &Decls = FunctionUDRMap.FindAndConstruct(CGF->CurFn);
-    Decls.second.push_back(D);
-  }
+  if (CGF)
+    FunctionUDRMap[CGF->CurFn].push_back(D);
 }
 
 std::pair<llvm::Function *, llvm::Function *>
@@ -1432,10 +1430,8 @@ llvm::Value *CGOpenMPRuntime::getThreadID(CodeGenFunction &CGF,
         ThreadID = CGF.EmitLoadOfScalar(LVal, Loc);
         // If value loaded in entry block, cache it and use it everywhere in
         // function.
-        if (CGF.Builder.GetInsertBlock() == TopBlock) {
-          auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn);
-          Elem.second.ThreadID = ThreadID;
-        }
+        if (CGF.Builder.GetInsertBlock() == TopBlock)
+          OpenMPLocThreadIDMap[CGF.CurFn].ThreadID = ThreadID;
         return ThreadID;
       }
     }
@@ -8640,8 +8636,7 @@ class MappableExprsHandler {
           const MapData &BaseData = CI == CE ? L : L1;
           OMPClauseMappableExprCommon::MappableExprComponentListRef SubData =
               SI == SE ? Components : Components1;
-          auto &OverlappedElements = OverlappedData.FindAndConstruct(&BaseData);
-          OverlappedElements.getSecond().push_back(SubData);
+          OverlappedData[&BaseData].push_back(SubData);
         }
       }
     }
@@ -9316,10 +9311,8 @@ void CGOpenMPRuntime::emitUserDefinedMapper(const OMPDeclareMapperDecl *D,
   MapperCGF.EmitBlock(DoneBB, /*IsFinished=*/true);
   MapperCGF.FinishFunction();
   UDMMap.try_emplace(D, Fn);
-  if (CGF) {
-    auto &Decls = FunctionUDMMap.FindAndConstruct(CGF->CurFn);
-    Decls.second.push_back(D);
-  }
+  if (CGF)
+    FunctionUDMMap[CGF->CurFn].push_back(D);
 }
 
 /// Emit the array initialization or deletion portion for user-defined mapper

From 86627149f6fd5148311b7b0aa1c7195a05a5d6a8 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson@amd.com>
Date: Wed, 4 Sep 2024 12:15:20 +0900
Subject: [PATCH 024/425] [AMDGPU] Mitigate GFX12 VALU read SGPR hazard
 (#100067)

Any SGPR read by a VALU can potentially obscure SALU writes to the same
register.
Insert s_wait_alu instructions to mitigate the hazard on affected paths.

Compute a global cache of SGPRs with any VALU reads and use this to
avoid inserting mitigation for SGPRs never accessed by VALUs.

To avoid excessive search when compile time is priority implement
secondary mode where all SALU writes are mitigated.

Co-authored-by: Shilei Tian <shilei.tian@amd.com>
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 288 +++++-
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h  |   4 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   2 +
 .../AMDGPU/GlobalISel/atomicrmw_fmax.ll       |  24 +-
 .../AMDGPU/GlobalISel/atomicrmw_fmin.ll       |  24 +-
 .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll |   2 +
 .../AMDGPU/atomic_optimizations_buffer.ll     |  68 +-
 .../atomic_optimizations_global_pointer.ll    |  96 +-
 .../AMDGPU/atomic_optimizations_raw_buffer.ll |  57 +-
 .../atomic_optimizations_struct_buffer.ll     |  57 +-
 .../buffer-fat-pointer-atomicrmw-fadd.ll      | 125 ++-
 .../buffer-fat-pointer-atomicrmw-fmax.ll      | 167 +++-
 .../buffer-fat-pointer-atomicrmw-fmin.ll      | 167 +++-
 .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll     | 104 ++-
 .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll     | 176 +++-
 .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll     | 176 +++-
 .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll     | 232 +++--
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll      |  18 +-
 llvm/test/CodeGen/AMDGPU/fmaximum.ll          |   3 +-
 llvm/test/CodeGen/AMDGPU/fmaximum3.ll         |   4 +-
 llvm/test/CodeGen/AMDGPU/fminimum.ll          |   3 +-
 llvm/test/CodeGen/AMDGPU/fminimum3.ll         |   4 +-
 .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll   | 106 ++-
 .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll   | 176 +++-
 .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll   | 176 +++-
 .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll   | 232 +++--
 .../test/CodeGen/AMDGPU/global_atomics_i64.ll |  48 +-
 .../hazard-recognizer-src-shared-base.ll      |  23 +
 .../AMDGPU/indirect-call-known-callees.ll     |  15 +-
 .../insert_waitcnt_for_precise_memory.ll      |  43 +-
 .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll     |   6 +
 .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll    |  44 +-
 ...mdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll |  10 +-
 .../AMDGPU/llvm.amdgcn.s.barrier.wait.ll      |  19 +-
 ...cn.struct.ptr.buffer.atomic.fadd.v2bf16.ll |  20 +-
 ...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll |  20 +-
 ...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll |  20 +-
 ...mdgcn.struct.ptr.buffer.atomic.fmax.f32.ll |  16 +-
 ...mdgcn.struct.ptr.buffer.atomic.fmin.f32.ll |  16 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll     |   2 +-
 llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll  |   4 +-
 llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll  |   2 +
 llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll  |   4 +-
 llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll  |   2 +
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  |  46 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i16.ll |  37 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i32.ll |   1 +
 llvm/test/CodeGen/AMDGPU/load-constant-i8.ll  | 108 ++-
 .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll    | 113 ++-
 .../CodeGen/AMDGPU/local-atomicrmw-fmax.ll    |  80 +-
 .../CodeGen/AMDGPU/local-atomicrmw-fmin.ll    |  80 +-
 .../CodeGen/AMDGPU/local-atomicrmw-fsub.ll    | 120 ++-
 .../test/CodeGen/AMDGPU/loop-prefetch-data.ll |   3 +-
 .../lower-work-group-id-intrinsics-hsa.ll     |   5 +-
 .../lower-work-group-id-intrinsics-pal.ll     |   5 +-
 .../materialize-frame-index-sgpr.gfx10.ll     |  34 +
 .../AMDGPU/materialize-frame-index-sgpr.ll    |  17 +-
 .../AMDGPU/memory-legalizer-flat-agent.ll     | 156 ++++
 .../AMDGPU/memory-legalizer-flat-lastuse.ll   |   8 +
 .../memory-legalizer-flat-nontemporal.ll      |  26 +
 .../memory-legalizer-flat-singlethread.ll     | 156 ++++
 .../AMDGPU/memory-legalizer-flat-system.ll    | 156 ++++
 .../AMDGPU/memory-legalizer-flat-volatile.ll  |  26 +
 .../AMDGPU/memory-legalizer-flat-wavefront.ll | 154 ++++
 .../AMDGPU/memory-legalizer-flat-workgroup.ll | 148 +++
 .../AMDGPU/memory-legalizer-global-agent.ll   | 150 +++
 .../AMDGPU/memory-legalizer-global-lastuse.ll |   8 +
 .../memory-legalizer-global-nontemporal.ll    |  18 +
 .../memory-legalizer-global-singlethread.ll   | 152 +++
 .../AMDGPU/memory-legalizer-global-system.ll  | 142 +++
 .../memory-legalizer-global-volatile.ll       |  20 +
 .../memory-legalizer-global-wavefront.ll      | 152 +++
 .../memory-legalizer-global-workgroup.ll      | 152 +++
 .../memory-legalizer-invalid-syncscope.ll     |   1 +
 .../AMDGPU/memory-legalizer-local-agent.ll    | 120 +++
 .../memory-legalizer-local-nontemporal.ll     |  16 +
 .../memory-legalizer-local-singlethread.ll    | 120 +++
 .../AMDGPU/memory-legalizer-local-system.ll   | 120 +++
 .../AMDGPU/memory-legalizer-local-volatile.ll |  14 +
 .../memory-legalizer-local-wavefront.ll       | 120 +++
 .../memory-legalizer-local-workgroup.ll       | 120 +++
 .../memory-legalizer-private-lastuse.ll       |   6 +
 .../memory-legalizer-private-nontemporal.ll   |  18 +
 .../memory-legalizer-private-volatile.ll      |  16 +
 .../AMDGPU/pseudo-scalar-transcendental.ll    |  37 +-
 llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll |   8 +-
 llvm/test/CodeGen/AMDGPU/v_swap_b16.ll        |  10 +-
 .../CodeGen/AMDGPU/valu-mask-write-hazard.mir | 168 ++--
 .../CodeGen/AMDGPU/valu-read-sgpr-hazard.mir  | 862 ++++++++++++++++++
 .../CodeGen/AMDGPU/vcmpx-permlane-hazard.mir  |   5 +-
 90 files changed, 5924 insertions(+), 915 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 2c1071c543305..cc39fd1740683 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -14,6 +14,7 @@
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
@@ -44,6 +45,10 @@ static cl::opt<unsigned, false, MFMAPaddingRatioParser>
                      cl::desc("Fill a percentage of the latency between "
                               "neighboring MFMA with s_nops."));
 
+static cl::opt<unsigned> MaxExhaustiveHazardSearch(
+    "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
+    cl::desc("Maximum function size for exhausive hazard search"));
+
 //===----------------------------------------------------------------------===//
 // Hazard Recognizer Implementation
 //===----------------------------------------------------------------------===//
@@ -51,15 +56,11 @@ static cl::opt<unsigned, false, MFMAPaddingRatioParser>
 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
                                                  const GCNSubtarget &ST);
 
-GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
-  IsHazardRecognizerMode(false),
-  CurrCycleInstr(nullptr),
-  MF(MF),
-  ST(MF.getSubtarget<GCNSubtarget>()),
-  TII(*ST.getInstrInfo()),
-  TRI(TII.getRegisterInfo()),
-  ClauseUses(TRI.getNumRegUnits()),
-  ClauseDefs(TRI.getNumRegUnits()) {
+GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
+    : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
+      ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
+      TRI(TII.getRegisterInfo()), UseVALUReadHazardExhaustiveSearch(false),
+      ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
   TSchedModel.init(&ST);
   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
@@ -1195,6 +1196,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
   fixWMMAHazards(MI);
   fixShift64HighRegBug(MI);
   fixVALUMaskWriteHazard(MI);
+  fixVALUReadSGPRHazard(MI);
   fixRequiredExportPriority(MI);
 }
 
@@ -3010,6 +3012,274 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
   return true;
 }
 
+// Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR.
+// i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
+static std::optional<unsigned> sgprPairNumber(Register Reg,
+                                              const SIRegisterInfo &TRI) {
+  switch (Reg) {
+  case AMDGPU::M0:
+  case AMDGPU::EXEC:
+  case AMDGPU::EXEC_LO:
+  case AMDGPU::EXEC_HI:
+  case AMDGPU::SGPR_NULL:
+  case AMDGPU::SGPR_NULL64:
+    return {};
+  default:
+    break;
+  }
+  unsigned RegN = TRI.getEncodingValue(Reg);
+  if (RegN > 127)
+    return {};
+  return (RegN >> 1) & 0x3f;
+}
+
+// For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
+void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
+  assert(MMF == &MF);
+
+  // Assume non-empty vector means it has already been computed.
+  if (!VALUReadHazardSGPRs.empty())
+    return;
+
+  auto CallingConv = MF.getFunction().getCallingConv();
+  bool IsCallFree =
+      AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
+
+  // Exhaustive search is only viable in non-caller/callee functions where
+  // VALUs will be exposed to the hazard recognizer.
+  UseVALUReadHazardExhaustiveSearch =
+      IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
+      MF.getInstructionCount() <= MaxExhaustiveHazardSearch;
+
+  // Consider all SGPRs hazards if the shader uses function calls or is callee.
+  bool UseVALUUseCache =
+      IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
+  VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
+  if (!UseVALUUseCache)
+    return;
+
+  // Perform a post ordered reverse scan to find VALUs which read an SGPR
+  // before a SALU write to the same SGPR.  This provides a reduction in
+  // hazard insertion when all VALU access to an SGPR occurs after its last
+  // SALU write, when compared to a linear scan.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
+  MachineCycleInfo CI;
+  CI.compute(*MMF);
+
+  for (auto *MBB : post_order(&MF)) {
+    bool InCycle = CI.getCycle(MBB) != nullptr;
+    for (auto &MI : reverse(MBB->instrs())) {
+      bool IsVALU = SIInstrInfo::isVALU(MI);
+      bool IsSALU = SIInstrInfo::isSALU(MI);
+      if (!IsVALU && !IsSALU)
+        continue;
+
+      for (const MachineOperand &Op : MI.operands()) {
+        if (!Op.isReg())
+          continue;
+        Register Reg = Op.getReg();
+        assert(!Op.getSubReg());
+        // Only consider implicit operands of VCC.
+        if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
+                                 Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
+          continue;
+        if (!TRI.isSGPRReg(MRI, Reg))
+          continue;
+        auto RegN = sgprPairNumber(Reg, TRI);
+        if (!RegN)
+          continue;
+        if (IsVALU && Op.isUse()) {
+          // Note: any access within a cycle must be considered a hazard.
+          if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN]))
+            VALUReadHazardSGPRs.set(*RegN);
+          ReadSGPRs.set(*RegN);
+        } else if (IsSALU) {
+          if (Op.isDef())
+            SALUWriteSGPRs.set(*RegN);
+          else
+            ReadSGPRs.set(*RegN);
+        }
+      }
+    }
+  }
+}
+
+bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
+  if (!ST.hasVALUReadSGPRHazard())
+    return false;
+
+  // The hazard sequence is fundamentally three instructions:
+  //   1. VALU reads SGPR
+  //   2. SALU writes SGPR
+  //   3. VALU/SALU reads SGPR
+  // Try to avoid searching for (1) because the expiry point of the hazard is
+  // indeterminate; however, the hazard between (2) and (3) can expire if the
+  // gap contains sufficient SALU instructions with no usage of SGPR from (1).
+  // Note: SGPRs must be considered as 64-bit pairs as hazard exists
+  // even if individual SGPRs are accessed.
+
+  bool MIIsSALU = SIInstrInfo::isSALU(*MI);
+  bool MIIsVALU = SIInstrInfo::isVALU(*MI);
+  if (!(MIIsSALU || MIIsVALU))
+    return false;
+
+  // Avoid expensive search when compile time is priority by
+  // mitigating every SALU which writes an SGPR.
+  if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {
+    if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI))
+      return false;
+
+    const MachineOperand *SDSTOp =
+        TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
+    if (!SDSTOp || !SDSTOp->isReg())
+      return false;
+
+    const Register HazardReg = SDSTOp->getReg();
+    if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
+        HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
+      return false;
+
+    // Add s_wait_alu sa_sdst(0) after SALU write.
+    auto NextMI = std::next(MI->getIterator());
+    auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
+                         TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+                     .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+
+    // SALU write may be s_getpc in a bundle.
+    updateGetPCBundle(NewMI);
+
+    return true;
+  }
+
+  // Pre-compute set of SGPR pairs read by VALUs.
+  // Note: pass mutable pointer to MachineFunction for CycleInfo.
+  computeVALUHazardSGPRs(MI->getMF());
+
+  // If no VALUs hazard SGPRs exist then nothing to do.
+  if (VALUReadHazardSGPRs.none())
+    return false;
+
+  // All SGPR writes before a call/return must be flushed as the callee/caller
+  // will not will not see the hazard chain, i.e. (2) to (3) described above.
+  const bool IsSetPC = (MI->isCall() || MI->isReturn()) &&
+                       !(MI->getOpcode() == AMDGPU::S_ENDPGM ||
+                         MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
+
+  // Collect all SGPR sources for MI which are read by a VALU.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  SmallSet<Register, 4> SGPRsUsed;
+
+  if (!IsSetPC) {
+    for (const MachineOperand &Op : MI->all_uses()) {
+      Register OpReg = Op.getReg();
+
+      // Only consider VCC implicit uses on VALUs.
+      // The only expected SALU implicit access is SCC which is no hazard.
+      if (MIIsSALU && Op.isImplicit())
+        continue;
+
+      if (!TRI.isSGPRReg(MRI, OpReg))
+        continue;
+
+      auto RegN = sgprPairNumber(OpReg, TRI);
+      if (!RegN)
+        continue;
+
+      if (!VALUReadHazardSGPRs[*RegN])
+        continue;
+
+      SGPRsUsed.insert(OpReg);
+    }
+
+    // No SGPRs -> nothing to do.
+    if (SGPRsUsed.empty())
+      return false;
+  }
+
+  // A hazard is any SALU which writes one of the SGPRs read by MI.
+  auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
+    if (!SIInstrInfo::isSALU(I))
+      return false;
+    // Ensure SGPR flush before call/return by conservatively assuming every
+    // SALU writes an SGPR.
+    if (IsSetPC && I.getNumDefs() > 0)
+      return true;
+    // Check for any register writes.
+    return any_of(SGPRsUsed, [this, &I](Register Reg) {
+      return I.modifiesRegister(Reg, &TRI);
+    });
+  };
+
+  const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11;
+  auto IsExpiredFn = [&](const MachineInstr &I, int Count) {
+    if (Count >= SALUExpiryCount)
+      return true;
+    // s_wait_alu sa_sdst(0) on path mitigates hazard.
+    if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+        AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
+      return true;
+    return false;
+  };
+
+  auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) {
+    // Only count true SALUs as wait states.
+    if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I))
+      return 0;
+    // SALU must be unrelated to any hazard registers.
+    if (any_of(SGPRsUsed,
+               [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); }))
+      return 0;
+    return 1;
+  };
+
+  // Check for the hazard.
+  DenseSet<const MachineBasicBlock *> Visited;
+  int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
+                                        std::next(MI->getReverseIterator()), 0,
+                                        IsExpiredFn, Visited, WaitStatesFn);
+
+  if (WaitStates >= SALUExpiryCount)
+    return false;
+
+  // Validate hazard through an exhaustive search.
+  if (UseVALUReadHazardExhaustiveSearch) {
+    // A hazard is any VALU which reads one of the paired SGPRs read by MI.
+    // This is searching for (1) in the hazard description.
+    auto hazardPair = [this](Register Reg) {
+      if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
+        return Register(AMDGPU::VCC);
+      auto RegN = sgprPairNumber(Reg, TRI);
+      return Register(AMDGPU::SGPR0_SGPR1 + *RegN);
+    };
+    auto SearchHazardFn = [this, hazardPair,
+                           &SGPRsUsed](const MachineInstr &I) {
+      if (!SIInstrInfo::isVALU(I))
+        return false;
+      // Check for any register reads.
+      return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
+        return I.readsRegister(hazardPair(Reg), &TRI);
+      });
+    };
+    auto SearchExpiredFn = [&](const MachineInstr &I, int Count) {
+      return false;
+    };
+    if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) ==
+        std::numeric_limits<int>::max())
+      return false;
+  }
+
+  // Add s_wait_alu sa_sdst(0) before SALU read.
+  auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                       TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+                   .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+
+  // SALU read may be after s_getpc in a bundle.
+  updateGetPCBundle(NewMI);
+
+  return true;
+}
+
 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
                                const SIInstrInfo &TII) {
   MachineBasicBlock &EntryMBB = MF->front();
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index f2a64ab48e180..e840e2445188f 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -48,6 +48,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   const SIRegisterInfo &TRI;
   TargetSchedModel TSchedModel;
   bool RunLdsBranchVmemWARHazardFixup;
+  BitVector VALUReadHazardSGPRs;
+  bool UseVALUReadHazardExhaustiveSearch;
 
   /// RegUnits of uses in the current soft memory clause.
   BitVector ClauseUses;
@@ -107,6 +109,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   bool fixWMMAHazards(MachineInstr *MI);
   bool fixShift64HighRegBug(MachineInstr *MI);
   bool fixVALUMaskWriteHazard(MachineInstr *MI);
+  void computeVALUHazardSGPRs(MachineFunction *MMF);
+  bool fixVALUReadSGPRHazard(MachineInstr *MI);
   bool fixRequiredExportPriority(MachineInstr *MI);
 
   int checkMAIHazards(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 9386bcf0d74b2..7b74eab96c567 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1247,6 +1247,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
 
+  bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
+
   /// Return if operations acting on VGPR tuples require even alignment.
   bool needsAlignedVGPRs() const { return GFX90AInsts; }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 61d2c854dffa5..df81b926bceb3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -623,13 +623,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -779,12 +781,14 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1211,13 +1215,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1365,12 +1371,14 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1868,12 +1876,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2059,12 +2069,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index 83be67a9138f6..53d9bf0751a1d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -623,13 +623,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -779,12 +781,14 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1211,13 +1215,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1365,12 +1371,14 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1868,12 +1876,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2059,12 +2069,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 04833eaaa3283..50d40368dd107 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -1404,6 +1404,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_mov_b32 s2, 3
 ; GFX12-NEXT:    s_mov_b32 s1, 2
 ; GFX12-NEXT:    s_mov_b32 s0, 1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
@@ -1494,6 +1495,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_mov_b32 s2, 3
 ; GFX12-NEXT:    s_mov_b32 s1, 2
 ; GFX12-NEXT:    s_mov_b32 s0, 1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 17fe3adc22169..5cf9c9faa693e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -226,6 +226,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
 ; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -233,8 +234,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:  ; %bb.1:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_mul_i32 s4, s4, 5
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -263,13 +265,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB0_2:
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
@@ -509,6 +514,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
 ; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -517,8 +523,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB1_2:
@@ -547,10 +554,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s4, s4
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB1_2:
@@ -559,6 +568,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
@@ -878,7 +888,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s4, s5
@@ -899,6 +909,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX12W64-NEXT:  ; %bb.3:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -924,22 +935,25 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s5, v1, s4
 ; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s4
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s4
 ; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX12W32-NEXT:  ; %bb.3:
@@ -948,6 +962,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB2_4:
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
@@ -1289,7 +1304,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W64-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s4, s5
@@ -1312,6 +1327,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W64-NEXT:    s_clause 0x1
 ; GFX12W64-NEXT:    s_load_b32 s5, s[2:3], 0x44
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v2, s5
@@ -1338,22 +1354,25 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s5, v1, s4
 ; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s4
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s4
 ; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX12W32-NEXT:  ; %bb.3:
@@ -1364,6 +1383,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W32-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s8
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB3_4:
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
@@ -1719,6 +1739,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
 ; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -1726,8 +1747,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:  ; %bb.1:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_mul_i32 s4, s4, 5
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -1757,13 +1779,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB5_2:
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
@@ -2006,6 +2031,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
 ; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -2014,8 +2040,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB6_2:
@@ -2045,10 +2072,12 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s4, s4
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB6_2:
@@ -2378,7 +2407,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s4, s5
@@ -2399,6 +2428,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX12W64-NEXT:  ; %bb.3:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -2424,22 +2454,25 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s5, v1, s4
 ; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s4
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s4
 ; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX12W32-NEXT:  ; %bb.3:
@@ -2448,6 +2481,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB7_4:
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 16f3ff4be6b50..ed036a83b6143 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -266,6 +266,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr1
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
 ; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -273,8 +274,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:  ; %bb.1:
 ; GFX1264-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_mul_i32 s6, s6, 5
 ; GFX1264-NEXT:    s_mov_b32 s10, -1
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mov_b32 s8, s2
@@ -307,10 +310,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1232-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1232-NEXT:  ; %bb.1:
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_bcnt1_i32_b32 s5, s5
 ; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_mul_i32 s5, s5, 5
 ; GFX1232-NEXT:    s_mov_b32 s10, -1
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mov_b32 s8, s2
@@ -320,6 +326,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB0_2:
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v1
@@ -597,6 +604,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr1
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
 ; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -640,6 +648,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1232-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1232-NEXT:  ; %bb.1:
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_bcnt1_i32_b32 s2, s8
 ; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
@@ -1012,7 +1021,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
-; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
 ; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s7
 ; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
@@ -1033,6 +1042,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.3:
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s10, -1
@@ -1064,23 +1074,26 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1232_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s5, v1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s4, s1
 ; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s6
 ; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1232_ITERATIVE-NEXT:    s_add_co_i32 s4, s4, s5
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1232_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX1232_ITERATIVE-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_xor_b32 s5, exec_lo, s5
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.3:
@@ -1095,6 +1108,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232_ITERATIVE-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232_ITERATIVE-NEXT:  .LBB2_4:
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1232_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
@@ -1520,6 +1534,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s7, v1, 31
 ; GFX1264_DPP-NEXT:    v_writelane_b32 v3, s6, 16
+; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1264_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1527,12 +1542,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s9, v1, 63
 ; GFX1264_DPP-NEXT:    v_writelane_b32 v3, s7, 32
+; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1264_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1264_DPP-NEXT:    s_mov_b32 s4, s9
 ; GFX1264_DPP-NEXT:    v_writelane_b32 v3, s8, 48
+; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX1264_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1264_DPP-NEXT:    s_mov_b32 s6, -1
@@ -1550,6 +1567,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264_DPP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264_DPP-NEXT:  .LBB2_2:
+; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1264_DPP-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264_DPP-NEXT:    v_readfirstlane_b32 s2, v0
@@ -1595,8 +1613,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v3, s5, 16
+; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1232_DPP-NEXT:    s_mov_b32 s4, s6
 ; GFX1232_DPP-NEXT:    s_mov_b32 s6, -1
@@ -1604,6 +1622,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1232_DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1232_DPP-NEXT:  ; %bb.1:
+; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1232_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1232_DPP-NEXT:    s_wait_kmcnt 0x0
@@ -1900,6 +1919,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s9, 0
 ; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1264-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -1910,6 +1930,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1264-NEXT:    s_mul_u64 s[6:7], s[8:9], 5
 ; GFX1264-NEXT:    s_mov_b32 s10, -1
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1264-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
@@ -1945,10 +1966,13 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1232-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1232-NEXT:  ; %bb.1:
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_bcnt1_i32_b32 s4, s4
 ; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_mul_u64 s[4:5], s[4:5], 5
 ; GFX1232-NEXT:    s_mov_b32 s10, -1
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mov_b32 s8, s2
@@ -2288,6 +2312,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s11, 0
 ; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1264-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -2298,6 +2323,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mul_u64 s[8:9], s[0:1], s[10:11]
 ; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX1264-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX1264-NEXT:    s_mov_b32 s10, -1
@@ -2336,6 +2362,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1232-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1232-NEXT:  ; %bb.1:
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_bcnt1_i32_b32 s2, s9
 ; GFX1232-NEXT:    s_mov_b32 s15, 0x31016000
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
@@ -2349,6 +2376,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB4_2:
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1232-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
@@ -2763,7 +2791,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[0:1]
-; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
 ; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
 ; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
@@ -2785,6 +2813,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.3:
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
@@ -2819,8 +2848,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1232_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s1
 ; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
@@ -2828,6 +2858,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s4, s1
 ; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s8
 ; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3594,6 +3625,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s7, v3, 63
 ; GFX1264_DPP-NEXT:    v_writelane_b32 v1, s8, 32
 ; GFX1264_DPP-NEXT:    v_writelane_b32 v2, s9, 32
+; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1264_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -3601,6 +3633,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX1264_DPP-NEXT:    v_writelane_b32 v1, s10, 48
 ; GFX1264_DPP-NEXT:    v_writelane_b32 v2, s11, 48
+; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX1264_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1264_DPP-NEXT:    s_mov_b64 s[8:9], exec
@@ -3619,6 +3652,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264_DPP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264_DPP-NEXT:  .LBB5_2:
+; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1264_DPP-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264_DPP-NEXT:    v_readfirstlane_b32 s2, v8
@@ -3698,6 +3732,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v1, s7, 16
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v2, s8, 16
+; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX1232_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1232_DPP-NEXT:    s_mov_b32 s8, exec_lo
@@ -3715,6 +3750,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232_DPP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232_DPP-NEXT:  .LBB5_2:
+; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1232_DPP-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232_DPP-NEXT:    v_readfirstlane_b32 s2, v8
@@ -3989,6 +4025,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr1
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
 ; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -3996,8 +4033,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:  ; %bb.1:
 ; GFX1264-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_mul_i32 s6, s6, 5
 ; GFX1264-NEXT:    s_mov_b32 s10, -1
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mov_b32 s8, s2
@@ -4031,10 +4070,13 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1232-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1232-NEXT:  ; %bb.1:
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_bcnt1_i32_b32 s5, s5
 ; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_mul_i32 s5, s5, 5
 ; GFX1232-NEXT:    s_mov_b32 s10, -1
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mov_b32 s8, s2
@@ -4044,6 +4086,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB6_2:
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v1
@@ -4326,6 +4369,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr1
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
 ; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -4370,6 +4414,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1232-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1232-NEXT:  ; %bb.1:
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_bcnt1_i32_b32 s2, s8
 ; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
@@ -4743,7 +4788,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
-; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
 ; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s7
 ; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
@@ -4764,6 +4809,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.3:
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s10, -1
@@ -4795,23 +4841,26 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1232_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s5, v1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s4, s1
 ; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s6
 ; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1232_ITERATIVE-NEXT:    s_add_co_i32 s4, s4, s5
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1232_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX1232_ITERATIVE-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_xor_b32 s5, exec_lo, s5
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.3:
@@ -4826,6 +4875,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232_ITERATIVE-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232_ITERATIVE-NEXT:  .LBB8_4:
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1232_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
@@ -5251,6 +5301,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s7, v1, 31
 ; GFX1264_DPP-NEXT:    v_writelane_b32 v3, s6, 16
+; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1264_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5258,12 +5309,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s9, v1, 63
 ; GFX1264_DPP-NEXT:    v_writelane_b32 v3, s7, 32
+; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1264_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1264_DPP-NEXT:    s_mov_b32 s4, s9
 ; GFX1264_DPP-NEXT:    v_writelane_b32 v3, s8, 48
+; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX1264_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1264_DPP-NEXT:    s_mov_b32 s6, -1
@@ -5281,6 +5334,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264_DPP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264_DPP-NEXT:  .LBB8_2:
+; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1264_DPP-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264_DPP-NEXT:    v_readfirstlane_b32 s2, v0
@@ -5326,8 +5380,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v3, s5, 16
+; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1232_DPP-NEXT:    s_mov_b32 s4, s6
 ; GFX1232_DPP-NEXT:    s_mov_b32 s6, -1
@@ -5335,6 +5389,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1232_DPP-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1232_DPP-NEXT:  ; %bb.1:
+; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1232_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1232_DPP-NEXT:    s_wait_kmcnt 0x0
@@ -5646,6 +5701,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s9, 0
 ; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1264-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -5656,6 +5712,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1264-NEXT:    s_mul_u64 s[6:7], s[8:9], 5
 ; GFX1264-NEXT:    s_mov_b32 s10, -1
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1264-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
@@ -5694,10 +5751,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1232-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1232-NEXT:  ; %bb.1:
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_bcnt1_i32_b32 s4, s4
 ; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_mul_u64 s[4:5], s[4:5], 5
 ; GFX1232-NEXT:    s_mov_b32 s10, -1
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mov_b32 s8, s2
@@ -6053,6 +6113,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s11, 0
 ; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1264-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -6063,6 +6124,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mul_u64 s[8:9], s[0:1], s[10:11]
 ; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX1264-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX1264-NEXT:    s_mov_b32 s10, -1
@@ -6105,6 +6167,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1232-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1232-NEXT:  ; %bb.1:
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_bcnt1_i32_b32 s2, s9
 ; GFX1232-NEXT:    s_mov_b32 s15, 0x31016000
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
@@ -6118,6 +6181,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB10_2:
+; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
@@ -6536,7 +6600,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[0:1]
-; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
 ; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
 ; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
@@ -6558,6 +6622,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.3:
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
@@ -6592,8 +6657,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1232_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s1
 ; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
@@ -6601,6 +6667,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s4, s1
 ; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s8
 ; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -7367,6 +7434,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s7, v3, 63
 ; GFX1264_DPP-NEXT:    v_writelane_b32 v1, s8, 32
 ; GFX1264_DPP-NEXT:    v_writelane_b32 v2, s9, 32
+; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1264_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7374,6 +7442,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX1264_DPP-NEXT:    v_writelane_b32 v1, s10, 48
 ; GFX1264_DPP-NEXT:    v_writelane_b32 v2, s11, 48
+; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX1264_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1264_DPP-NEXT:    s_mov_b64 s[8:9], exec
@@ -7392,6 +7461,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264_DPP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264_DPP-NEXT:  .LBB11_2:
+; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1264_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1264_DPP-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264_DPP-NEXT:    v_readfirstlane_b32 s2, v8
@@ -7471,6 +7541,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v1, s7, 16
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v2, s8, 16
+; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX1232_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1232_DPP-NEXT:    s_mov_b32 s8, exec_lo
@@ -7488,6 +7559,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232_DPP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232_DPP-NEXT:  .LBB11_2:
+; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1232_DPP-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232_DPP-NEXT:    v_readfirstlane_b32 s2, v8
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 3a2efadac067d..9d4dfd8911257 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -225,6 +225,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
 ; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -232,8 +233,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:  ; %bb.1:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_mul_i32 s4, s4, 5
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -262,13 +264,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB0_2:
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
@@ -508,6 +513,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
 ; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -516,8 +522,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB1_2:
@@ -546,10 +553,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s4, s4
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB1_2:
@@ -558,6 +567,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
@@ -877,7 +887,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s4, s5
@@ -898,6 +908,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX12W64-NEXT:  ; %bb.3:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -923,22 +934,25 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s5, v1, s4
 ; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s4
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s4
 ; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX12W32-NEXT:  ; %bb.3:
@@ -947,6 +961,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB2_4:
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
@@ -1302,6 +1317,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
 ; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -1309,8 +1325,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:  ; %bb.1:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_mul_i32 s4, s4, 5
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -1340,13 +1357,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB4_2:
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
@@ -1589,6 +1609,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
 ; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -1597,8 +1618,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB5_2:
@@ -1628,10 +1650,12 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s4, s4
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB5_2:
@@ -1961,7 +1985,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s4, s5
@@ -1982,6 +2006,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_cbranch_execz .LBB6_4
 ; GFX12W64-NEXT:  ; %bb.3:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -2007,22 +2032,25 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s5, v1, s4
 ; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s4
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s4
 ; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB6_4
 ; GFX12W32-NEXT:  ; %bb.3:
@@ -2031,6 +2059,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB6_4:
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index d0c0b62c78e42..3fb44e090c61f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -232,6 +232,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
 ; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -240,8 +241,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_mul_i32 s4, s4, 5
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
@@ -270,13 +272,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB0_2:
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
@@ -523,6 +528,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
 ; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -532,8 +538,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB1_2:
@@ -562,10 +569,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s4, s4
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB1_2:
@@ -574,6 +583,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
@@ -899,7 +909,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s4, s5
@@ -921,6 +931,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:  ; %bb.3:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
 ; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
@@ -946,22 +957,25 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s5, v1, s4
 ; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s4
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s4
 ; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX12W32-NEXT:  ; %bb.3:
@@ -970,6 +984,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB2_4:
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
@@ -1466,6 +1481,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
 ; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -1474,8 +1490,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_mul_i32 s4, s4, 5
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
@@ -1505,13 +1522,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB5_2:
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
@@ -1761,6 +1781,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
 ; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
@@ -1770,8 +1791,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB6_2:
@@ -1801,10 +1823,12 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s4, s4
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB6_2:
@@ -2140,7 +2164,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s4, s5
@@ -2162,6 +2186,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:  ; %bb.3:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[2:3], 0x34
 ; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
@@ -2187,22 +2212,25 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s5, v1, s4
 ; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s4
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s4
 ; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX12W32-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX12W32-NEXT:  ; %bb.3:
@@ -2211,6 +2239,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB7_4:
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 63cdd8a3bb16d..e195026c13d27 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -436,13 +436,16 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_add_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -450,6 +453,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -2049,7 +2053,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -2067,12 +2071,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
@@ -2323,7 +2329,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
@@ -2340,12 +2346,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
@@ -2596,12 +2604,15 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -2626,11 +2637,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
@@ -2640,11 +2654,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -3114,7 +3129,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -3132,12 +3147,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
@@ -3408,7 +3425,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -3426,12 +3443,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -3685,14 +3704,17 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s6, 0x200
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s6, -4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX12-NEXT:    s_and_b32 s4, s6, 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX12-NEXT:    buffer_load_b32 v2, v5, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -3705,6 +3727,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_or_b32 v1, v2, s6, v1
 ; GFX12-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
@@ -3714,12 +3737,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -4055,14 +4079,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s6, 0x200
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s6, -4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-NEXT:    s_and_b32 s4, s6, 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX12-NEXT:    buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
@@ -4075,6 +4102,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_or_b32 v1, v2, s6, v1
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
@@ -4084,11 +4112,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -4430,11 +4459,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b32 v7, v10, s[4:7], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -4464,11 +4496,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
@@ -4478,12 +4513,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -5056,14 +5092,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s6, 0x200
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s6, -4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX12-NEXT:    s_and_b32 s4, s6, 3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
@@ -5084,6 +5123,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
@@ -5093,12 +5133,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -5481,14 +5522,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s6, 0x200
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s6, -4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-NEXT:    s_and_b32 s4, s6, 3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
@@ -5509,6 +5553,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
@@ -5518,11 +5563,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -5910,11 +5956,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -5954,11 +6003,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
@@ -5968,12 +6020,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -7127,13 +7180,16 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -7141,6 +7197,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -9585,13 +9642,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -9599,6 +9659,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index c90296124eb12..c7569a6c155db 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -428,13 +428,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_max_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -442,6 +445,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -1297,7 +1301,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -1317,12 +1321,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
@@ -1514,7 +1520,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:    s_mov_b32 s4, 0
@@ -1533,12 +1539,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
@@ -1734,12 +1742,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -1767,11 +1778,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1781,11 +1795,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -2150,7 +2165,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -2170,12 +2185,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
@@ -2461,7 +2478,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -2481,12 +2498,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -2681,14 +2700,17 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s6, 0x200
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v0, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s6, -4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX12-NEXT:    s_and_b32 s4, s6, 3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -2701,8 +2723,9 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v5
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
@@ -2712,12 +2735,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3067,14 +3091,17 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s6, 0x200
 ; GFX12-NEXT:    v_max_num_f16_e32 v3, v0, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s6, -4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-NEXT:    s_and_b32 s4, s6, 3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
@@ -3087,8 +3114,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v3
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
@@ -3098,11 +3126,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3457,11 +3486,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -3495,11 +3527,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
@@ -3509,12 +3544,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -4100,14 +4136,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s6, 0x200
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s6, -4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX12-NEXT:    s_and_b32 s4, s6, 3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -4128,6 +4167,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
@@ -4137,12 +4177,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4527,14 +4568,17 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s6, 0x200
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s6, -4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-NEXT:    s_and_b32 s4, s6, 3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
@@ -4555,6 +4599,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
@@ -4564,11 +4609,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4958,11 +5004,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -5002,11 +5051,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
@@ -5016,12 +5068,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -5643,8 +5696,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s4, 0
@@ -5662,12 +5716,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
@@ -5987,6 +6043,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x400
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v0, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
@@ -6004,12 +6061,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
@@ -6329,12 +6388,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -6362,11 +6424,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
@@ -6376,12 +6441,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -6937,8 +7003,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x400
 ; GFX12-NEXT:    s_mov_b32 s5, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v4, s4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
@@ -6972,12 +7038,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
@@ -7382,7 +7450,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s5, 0
@@ -7414,12 +7482,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
@@ -7823,12 +7893,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -7871,11 +7944,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
@@ -7885,12 +7961,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 91adbfa559976..9f97d2033bbb5 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -428,13 +428,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_min_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -442,6 +445,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -1297,7 +1301,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -1317,12 +1321,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
@@ -1514,7 +1520,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:    s_mov_b32 s4, 0
@@ -1533,12 +1539,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
@@ -1734,12 +1742,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -1767,11 +1778,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1781,11 +1795,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -2150,7 +2165,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -2170,12 +2185,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
@@ -2461,7 +2478,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -2481,12 +2498,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -2681,14 +2700,17 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s6, 0x200
 ; GFX12-NEXT:    v_max_num_f16_e32 v5, v0, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s6, -4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX12-NEXT:    s_and_b32 s4, s6, 3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -2701,8 +2723,9 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v5
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
@@ -2712,12 +2735,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3067,14 +3091,17 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s6, 0x200
 ; GFX12-NEXT:    v_max_num_f16_e32 v3, v0, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s6, -4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-NEXT:    s_and_b32 s4, s6, 3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
@@ -3087,8 +3114,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v3
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
@@ -3098,11 +3126,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3457,11 +3486,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -3495,11 +3527,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
@@ -3509,12 +3544,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -4100,14 +4136,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s6, 0x200
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s6, -4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX12-NEXT:    s_and_b32 s4, s6, 3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -4128,6 +4167,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
@@ -4137,12 +4177,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4527,14 +4568,17 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_addk_co_i32 s6, 0x200
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s4, s6, -4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-NEXT:    s_and_b32 s4, s6, 3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
@@ -4555,6 +4599,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
@@ -4564,11 +4609,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4958,11 +5004,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -5002,11 +5051,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
@@ -5016,12 +5068,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -5643,8 +5696,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s4, 0
@@ -5662,12 +5716,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
@@ -5987,6 +6043,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x400
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v0, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
@@ -6004,12 +6061,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
@@ -6329,12 +6388,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -6362,11 +6424,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
@@ -6376,12 +6441,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -6937,8 +7003,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x400
 ; GFX12-NEXT:    s_mov_b32 s5, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v4, s4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
@@ -6972,12 +7038,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
@@ -7382,7 +7450,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s5, 0
@@ -7414,12 +7482,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
@@ -7823,12 +7893,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -7871,11 +7944,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
@@ -7885,12 +7961,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 422c8a0be23b4..1ae1204e3cde1 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -5707,13 +5707,15 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
@@ -5888,13 +5890,15 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6070,13 +6074,15 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -6263,12 +6269,14 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
@@ -6432,12 +6440,14 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6608,12 +6618,14 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -6816,13 +6828,15 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
@@ -7105,13 +7119,15 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7403,13 +7419,15 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -7700,12 +7718,14 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
@@ -7978,12 +7998,14 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8265,12 +8287,14 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -8541,12 +8565,14 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -8756,13 +8782,15 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -8990,13 +9018,15 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9289,12 +9319,14 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9590,13 +9622,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -9933,13 +9967,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10286,13 +10322,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -10639,12 +10677,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10981,12 +11021,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11313,13 +11355,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -11596,12 +11640,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -11882,12 +11928,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
@@ -12215,13 +12263,15 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12569,12 +12619,14 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index ea2427a3c420f..ed78f4a071e3d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -2800,13 +2800,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2954,13 +2956,15 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3113,13 +3117,15 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3286,12 +3292,14 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -3435,12 +3443,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3591,12 +3601,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3761,13 +3773,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
@@ -3971,13 +3985,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -4143,13 +4159,15 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4446,13 +4464,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -4758,13 +4778,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5070,12 +5092,14 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
@@ -5364,12 +5388,14 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5667,12 +5693,14 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5959,13 +5987,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -6195,12 +6225,14 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -6437,13 +6469,15 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6751,12 +6785,14 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7065,13 +7101,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -7409,13 +7447,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7763,13 +7803,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -8115,12 +8157,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
@@ -8448,12 +8492,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8791,12 +8837,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -9124,13 +9172,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -9408,12 +9458,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -9698,13 +9750,15 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10053,12 +10107,14 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10381,13 +10437,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -10616,13 +10674,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10854,13 +10914,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11110,12 +11172,14 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
@@ -11336,12 +11400,14 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11569,12 +11635,14 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11820,13 +11888,15 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12060,12 +12130,14 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12317,13 +12389,15 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -12662,13 +12736,15 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13010,13 +13086,15 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -13375,12 +13453,14 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -13709,12 +13789,14 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14050,12 +14132,14 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -14410,13 +14494,15 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14759,12 +14845,14 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 2767b66e44703..bdb945a652eb2 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -2800,13 +2800,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2954,13 +2956,15 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3113,13 +3117,15 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3286,12 +3292,14 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -3435,12 +3443,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3591,12 +3601,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3761,13 +3773,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
@@ -3971,13 +3985,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -4143,13 +4159,15 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4446,13 +4464,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -4758,13 +4778,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5070,12 +5092,14 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
@@ -5364,12 +5388,14 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5667,12 +5693,14 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5959,13 +5987,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -6195,12 +6225,14 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -6437,13 +6469,15 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6751,12 +6785,14 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7065,13 +7101,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -7409,13 +7447,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7763,13 +7803,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -8115,12 +8157,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
@@ -8448,12 +8492,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8791,12 +8837,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -9124,13 +9172,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -9408,12 +9458,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -9698,13 +9750,15 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10053,12 +10107,14 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10381,13 +10437,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -10616,13 +10674,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10854,13 +10914,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11110,12 +11172,14 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
@@ -11336,12 +11400,14 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11569,12 +11635,14 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11820,13 +11888,15 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12060,12 +12130,14 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12317,13 +12389,15 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -12662,13 +12736,15 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13010,13 +13086,15 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -13375,12 +13453,14 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -13709,12 +13789,14 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14050,12 +14132,14 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -14410,13 +14494,15 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14759,12 +14845,14 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 6672f16c4a7a8..c7f2bf6d1b317 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -34,13 +34,15 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 {
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32:
@@ -229,13 +231,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos:
@@ -428,13 +432,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg:
@@ -644,12 +650,14 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 {
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32:
@@ -829,12 +837,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos:
@@ -1021,12 +1031,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg:
@@ -1232,13 +1244,15 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos:
@@ -1432,12 +1446,14 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos:
@@ -1631,13 +1647,15 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__ftz:
@@ -1826,13 +1844,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
@@ -2025,13 +2045,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
@@ -2241,12 +2263,14 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 {
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__ftz:
@@ -2426,12 +2450,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz:
@@ -2618,12 +2644,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
@@ -2829,13 +2857,15 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
@@ -3029,12 +3059,14 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz:
@@ -3228,13 +3260,15 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 {
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64:
@@ -3439,13 +3473,15 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos:
@@ -3651,13 +3687,15 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg:
@@ -3877,12 +3915,14 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 {
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64:
@@ -4072,12 +4112,14 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos:
@@ -4274,12 +4316,14 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg:
@@ -4513,13 +4557,15 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16:
@@ -4802,13 +4848,15 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
@@ -5100,13 +5148,15 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
@@ -5397,12 +5447,14 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 {
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16:
@@ -5675,12 +5727,14 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
@@ -5962,12 +6016,14 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
@@ -6239,13 +6295,15 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
@@ -6460,12 +6518,14 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
@@ -6687,13 +6747,15 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
@@ -6986,12 +7048,14 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
@@ -7287,13 +7351,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16:
@@ -7630,13 +7696,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
@@ -7983,13 +8051,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
@@ -8334,12 +8404,14 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16:
@@ -8666,12 +8738,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
@@ -9008,12 +9082,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
@@ -9340,13 +9416,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
@@ -9623,12 +9701,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
@@ -9912,13 +9992,15 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
@@ -10266,12 +10348,14 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
@@ -10591,13 +10675,15 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) #
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16:
@@ -10809,13 +10895,15 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos:
@@ -11030,13 +11118,15 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg:
@@ -11268,12 +11358,14 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16:
@@ -11475,12 +11567,14 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos:
@@ -11689,12 +11783,14 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg:
@@ -11922,13 +12018,15 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos:
@@ -12144,12 +12242,14 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos:
@@ -12385,13 +12485,15 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
@@ -12730,13 +12832,15 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -13078,13 +13182,15 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
@@ -13443,12 +13549,14 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
@@ -13777,12 +13885,14 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
@@ -14118,12 +14228,14 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
@@ -14478,13 +14590,15 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -14827,12 +14941,14 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 9a9fd289e2d0c..9653f8fdacac6 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -265,10 +265,11 @@ define void @zero_init_foo() {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 s1, s0
 ; GFX12-NEXT:    s_mov_b32 s2, s0
 ; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_clause 0x3
@@ -354,10 +355,11 @@ define void @zero_init_foo() {
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
 ; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-PAL-NEXT:    s_clause 0x3
@@ -1315,10 +1317,11 @@ define void @zero_init_small_offset_foo() {
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 s1, s0
 ; GFX12-NEXT:    s_mov_b32 s2, s0
 ; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_clause 0x3
@@ -1414,10 +1417,11 @@ define void @zero_init_small_offset_foo() {
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
 ; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-PAL-NEXT:    s_clause 0x3
@@ -2526,10 +2530,11 @@ define void @zero_init_large_offset_foo() {
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 s1, s0
 ; GFX12-NEXT:    s_mov_b32 s2, s0
 ; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_clause 0x3
@@ -2666,10 +2671,11 @@ define void @zero_init_large_offset_foo() {
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
 ; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-PAL-NEXT:    s_clause 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index 3b7009023b03a..04cd150d93176 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -174,7 +174,8 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x ha
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    v_pk_maximum_f16 v0, s0, s2
 ; GFX12-GISEL-NEXT:    s_maximum_f16 s0, s1, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 9ce1ba3316dd5..27282a453075b 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -462,7 +462,7 @@ define float @v_fmaximum3_f32_const1_const2(float %a) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s0, 0x41000000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, s0, 0x41800000
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1409,7 +1409,7 @@ define half @v_fmaximum3_f16_const1_const2(half %a) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_movk_i32 s0, 0x4800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, s0, 0x4c00
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index 817e6dd87361f..3271758f71297 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -174,7 +174,8 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x ha
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    v_pk_minimum_f16 v0, s0, s2
 ; GFX12-GISEL-NEXT:    s_minimum_f16 s0, s1, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index 21074d58bdb7e..d9ba2de48bb01 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -462,7 +462,7 @@ define float @v_fminimum3_f32_const1_const2(float %a) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s0, 0x41000000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, s0, 0x41800000
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1409,7 +1409,7 @@ define half @v_fminimum3_f16_const1_const2(half %a) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_movk_i32 s0, 0x4800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, s0, 0x4c00
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 064238c63717e..361cc1e9e6c1d 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -6963,13 +6963,15 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
@@ -7184,13 +7186,15 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7406,13 +7410,15 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -7635,12 +7641,14 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
@@ -7838,12 +7846,14 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8044,12 +8054,14 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -8277,13 +8289,15 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
@@ -8616,13 +8630,15 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8966,13 +8982,15 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -9315,12 +9333,14 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
@@ -9642,12 +9662,14 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9979,12 +10001,14 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -10306,13 +10330,15 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -10566,12 +10592,14 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -10830,13 +10858,15 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11181,12 +11211,14 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11532,13 +11564,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -11925,13 +11959,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12330,13 +12366,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -12733,12 +12771,14 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
@@ -13114,12 +13154,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13506,12 +13548,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -13888,13 +13932,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -14210,12 +14256,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -14536,13 +14584,15 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB62_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14942,12 +14992,14 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB63_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -23072,8 +23124,10 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX12-NEXT:    s_cbranch_execz .LBB92_2
 ; GFX12-NEXT:  ; %bb.1:
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_bcnt1_i32_b32 s0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index 06d971febd038..84003a0432f7e 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -3020,13 +3020,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -3192,13 +3194,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3365,13 +3369,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3538,12 +3544,14 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -3700,12 +3708,14 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3865,12 +3875,14 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -4030,13 +4042,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
@@ -4278,13 +4292,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -4468,13 +4484,15 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4821,13 +4839,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5185,13 +5205,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5549,12 +5571,14 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
@@ -5892,12 +5916,14 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6245,12 +6271,14 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -6587,13 +6615,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -6862,12 +6892,14 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -7141,13 +7173,15 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7507,12 +7541,14 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7871,13 +7907,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -8266,13 +8304,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8673,13 +8713,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -9078,12 +9120,14 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
@@ -9461,12 +9505,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9855,12 +9901,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -10239,13 +10287,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -10563,12 +10613,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -10891,13 +10943,15 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11299,12 +11353,14 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11678,13 +11734,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -11970,13 +12028,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12264,13 +12324,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -12562,12 +12624,14 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
@@ -12843,12 +12907,14 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13127,12 +13193,14 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -13419,13 +13487,15 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13715,12 +13785,14 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14023,13 +14095,15 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -14421,13 +14495,15 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14821,13 +14897,15 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -15224,12 +15302,14 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -15609,12 +15689,14 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -15997,12 +16079,14 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -16394,13 +16478,15 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16795,12 +16881,14 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index 65df8f07fb8b3..2aad91cd1071f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -3020,13 +3020,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -3192,13 +3194,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3365,13 +3369,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3538,12 +3544,14 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -3700,12 +3708,14 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3865,12 +3875,14 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -4030,13 +4042,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
@@ -4278,13 +4292,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -4468,13 +4484,15 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4821,13 +4839,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5185,13 +5205,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5549,12 +5571,14 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
@@ -5892,12 +5916,14 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6245,12 +6271,14 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -6587,13 +6615,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -6862,12 +6892,14 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -7141,13 +7173,15 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7507,12 +7541,14 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7871,13 +7907,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -8266,13 +8304,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8673,13 +8713,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -9078,12 +9120,14 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
@@ -9461,12 +9505,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9855,12 +9901,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -10239,13 +10287,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -10563,12 +10613,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -10891,13 +10943,15 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11299,12 +11353,14 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11678,13 +11734,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -11970,13 +12028,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12264,13 +12324,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -12562,12 +12624,14 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
@@ -12843,12 +12907,14 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13127,12 +13193,14 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -13419,13 +13487,15 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13715,12 +13785,14 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14023,13 +14095,15 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -14421,13 +14495,15 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14821,13 +14897,15 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -15224,12 +15302,14 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -15609,12 +15689,14 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -15997,12 +16079,14 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -16394,13 +16478,15 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16795,12 +16881,14 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 79aa69771f84b..2e3799e1714af 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -35,13 +35,15 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32:
@@ -266,13 +268,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos:
@@ -499,13 +503,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg:
@@ -741,12 +747,14 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32:
@@ -961,12 +969,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos:
@@ -1184,12 +1194,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg:
@@ -1416,13 +1428,15 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos:
@@ -1650,12 +1664,14 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos:
@@ -1880,13 +1896,15 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__ftz:
@@ -2111,13 +2129,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
@@ -2344,13 +2364,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
@@ -2586,12 +2608,14 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__ftz:
@@ -2806,12 +2830,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz:
@@ -3029,12 +3055,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
@@ -3261,13 +3289,15 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
@@ -3495,12 +3525,14 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos__ftz:
@@ -3725,13 +3757,15 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_f64:
@@ -3976,13 +4010,15 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos:
@@ -4228,13 +4264,15 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg:
@@ -4487,12 +4525,14 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_f64:
@@ -4716,12 +4756,14 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_pos:
@@ -4948,12 +4990,14 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_neg:
@@ -5207,13 +5251,15 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16:
@@ -5546,13 +5592,15 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos:
@@ -5896,13 +5944,15 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg:
@@ -6245,12 +6295,14 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16:
@@ -6572,12 +6624,14 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos:
@@ -6909,12 +6963,14 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg:
@@ -7236,13 +7292,15 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
@@ -7496,12 +7554,14 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
@@ -7760,13 +7820,15 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos:
@@ -8111,12 +8173,14 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos:
@@ -8462,13 +8526,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16:
@@ -8855,13 +8921,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos:
@@ -9260,13 +9328,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg:
@@ -9663,12 +9733,14 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16:
@@ -10044,12 +10116,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos:
@@ -10436,12 +10510,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg:
@@ -10818,13 +10894,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
@@ -11140,12 +11218,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
@@ -11466,13 +11546,15 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos:
@@ -11872,12 +11954,14 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos:
@@ -12247,13 +12331,15 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16:
@@ -12522,13 +12608,15 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_pos:
@@ -12799,13 +12887,15 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg:
@@ -13079,12 +13169,14 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16:
@@ -13341,12 +13433,14 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_pos:
@@ -13606,12 +13700,14 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_neg:
@@ -13880,13 +13976,15 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fsub_ret_v2f16__offset12b_pos:
@@ -14158,12 +14256,14 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fsub_noret_v2f16__offset12b_pos:
@@ -14450,13 +14550,15 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16:
@@ -14848,13 +14950,15 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -15248,13 +15352,15 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
@@ -15651,12 +15757,14 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16:
@@ -16036,12 +16144,14 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
@@ -16424,12 +16534,14 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
@@ -16821,13 +16933,15 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -17222,12 +17336,14 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index d1a371fc4356f..24fd709514b47 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -280,7 +280,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -571,7 +571,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -864,7 +864,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -1155,7 +1155,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -1448,7 +1448,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -1739,7 +1739,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -2014,7 +2014,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_SE
 ; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -2287,7 +2287,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_SE
 ; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -2562,7 +2562,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_SE
 ; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -2835,7 +2835,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_SE
 ; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -3110,7 +3110,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_SE
 ; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -3383,7 +3383,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_SE
 ; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -3658,7 +3658,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_SE
 ; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -3931,7 +3931,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_SE
 ; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -4224,7 +4224,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -4515,7 +4515,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -4920,7 +4920,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5211,7 +5211,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5504,7 +5504,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5795,7 +5795,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6102,7 +6102,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    s_lshl_b64 s[2:3], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32 scope:SCOPE_DEV
@@ -6196,7 +6196,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6427,7 +6427,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    s_lshl_b64 s[2:3], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] scope:SCOPE_DEV
@@ -6518,7 +6518,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll
new file mode 100644
index 0000000000000..4aa49f2c9296d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=gfx1201 %s -o - | FileCheck %s
+
+define amdgpu_kernel void @foo() {
+; CHECK-LABEL: foo:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1
+; CHECK-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0
+; CHECK-NEXT:    flat_store_b64 v[0:1], v[2:3]
+; CHECK-NEXT:    s_endpgm
+entry:
+  br label %bb1
+
+bb0:
+  br label %bb1
+
+bb1:
+  %dst = phi ptr [ null, %bb0 ], [ addrspacecast (ptr addrspace(3) null to ptr), %entry ]
+  store i64 0, ptr %dst, align 16
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index eb4cba35e9946..44a2c34b06b57 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -44,15 +44,19 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ; GFX12-LABEL: indirect_call_known_no_special_inputs:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_getpc_b64 s[6:7]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_sext_i32_i16 s7, s7
-; GFX12-NEXT:    s_add_co_u32 s6, s6, snork@gotpcrel32@lo+8
-; GFX12-NEXT:    s_add_co_ci_u32 s7, s7, snork@gotpcrel32@hi+16
+; GFX12-NEXT:    s_add_co_u32 s6, s6, snork@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s7, s7, snork@gotpcrel32@hi+24
 ; GFX12-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX12-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX12-NEXT:    s_getpc_b64 s[8:9]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_sext_i32_i16 s9, s9
-; GFX12-NEXT:    s_add_co_u32 s8, s8, wobble@gotpcrel32@lo+8
-; GFX12-NEXT:    s_add_co_ci_u32 s9, s9, wobble@gotpcrel32@hi+16
+; GFX12-NEXT:    s_add_co_u32 s8, s8, wobble@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s9, s9, wobble@gotpcrel32@hi+24
 ; GFX12-NEXT:    s_load_u8 s12, s[4:5], 0x0
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[6:7], 0x0
 ; GFX12-NEXT:    s_load_b64 s[6:7], s[8:9], 0x0
@@ -61,12 +65,13 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ; GFX12-NEXT:    s_mov_b32 s32, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s8, 1, s12
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_eq_u32 s8, 1
 ; GFX12-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX12-NEXT:    s_cselect_b32 s7, s7, s5
 ; GFX12-NEXT:    s_cselect_b32 s6, s6, s4
 ; GFX12-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX12-NEXT:    s_endpgm
 
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index 3b972352e0e45..0045082eedb0a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -282,13 +282,15 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst
   ret i32 %result
@@ -401,15 +403,18 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp)  {
 ; GFX12-NEXT:    scratch_load_b32 v32, off, s32
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_sext_i32_i16 s1, s1
-; GFX12-NEXT:    s_add_co_u32 s0, s0, byval_align16_f64_arg@rel32@lo+8
-; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, byval_align16_f64_arg@rel32@hi+16
+; GFX12-NEXT:    s_add_co_u32 s0, s0, byval_align16_f64_arg@rel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, byval_align16_f64_arg@rel32@hi+24
 ; GFX12-NEXT:    scratch_store_b32 off, v32, s32
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b64 v[32:33], off, s32 offset:24
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    scratch_store_b64 off, v[32:33], s32 offset:16
 ; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %alloca = alloca double, align 8, addrspace(5)
@@ -587,25 +592,34 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
 ; GFX12-NEXT:    v_rcp_iflag_f32_e32 v0, s4
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
 ; GFX12-NEXT:    s_mul_f32 s4, s4, 0x4f7ffffe
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cvt_u32_f32 s4, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GFX12-NEXT:    s_mul_i32 s5, s5, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mul_hi_u32 s5, s4, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_co_i32 s4, s4, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mul_hi_u32 s4, s2, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mul_i32 s5, s4, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_sub_co_i32 s2, s2, s5
 ; GFX12-NEXT:    s_add_co_i32 s5, s4, 1
 ; GFX12-NEXT:    s_sub_co_i32 s6, s2, s3
 ; GFX12-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cselect_b32 s4, s5, s4
 ; GFX12-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_co_i32 s5, s4, 1
 ; GFX12-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cselect_b32 s2, s5, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
@@ -789,9 +803,11 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
 ; GFX12-NEXT:  ; %bb.1:
 ; GFX12-NEXT:    s_load_b32 s1, s[2:3], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_bcnt1_i32_b32 s0, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mul_i32 s0, s0, 5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1
 ; GFX12-NEXT:    global_wb scope:SCOPE_SE
 ; GFX12-NEXT:    ds_add_u32 v0, v1
@@ -1036,15 +1052,18 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:  ; %bb.1:
 ; GFX12-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_bcnt1_i32_b32 s1, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mul_i32 s1, s1, 5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SE
 ; GFX12-NEXT:    ds_add_rtn_u32 v1, v1, v2
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:  .LBB7_2:
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
@@ -1220,13 +1239,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12-NEXT:  ; %bb.1:
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x34
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_bcnt1_i32_b32 s1, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mul_i32 s1, s1, 5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:  .LBB8_2:
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
@@ -1645,4 +1667,3 @@ entry:
   %bc = bitcast <2 x i32> %r.1 to <2 x float>
   ret <2 x float> %bc
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
index 9445f1225e0cb..99f4fbf359948 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
@@ -64,6 +64,7 @@ define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc,
 ; GFX12-NEXT:    s_mov_b32 s4, 4
 ; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
@@ -83,6 +84,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc
 ; GFX12-NEXT:    s_mov_b32 s4, 4
 ; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
@@ -100,6 +102,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inre
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX12-NEXT:    s_mov_b32 s4, 4
 ; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
@@ -169,6 +172,7 @@ define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsr
 ; GFX12-NEXT:    s_mov_b32 s4, 4
 ; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
@@ -188,6 +192,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %r
 ; GFX12-NEXT:    s_mov_b32 s4, 4
 ; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
@@ -205,6 +210,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> i
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
 ; GFX12-NEXT:    s_mov_b32 s4, 4
 ; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index aad74410d1453..c4a86952bc414 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -587,7 +587,8 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -664,9 +665,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
@@ -680,9 +681,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
@@ -728,7 +729,8 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -805,9 +807,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
@@ -821,9 +823,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
@@ -3298,7 +3300,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -3344,7 +3347,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -3421,9 +3425,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
@@ -3437,9 +3441,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
@@ -3516,9 +3520,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
@@ -3532,9 +3536,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
index 320b0b4508b6a..8a0602e0472b5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -81,23 +81,27 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u32_e64 s1, s3, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, s0, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v0, v5, s[4:7], s3 offen offset:128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
 ; GFX12-NEXT:    ; implicit-def: $vgpr6
 ; GFX12-NEXT:    ; implicit-def: $vgpr5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %voffset.add = add i32 %voffset, 128
   %ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 8bfe996c6a90a..6e029f7c0a95e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -210,6 +210,7 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
 ; GCN-NEXT:    s_mov_b32 m0, s0
 ; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal m0
+; GCN-NEXT:    s_wait_alu 0xfffe
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_var:
@@ -489,6 +490,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
 ; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal_isfirst m0
 ; GCN-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GCN-NEXT:    s_wait_alu 0xfffe
 ; GCN-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
 ; GCN-NEXT:    global_load_b32 v0, v[0:1], off
 ; GCN-NEXT:    global_load_b32 v1, v[2:3], off
@@ -516,8 +518,9 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
 ; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst m0
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s0, 1, 0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GLOBAL-ISEL-NEXT:    s_wait_alu 0xfffe
 ; GLOBAL-ISEL-NEXT:    s_and_b32 s0, 1, s0
+; GLOBAL-ISEL-NEXT:    s_wait_alu 0xfffe
 ; GLOBAL-ISEL-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GLOBAL-ISEL-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
 ; GLOBAL-ISEL-NEXT:    global_load_b32 v0, v[0:1], off
@@ -741,6 +744,7 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    s_mov_b32 m0, s0
 ; GCN-NEXT:    s_barrier_init m0
+; GCN-NEXT:    s_wait_alu 0xfffe
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GLOBAL-ISEL-LABEL: test5_s_barrier_init_m0:
@@ -752,11 +756,12 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 s0, v1
 ; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 s1, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GLOBAL-ISEL-NEXT:    s_lshl_b32 s0, 16, s0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GLOBAL-ISEL-NEXT:    s_wait_alu 0xfffe
 ; GLOBAL-ISEL-NEXT:    s_or_b32 m0, s1, s0
 ; GLOBAL-ISEL-NEXT:    s_barrier_init m0
+; GLOBAL-ISEL-NEXT:    s_wait_alu 0xfffe
 ; GLOBAL-ISEL-NEXT:    s_setpc_b64 s[30:31]
   call void @llvm.amdgcn.s.barrier.init(i32 %arg1, i32 %arg2)
   ret void
@@ -945,6 +950,7 @@ define void @test5_s_barrier_join_m0(i32 %arg) {
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    s_mov_b32 m0, s0
 ; GCN-NEXT:    s_barrier_join m0
+; GCN-NEXT:    s_wait_alu 0xfffe
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GLOBAL-ISEL-LABEL: test5_s_barrier_join_m0:
@@ -1202,6 +1208,7 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) {
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    s_mov_b32 m0, s0
 ; GCN-NEXT:    s_wakeup_barrier m0
+; GCN-NEXT:    s_wait_alu 0xfffe
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GLOBAL-ISEL-LABEL: test5_s_wakeup_barrier_m0:
@@ -1386,10 +1393,11 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
 ; GCN-NEXT:    s_wait_bvhcnt 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_2)
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GCN-NEXT:    s_mov_b32 m0, s0
 ; GCN-NEXT:    s_get_barrier_state s0, m0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_wait_alu 0xfffe
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1403,7 +1411,8 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
 ; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 m0, v0
 ; GLOBAL-ISEL-NEXT:    s_get_barrier_state s0, m0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
+; GLOBAL-ISEL-NEXT:    s_wait_alu 0xfffe
+; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GLOBAL-ISEL-NEXT:    s_setpc_b64 s[30:31]
   %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 %arg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
index 78204dfefc80c..2efade9fcbba1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -50,23 +50,27 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsr
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1200-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
 ; GFX1200-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1200-NEXT:    v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    s_and_b32 s0, s0, s1
-; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX1200-NEXT:    s_wait_loadcnt 0x0
 ; GFX1200-NEXT:    buffer_atomic_pk_add_bf16 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
 ; GFX1200-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
 ; GFX1200-NEXT:    ; implicit-def: $vgpr7
 ; GFX1200-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX1200-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX1200-NEXT:  ; %bb.2:
 ; GFX1200-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret <2 x bfloat> %ret
@@ -90,22 +94,26 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgp
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1200-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
 ; GFX1200-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1200-NEXT:    v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    s_and_b32 s0, s0, s1
-; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX1200-NEXT:    buffer_atomic_pk_add_bf16 v0, v[5:6], s[4:7], s3 idxen offen
 ; GFX1200-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
 ; GFX1200-NEXT:    ; implicit-def: $vgpr7
 ; GFX1200-NEXT:    ; implicit-def: $vgpr0
 ; GFX1200-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX1200-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX1200-NEXT:  ; %bb.2:
 ; GFX1200-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
index 1005996003044..d5b5c71cc42a9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
@@ -306,22 +306,26 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1200-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
 ; GFX1200-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1200-NEXT:    v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    s_and_b32 s0, s0, s1
-; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX1200-NEXT:    buffer_atomic_add_f32 v0, v[5:6], s[4:7], s3 idxen offen
 ; GFX1200-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
 ; GFX1200-NEXT:    ; implicit-def: $vgpr7
 ; GFX1200-NEXT:    ; implicit-def: $vgpr0
 ; GFX1200-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX1200-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX1200-NEXT:  ; %bb.2:
 ; GFX1200-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -444,22 +448,26 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1200-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
 ; GFX1200-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1200-NEXT:    v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    s_and_b32 s0, s0, s1
-; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX1200-NEXT:    buffer_atomic_pk_add_f16 v0, v[5:6], s[4:7], s3 idxen offen
 ; GFX1200-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
 ; GFX1200-NEXT:    ; implicit-def: $vgpr7
 ; GFX1200-NEXT:    ; implicit-def: $vgpr0
 ; GFX1200-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX1200-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX1200-NEXT:  ; %bb.2:
 ; GFX1200-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
index 5f6a67e466020..a312a3cb0a95c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
@@ -235,23 +235,27 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1200-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
 ; GFX1200-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1200-NEXT:    v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    s_and_b32 s0, s0, s1
-; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX1200-NEXT:    s_wait_loadcnt 0x0
 ; GFX1200-NEXT:    buffer_atomic_add_f32 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
 ; GFX1200-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
 ; GFX1200-NEXT:    ; implicit-def: $vgpr7
 ; GFX1200-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX1200-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX1200-NEXT:  ; %bb.2:
 ; GFX1200-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret float %ret
@@ -344,23 +348,27 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1200-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
 ; GFX1200-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1200-NEXT:    v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    s_and_b32 s0, s0, s1
-; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX1200-NEXT:    s_wait_loadcnt 0x0
 ; GFX1200-NEXT:    buffer_atomic_pk_add_f16 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
 ; GFX1200-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
 ; GFX1200-NEXT:    ; implicit-def: $vgpr7
 ; GFX1200-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX1200-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX1200-NEXT:  ; %bb.2:
 ; GFX1200-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret <2 x half> %ret
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
index bd803c380e90a..8d1dce76d2cc8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
@@ -554,18 +554,22 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s1, s1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
 ; GFX12-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %voffset.add = add i32 %voffset, 256
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
@@ -708,23 +712,27 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u32_e64 s1, s3, v7
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, s0, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
 ; GFX12-NEXT:    ; implicit-def: $vgpr7
 ; GFX12-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %voffset.add = add i32 %voffset, 256
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
index c9b50eddc94ee..06b1a9cc70513 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
@@ -554,18 +554,22 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s1, s1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
 ; GFX12-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %voffset.add = add i32 %voffset, 256
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
@@ -708,23 +712,27 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
 ; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_u32_e64 s1, s3, v7
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_and_b32 s0, s0, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
 ; GFX12-NEXT:    ; implicit-def: $vgpr7
 ; GFX12-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %voffset.add = add i32 %voffset, 256
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
index df5533b629502..5ea89bc574910 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
@@ -49,7 +49,7 @@ define amdgpu_gfx void @test_wave_id_callable(ptr addrspace(1) %out) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_bfe_u32 s0, ttmp8, 0x50019
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 5d3a5800bcdd8..bc0daf95e329c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -515,11 +515,13 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_maximum_f16 s0, s0, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GFX12-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s0
 ; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %op = call half @llvm.maximum.f16(half %src0, half %src1)
   %cast = bitcast half %op to i16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
index e6655aeab7e9b..6b61931fc9414 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -485,6 +485,7 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s0
 ; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %op = call float @llvm.maximum.f32(float %src0, float %src1)
   call void asm sideeffect "; use $0", "s"(float %op)
@@ -888,6 +889,7 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s[0:1]
 ; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
   call void asm sideeffect "; use $0", "s"(<2 x float> %op)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 01effc24e741d..77b5682a2dbd1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -424,11 +424,13 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_minimum_f16 s0, s0, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GFX12-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s0
 ; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %op = call half @llvm.minimum.f16(half %src0, half %src1)
   %cast = bitcast half %op to i16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
index 518fc27c23082..8753dc50c4da4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
@@ -485,6 +485,7 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s0
 ; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %op = call float @llvm.minimum.f32(float %src0, float %src1)
   call void asm sideeffect "; use $0", "s"(float %op)
@@ -888,6 +889,7 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s[0:1]
 ; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
   call void asm sideeffect "; use $0", "s"(<2 x float> %op)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 7178eaf2e7384..0221f9992ad43 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -2360,6 +2360,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v6, 5, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v9, 7, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v13, 3, s2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshrrev_b16 v14, 5, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v18, 1, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v21, 3, s3
@@ -2397,6 +2398,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x10015
 ; GFX12-NEXT:    v_and_b32_e32 v22, 1, v2
 ; GFX12-NEXT:    v_dual_mov_b32 v28, s8 :: v_dual_and_b32 v1, 1, v10
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v29, s7 :: v_dual_and_b32 v2, 1, v11
 ; GFX12-NEXT:    v_dual_mov_b32 v31, s3 :: v_dual_and_b32 v6, 1, v7
 ; GFX12-NEXT:    v_and_b32_e32 v4, 1, v5
@@ -2794,6 +2796,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_i32 s5, s2, 0x10000
 ; GFX12-NEXT:    s_bfe_i32 s6, s2, 0x10013
 ; GFX12-NEXT:    s_bfe_i32 s7, s2, 0x10012
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshrrev_b16 v16, 4, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v20, 5, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v21, 6, s3
@@ -2807,7 +2810,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_i32 s10, s2, 0x10016
 ; GFX12-NEXT:    s_bfe_i32 s11, s2, 0x10014
 ; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x10015
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v25, s2
 ; GFX12-NEXT:    v_bfe_i32 v15, v14, 0, 1
 ; GFX12-NEXT:    v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v27, s9
@@ -3454,6 +3457,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v3, 11, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v9, 13, s3
 ; GFX12-NEXT:    v_and_b32_e32 v44, 1, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 1, s4
 ; GFX12-NEXT:    s_lshr_b32 s5, s2, 24
 ; GFX12-NEXT:    v_dual_mov_b32 v64, 0 :: v_dual_and_b32 v41, 1, v2
@@ -3467,14 +3471,16 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v13, 7, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v14, 1, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v17, 5, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 5, s5
 ; GFX12-NEXT:    s_and_b32 s7, s2, 1
 ; GFX12-NEXT:    s_bfe_u32 s18, s3, 0x10010
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v52, s18 :: v_dual_and_b32 v35, 1, v9
 ; GFX12-NEXT:    v_and_b32_e32 v9, 1, v1
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 3, s4
 ; GFX12-NEXT:    s_bfe_u32 s19, s3, 0x10017
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v51, s19 :: v_dual_and_b32 v42, 1, v3
 ; GFX12-NEXT:    v_lshrrev_b16 v3, 3, s5
 ; GFX12-NEXT:    v_lshrrev_b16 v15, 3, s3
@@ -3489,30 +3495,34 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_u32 s9, s2, 0x10012
 ; GFX12-NEXT:    s_bfe_u32 s10, s2, 0x10011
 ; GFX12-NEXT:    s_bfe_u32 s12, s2, 0x10017
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v59, s12 :: v_dual_and_b32 v22, 1, v13
 ; GFX12-NEXT:    v_dual_mov_b32 v62, s9 :: v_dual_and_b32 v13, 1, v17
 ; GFX12-NEXT:    v_lshrrev_b16 v17, 6, s5
 ; GFX12-NEXT:    s_bfe_u32 s13, s2, 0x10016
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v58, s13 :: v_dual_and_b32 v23, 1, v14
 ; GFX12-NEXT:    s_bfe_u32 s14, s2, 0x10015
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v57, s14 :: v_dual_and_b32 v26, 1, v11
 ; GFX12-NEXT:    v_and_b32_e32 v11, 1, v1
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 1, s5
 ; GFX12-NEXT:    s_bfe_u32 s15, s3, 0x10013
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v55, s15 :: v_dual_and_b32 v34, 1, v7
 ; GFX12-NEXT:    v_lshrrev_b16 v7, 7, s5
 ; GFX12-NEXT:    s_bfe_u32 s16, s3, 0x10012
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v54, s16 :: v_dual_and_b32 v31, 1, v10
 ; GFX12-NEXT:    s_bfe_u32 s17, s3, 0x10011
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v53, s17 :: v_dual_and_b32 v38, 1, v5
 ; GFX12-NEXT:    s_bfe_u32 s20, s3, 0x10016
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v50, s20 :: v_dual_and_b32 v39, 1, v6
 ; GFX12-NEXT:    v_lshrrev_b16 v6, 2, s5
 ; GFX12-NEXT:    s_bfe_u32 s21, s3, 0x10014
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v48, s21 :: v_dual_and_b32 v43, 1, v4
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 4, s5
 ; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x10018
@@ -3522,7 +3532,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v24, 8, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v18, 2, s3
 ; GFX12-NEXT:    s_bfe_u32 s11, s2, 0x10010
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v60, s11 :: v_dual_and_b32 v19, 1, v15
 ; GFX12-NEXT:    v_lshrrev_b16 v0, 12, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v8, 14, s2
@@ -3541,6 +3551,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_u32 s3, s3, 0x10015
 ; GFX12-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v63, s8 :: v_dual_and_b32 v2, 1, v6
 ; GFX12-NEXT:    v_and_b32_e32 v6, 1, v17
 ; GFX12-NEXT:    v_and_b32_e32 v17, 0xffff, v23
@@ -4266,6 +4277,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v0, 12, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v8, 13, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v32, 15, s2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshrrev_b16 v12, 4, s4
 ; GFX12-NEXT:    v_lshrrev_b16 v13, 5, s4
 ; GFX12-NEXT:    v_lshrrev_b16 v14, 6, s4
@@ -4311,7 +4323,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_i32 s20, s3, 0x10016
 ; GFX12-NEXT:    s_bfe_i32 s21, s3, 0x10014
 ; GFX12-NEXT:    s_bfe_i32 s3, s3, 0x10015
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v64, 0 :: v_dual_mov_b32 v49, s3
 ; GFX12-NEXT:    v_bfe_i32 v23, v23, 0, 1
 ; GFX12-NEXT:    v_bfe_i32 v22, v22, 0, 1
@@ -6791,6 +6803,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 9, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v8, 7, s2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshrrev_b16 v16, 7, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v18, 6, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v17, 5, s3
@@ -6808,6 +6821,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_and_b32_e32 v28, 1, v21
 ; GFX12-NEXT:    v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v31, 1, v2
 ; GFX12-NEXT:    v_dual_mov_b32 v32, v1 :: v_dual_and_b32 v33, 0xffff, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v21, 0xffff, v3
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10014
@@ -6817,6 +6831,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_and_b32_e32 v11, 1, v11
 ; GFX12-NEXT:    v_and_b32_e32 v13, 1, v13
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10012
@@ -6827,6 +6842,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_and_b32_e32 v9, 1, v17
 ; GFX12-NEXT:    v_and_b32_e32 v29, 1, v23
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 15, s2
@@ -6842,6 +6858,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff, v13
 ; GFX12-NEXT:    v_and_b32_e32 v17, 0xffff, v24
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, v1
 ; GFX12-NEXT:    v_and_b32_e32 v43, 0xffff, v26
@@ -7554,6 +7571,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v62, v[30:33], s[0:1] offset:64
 ; GFX12-NEXT:    global_store_b128 v62, v[26:29], s[0:1] offset:48
 ; GFX12-NEXT:    global_store_b128 v62, v[8:11], s[0:1] offset:32
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, s3
 ; GFX12-NEXT:    s_clause 0x5
 ; GFX12-NEXT:    global_store_b128 v62, v[4:7], s[0:1] offset:16
@@ -8449,6 +8467,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_and_b32_e32 v43, 1, v10
 ; GFX12-NEXT:    v_dual_mov_b32 v68, v1 :: v_dual_and_b32 v69, 1, v2
 ; GFX12-NEXT:    v_dual_mov_b32 v62, v1 :: v_dual_and_b32 v71, 0xffff, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_and_b32 v67, 0xffff, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v66, v1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v3, v1
@@ -8457,6 +8476,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v14, 13, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v18, 9, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v47, v1 :: v_dual_and_b32 v38, 1, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 5, s4
 ; GFX12-NEXT:    v_lshrrev_b16 v6, 3, s4
 ; GFX12-NEXT:    s_bfe_u32 s8, s3, 0x10016
@@ -8465,6 +8485,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_and_b32_e32 v45, 1, v12
 ; GFX12-NEXT:    v_and_b32_e32 v41, 1, v16
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:416
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX12-NEXT:    s_lshr_b32 s5, s2, 24
@@ -8473,6 +8494,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v49, v1 :: v_dual_and_b32 v40, 1, v8
 ; GFX12-NEXT:    v_and_b32_e32 v44, 1, v14
 ; GFX12-NEXT:    v_and_b32_e32 v14, 1, v6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshrrev_b16 v6, 5, s5
 ; GFX12-NEXT:    v_lshrrev_b16 v8, 1, s5
 ; GFX12-NEXT:    v_lshrrev_b16 v10, 3, s5
@@ -8483,6 +8505,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_u32 s9, s3, 0x10013
 ; GFX12-NEXT:    v_and_b32_e32 v33, 1, v20
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:432
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX12-NEXT:    v_lshrrev_b16 v9, 15, s3
@@ -8509,6 +8532,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x10018
 ; GFX12-NEXT:    s_bfe_u32 s3, s3, 0x10010
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:400
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10016
@@ -8518,6 +8542,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_and_b32_e32 v82, 0xffff, v35
 ; GFX12-NEXT:    v_and_b32_e32 v35, 1, v27
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:384
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v27, v1
 ; GFX12-NEXT:    v_and_b32_e32 v81, 0xffff, v4
@@ -8529,6 +8554,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_and_b32_e32 v26, 0xffff, v31
 ; GFX12-NEXT:    v_and_b32_e32 v31, 1, v29
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10012
@@ -8538,6 +8564,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v21, 2, s2
 ; GFX12-NEXT:    v_and_b32_e32 v33, 0xffff, v33
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX12-NEXT:    v_lshrrev_b16 v15, 8, s2
@@ -8561,6 +8588,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_and_b32_e32 v10, 0xffff, v39
 ; GFX12-NEXT:    v_and_b32_e32 v39, 1, v25
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_and_b32 v77, 1, v7
 ; GFX12-NEXT:    v_and_b32_e32 v79, 0xffff, v5
@@ -9818,6 +9846,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v75, s42 :: v_dual_mov_b32 v76, s43
 ; GFX12-NEXT:    v_bfe_i32 v79, v1, 0, 1
 ; GFX12-NEXT:    v_bfe_i32 v85, v65, 0, 1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v65, s40
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v12, v[69:72], s[0:1] offset:144
@@ -9903,6 +9932,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v50, 31, v49
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v88, 31, v87
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v86, 31, v85
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v34, s19 :: v_dual_mov_b32 v17, s4
 ; GFX12-NEXT:    v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 22b718935738b..2ee1c60b4bbf2 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -2974,6 +2974,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GFX12-NEXT:    s_and_b32 s7, s7, 0xffff
 ; GFX12-NEXT:    s_lshr_b32 s25, s6, 16
 ; GFX12-NEXT:    s_and_b32 s6, s6, 0xffff
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28
 ; GFX12-NEXT:    v_mov_b32_e32 v10, s11
 ; GFX12-NEXT:    s_lshr_b32 s22, s5, 16
@@ -3464,6 +3465,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GFX12-NEXT:    s_ashr_i32 s25, s6, 16
 ; GFX12-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX12-NEXT:    s_sext_i32_i16 s6, s6
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28
 ; GFX12-NEXT:    v_mov_b32_e32 v10, s11
 ; GFX12-NEXT:    s_ashr_i32 s22, s5, 16
@@ -5795,10 +5797,10 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s3, 0xffff, s2
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
 ; GFX12-NEXT:    s_pack_hl_b32_b16 s2, s2, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -6030,6 +6032,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_pack_hl_b32_b16 s2, s3, 0
 ; GFX12-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
@@ -6370,23 +6373,27 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s2, 0xffff, s7
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    s_pack_hl_b32_b16 s3, s7, 0
 ; GFX12-NEXT:    s_pack_hl_b32_b16 s2, s6, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    s_and_b32 s3, 0xffff, s6
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    s_pack_hl_b32_b16 s2, s5, 0
 ; GFX12-NEXT:    s_and_b32 s3, 0xffff, s5
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    s_pack_hl_b32_b16 s2, s4, 0
 ; GFX12-NEXT:    s_and_b32 s3, 0xffff, s4
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
@@ -6966,36 +6973,43 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GFX12-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX12-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:80
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX12-NEXT:    s_lshr_b32 s4, s7, 16
 ; GFX12-NEXT:    s_and_b32 s5, s7, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:64
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-NEXT:    s_lshr_b32 s4, s6, 16
 ; GFX12-NEXT:    s_and_b32 s5, s6, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:112
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX12-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:96
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-NEXT:    s_lshr_b32 s3, s2, 16
 ; GFX12-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:48
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    s_lshr_b32 s2, s1, 16
 ; GFX12-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:32
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:16
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9]
@@ -8047,76 +8061,91 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GFX12-NEXT:    s_lshr_b32 s15, s14, 16
 ; GFX12-NEXT:    s_and_b32 s14, s14, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17] offset:240
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s15
 ; GFX12-NEXT:    s_lshr_b32 s14, s13, 16
 ; GFX12-NEXT:    s_and_b32 s13, s13, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17] offset:224
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s13
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX12-NEXT:    s_lshr_b32 s13, s12, 16
 ; GFX12-NEXT:    s_and_b32 s12, s12, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17] offset:208
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX12-NEXT:    s_lshr_b32 s12, s11, 16
 ; GFX12-NEXT:    s_and_b32 s11, s11, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17] offset:192
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s11
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX12-NEXT:    s_lshr_b32 s11, s10, 16
 ; GFX12-NEXT:    s_and_b32 s10, s10, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17] offset:176
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX12-NEXT:    s_lshr_b32 s10, s9, 16
 ; GFX12-NEXT:    s_and_b32 s9, s9, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17] offset:160
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s9
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX12-NEXT:    s_lshr_b32 s9, s8, 16
 ; GFX12-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17] offset:144
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX12-NEXT:    s_lshr_b32 s8, s7, 16
 ; GFX12-NEXT:    s_and_b32 s7, s7, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17] offset:128
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s7
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX12-NEXT:    s_lshr_b32 s7, s6, 16
 ; GFX12-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17] offset:112
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX12-NEXT:    s_lshr_b32 s6, s5, 16
 ; GFX12-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17] offset:96
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX12-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17] offset:80
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX12-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX12-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17] offset:64
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-NEXT:    s_lshr_b32 s3, s2, 16
 ; GFX12-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17] offset:48
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    s_lshr_b32 s2, s1, 16
 ; GFX12-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17] offset:32
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17] offset:16
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17]
@@ -8926,6 +8955,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GFX12-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x100000
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s64 :: v_dual_mov_b32 v7, s61
 ; GFX12-NEXT:    v_dual_mov_b32 v6, s60 :: v_dual_mov_b32 v9, s13
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s59
 ; GFX12-NEXT:    v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s57
 ; GFX12-NEXT:    v_dual_mov_b32 v12, s56 :: v_dual_mov_b32 v15, s55
@@ -8937,6 +8967,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[16:17] offset:208
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[16:17] offset:192
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s53 :: v_dual_mov_b32 v0, s52
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12
 ; GFX12-NEXT:    v_dual_mov_b32 v5, s45 :: v_dual_mov_b32 v4, s44
 ; GFX12-NEXT:    v_dual_mov_b32 v7, s51 :: v_dual_mov_b32 v6, s50
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index f1a6bccc559f0..4ab55164e0999 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -4390,6 +4390,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[36:37] offset:224
 ; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[36:37] offset:208
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[36:37] offset:192
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s24 :: v_dual_mov_b32 v0, s22
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s57 :: v_dual_mov_b32 v2, s23
 ; GFX12-NEXT:    v_mov_b32_e32 v5, s56
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 67a376b8c0f3c..7f26ad7009e44 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -1119,8 +1119,8 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
 ; GFX12-NEXT:    s_and_b32 s3, s2, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
 ; GFX12-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1223,8 +1223,8 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
 ; GFX12-NEXT:    s_sext_i32_i8 s3, s2
 ; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x80010
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
@@ -1332,6 +1332,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x80010
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
@@ -1439,6 +1440,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_ashr_i32 s3, s2, 24
 ; GFX12-NEXT:    s_sext_i32_i8 s4, s2
 ; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x80010
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
@@ -1597,6 +1599,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_bfe_u32 s3, s3, 0x80010
 ; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s6 :: v_dual_and_b32 v5, 0xffff, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s3
@@ -1761,6 +1764,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_sext_i32_i8 s3, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s6
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX12-NEXT:    v_mov_b32_e32 v4, s3
@@ -2018,11 +2022,13 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_lshr_b32 s3, s5, 24
 ; GFX12-NEXT:    s_bfe_u32 s5, s5, 0x80010
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_and_b32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_lshr_b32 s2, s4, 24
 ; GFX12-NEXT:    s_and_b32 s10, s4, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s4, s4, 0x80010
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v15, s2
 ; GFX12-NEXT:    v_dual_mov_b32 v8, s11 :: v_dual_and_b32 v5, 0xffff, v5
 ; GFX12-NEXT:    v_dual_mov_b32 v10, s5 :: v_dual_and_b32 v9, 0xffff, v9
@@ -2294,6 +2300,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_i32 s9, s5, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s5, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s10
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s8
 ; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX12-NEXT:    s_ashr_i32 s2, s4, 24
@@ -2305,6 +2312,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v10, s9
 ; GFX12-NEXT:    v_bfe_i32 v9, v9, 0, 8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v12, s4
 ; GFX12-NEXT:    v_mov_b32_e32 v14, s3
 ; GFX12-NEXT:    v_bfe_i32 v13, v13, 0, 8
@@ -2753,7 +2761,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_and_b32 s21, s7, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s7, s7, 0x80010
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s16
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s11 :: v_dual_and_b32 v13, 0xffff, v13
 ; GFX12-NEXT:    v_dual_mov_b32 v8, s23 :: v_dual_and_b32 v1, 0xffff, v1
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
@@ -2767,6 +2775,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_u32 s6, s6, 0x80010
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s24 :: v_dual_and_b32 v17, 0xffff, v14
 ; GFX12-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_and_b32 v21, 0xffff, v12
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v31, s14 :: v_dual_mov_b32 v20, s20
 ; GFX12-NEXT:    s_lshr_b32 s3, s5, 24
 ; GFX12-NEXT:    s_and_b32 s19, s5, 0xff
@@ -2776,6 +2785,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_and_b32 s18, s4, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s4, s4, 0x80010
 ; GFX12-NEXT:    v_dual_mov_b32 v23, s12 :: v_dual_mov_b32 v16, s19
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:112
@@ -3263,6 +3273,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_sext_i32_i8 s7, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s22
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s11
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v23, s14
 ; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX12-NEXT:    v_bfe_i32 v25, v11, 0, 8
@@ -3276,6 +3287,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v31, s18
 ; GFX12-NEXT:    v_dual_mov_b32 v6, s23 :: v_dual_mov_b32 v27, s16
 ; GFX12-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v15, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v30, s19
 ; GFX12-NEXT:    s_bfe_i32 s13, s5, 0x80010
@@ -3288,6 +3300,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_mov_b32_e32 v20, s6
 ; GFX12-NEXT:    v_mov_b32_e32 v22, s15
 ; GFX12-NEXT:    v_bfe_i32 v17, v14, 0, 8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v16, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v18, s13
 ; GFX12-NEXT:    v_bfe_i32 v13, v13, 0, 8
@@ -4116,11 +4129,13 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v60, 0 :: v_dual_and_b32 v5, 0xffff, v5
 ; GFX12-NEXT:    v_dual_mov_b32 v56, s50 :: v_dual_and_b32 v9, 0xffff, v9
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v58, s15
 ; GFX12-NEXT:    s_and_b32 s43, s8, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s8, s8, 0x80010
 ; GFX12-NEXT:    s_and_b32 s48, s13, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s13, s13, 0x80010
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v30, s43 :: v_dual_and_b32 v57, 0xffff, v0
 ; GFX12-NEXT:    v_dual_mov_b32 v59, s34 :: v_dual_mov_b32 v32, s8
 ; GFX12-NEXT:    s_lshr_b32 s27, s9, 24
@@ -4132,6 +4147,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_u32 s9, s9, 0x80010
 ; GFX12-NEXT:    s_and_b32 s47, s12, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s12, s12, 0x80010
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v36, s9 :: v_dual_and_b32 v53, 0xffff, v2
 ; GFX12-NEXT:    v_dual_mov_b32 v55, s33 :: v_dual_mov_b32 v26, s42
 ; GFX12-NEXT:    s_lshr_b32 s25, s7, 24
@@ -4139,6 +4155,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v50, s13 :: v_dual_and_b32 v23, 0xffff, v12
 ; GFX12-NEXT:    s_bfe_u32 s7, s7, 0x80010
 ; GFX12-NEXT:    v_dual_mov_b32 v34, s44 :: v_dual_and_b32 v49, 0xffff, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v51, s31 :: v_dual_mov_b32 v28, s7
 ; GFX12-NEXT:    s_lshr_b32 s28, s10, 24
 ; GFX12-NEXT:    s_lshr_b32 s29, s11, 24
@@ -4148,6 +4165,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_and_b32 s45, s10, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s10, s10, 0x80010
 ; GFX12-NEXT:    s_and_b32 s46, s11, 0xff
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v40, s10 :: v_dual_and_b32 v45, 0xffff, v4
 ; GFX12-NEXT:    v_dual_mov_b32 v47, s30 :: v_dual_mov_b32 v22, s41
 ; GFX12-NEXT:    s_bfe_u32 s11, s11, 0x80010
@@ -4162,10 +4180,11 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v60, v[52:55], s[16:17] offset:224
 ; GFX12-NEXT:    global_store_b128 v60, v[48:51], s[16:17] offset:208
 ; GFX12-NEXT:    global_store_b128 v60, v[44:47], s[16:17] offset:192
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v44, s11 :: v_dual_mov_b32 v45, s29
 ; GFX12-NEXT:    v_mov_b32_e32 v24, s6
 ; GFX12-NEXT:    s_and_b32 s40, s5, 0xff
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v41, s28 :: v_dual_mov_b32 v20, s40
 ; GFX12-NEXT:    s_lshr_b32 s23, s5, 24
 ; GFX12-NEXT:    s_bfe_u32 s5, s5, 0x80010
@@ -4175,6 +4194,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_and_b32 s39, s4, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s4, s4, 0x80010
 ; GFX12-NEXT:    v_dual_mov_b32 v33, s26 :: v_dual_mov_b32 v16, s39
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v29, s25 :: v_dual_mov_b32 v18, s4
 ; GFX12-NEXT:    s_lshr_b32 s21, s3, 24
 ; GFX12-NEXT:    s_bfe_u32 s3, s3, 0x80010
@@ -4187,10 +4207,12 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v60, v[26:29], s[16:17] offset:112
 ; GFX12-NEXT:    global_store_b128 v60, v[22:25], s[16:17] offset:96
 ; GFX12-NEXT:    v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v23, s23
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v14, s3
 ; GFX12-NEXT:    s_lshr_b32 s20, s2, 24
 ; GFX12-NEXT:    s_and_b32 s37, s2, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v8, s37
 ; GFX12-NEXT:    s_lshr_b32 s19, s1, 24
 ; GFX12-NEXT:    s_and_b32 s36, s1, 0xff
@@ -4199,6 +4221,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_lshr_b32 s18, s0, 24
 ; GFX12-NEXT:    s_and_b32 s35, s0, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v4, s36
 ; GFX12-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s19
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s35 :: v_dual_mov_b32 v3, s18
@@ -5061,6 +5084,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_ashr_i32 s47, s14, 24
 ; GFX12-NEXT:    s_bfe_i32 s48, s14, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s14, s14
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v59, 0 :: v_dual_mov_b32 v52, s15
 ; GFX12-NEXT:    v_lshrrev_b16 v6, 8, s11
 ; GFX12-NEXT:    s_ashr_i32 s45, s13, 24
@@ -5080,6 +5104,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_i32 s42, s11, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s11, s11
 ; GFX12-NEXT:    v_bfe_i32 v45, v3, 0, 8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v44, s13 :: v_dual_mov_b32 v43, s43
 ; GFX12-NEXT:    v_mov_b32_e32 v46, s46
 ; GFX12-NEXT:    v_lshrrev_b16 v10, 8, s8
@@ -5104,12 +5129,14 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v59, v[40:43], s[16:17] offset:192
 ; GFX12-NEXT:    v_mov_b32_e32 v41, s39
 ; GFX12-NEXT:    v_dual_mov_b32 v55, s11 :: v_dual_mov_b32 v58, s41
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v37, s37
 ; GFX12-NEXT:    s_ashr_i32 s33, s7, 24
 ; GFX12-NEXT:    s_ashr_i32 s35, s8, 24
 ; GFX12-NEXT:    s_bfe_i32 s36, s8, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s8, s8
 ; GFX12-NEXT:    v_bfe_i32 v39, v7, 0, 8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v33, s35
 ; GFX12-NEXT:    v_dual_mov_b32 v40, s40 :: v_dual_mov_b32 v29, s33
 ; GFX12-NEXT:    v_lshrrev_b16 v13, 8, s3
@@ -5131,6 +5158,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_i32 s29, s5, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s5, s5
 ; GFX12-NEXT:    v_bfe_i32 v31, v10, 0, 8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v19, s26
 ; GFX12-NEXT:    v_mov_b32_e32 v32, s36
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s1
@@ -5158,6 +5186,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s28
 ; GFX12-NEXT:    s_bfe_i32 s23, s2, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s2, s2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v16, s4
 ; GFX12-NEXT:    v_mov_b32_e32 v18, s27
 ; GFX12-NEXT:    s_bfe_i32 s21, s1, 0x80010
@@ -5171,6 +5200,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v10, s23
 ; GFX12-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s21
 ; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 8
@@ -5869,7 +5899,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s2
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x80010
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
 ; GFX12-NEXT:    s_lshr_b32 s4, s2, 24
 ; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
@@ -5877,6 +5907,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
@@ -6027,6 +6058,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x80000
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s6
@@ -6225,26 +6257,29 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x80010
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
 ; GFX12-NEXT:    s_lshr_b32 s5, s3, 24
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    s_lshr_b32 s4, s2, 24
 ; GFX12-NEXT:    s_bfe_u32 s5, s2, 0x80010
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s3
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
 ; GFX12-NEXT:    s_and_b32 s2, s3, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:32
@@ -6490,7 +6525,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_bfe_i32 v14, v7, 0, 8
 ; GFX12-NEXT:    s_bfe_i64 s[12:13], s[2:3], 0x80000
 ; GFX12-NEXT:    s_ashr_i64 s[2:3], s[2:3], 56
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x80000
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v9, s9
@@ -6831,47 +6866,55 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_bfe_u32 s2, s7, 0x80010
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    s_lshr_b32 s3, s7, 24
 ; GFX12-NEXT:    s_lshr_b32 s2, s5, 24
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    s_bfe_u32 s3, s5, 0x80010
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s6
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s7
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:112
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    s_lshr_b32 s2, s6, 24
 ; GFX12-NEXT:    s_bfe_u32 s3, s6, 0x80010
 ; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    s_lshr_b32 s2, s4, 24
 ; GFX12-NEXT:    s_bfe_u32 s3, s4, 0x80010
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    s_and_b32 s2, s6, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s5
 ; GFX12-NEXT:    s_and_b32 s2, s7, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:64
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s4
 ; GFX12-NEXT:    s_and_b32 s2, s5, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:96
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
 ; GFX12-NEXT:    s_and_b32 s2, s4, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
@@ -7303,6 +7346,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_ashr_i64 s[4:5], s[4:5], 56
 ; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v30, 0 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v7, s5
@@ -7939,48 +7983,56 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_bfe_u32 s10, s7, 0x80010
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10
 ; GFX12-NEXT:    s_lshr_b32 s11, s7, 24
 ; GFX12-NEXT:    s_lshr_b32 s10, s5, 24
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    s_bfe_u32 s11, s5, 0x80010
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s7
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s6
 ; GFX12-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:240
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX12-NEXT:    s_lshr_b32 s10, s3, 24
 ; GFX12-NEXT:    s_bfe_u32 s11, s3, 0x80010
 ; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:176
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX12-NEXT:    s_lshr_b32 s10, s1, 24
 ; GFX12-NEXT:    s_bfe_u32 s11, s1, 0x80010
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:112
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX12-NEXT:    s_lshr_b32 s10, s6, 24
 ; GFX12-NEXT:    s_bfe_u32 s11, s6, 0x80010
 ; GFX12-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:48
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX12-NEXT:    s_lshr_b32 s10, s4, 24
 ; GFX12-NEXT:    s_bfe_u32 s11, s4, 0x80010
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:208
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX12-NEXT:    s_lshr_b32 s10, s2, 24
 ; GFX12-NEXT:    s_bfe_u32 s11, s2, 0x80010
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:144
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX12-NEXT:    s_lshr_b32 s10, s0, 24
 ; GFX12-NEXT:    s_bfe_u32 s11, s0, 0x80010
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:80
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:16
@@ -7996,6 +8048,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s4
 ; GFX12-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:192
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
@@ -8008,6 +8061,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s2
 ; GFX12-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:128
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
@@ -8020,6 +8074,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s0
 ; GFX12-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:64
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
@@ -8866,6 +8921,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x80000
 ; GFX12-NEXT:    v_dual_mov_b32 v60, s42 :: v_dual_mov_b32 v29, s47
 ; GFX12-NEXT:    v_dual_mov_b32 v28, s46 :: v_dual_mov_b32 v63, s45
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v21, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v17, s15
 ; GFX12-NEXT:    v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v9, s13
@@ -9605,14 +9661,16 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s3, s2, 16
 ; GFX12-NEXT:    v_and_b32_e64 v0, 0xff, s2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_b32_e64 v1, 0xff, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 8, s2
 ; GFX12-NEXT:    s_lshr_b32 s2, s2, 24
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshl_or_b32 v1, s2, 16, v1
 ; GFX12-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -9758,11 +9816,13 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_ashrrev_i16 v0, 8, s2
 ; GFX12-NEXT:    v_and_b32_e64 v1, 0xffff, s4
 ; GFX12-NEXT:    s_ashr_i32 s2, s2, 24
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_bfe_i32 s3, s3, 0x80000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -9961,6 +10021,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_lshr_b32 s2, s3, 24
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
 ; GFX12-NEXT:    v_lshl_or_b32 v2, v1, 16, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshl_or_b32 v3, s2, 16, v3
 ; GFX12-NEXT:    v_lshl_or_b32 v1, s4, 16, v5
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
@@ -10187,8 +10248,10 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_ashr_i32 s2, s2, 24
 ; GFX12-NEXT:    s_bfe_i32 s3, s6, 0x80000
 ; GFX12-NEXT:    s_bfe_i32 s5, s7, 0x80000
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
 ; GFX12-NEXT:    s_pack_ll_b32_b16 s3, s5, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
 ; GFX12-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
@@ -10537,6 +10600,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v0, 16, v5
 ; GFX12-NEXT:    v_lshl_or_b32 v6, v3, 16, v6
 ; GFX12-NEXT:    v_lshl_or_b32 v4, v1, 16, v7
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshl_or_b32 v7, s8, 16, v11
 ; GFX12-NEXT:    v_lshl_or_b32 v5, s2, 16, v12
 ; GFX12-NEXT:    v_lshl_or_b32 v3, s12, 16, v9
@@ -10926,6 +10990,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_ashrrev_i16 v0, 8, s5
 ; GFX12-NEXT:    s_bfe_i32 s5, s5, 0x80000
 ; GFX12-NEXT:    s_bfe_i32 s12, s7, 0x80000
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_ashr_i64 s[2:3], s[6:7], 56
 ; GFX12-NEXT:    v_and_b32_e64 v12, 0xffff, s6
 ; GFX12-NEXT:    s_bfe_i32 s6, s8, 0x80000
@@ -10937,14 +11002,16 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_and_b32_e64 v4, 0xffff, s5
 ; GFX12-NEXT:    v_and_b32_e64 v11, 0xffff, s12
 ; GFX12-NEXT:    v_ashrrev_i16 v13, 8, s8
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_b32_e64 v16, 0xffff, s6
 ; GFX12-NEXT:    v_ashrrev_i16 v9, 8, s11
 ; GFX12-NEXT:    v_ashrrev_i16 v10, 8, s10
 ; GFX12-NEXT:    s_bfe_i32 s5, s9, 0x80000
 ; GFX12-NEXT:    v_and_b32_e64 v14, 0xffff, s3
 ; GFX12-NEXT:    v_and_b32_e64 v15, 0xffff, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s5, s2
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s2
 ; GFX12-NEXT:    v_lshl_or_b32 v6, v0, 16, v4
 ; GFX12-NEXT:    v_lshl_or_b32 v4, v1, 16, v7
@@ -11566,6 +11633,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshl_or_b32 v2, v2, 16, v6
 ; GFX12-NEXT:    v_lshl_or_b32 v6, v4, 16, v10
 ; GFX12-NEXT:    v_lshl_or_b32 v4, v3, 16, v11
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshl_or_b32 v3, s24, 16, v14
 ; GFX12-NEXT:    v_lshl_or_b32 v10, v9, 16, v12
 ; GFX12-NEXT:    v_lshl_or_b32 v8, v8, 16, v13
@@ -11580,6 +11648,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_lshr_b32 s16, s5, 24
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s6
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s7
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshl_or_b32 v11, s16, 16, v17
 ; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; GFX12-NEXT:    v_and_b32_e32 v12, 0xffff, v12
@@ -11593,11 +11662,13 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_lshr_b32 s18, s2, 24
 ; GFX12-NEXT:    v_lshl_or_b32 v14, v5, 16, v9
 ; GFX12-NEXT:    v_lshl_or_b32 v12, v1, 16, v12
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshl_or_b32 v15, s12, 16, v13
 ; GFX12-NEXT:    v_lshl_or_b32 v13, s10, 16, v17
 ; GFX12-NEXT:    s_lshr_b32 s22, s0, 24
 ; GFX12-NEXT:    v_lshl_or_b32 v9, s14, 16, v19
 ; GFX12-NEXT:    v_lshl_or_b32 v5, s18, 16, v18
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshl_or_b32 v1, s22, 16, v20
 ; GFX12-NEXT:    s_clause 0x3
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[8:9] offset:48
@@ -12316,6 +12387,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_ashrrev_i16 v1, 8, s0
 ; GFX12-NEXT:    s_bfe_i32 s19, s0, 0x80000
 ; GFX12-NEXT:    v_ashrrev_i16 v5, 8, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_ashr_i64 s[0:1], s[4:5], 56
 ; GFX12-NEXT:    v_and_b32_e64 v10, 0xffff, s2
 ; GFX12-NEXT:    v_and_b32_e64 v12, 0xffff, s20
@@ -12323,6 +12395,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_ashrrev_i16 v3, 8, s3
 ; GFX12-NEXT:    s_bfe_i32 s3, s3, 0x80000
 ; GFX12-NEXT:    s_bfe_i32 s2, s15, 0x80000
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_b32_e64 v14, 0xffff, s1
 ; GFX12-NEXT:    s_bfe_i32 s1, s12, 0x80000
 ; GFX12-NEXT:    v_and_b32_e64 v2, 0xffff, s18
@@ -12333,9 +12406,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_and_b32_e64 v15, 0xffff, s2
 ; GFX12-NEXT:    v_lshl_or_b32 v4, v4, 16, v10
 ; GFX12-NEXT:    v_lshl_or_b32 v10, v5, 16, v12
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_b32_e64 v5, 0xffff, s1
 ; GFX12-NEXT:    s_bfe_i32 s1, s7, 0x80000
 ; GFX12-NEXT:    s_lshr_b32 s11, s7, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_b32_e64 v12, 0xffff, s1
 ; GFX12-NEXT:    s_bfe_i32 s1, s6, 0x80000
 ; GFX12-NEXT:    s_lshr_b32 s10, s6, 16
@@ -12344,9 +12419,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshl_or_b32 v6, v3, 16, v8
 ; GFX12-NEXT:    v_lshl_or_b32 v8, v7, 16, v13
 ; GFX12-NEXT:    v_lshl_or_b32 v7, v11, 16, v15
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_b32_e64 v15, 0xffff, s1
 ; GFX12-NEXT:    s_bfe_i32 s1, s11, 0x80000
 ; GFX12-NEXT:    s_lshr_b32 s13, s5, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_b32_e64 v22, 0xffff, s1
 ; GFX12-NEXT:    s_bfe_i32 s1, s10, 0x80000
 ; GFX12-NEXT:    v_ashrrev_i16 v9, 8, s17
@@ -12355,6 +12432,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_ashrrev_i16 v13, 8, s6
 ; GFX12-NEXT:    v_ashrrev_i16 v21, 8, s11
 ; GFX12-NEXT:    v_ashrrev_i16 v23, 8, s10
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_b32_e64 v24, 0xffff, s1
 ; GFX12-NEXT:    s_bfe_i32 s5, s16, 0x80000
 ; GFX12-NEXT:    v_ashrrev_i16 v1, 8, s12
@@ -12362,11 +12440,13 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_i32 s4, s13, 0x80000
 ; GFX12-NEXT:    v_and_b32_e64 v20, 0xffff, s3
 ; GFX12-NEXT:    v_ashrrev_i16 v17, 8, s16
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_b32_e64 v19, 0xffff, s5
 ; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s4, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX12-NEXT:    v_lshl_or_b32 v3, v9, 16, v14
 ; GFX12-NEXT:    v_lshl_or_b32 v14, v11, 16, v12
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v11, s0
 ; GFX12-NEXT:    v_lshl_or_b32 v12, v13, 16, v15
 ; GFX12-NEXT:    v_lshl_or_b32 v15, v21, 16, v22
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 0894d3251423d..295ae94902da7 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -503,12 +503,14 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_ret_f64:
@@ -694,12 +696,14 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_ret_f64__offset:
@@ -884,12 +888,14 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_noret_f64:
@@ -1066,12 +1072,14 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_noret_f64__offset:
@@ -1267,13 +1275,15 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_ret_f16:
@@ -1573,13 +1583,15 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_ret_f16__offset:
@@ -1887,12 +1899,14 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_noret_f16:
@@ -2181,12 +2195,14 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_noret_f16__offset:
@@ -2474,13 +2490,15 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_ret_f16__offset__align4:
@@ -2710,12 +2728,14 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_noret_f16__offset__align4:
@@ -2958,13 +2978,15 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_ret_bf16:
@@ -3312,13 +3334,15 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_ret_bf16__offset:
@@ -3674,12 +3698,14 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_noret_bf16:
@@ -4016,12 +4042,14 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_noret_bf16__offset:
@@ -4357,13 +4385,15 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_ret_bf16__offset__align4:
@@ -4648,12 +4678,14 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_noret_bf16__offset__align4:
@@ -7018,11 +7050,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX12-NEXT:    s_cbranch_execz .LBB28_2
 ; GFX12-NEXT:  ; %bb.1:
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_bcnt1_i32_b32 s5, s6
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s5
 ; GFX12-NEXT:    s_lshl_b32 s5, s1, 3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1
 ; GFX12-NEXT:    global_wb scope:SCOPE_SE
 ; GFX12-NEXT:    ds_add_rtn_f32 v1, v2, v1
@@ -7030,25 +7063,27 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:  .LBB28_2:
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    s_mov_b32 s7, exec_lo
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v2, s7, 0
 ; GFX12-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX12-NEXT:    s_cbranch_execz .LBB28_4
 ; GFX12-NEXT:  ; %bb.3:
 ; GFX12-NEXT:    s_bcnt1_i32_b32 s0, s7
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
 ; GFX12-NEXT:    s_lshl_b32 s0, s1, 4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1
 ; GFX12-NEXT:    global_wb scope:SCOPE_SE
 ; GFX12-NEXT:    ds_add_f32 v2, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:  .LBB28_4:
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
@@ -7061,22 +7096,26 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX12-NEXT:    ; implicit-def: $vgpr0
 ; GFX12-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_ctz_i32_b32 s5, s1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s6, v1, s5
 ; GFX12-NEXT:    s_lshl_b32 s7, 1, s5
 ; GFX12-NEXT:    v_writelane_b32 v0, s0, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX12-NEXT:    s_add_f32 s0, s0, s6
 ; GFX12-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX12-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX12-NEXT:    ; implicit-def: $vgpr1
 ; GFX12-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execz .LBB28_8
 ; GFX12-NEXT:  ; %bb.7:
@@ -7086,6 +7125,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:  .LBB28_8:
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX12-NEXT:    v_readfirstlane_b32 s2, v1
@@ -7885,32 +7925,36 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX12-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX12-NEXT:  ; %bb.1:
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_bcnt1_i32_b32 s5, s6
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s5
 ; GFX12-NEXT:    s_lshl_b32 s5, s1, 3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1
 ; GFX12-NEXT:    ds_add_rtn_f32 v1, v2, v1
 ; GFX12-NEXT:  .LBB29_2:
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mov_b32 s7, exec_lo
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v2, s7, 0
 ; GFX12-NEXT:    s_mov_b32 s6, exec_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX12-NEXT:    s_cbranch_execz .LBB29_4
 ; GFX12-NEXT:  ; %bb.3:
 ; GFX12-NEXT:    s_bcnt1_i32_b32 s0, s7
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
 ; GFX12-NEXT:    s_lshl_b32 s0, s1, 4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1
 ; GFX12-NEXT:    ds_add_f32 v2, v1
 ; GFX12-NEXT:  .LBB29_4:
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
@@ -7923,28 +7967,33 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    ; implicit-def: $vgpr0
 ; GFX12-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_ctz_i32_b32 s5, s1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s6, v1, s5
 ; GFX12-NEXT:    s_lshl_b32 s7, 1, s5
 ; GFX12-NEXT:    v_writelane_b32 v0, s0, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX12-NEXT:    s_add_f32 s0, s0, s6
 ; GFX12-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX12-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX12-NEXT:    ; implicit-def: $vgpr1
 ; GFX12-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execz .LBB29_8
 ; GFX12-NEXT:  ; %bb.7:
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    ds_add_rtn_f32 v1, v1, v2
 ; GFX12-NEXT:  .LBB29_8:
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index 6dec36c316ee3..cc79db1b20af4 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -816,13 +816,15 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_ret_f16:
@@ -1129,13 +1131,15 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_ret_f16__offset:
@@ -1450,12 +1454,14 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_noret_f16:
@@ -1752,12 +1758,14 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_noret_f16__offset:
@@ -2053,13 +2061,15 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_ret_f16__offset__align4:
@@ -2297,12 +2307,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_noret_f16__offset__align4:
@@ -2552,13 +2564,15 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_ret_bf16:
@@ -2908,13 +2922,15 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_ret_bf16__offset:
@@ -3272,12 +3288,14 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_noret_bf16:
@@ -3616,12 +3634,14 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset:
@@ -3959,13 +3979,15 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_ret_bf16__offset__align4:
@@ -4252,12 +4274,14 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset__align4:
@@ -4531,13 +4555,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_ret_v2f16:
@@ -4802,13 +4828,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_ret_v2f16__offset:
@@ -5073,12 +5101,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_noret_v2f16:
@@ -5334,12 +5364,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_noret_v2f16__offset:
@@ -5618,13 +5650,15 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_ret_v2bf16:
@@ -5994,13 +6028,15 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_ret_v2bf16__offset:
@@ -6370,12 +6406,14 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16:
@@ -6733,12 +6771,14 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index b3132a2fa80dd..1ffd93e35d8cd 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -816,13 +816,15 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_ret_f16:
@@ -1129,13 +1131,15 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_ret_f16__offset:
@@ -1450,12 +1454,14 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_noret_f16:
@@ -1752,12 +1758,14 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_noret_f16__offset:
@@ -2053,13 +2061,15 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_ret_f16__offset__align4:
@@ -2297,12 +2307,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_noret_f16__offset__align4:
@@ -2552,13 +2564,15 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_ret_bf16:
@@ -2908,13 +2922,15 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_ret_bf16__offset:
@@ -3272,12 +3288,14 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_noret_bf16:
@@ -3616,12 +3634,14 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset:
@@ -3959,13 +3979,15 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_ret_bf16__offset__align4:
@@ -4252,12 +4274,14 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset__align4:
@@ -4531,13 +4555,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_ret_v2f16:
@@ -4802,13 +4828,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_ret_v2f16__offset:
@@ -5073,12 +5101,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_noret_v2f16:
@@ -5334,12 +5364,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_noret_v2f16__offset:
@@ -5618,13 +5650,15 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_ret_v2bf16:
@@ -5994,13 +6028,15 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_ret_v2bf16__offset:
@@ -6370,12 +6406,14 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16:
@@ -6733,12 +6771,14 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index 5ebeddd04b2ae..9bc8bafc34a68 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -35,13 +35,15 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_ret_f32:
@@ -246,13 +248,15 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_ret_f32__offset:
@@ -457,12 +461,14 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_noret_f32:
@@ -657,12 +663,14 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_noret_f32__offset:
@@ -865,12 +873,14 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_ret_f64:
@@ -1081,12 +1091,14 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_ret_f64__offset:
@@ -1296,12 +1308,14 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_noret_f64:
@@ -1501,12 +1515,14 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_noret_f64__offset:
@@ -1725,13 +1741,15 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_ret_f16:
@@ -2031,13 +2049,15 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_ret_f16__offset:
@@ -2345,12 +2365,14 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_noret_f16:
@@ -2639,12 +2661,14 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_noret_f16__offset:
@@ -2932,13 +2956,15 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_ret_f16__offset__align4:
@@ -3168,12 +3194,14 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_noret_f16__offset__align4:
@@ -3416,13 +3444,15 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_ret_bf16:
@@ -3770,13 +3800,15 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_ret_bf16__offset:
@@ -4132,12 +4164,14 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_noret_bf16:
@@ -4474,12 +4508,14 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_noret_bf16__offset:
@@ -4815,13 +4851,15 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_ret_bf16__offset__align4:
@@ -5106,12 +5144,14 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_noret_bf16__offset__align4:
@@ -5381,13 +5421,15 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_ret_v2f16:
@@ -5635,13 +5677,15 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_ret_v2f16__offset:
@@ -5888,12 +5932,14 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_noret_v2f16:
@@ -6130,12 +6176,14 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_noret_v2f16__offset:
@@ -6398,13 +6446,15 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_ret_v2bf16:
@@ -6774,13 +6824,15 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_ret_v2bf16__offset:
@@ -7150,12 +7202,14 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16:
@@ -7513,12 +7567,14 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
@@ -7864,13 +7920,15 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode:
@@ -8074,12 +8132,14 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode:
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index 390d1d70ff2aa..df954f6f940c8 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s
 ; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
 ; GCN-NEXT:  .LBB0_2: ; %for.body
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_wait_alu 0xfffe
 ; GCN-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GCN-NEXT:    s_prefetch_data s[2:3], 0x0, null, 0
 ; GCN-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
@@ -149,6 +149,7 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa
 ; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
 ; GCN-NEXT:  .LBB3_1: ; %for.body
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_wait_alu 0xfffe
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-NEXT:    s_add_co_i32 s2, s2, -1
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index fef1b57db5685..acba2841a7107 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -200,6 +200,7 @@ define amdgpu_kernel void @caller() {
 ; GFX12-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX12-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
 ; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -212,6 +213,7 @@ define amdgpu_kernel void @caller() {
 ; GFX12-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX12-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
 ; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX12-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -276,9 +278,10 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1)
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s0, ttmp7, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0
 ; GFX12-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
index f90753652baa5..1da05ed264a64 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -107,6 +107,7 @@ define amdgpu_cs void @caller() {
 ; GFX12-SDAG-NEXT:    s_mov_b32 s1, callee@abs32@hi
 ; GFX12-SDAG-NEXT:    s_mov_b32 s0, callee@abs32@lo
 ; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
 ; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -116,6 +117,7 @@ define amdgpu_cs void @caller() {
 ; GFX12-GISEL-NEXT:    s_mov_b32 s0, callee@abs32@lo
 ; GFX12-GISEL-NEXT:    s_mov_b32 s1, callee@abs32@hi
 ; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
 ; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -182,9 +184,10 @@ define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s0, ttmp7, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0
 ; GFX12-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
index 94d1eca05ed0e..8a789a4c6cda9 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
@@ -106,17 +106,20 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX12-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX12-NEXT:    s_add_co_ci_u32 s0, s32, 0x4000
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_bitcmp1_b32 s0, 0
 ; GFX12-NEXT:    s_bitset0_b32 s0, 0
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use alloca0 v0
 ; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 s59, s0
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s59, scc
@@ -124,8 +127,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
 ; GFX12-NEXT:    v_readlane_b32 s59, v1, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
@@ -313,10 +318,12 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 s59, s0
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use alloca0 v0
@@ -327,8 +334,10 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
 ; GFX12-NEXT:    v_readlane_b32 s59, v1, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: scalar_mov_materializes_frame_index_dead_scc:
@@ -530,17 +539,20 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX12-NEXT:    s_mov_b32 s33, s32
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_store_b32 off, v1, s33 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_addk_co_i32 s32, 0x4040
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX12-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX12-NEXT:    s_add_co_ci_u32 s0, s33, 0x4000
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s33
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_bitcmp1_b32 s0, 0
 ; GFX12-NEXT:    s_bitset0_b32 s0, 0
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use alloca0 v0
 ; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 s59, s0
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s59, scc
@@ -548,10 +560,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX12-NEXT:    v_readlane_b32 s59, v1, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v1, off, s33 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_addk_co_i32 s32, 0xbfc0
 ; GFX12-NEXT:    s_mov_b32 s33, s1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
@@ -745,6 +759,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset()
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX12-NEXT:    s_mov_b32 s59, s32
@@ -756,8 +771,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset()
 ; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
@@ -911,6 +928,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX12-NEXT:    s_mov_b32 s59, s32
@@ -921,8 +939,10 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0
 ; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
@@ -1092,6 +1112,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX12-NEXT:    s_mov_b32 s33, s32
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX12-NEXT:    s_addk_co_i32 s32, 0x4040
@@ -1103,10 +1124,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_addk_co_i32 s32, 0xbfc0
 ; GFX12-NEXT:    s_mov_b32 s33, s1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
@@ -1292,6 +1315,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX12-NEXT:    s_mov_b32 s33, s32
 ; GFX12-NEXT:    s_xor_saveexec_b32 s1, -1
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX12-NEXT:    s_mov_b32 s59, s33
@@ -1303,10 +1327,12 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s1, -1
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    s_addk_co_i32 s32, 0xbfc0
 ; GFX12-NEXT:    s_mov_b32 s33, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
@@ -1492,9 +1518,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
 ; GFX12-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s32
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX12-NEXT:    ;;#ASMSTART
@@ -1509,8 +1537,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
 ; GFX12-NEXT:    v_readlane_b32 s59, v2, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v2, off, s32 offset:32768 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
@@ -1710,10 +1740,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s1, -1
 ; GFX12-NEXT:    scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_add_co_i32 s1, s32, 0x4000
 ; GFX12-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_add_nc_u32_e64 v1, s0, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
@@ -1728,8 +1760,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX12-NEXT:    v_readlane_b32 s59, v2, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v2, off, s32 offset:32768 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index 6346406fa8941..9829b7e787d47 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -673,6 +673,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_store_b32 off, v23, s32 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    v_writelane_b32 v23, s30, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
@@ -711,13 +712,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
 ; GFX12-NEXT:    ;;#ASMEND
 ; GFX12-NEXT:    s_add_co_ci_u32 s32, s32, 0x4000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_bitcmp1_b32 s32, 0
 ; GFX12-NEXT:    v_writelane_b32 v23, s59, 28
 ; GFX12-NEXT:    s_bitset0_b32 s32, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 s59, s32
 ; GFX12-NEXT:    s_add_co_ci_u32 s32, s32, 0xffffc000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_bitcmp1_b32 s32, 0
 ; GFX12-NEXT:    s_bitset0_b32 s32, 0
 ; GFX12-NEXT:    ;;#ASMSTART
@@ -754,8 +756,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
 ; GFX12-NEXT:    v_readlane_b32 s30, v23, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v23, off, s32 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
   %alloca1 = alloca i32, align 4, addrspace(5)
@@ -1396,6 +1400,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_store_b32 off, v21, s32 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    v_writelane_b32 v21, s30, 0
 ; GFX12-NEXT:    v_writelane_b32 v21, s31, 1
@@ -1466,8 +1471,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX12-NEXT:    v_readlane_b32 s30, v21, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v21, off, s32 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [4096 x i32], align 16, addrspace(5)
 
@@ -2196,16 +2203,18 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_store_b32 off, v23, s32 offset:32768 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    v_writelane_b32 v23, s30, 0
 ; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s32 :: v_dual_mov_b32 v0, s0
 ; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX12-NEXT:    v_writelane_b32 v23, s31, 1
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use alloca0 v1
 ; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_nc_u32_e32 v22, 0x200, v0
 ; GFX12-NEXT:    v_writelane_b32 v23, s33, 2
 ; GFX12-NEXT:    v_writelane_b32 v23, s34, 3
@@ -2271,8 +2280,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
 ; GFX12-NEXT:    v_readlane_b32 s30, v23, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v23, off, s32 offset:32768 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
   %alloca1 = alloca [4096 x i32], align 4, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 45e8b3bcff13c..e9b4ec52599a0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ; GFX12-WGP-LABEL: flat_agent_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ; GFX12-CU-LABEL: flat_agent_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ; GFX12-WGP-LABEL: flat_agent_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ; GFX12-CU-LABEL: flat_agent_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -547,6 +551,7 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX12-WGP-LABEL: flat_agent_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -565,6 +570,7 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX12-CU-LABEL: flat_agent_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -765,6 +771,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -787,6 +794,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX12-CU-LABEL: flat_agent_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1550,6 +1558,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ; GFX12-WGP-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1562,6 +1571,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ; GFX12-CU-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1729,6 +1739,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX12-WGP-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1743,6 +1754,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX12-CU-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1901,6 +1913,7 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ; GFX12-WGP-LABEL: flat_agent_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1918,6 +1931,7 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ; GFX12-CU-LABEL: flat_agent_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -2106,6 +2120,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX12-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -2125,6 +2140,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX12-CU-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -2315,6 +2331,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -2334,6 +2351,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX12-CU-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -3215,6 +3233,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3231,6 +3250,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3473,6 +3493,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3491,6 +3512,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3724,6 +3746,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3745,6 +3768,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4008,6 +4032,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4031,6 +4056,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4296,6 +4322,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4319,6 +4346,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4568,6 +4596,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4586,6 +4615,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4830,6 +4860,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4848,6 +4879,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5108,6 +5140,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5131,6 +5164,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5396,6 +5430,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5419,6 +5454,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5684,6 +5720,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5707,6 +5744,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5972,6 +6010,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5995,6 +6034,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6260,6 +6300,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6283,6 +6324,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6548,6 +6590,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6571,6 +6614,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6836,6 +6880,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6859,6 +6904,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7124,6 +7170,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7147,6 +7194,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7401,6 +7449,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7421,6 +7470,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7687,6 +7737,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7710,6 +7761,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7982,6 +8034,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8007,6 +8060,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8294,6 +8348,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8322,6 +8377,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8612,6 +8668,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8640,6 +8697,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8914,6 +8972,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8937,6 +8996,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9206,6 +9266,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9229,6 +9290,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9514,6 +9576,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9542,6 +9605,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9832,6 +9896,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9860,6 +9925,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10150,6 +10216,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10178,6 +10245,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10468,6 +10536,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10496,6 +10565,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10786,6 +10856,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10814,6 +10885,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -11104,6 +11176,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -11132,6 +11205,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -11422,6 +11496,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -11450,6 +11525,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -11740,6 +11816,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -11768,6 +11845,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -11953,6 +12031,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ; GFX12-WGP-LABEL: flat_agent_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -11968,6 +12047,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ; GFX12-CU-LABEL: flat_agent_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -12138,6 +12218,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -12153,6 +12234,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -12344,6 +12426,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -12363,6 +12446,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -12572,6 +12656,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -12595,6 +12680,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -13359,6 +13445,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -13371,6 +13458,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -13534,6 +13622,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -13548,6 +13637,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -13706,6 +13796,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ; GFX12-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -13723,6 +13814,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ; GFX12-CU-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -13907,6 +13999,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -13926,6 +14019,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -14112,6 +14206,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -14131,6 +14226,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -15042,6 +15138,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15058,6 +15155,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15296,6 +15394,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15314,6 +15413,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15547,6 +15647,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15568,6 +15669,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15827,6 +15929,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15850,6 +15953,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16111,6 +16215,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16134,6 +16239,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16379,6 +16485,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16397,6 +16504,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16637,6 +16745,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16655,6 +16764,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16911,6 +17021,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16934,6 +17045,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17195,6 +17307,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17218,6 +17331,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17479,6 +17593,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17502,6 +17617,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17763,6 +17879,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17786,6 +17903,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18047,6 +18165,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18070,6 +18189,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18331,6 +18451,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18354,6 +18475,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18615,6 +18737,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18638,6 +18761,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18899,6 +19023,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18922,6 +19047,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19176,6 +19302,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19196,6 +19323,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19470,6 +19598,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19494,6 +19623,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19767,6 +19897,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19792,6 +19923,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20087,6 +20219,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20116,6 +20249,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20415,6 +20549,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20444,6 +20579,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20727,6 +20863,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20751,6 +20888,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -21029,6 +21167,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -21053,6 +21192,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -21347,6 +21487,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -21376,6 +21517,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -21675,6 +21817,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -21704,6 +21847,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -22003,6 +22147,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -22032,6 +22177,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -22331,6 +22477,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -22360,6 +22507,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -22659,6 +22807,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -22688,6 +22837,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -22987,6 +23137,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -23016,6 +23167,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -23315,6 +23467,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -23344,6 +23497,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -23643,6 +23797,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -23672,6 +23827,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
index fb40274cac1ba..5c59481c59853 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
@@ -6,6 +6,7 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) {
 ; GFX12-LABEL: flat_last_use_load_0:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
@@ -29,18 +30,23 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) {
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x8
 ; GFX12-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_b32_e64 v0, v0, s2
 ; GFX12-NEXT:    s_mov_b32 s2, 2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
 ; GFX12-NEXT:    s_mov_b32 s2, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    ; implicit-def: $sgpr2
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s3, s4
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-NEXT:    s_mov_b32 s2, s5
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-NEXT:    v_add_co_u32 v0, s3, s3, v0
 ; GFX12-NEXT:    v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
@@ -64,6 +70,7 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
 ; GFX12-LABEL: flat_last_use_and_volatile_load:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
@@ -88,6 +95,7 @@ define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out)
 ; GFX12-LABEL: flat_last_use_and_nontemporal_load:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 5fa8e6891bafb..b2340caa2933f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX12-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX12-CU-LABEL: flat_nontemporal_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -475,18 +477,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX12-WGP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, 0
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    ; implicit-def: $sgpr2
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    s_mov_b32 s3, s4
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, s5
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-WGP-NEXT:    v_add_co_u32 v0, s3, s3, v0
 ; GFX12-WGP-NEXT:    v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
@@ -504,18 +511,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX12-CU-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s2
 ; GFX12-CU-NEXT:    s_mov_b32 s2, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
 ; GFX12-CU-NEXT:    s_mov_b32 s2, 0
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    ; implicit-def: $sgpr2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    s_mov_b32 s3, s4
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-CU-NEXT:    s_mov_b32 s2, s5
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-CU-NEXT:    v_add_co_u32 v0, s3, s3, v0
 ; GFX12-CU-NEXT:    v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
@@ -688,6 +700,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX12-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -703,6 +716,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX12-CU-LABEL: flat_nontemporal_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1007,17 +1021,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[1:2]
 ; GFX12-WGP-NEXT:    s_mov_b32 s0, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s0
 ; GFX12-WGP-NEXT:    s_mov_b32 s0, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v3, s0, v0
 ; GFX12-WGP-NEXT:    s_mov_b32 s0, 0
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    ; implicit-def: $sgpr0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX12-WGP-NEXT:    s_mov_b32 s1, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-WGP-NEXT:    s_mov_b32 s0, s3
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-WGP-NEXT:    v_add_co_u32 v0, s1, s1, v0
 ; GFX12-WGP-NEXT:    v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
@@ -1036,17 +1055,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[1:2]
 ; GFX12-CU-NEXT:    s_mov_b32 s0, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s0
 ; GFX12-CU-NEXT:    s_mov_b32 s0, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v3, s0, v0
 ; GFX12-CU-NEXT:    s_mov_b32 s0, 0
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    ; implicit-def: $sgpr0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX12-CU-NEXT:    s_mov_b32 s1, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-CU-NEXT:    s_mov_b32 s0, s3
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-CU-NEXT:    v_add_co_u32 v0, s1, s1, v0
 ; GFX12-CU-NEXT:    v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
@@ -1224,6 +1248,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ; GFX12-WGP-LABEL: flat_nontemporal_volatile_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1242,6 +1267,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ; GFX12-CU-LABEL: flat_nontemporal_volatile_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 4c9ce15211e34..304c80d7bb24d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ; GFX12-WGP-LABEL: flat_singlethread_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ; GFX12-CU-LABEL: flat_singlethread_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -534,6 +538,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -549,6 +554,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ; GFX12-CU-LABEL: flat_singlethread_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -719,6 +725,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -734,6 +741,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1438,6 +1446,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1450,6 +1459,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1590,6 +1600,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1602,6 +1613,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ; GFX12-CU-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1742,6 +1754,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ; GFX12-WGP-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1754,6 +1767,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ; GFX12-CU-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1894,6 +1908,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1906,6 +1921,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ; GFX12-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -2046,6 +2062,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -2058,6 +2075,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -2823,6 +2841,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -2839,6 +2858,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3054,6 +3074,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3070,6 +3091,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3285,6 +3307,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3301,6 +3324,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3516,6 +3540,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3532,6 +3557,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3747,6 +3773,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3763,6 +3790,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3978,6 +4006,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3994,6 +4023,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4209,6 +4239,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4225,6 +4256,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4440,6 +4472,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4456,6 +4489,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4671,6 +4705,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4687,6 +4722,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4902,6 +4938,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4918,6 +4955,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5133,6 +5171,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5149,6 +5188,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5364,6 +5404,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5380,6 +5421,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5595,6 +5637,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5611,6 +5654,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5826,6 +5870,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5842,6 +5887,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6057,6 +6103,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6073,6 +6120,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6320,6 +6368,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6340,6 +6389,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6593,6 +6643,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6613,6 +6664,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6866,6 +6918,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6886,6 +6939,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7139,6 +7193,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7159,6 +7214,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7412,6 +7468,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7432,6 +7489,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7685,6 +7743,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7705,6 +7764,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7958,6 +8018,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7978,6 +8039,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8231,6 +8293,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8251,6 +8314,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8504,6 +8568,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8524,6 +8589,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8777,6 +8843,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8797,6 +8864,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9050,6 +9118,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9070,6 +9139,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9323,6 +9393,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9343,6 +9414,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9596,6 +9668,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9616,6 +9689,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9869,6 +9943,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9889,6 +9964,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10142,6 +10218,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10162,6 +10239,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10339,6 +10417,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -10354,6 +10433,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -10524,6 +10604,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -10539,6 +10620,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -10709,6 +10791,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -10724,6 +10807,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -10894,6 +10978,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -10909,6 +10994,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -11613,6 +11699,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -11625,6 +11712,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -11765,6 +11853,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -11777,6 +11866,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -11917,6 +12007,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -11929,6 +12020,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -12069,6 +12161,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -12081,6 +12174,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -12221,6 +12315,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -12233,6 +12328,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -12998,6 +13094,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13014,6 +13111,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13229,6 +13327,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13245,6 +13344,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13460,6 +13560,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13476,6 +13577,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13691,6 +13793,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13707,6 +13810,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13922,6 +14026,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13938,6 +14043,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14153,6 +14259,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14169,6 +14276,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14384,6 +14492,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14400,6 +14509,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14615,6 +14725,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14631,6 +14742,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14846,6 +14958,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14862,6 +14975,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15077,6 +15191,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15093,6 +15208,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15308,6 +15424,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15324,6 +15441,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15539,6 +15657,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15555,6 +15674,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15770,6 +15890,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15786,6 +15907,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16001,6 +16123,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16017,6 +16140,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16232,6 +16356,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16248,6 +16373,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16495,6 +16621,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16515,6 +16642,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16768,6 +16896,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16788,6 +16917,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17041,6 +17171,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17061,6 +17192,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17314,6 +17446,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17334,6 +17467,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17587,6 +17721,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17607,6 +17742,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17860,6 +17996,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17880,6 +18017,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18133,6 +18271,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18153,6 +18292,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18406,6 +18546,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18426,6 +18567,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18679,6 +18821,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18699,6 +18842,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18952,6 +19096,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18972,6 +19117,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19225,6 +19371,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19245,6 +19392,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19498,6 +19646,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19518,6 +19667,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19771,6 +19921,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19791,6 +19942,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20044,6 +20196,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20064,6 +20217,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20317,6 +20471,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20337,6 +20492,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index e77f1432c1c9d..3502a29edeecb 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ; GFX12-WGP-LABEL: flat_system_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ; GFX12-CU-LABEL: flat_system_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ; GFX12-WGP-LABEL: flat_system_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ; GFX12-CU-LABEL: flat_system_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -549,6 +553,7 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX12-WGP-LABEL: flat_system_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -567,6 +572,7 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX12-CU-LABEL: flat_system_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -769,6 +775,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX12-WGP-LABEL: flat_system_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -791,6 +798,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX12-CU-LABEL: flat_system_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1558,6 +1566,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ; GFX12-WGP-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1570,6 +1579,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ; GFX12-CU-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1739,6 +1749,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX12-WGP-LABEL: flat_system_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1753,6 +1764,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX12-CU-LABEL: flat_system_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1913,6 +1925,7 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ; GFX12-WGP-LABEL: flat_system_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1930,6 +1943,7 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ; GFX12-CU-LABEL: flat_system_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -2122,6 +2136,7 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX12-WGP-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -2141,6 +2156,7 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX12-CU-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -2335,6 +2351,7 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX12-WGP-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -2354,6 +2371,7 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX12-CU-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -3245,6 +3263,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3261,6 +3280,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3505,6 +3525,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3523,6 +3544,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3758,6 +3780,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3779,6 +3802,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4046,6 +4070,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4069,6 +4094,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4338,6 +4364,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4361,6 +4388,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4612,6 +4640,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4630,6 +4659,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4876,6 +4906,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4894,6 +4925,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5158,6 +5190,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5181,6 +5214,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5450,6 +5484,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5473,6 +5508,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5742,6 +5778,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5765,6 +5802,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6034,6 +6072,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6057,6 +6096,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6326,6 +6366,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6349,6 +6390,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6618,6 +6660,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6641,6 +6684,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6910,6 +6954,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6933,6 +6978,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7202,6 +7248,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7225,6 +7272,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7479,6 +7527,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7499,6 +7548,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7767,6 +7817,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7790,6 +7841,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8064,6 +8116,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8089,6 +8142,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8380,6 +8434,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8408,6 +8463,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8702,6 +8758,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8730,6 +8787,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9006,6 +9064,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9029,6 +9088,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9300,6 +9360,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9323,6 +9384,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9612,6 +9674,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9640,6 +9703,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9934,6 +9998,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9962,6 +10027,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10256,6 +10322,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10284,6 +10351,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10578,6 +10646,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10606,6 +10675,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10900,6 +10970,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10928,6 +10999,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -11222,6 +11294,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -11250,6 +11323,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -11544,6 +11618,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -11572,6 +11647,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -11866,6 +11942,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -11894,6 +11971,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -12079,6 +12157,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ; GFX12-WGP-LABEL: flat_system_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -12094,6 +12173,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ; GFX12-CU-LABEL: flat_system_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -12264,6 +12344,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -12279,6 +12360,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -12472,6 +12554,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -12491,6 +12574,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -12702,6 +12786,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -12725,6 +12810,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -13493,6 +13579,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -13505,6 +13592,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -13670,6 +13758,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -13684,6 +13773,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -13844,6 +13934,7 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ; GFX12-WGP-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -13861,6 +13952,7 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ; GFX12-CU-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -14049,6 +14141,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -14068,6 +14161,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -14258,6 +14352,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -14277,6 +14372,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -15198,6 +15294,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15214,6 +15311,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15454,6 +15552,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15472,6 +15571,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15707,6 +15807,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15728,6 +15829,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15991,6 +16093,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16014,6 +16117,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16279,6 +16383,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16302,6 +16407,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16549,6 +16655,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16567,6 +16674,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16809,6 +16917,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16827,6 +16936,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17087,6 +17197,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17110,6 +17221,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17375,6 +17487,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17398,6 +17511,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17663,6 +17777,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17686,6 +17801,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17951,6 +18067,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17974,6 +18091,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18239,6 +18357,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18262,6 +18381,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18527,6 +18647,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18550,6 +18671,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18815,6 +18937,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18838,6 +18961,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19103,6 +19227,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19126,6 +19251,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19380,6 +19506,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19400,6 +19527,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19676,6 +19804,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19700,6 +19829,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19975,6 +20105,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20000,6 +20131,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20299,6 +20431,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20328,6 +20461,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20631,6 +20765,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20660,6 +20795,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20945,6 +21081,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20969,6 +21106,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -21249,6 +21387,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -21273,6 +21412,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -21571,6 +21711,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -21600,6 +21741,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -21903,6 +22045,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -21932,6 +22075,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -22235,6 +22379,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -22264,6 +22409,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -22567,6 +22713,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -22596,6 +22743,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -22899,6 +23047,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -22928,6 +23077,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -23231,6 +23381,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -23260,6 +23411,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -23563,6 +23715,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -23592,6 +23745,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -23895,6 +24049,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -23924,6 +24079,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index 6bf54ccabc9da..c2b7aa4fcfbf1 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -110,6 +110,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX12-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -128,6 +129,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX12-CU-LABEL: flat_nontemporal_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -329,18 +331,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX12-WGP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, 0
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    ; implicit-def: $sgpr2
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    s_mov_b32 s3, s4
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, s5
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-WGP-NEXT:    v_add_co_u32 v0, s3, s3, v0
 ; GFX12-WGP-NEXT:    v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
@@ -361,18 +368,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX12-CU-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s2
 ; GFX12-CU-NEXT:    s_mov_b32 s2, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
 ; GFX12-CU-NEXT:    s_mov_b32 s2, 0
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    ; implicit-def: $sgpr2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    s_mov_b32 s3, s4
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-CU-NEXT:    s_mov_b32 s2, s5
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-CU-NEXT:    v_add_co_u32 v0, s3, s3, v0
 ; GFX12-CU-NEXT:    v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
@@ -498,6 +510,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX12-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -518,6 +531,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX12-CU-LABEL: flat_nontemporal_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -727,17 +741,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[1:2]
 ; GFX12-WGP-NEXT:    s_mov_b32 s0, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s0
 ; GFX12-WGP-NEXT:    s_mov_b32 s0, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v3, s0, v0
 ; GFX12-WGP-NEXT:    s_mov_b32 s0, 0
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    ; implicit-def: $sgpr0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX12-WGP-NEXT:    s_mov_b32 s1, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-WGP-NEXT:    s_mov_b32 s0, s3
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-WGP-NEXT:    v_add_co_u32 v0, s1, s1, v0
 ; GFX12-WGP-NEXT:    v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
@@ -761,17 +780,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[1:2]
 ; GFX12-CU-NEXT:    s_mov_b32 s0, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s0
 ; GFX12-CU-NEXT:    s_mov_b32 s0, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v3, s0, v0
 ; GFX12-CU-NEXT:    s_mov_b32 s0, 0
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    ; implicit-def: $sgpr0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX12-CU-NEXT:    s_mov_b32 s1, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-CU-NEXT:    s_mov_b32 s0, s3
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-CU-NEXT:    v_add_co_u32 v0, s1, s1, v0
 ; GFX12-CU-NEXT:    v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
@@ -896,6 +920,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ; GFX12-WGP-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -914,6 +939,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ; GFX12-CU-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index c7826181cc8dd..23982f8a00cdb 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ; GFX12-WGP-LABEL: flat_wavefront_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ; GFX12-CU-LABEL: flat_wavefront_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -534,6 +538,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -549,6 +554,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ; GFX12-CU-LABEL: flat_wavefront_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -719,6 +725,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -734,6 +741,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1438,6 +1446,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1450,6 +1459,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1590,6 +1600,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1602,6 +1613,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ; GFX12-CU-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1742,6 +1754,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ; GFX12-WGP-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1754,6 +1767,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ; GFX12-CU-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1894,6 +1908,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1906,6 +1921,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ; GFX12-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -2046,6 +2062,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -2058,6 +2075,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -2823,6 +2841,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -2839,6 +2858,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3054,6 +3074,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3070,6 +3091,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3285,6 +3307,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3301,6 +3324,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3516,6 +3540,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3532,6 +3557,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3747,6 +3773,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3763,6 +3790,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3978,6 +4006,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3994,6 +4023,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4209,6 +4239,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4225,6 +4256,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4440,6 +4472,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4456,6 +4489,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4671,6 +4705,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4687,6 +4722,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4902,6 +4938,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4918,6 +4955,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5133,6 +5171,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5149,6 +5188,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5364,6 +5404,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5380,6 +5421,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5595,6 +5637,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5611,6 +5654,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5826,6 +5870,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5842,6 +5887,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6057,6 +6103,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6073,6 +6120,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6320,6 +6368,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6340,6 +6389,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6593,6 +6643,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6613,6 +6664,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6866,6 +6918,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6886,6 +6939,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7139,6 +7193,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7159,6 +7214,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7412,6 +7468,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7432,6 +7489,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7685,6 +7743,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7705,6 +7764,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7958,6 +8018,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7978,6 +8039,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8231,6 +8293,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8251,6 +8314,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8504,6 +8568,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8524,6 +8589,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8777,6 +8843,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8797,6 +8864,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9050,6 +9118,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9070,6 +9139,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9323,6 +9393,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9343,6 +9414,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9596,6 +9668,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9616,6 +9689,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9869,6 +9943,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9889,6 +9964,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10142,6 +10218,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10162,6 +10239,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10339,6 +10417,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -10354,6 +10433,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -10524,6 +10604,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -10539,6 +10620,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -10709,6 +10791,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -10724,6 +10807,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -10894,6 +10978,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -10909,6 +10994,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -11613,6 +11699,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -11625,6 +11712,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -11765,6 +11853,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -11777,6 +11866,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -11917,6 +12007,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -11929,6 +12020,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -12069,6 +12161,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -12081,6 +12174,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -12221,6 +12315,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -12233,6 +12328,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -12998,6 +13094,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13014,6 +13111,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13229,6 +13327,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13245,6 +13344,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13460,6 +13560,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13476,6 +13577,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13691,6 +13793,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13707,6 +13810,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13922,6 +14026,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13938,6 +14043,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14153,6 +14259,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14169,6 +14276,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14384,6 +14492,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14400,6 +14509,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14615,6 +14725,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14631,6 +14742,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14846,6 +14958,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14862,6 +14975,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15077,6 +15191,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15093,6 +15208,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15308,6 +15424,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15324,6 +15441,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15539,6 +15657,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15555,6 +15674,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15770,6 +15890,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15786,6 +15907,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16001,6 +16123,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16017,6 +16140,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16232,6 +16356,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16248,6 +16373,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16495,6 +16621,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16515,6 +16642,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16768,6 +16896,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16788,6 +16917,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17041,6 +17171,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17061,6 +17192,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17314,6 +17446,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17334,6 +17467,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17587,6 +17721,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17607,6 +17742,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17860,6 +17996,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17880,6 +18017,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18133,6 +18271,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18153,6 +18292,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18406,6 +18546,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18426,6 +18567,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18679,6 +18821,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18699,6 +18842,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18952,6 +19096,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18972,6 +19117,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19225,6 +19371,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19245,6 +19392,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19498,6 +19646,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19518,6 +19667,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19771,6 +19921,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19791,6 +19942,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20044,6 +20196,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20064,6 +20217,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 8949e4b782f63..cd2c8176b8d33 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ; GFX12-WGP-LABEL: flat_workgroup_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ; GFX12-CU-LABEL: flat_workgroup_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ; GFX12-CU-LABEL: flat_workgroup_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -544,6 +548,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ; GFX12-WGP-LABEL: flat_workgroup_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -562,6 +567,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ; GFX12-CU-LABEL: flat_workgroup_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -755,6 +761,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -777,6 +784,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1519,6 +1527,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1531,6 +1540,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ; GFX12-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1687,6 +1697,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ; GFX12-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1701,6 +1712,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ; GFX12-CU-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -1854,6 +1866,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX12-WGP-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -1871,6 +1884,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX12-CU-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -2040,6 +2054,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -2059,6 +2074,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -2229,6 +2245,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -2248,6 +2265,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -3093,6 +3111,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3109,6 +3128,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3340,6 +3360,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3358,6 +3379,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3586,6 +3608,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3607,6 +3630,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3851,6 +3875,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -3874,6 +3899,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4119,6 +4145,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4142,6 +4169,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4375,6 +4403,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4393,6 +4422,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4625,6 +4655,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4643,6 +4674,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4887,6 +4919,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -4910,6 +4943,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5155,6 +5189,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5178,6 +5213,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5423,6 +5459,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5446,6 +5483,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5691,6 +5729,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5714,6 +5753,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5963,6 +6003,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -5983,6 +6024,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6246,6 +6288,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6269,6 +6312,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6535,6 +6579,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6560,6 +6605,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6836,6 +6882,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -6864,6 +6911,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7141,6 +7189,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7169,6 +7218,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7434,6 +7484,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7457,6 +7508,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7721,6 +7773,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -7744,6 +7797,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8020,6 +8074,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8048,6 +8103,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8325,6 +8381,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8353,6 +8410,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8630,6 +8688,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8658,6 +8717,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8935,6 +8995,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -8963,6 +9024,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9240,6 +9302,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9268,6 +9331,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9545,6 +9609,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9573,6 +9638,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9850,6 +9916,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -9878,6 +9945,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10155,6 +10223,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10183,6 +10252,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -10362,6 +10432,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -10377,6 +10448,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -10547,6 +10619,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -10562,6 +10635,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -10738,6 +10812,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -10757,6 +10832,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -10939,6 +11015,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -10962,6 +11039,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -11688,6 +11766,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -11700,6 +11779,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -11848,6 +11928,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -11862,6 +11943,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -12008,6 +12090,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -12025,6 +12108,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -12179,6 +12263,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -12198,6 +12283,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -12352,6 +12438,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -12371,6 +12458,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -13188,6 +13276,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13204,6 +13293,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13427,6 +13517,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13445,6 +13536,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13666,6 +13758,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13687,6 +13780,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13916,6 +14010,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -13939,6 +14034,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14168,6 +14264,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14191,6 +14288,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14414,6 +14512,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14432,6 +14531,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14655,6 +14755,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14673,6 +14774,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14902,6 +15004,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -14925,6 +15028,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15154,6 +15258,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15177,6 +15282,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15406,6 +15512,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15429,6 +15536,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15658,6 +15766,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15681,6 +15790,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15910,6 +16020,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -15933,6 +16044,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16162,6 +16274,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16185,6 +16298,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16414,6 +16528,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16437,6 +16552,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16666,6 +16782,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16689,6 +16806,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16936,6 +17054,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -16956,6 +17075,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17215,6 +17335,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17239,6 +17360,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17498,6 +17620,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17523,6 +17646,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17788,6 +17912,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -17817,6 +17942,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18082,6 +18208,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18111,6 +18238,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18370,6 +18498,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18394,6 +18523,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18653,6 +18783,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18677,6 +18808,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18942,6 +19074,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -18971,6 +19104,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19236,6 +19370,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19265,6 +19400,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19530,6 +19666,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19559,6 +19696,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19824,6 +19962,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -19853,6 +19992,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20118,6 +20258,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20147,6 +20288,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20412,6 +20554,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20441,6 +20584,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20706,6 +20850,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -20735,6 +20880,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -21000,6 +21146,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
@@ -21029,6 +21176,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index b56860991b194..4ba64af63e5f5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @global_agent_unordered_load(
 ; GFX12-WGP-LABEL: global_agent_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -194,6 +195,7 @@ define amdgpu_kernel void @global_agent_unordered_load(
 ; GFX12-CU-LABEL: global_agent_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -378,6 +380,7 @@ define amdgpu_kernel void @global_agent_monotonic_load(
 ; GFX12-WGP-LABEL: global_agent_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -390,6 +393,7 @@ define amdgpu_kernel void @global_agent_monotonic_load(
 ; GFX12-CU-LABEL: global_agent_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -588,6 +592,7 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ; GFX12-WGP-LABEL: global_agent_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -603,6 +608,7 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ; GFX12-CU-LABEL: global_agent_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -811,6 +817,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ; GFX12-WGP-LABEL: global_agent_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -830,6 +837,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ; GFX12-CU-LABEL: global_agent_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -995,6 +1003,7 @@ define amdgpu_kernel void @global_agent_unordered_store(
 ; GFX12-WGP-LABEL: global_agent_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1006,6 +1015,7 @@ define amdgpu_kernel void @global_agent_unordered_store(
 ; GFX12-CU-LABEL: global_agent_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -1162,6 +1172,7 @@ define amdgpu_kernel void @global_agent_monotonic_store(
 ; GFX12-WGP-LABEL: global_agent_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1173,6 +1184,7 @@ define amdgpu_kernel void @global_agent_monotonic_store(
 ; GFX12-CU-LABEL: global_agent_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -1346,6 +1358,7 @@ define amdgpu_kernel void @global_agent_release_store(
 ; GFX12-WGP-LABEL: global_agent_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1362,6 +1375,7 @@ define amdgpu_kernel void @global_agent_release_store(
 ; GFX12-CU-LABEL: global_agent_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -1540,6 +1554,7 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
 ; GFX12-WGP-LABEL: global_agent_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1556,6 +1571,7 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
 ; GFX12-CU-LABEL: global_agent_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -3395,6 +3411,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3410,6 +3427,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3644,6 +3662,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3661,6 +3680,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3889,6 +3909,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3909,6 +3930,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4165,6 +4187,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4187,6 +4210,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4445,6 +4469,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4467,6 +4492,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4708,6 +4734,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4725,6 +4752,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4961,6 +4989,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4978,6 +5007,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5231,6 +5261,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5253,6 +5284,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5511,6 +5543,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5533,6 +5566,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5791,6 +5825,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5813,6 +5848,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6071,6 +6107,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6093,6 +6130,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6351,6 +6389,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6373,6 +6412,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6631,6 +6671,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6653,6 +6694,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6911,6 +6953,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6933,6 +6976,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7191,6 +7235,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7213,6 +7258,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7454,6 +7500,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7471,6 +7518,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7724,6 +7772,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7744,6 +7793,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8002,6 +8052,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8024,6 +8075,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8299,6 +8351,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8324,6 +8377,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8602,6 +8656,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8627,6 +8682,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8888,6 +8944,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8908,6 +8965,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9164,6 +9222,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9184,6 +9243,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9457,6 +9517,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9482,6 +9543,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9760,6 +9822,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9785,6 +9848,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10063,6 +10127,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10088,6 +10153,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10366,6 +10432,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10391,6 +10458,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10669,6 +10737,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10694,6 +10763,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10972,6 +11042,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10997,6 +11068,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11275,6 +11347,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11300,6 +11373,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11578,6 +11652,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11603,6 +11678,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11802,6 +11878,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
 ; GFX12-WGP-LABEL: global_agent_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -11814,6 +11891,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
 ; GFX12-CU-LABEL: global_agent_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -11998,6 +12076,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -12010,6 +12089,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -12208,6 +12288,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -12223,6 +12304,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -12431,6 +12513,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -12450,6 +12533,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -12615,6 +12699,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
 ; GFX12-WGP-LABEL: global_agent_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -12626,6 +12711,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
 ; GFX12-CU-LABEL: global_agent_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -12782,6 +12868,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -12793,6 +12880,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -12966,6 +13054,7 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
 ; GFX12-WGP-LABEL: global_agent_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -12982,6 +13071,7 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
 ; GFX12-CU-LABEL: global_agent_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -13160,6 +13250,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -13176,6 +13267,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -15015,6 +15107,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15030,6 +15123,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15264,6 +15358,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15281,6 +15376,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15509,6 +15605,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15529,6 +15626,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15785,6 +15883,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15807,6 +15906,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16065,6 +16165,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16087,6 +16188,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16328,6 +16430,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16345,6 +16448,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16581,6 +16685,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16598,6 +16703,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16851,6 +16957,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16873,6 +16980,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17131,6 +17239,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17153,6 +17262,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17411,6 +17521,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17433,6 +17544,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17691,6 +17803,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17713,6 +17826,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17971,6 +18085,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17993,6 +18108,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18251,6 +18367,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18273,6 +18390,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18531,6 +18649,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18553,6 +18672,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18811,6 +18931,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18833,6 +18954,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19074,6 +19196,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19091,6 +19214,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19344,6 +19468,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19364,6 +19489,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19637,6 +19763,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19662,6 +19789,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19940,6 +20068,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19965,6 +20094,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20226,6 +20356,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20246,6 +20377,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20502,6 +20634,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20522,6 +20655,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20795,6 +20929,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20820,6 +20955,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21098,6 +21234,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21123,6 +21260,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21401,6 +21539,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21426,6 +21565,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21704,6 +21844,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21729,6 +21870,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22007,6 +22149,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22032,6 +22175,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22310,6 +22454,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22335,6 +22480,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22613,6 +22759,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22638,6 +22785,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22916,6 +23064,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22941,6 +23090,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
index 7a9cb992a0cd1..0fc3212b0f46d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
@@ -6,6 +6,7 @@ define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addr
 ; GFX12-LABEL: global_last_use_load_0:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -25,13 +26,16 @@ define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addr
 ; GFX12-LABEL: global_last_use_load_1:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_b32_e64 v1, v1, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v1, s[2:3] th:TH_LOAD_LU
@@ -50,6 +54,7 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i
 ; GFX12-LABEL: global_last_use_and_volatile_load:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -70,13 +75,16 @@ define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1)
 ; GFX12-LABEL: global_last_use_and_nontemporal_load:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_b32_e64 v1, v1, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v1, s[2:3] th:TH_LOAD_LU
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 9b2b3a4cfa9ba..14f1734235673 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -178,6 +178,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ; GFX12-WGP-LABEL: global_nontemporal_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -191,6 +192,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ; GFX12-CU-LABEL: global_nontemporal_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -437,13 +439,16 @@ define amdgpu_kernel void @global_nontemporal_load_1(
 ; GFX12-WGP-LABEL: global_nontemporal_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s4
 ; GFX12-WGP-NEXT:    s_mov_b32 s4, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v1, s[2:3] th:TH_LOAD_NT
@@ -454,13 +459,16 @@ define amdgpu_kernel void @global_nontemporal_load_1(
 ; GFX12-CU-LABEL: global_nontemporal_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s4
 ; GFX12-CU-NEXT:    s_mov_b32 s4, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v1, s[2:3] th:TH_LOAD_NT
@@ -641,6 +649,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
 ; GFX12-WGP-LABEL: global_nontemporal_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -654,6 +663,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
 ; GFX12-CU-LABEL: global_nontemporal_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -881,13 +891,16 @@ define amdgpu_kernel void @global_nontemporal_store_1(
 ; GFX12-WGP-LABEL: global_nontemporal_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s3
 ; GFX12-WGP-NEXT:    s_mov_b32 s3, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v0, s3, v0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
@@ -897,13 +910,16 @@ define amdgpu_kernel void @global_nontemporal_store_1(
 ; GFX12-CU-LABEL: global_nontemporal_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s3
 ; GFX12-CU-NEXT:    s_mov_b32 s3, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v0, s3, v0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
@@ -1087,6 +1103,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
 ; GFX12-WGP-LABEL: global_nontemporal_volatile_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -1101,6 +1118,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
 ; GFX12-CU-LABEL: global_nontemporal_volatile_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index afc46fbc23a67..33aaeebf658dd 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
 ; GFX12-WGP-LABEL: global_singlethread_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -194,6 +195,7 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
 ; GFX12-CU-LABEL: global_singlethread_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -378,6 +380,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -390,6 +393,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
 ; GFX12-CU-LABEL: global_singlethread_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -574,6 +578,7 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
 ; GFX12-WGP-LABEL: global_singlethread_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -586,6 +591,7 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
 ; GFX12-CU-LABEL: global_singlethread_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -770,6 +776,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -782,6 +789,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -940,6 +948,7 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
 ; GFX12-WGP-LABEL: global_singlethread_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -951,6 +960,7 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
 ; GFX12-CU-LABEL: global_singlethread_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -1107,6 +1117,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1118,6 +1129,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
 ; GFX12-CU-LABEL: global_singlethread_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -1274,6 +1286,7 @@ define amdgpu_kernel void @global_singlethread_release_store(
 ; GFX12-WGP-LABEL: global_singlethread_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1285,6 +1298,7 @@ define amdgpu_kernel void @global_singlethread_release_store(
 ; GFX12-CU-LABEL: global_singlethread_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -1441,6 +1455,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1452,6 +1467,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -3004,6 +3020,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3019,6 +3036,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3228,6 +3246,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3243,6 +3262,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3452,6 +3472,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3467,6 +3488,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3676,6 +3698,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3691,6 +3714,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3900,6 +3924,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3915,6 +3940,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4124,6 +4150,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4139,6 +4166,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4348,6 +4376,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4363,6 +4392,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4572,6 +4602,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4587,6 +4618,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4796,6 +4828,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4811,6 +4844,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5020,6 +5054,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5035,6 +5070,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5244,6 +5280,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5259,6 +5296,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5468,6 +5506,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5483,6 +5522,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5692,6 +5732,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5707,6 +5748,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5916,6 +5958,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5931,6 +5974,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6140,6 +6184,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6155,6 +6200,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6389,6 +6435,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6406,6 +6453,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6644,6 +6692,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6661,6 +6710,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6899,6 +6949,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6916,6 +6967,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7154,6 +7206,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7171,6 +7224,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7409,6 +7463,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7426,6 +7481,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7664,6 +7720,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7681,6 +7738,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7919,6 +7977,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7936,6 +7995,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8174,6 +8234,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8191,6 +8252,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8429,6 +8491,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8446,6 +8509,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8684,6 +8748,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8701,6 +8766,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8939,6 +9005,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8956,6 +9023,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9194,6 +9262,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9211,6 +9280,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9449,6 +9519,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9466,6 +9537,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9704,6 +9776,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9721,6 +9794,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9959,6 +10033,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9976,6 +10051,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10167,6 +10243,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10179,6 +10256,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
 ; GFX12-CU-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10363,6 +10441,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10375,6 +10454,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10559,6 +10639,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10571,6 +10652,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10755,6 +10837,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10767,6 +10850,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10925,6 +11009,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -10936,6 +11021,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
 ; GFX12-CU-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -11092,6 +11178,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -11103,6 +11190,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -11259,6 +11347,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -11270,6 +11359,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
 ; GFX12-CU-LABEL: global_singlethread_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -11426,6 +11516,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -11437,6 +11528,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -12989,6 +13081,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13004,6 +13097,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13213,6 +13307,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13228,6 +13323,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13437,6 +13533,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13452,6 +13549,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13661,6 +13759,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13676,6 +13775,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13885,6 +13985,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13900,6 +14001,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14109,6 +14211,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14124,6 +14227,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14333,6 +14437,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14348,6 +14453,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14557,6 +14663,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14572,6 +14679,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14781,6 +14889,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14796,6 +14905,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15005,6 +15115,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15020,6 +15131,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15229,6 +15341,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15244,6 +15357,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15453,6 +15567,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15468,6 +15583,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15677,6 +15793,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15692,6 +15809,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15901,6 +16019,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15916,6 +16035,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16125,6 +16245,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16140,6 +16261,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16374,6 +16496,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16391,6 +16514,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16629,6 +16753,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16646,6 +16771,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16884,6 +17010,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
 ; GFX12-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16901,6 +17028,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
 ; GFX12-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17139,6 +17267,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17156,6 +17285,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
 ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17394,6 +17524,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17411,6 +17542,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17649,6 +17781,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17666,6 +17799,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17904,6 +18038,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17921,6 +18056,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18159,6 +18295,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
 ; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18176,6 +18313,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
 ; GFX12-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18414,6 +18552,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18431,6 +18570,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
 ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18669,6 +18809,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18686,6 +18827,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18924,6 +19066,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18941,6 +19084,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19179,6 +19323,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19196,6 +19341,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19434,6 +19580,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
 ; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19451,6 +19598,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
 ; GFX12-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19689,6 +19837,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19706,6 +19855,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
 ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19944,6 +20094,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19961,6 +20112,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 62a4f3b43b2dc..2c877755019ce 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @global_system_unordered_load(
 ; GFX12-WGP-LABEL: global_system_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -194,6 +195,7 @@ define amdgpu_kernel void @global_system_unordered_load(
 ; GFX12-CU-LABEL: global_system_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -378,6 +380,7 @@ define amdgpu_kernel void @global_system_monotonic_load(
 ; GFX12-WGP-LABEL: global_system_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -390,6 +393,7 @@ define amdgpu_kernel void @global_system_monotonic_load(
 ; GFX12-CU-LABEL: global_system_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -590,6 +594,7 @@ define amdgpu_kernel void @global_system_acquire_load(
 ; GFX12-WGP-LABEL: global_system_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -605,6 +610,7 @@ define amdgpu_kernel void @global_system_acquire_load(
 ; GFX12-CU-LABEL: global_system_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -815,6 +821,7 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ; GFX12-WGP-LABEL: global_system_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -834,6 +841,7 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ; GFX12-CU-LABEL: global_system_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -999,6 +1007,7 @@ define amdgpu_kernel void @global_system_unordered_store(
 ; GFX12-WGP-LABEL: global_system_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1010,6 +1019,7 @@ define amdgpu_kernel void @global_system_unordered_store(
 ; GFX12-CU-LABEL: global_system_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -1166,6 +1176,7 @@ define amdgpu_kernel void @global_system_monotonic_store(
 ; GFX12-WGP-LABEL: global_system_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1177,6 +1188,7 @@ define amdgpu_kernel void @global_system_monotonic_store(
 ; GFX12-CU-LABEL: global_system_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -1352,6 +1364,7 @@ define amdgpu_kernel void @global_system_release_store(
 ; GFX12-WGP-LABEL: global_system_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1368,6 +1381,7 @@ define amdgpu_kernel void @global_system_release_store(
 ; GFX12-CU-LABEL: global_system_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -1548,6 +1562,7 @@ define amdgpu_kernel void @global_system_seq_cst_store(
 ; GFX12-WGP-LABEL: global_system_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1564,6 +1579,7 @@ define amdgpu_kernel void @global_system_seq_cst_store(
 ; GFX12-CU-LABEL: global_system_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -3425,6 +3441,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3440,6 +3457,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3676,6 +3694,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3693,6 +3712,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3923,6 +3943,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3943,6 +3964,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4203,6 +4225,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4225,6 +4248,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4487,6 +4511,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4509,6 +4534,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4752,6 +4778,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4769,6 +4796,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5007,6 +5035,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5024,6 +5053,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5281,6 +5311,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5303,6 +5334,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5565,6 +5597,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5587,6 +5620,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5849,6 +5883,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5871,6 +5906,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6133,6 +6169,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6155,6 +6192,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6396,6 +6434,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6413,6 +6452,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6668,6 +6708,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6688,6 +6729,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6965,6 +7007,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6990,6 +7033,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7272,6 +7316,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7297,6 +7342,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7560,6 +7606,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7580,6 +7627,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7838,6 +7886,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7858,6 +7907,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8135,6 +8185,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8160,6 +8211,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8442,6 +8494,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8467,6 +8520,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8749,6 +8803,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8774,6 +8829,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9056,6 +9112,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9081,6 +9138,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9363,6 +9421,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9388,6 +9447,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9670,6 +9730,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9695,6 +9756,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9977,6 +10039,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10002,6 +10065,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10284,6 +10348,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10309,6 +10374,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10508,6 +10574,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
 ; GFX12-WGP-LABEL: global_system_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10520,6 +10587,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
 ; GFX12-CU-LABEL: global_system_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10704,6 +10772,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10716,6 +10785,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10916,6 +10986,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10931,6 +11002,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ; GFX12-CU-LABEL: global_system_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -11141,6 +11213,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -11160,6 +11233,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -11325,6 +11399,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
 ; GFX12-WGP-LABEL: global_system_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -11336,6 +11411,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
 ; GFX12-CU-LABEL: global_system_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -11492,6 +11568,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -11503,6 +11580,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -11678,6 +11756,7 @@ define amdgpu_kernel void @global_system_one_as_release_store(
 ; GFX12-WGP-LABEL: global_system_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -11694,6 +11773,7 @@ define amdgpu_kernel void @global_system_one_as_release_store(
 ; GFX12-CU-LABEL: global_system_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -11874,6 +11954,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -11890,6 +11971,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -13751,6 +13833,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13766,6 +13849,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14002,6 +14086,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14019,6 +14104,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14249,6 +14335,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14269,6 +14356,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14529,6 +14617,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14551,6 +14640,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14813,6 +14903,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14835,6 +14926,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15078,6 +15170,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15095,6 +15188,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15333,6 +15427,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15350,6 +15445,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15607,6 +15703,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15629,6 +15726,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15891,6 +15989,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15913,6 +16012,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16175,6 +16275,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16197,6 +16298,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16459,6 +16561,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16481,6 +16584,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16743,6 +16847,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16765,6 +16870,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17027,6 +17133,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17049,6 +17156,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17311,6 +17419,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17333,6 +17442,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17595,6 +17705,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17617,6 +17728,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17858,6 +17970,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17875,6 +17988,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18130,6 +18244,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18150,6 +18265,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18410,6 +18526,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18432,6 +18549,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18711,6 +18829,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18736,6 +18855,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19018,6 +19138,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19043,6 +19164,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19306,6 +19428,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19326,6 +19449,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19584,6 +19708,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19604,6 +19729,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19881,6 +20007,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19906,6 +20033,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20188,6 +20316,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20213,6 +20342,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20495,6 +20625,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20520,6 +20651,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20802,6 +20934,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20827,6 +20960,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21109,6 +21243,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21134,6 +21269,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21416,6 +21552,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21441,6 +21578,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21723,6 +21861,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21748,6 +21887,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22030,6 +22170,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22055,6 +22196,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index a98efb49b4b72..692aee5f4b9ea 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -126,6 +126,7 @@ define amdgpu_kernel void @global_volatile_load_0(
 ; GFX12-WGP-LABEL: global_volatile_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -140,6 +141,7 @@ define amdgpu_kernel void @global_volatile_load_0(
 ; GFX12-CU-LABEL: global_volatile_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -315,13 +317,16 @@ define amdgpu_kernel void @global_volatile_load_1(
 ; GFX12-WGP-LABEL: global_volatile_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s4
 ; GFX12-WGP-NEXT:    s_mov_b32 s4, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v1, s[2:3] scope:SCOPE_SYS
@@ -334,13 +339,16 @@ define amdgpu_kernel void @global_volatile_load_1(
 ; GFX12-CU-LABEL: global_volatile_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s4
 ; GFX12-CU-NEXT:    s_mov_b32 s4, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v1, s[2:3] scope:SCOPE_SYS
@@ -474,6 +482,7 @@ define amdgpu_kernel void @global_volatile_store_0(
 ; GFX12-WGP-LABEL: global_volatile_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -493,6 +502,7 @@ define amdgpu_kernel void @global_volatile_store_0(
 ; GFX12-CU-LABEL: global_volatile_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -665,13 +675,16 @@ define amdgpu_kernel void @global_volatile_store_1(
 ; GFX12-WGP-LABEL: global_volatile_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s3
 ; GFX12-WGP-NEXT:    s_mov_b32 s3, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v0, s3, v0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
@@ -687,13 +700,16 @@ define amdgpu_kernel void @global_volatile_store_1(
 ; GFX12-CU-LABEL: global_volatile_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s3
 ; GFX12-CU-NEXT:    s_mov_b32 s3, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v0, s3, v0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
@@ -833,6 +849,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
 ; GFX12-WGP-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -848,6 +865,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
 ; GFX12-CU-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -967,6 +985,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ; GFX12-WGP-LABEL: global_volatile_workgroup_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -983,6 +1002,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ; GFX12-CU-LABEL: global_volatile_workgroup_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index f805e2cf37006..aaa11c0455606 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
 ; GFX12-WGP-LABEL: global_wavefront_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -194,6 +195,7 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
 ; GFX12-CU-LABEL: global_wavefront_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -378,6 +380,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -390,6 +393,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
 ; GFX12-CU-LABEL: global_wavefront_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -574,6 +578,7 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
 ; GFX12-WGP-LABEL: global_wavefront_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -586,6 +591,7 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
 ; GFX12-CU-LABEL: global_wavefront_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -770,6 +776,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -782,6 +789,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -940,6 +948,7 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
 ; GFX12-WGP-LABEL: global_wavefront_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -951,6 +960,7 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
 ; GFX12-CU-LABEL: global_wavefront_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -1107,6 +1117,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1118,6 +1129,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
 ; GFX12-CU-LABEL: global_wavefront_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -1274,6 +1286,7 @@ define amdgpu_kernel void @global_wavefront_release_store(
 ; GFX12-WGP-LABEL: global_wavefront_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1285,6 +1298,7 @@ define amdgpu_kernel void @global_wavefront_release_store(
 ; GFX12-CU-LABEL: global_wavefront_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -1441,6 +1455,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1452,6 +1467,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -3004,6 +3020,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3019,6 +3036,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3228,6 +3246,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3243,6 +3262,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3452,6 +3472,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3467,6 +3488,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3676,6 +3698,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3691,6 +3714,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3900,6 +3924,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3915,6 +3940,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4124,6 +4150,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4139,6 +4166,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4348,6 +4376,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4363,6 +4392,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4572,6 +4602,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4587,6 +4618,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4796,6 +4828,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4811,6 +4844,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5020,6 +5054,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5035,6 +5070,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5244,6 +5280,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5259,6 +5296,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5468,6 +5506,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5483,6 +5522,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5692,6 +5732,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5707,6 +5748,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5916,6 +5958,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5931,6 +5974,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6140,6 +6184,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6155,6 +6200,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6389,6 +6435,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6406,6 +6453,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6644,6 +6692,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6661,6 +6710,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6899,6 +6949,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6916,6 +6967,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7154,6 +7206,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7171,6 +7224,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7409,6 +7463,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7426,6 +7481,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7664,6 +7720,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7681,6 +7738,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7919,6 +7977,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7936,6 +7995,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8174,6 +8234,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8191,6 +8252,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8429,6 +8491,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8446,6 +8509,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8684,6 +8748,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8701,6 +8766,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8939,6 +9005,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8956,6 +9023,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9194,6 +9262,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9211,6 +9280,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9449,6 +9519,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9466,6 +9537,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9704,6 +9776,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9721,6 +9794,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9959,6 +10033,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9976,6 +10051,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10167,6 +10243,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10179,6 +10256,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
 ; GFX12-CU-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10363,6 +10441,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10375,6 +10454,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10559,6 +10639,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10571,6 +10652,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10755,6 +10837,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10767,6 +10850,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -10925,6 +11009,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -10936,6 +11021,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
 ; GFX12-CU-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -11092,6 +11178,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -11103,6 +11190,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -11259,6 +11347,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -11270,6 +11359,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
 ; GFX12-CU-LABEL: global_wavefront_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -11426,6 +11516,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -11437,6 +11528,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -12989,6 +13081,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13004,6 +13097,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13213,6 +13307,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13228,6 +13323,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13437,6 +13533,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13452,6 +13549,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13661,6 +13759,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13676,6 +13775,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13885,6 +13985,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13900,6 +14001,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14109,6 +14211,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14124,6 +14227,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14333,6 +14437,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14348,6 +14453,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14557,6 +14663,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14572,6 +14679,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14781,6 +14889,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14796,6 +14905,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15005,6 +15115,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15020,6 +15131,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15229,6 +15341,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15244,6 +15357,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15453,6 +15567,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15468,6 +15583,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15677,6 +15793,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15692,6 +15809,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15901,6 +16019,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15916,6 +16035,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16125,6 +16245,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16140,6 +16261,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16374,6 +16496,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16391,6 +16514,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16629,6 +16753,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16646,6 +16771,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16884,6 +17010,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
 ; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16901,6 +17028,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
 ; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17139,6 +17267,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17156,6 +17285,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
 ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17394,6 +17524,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17411,6 +17542,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17649,6 +17781,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17666,6 +17799,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17904,6 +18038,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17921,6 +18056,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18159,6 +18295,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18176,6 +18313,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18414,6 +18552,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18431,6 +18570,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18669,6 +18809,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18686,6 +18827,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18924,6 +19066,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18941,6 +19084,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19179,6 +19323,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19196,6 +19341,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19434,6 +19580,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19451,6 +19598,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19689,6 +19837,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19706,6 +19855,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19944,6 +20094,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19961,6 +20112,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 30bf492071535..25c75aa50df09 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
 ; GFX12-WGP-LABEL: global_workgroup_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -194,6 +195,7 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
 ; GFX12-CU-LABEL: global_workgroup_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -378,6 +380,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -390,6 +393,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
 ; GFX12-CU-LABEL: global_workgroup_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -578,6 +582,7 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
 ; GFX12-WGP-LABEL: global_workgroup_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -593,6 +598,7 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
 ; GFX12-CU-LABEL: global_workgroup_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -786,6 +792,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -805,6 +812,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -964,6 +972,7 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
 ; GFX12-WGP-LABEL: global_workgroup_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -975,6 +984,7 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
 ; GFX12-CU-LABEL: global_workgroup_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -1131,6 +1141,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1142,6 +1153,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
 ; GFX12-CU-LABEL: global_workgroup_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -1311,6 +1323,7 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ; GFX12-WGP-LABEL: global_workgroup_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1327,6 +1340,7 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ; GFX12-CU-LABEL: global_workgroup_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -1497,6 +1511,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1513,6 +1528,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -3212,6 +3228,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3227,6 +3244,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3444,6 +3462,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3461,6 +3480,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3683,6 +3703,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3703,6 +3724,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3934,6 +3956,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3956,6 +3979,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4187,6 +4211,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4209,6 +4234,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4427,6 +4453,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4444,6 +4471,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4661,6 +4689,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4678,6 +4707,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4908,6 +4938,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4930,6 +4961,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5161,6 +5193,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5183,6 +5216,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5414,6 +5448,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5436,6 +5471,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5667,6 +5703,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5689,6 +5726,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5920,6 +5958,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5942,6 +5981,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6173,6 +6213,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6195,6 +6236,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6426,6 +6468,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6448,6 +6491,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6679,6 +6723,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6701,6 +6746,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6936,6 +6982,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6953,6 +7000,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7195,6 +7243,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7215,6 +7264,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7466,6 +7516,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7488,6 +7539,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7744,6 +7796,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7769,6 +7822,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8025,6 +8079,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8050,6 +8105,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8293,6 +8349,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8313,6 +8370,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8555,6 +8613,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8575,6 +8634,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8830,6 +8890,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8855,6 +8916,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9111,6 +9173,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9136,6 +9199,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9392,6 +9456,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9417,6 +9482,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9673,6 +9739,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9698,6 +9765,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9954,6 +10022,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9979,6 +10048,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10235,6 +10305,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10260,6 +10331,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10516,6 +10588,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10541,6 +10614,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10797,6 +10871,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10822,6 +10897,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11014,6 +11090,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -11026,6 +11103,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
 ; GFX12-CU-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -11210,6 +11288,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -11222,6 +11301,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -11410,6 +11490,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -11425,6 +11506,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -11615,6 +11697,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -11634,6 +11717,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
@@ -11792,6 +11876,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -11803,6 +11888,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
 ; GFX12-CU-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -11959,6 +12045,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -11970,6 +12057,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -12132,6 +12220,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -12148,6 +12237,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ; GFX12-CU-LABEL: global_workgroup_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -12310,6 +12400,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -12326,6 +12417,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -13984,6 +14076,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13999,6 +14092,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14216,6 +14310,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14233,6 +14328,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14448,6 +14544,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14468,6 +14565,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14691,6 +14789,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14713,6 +14812,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14936,6 +15036,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14958,6 +15059,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15175,6 +15277,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15192,6 +15295,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15409,6 +15513,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15426,6 +15531,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15649,6 +15755,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15671,6 +15778,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15894,6 +16002,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15916,6 +16025,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16139,6 +16249,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16161,6 +16272,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16384,6 +16496,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16406,6 +16519,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16629,6 +16743,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16651,6 +16766,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16874,6 +16990,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16896,6 +17013,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17119,6 +17237,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17141,6 +17260,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17364,6 +17484,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17386,6 +17507,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17620,6 +17742,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17637,6 +17760,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17879,6 +18003,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17899,6 +18024,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18143,6 +18269,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18165,6 +18292,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18413,6 +18541,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18438,6 +18567,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18686,6 +18816,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18711,6 +18842,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18953,6 +19085,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18973,6 +19106,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19215,6 +19349,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19235,6 +19370,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19483,6 +19619,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19508,6 +19645,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19756,6 +19894,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19781,6 +19920,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20029,6 +20169,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20054,6 +20195,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20302,6 +20444,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20327,6 +20470,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20575,6 +20719,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20600,6 +20745,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20848,6 +20994,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20873,6 +21020,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21121,6 +21269,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21146,6 +21295,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21394,6 +21544,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21419,6 +21570,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
index 67ca31a2bb84e..1a2058cbe39e4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: not llc -mtriple=amdgcn-amd- -mcpu=gfx803 < %s 2>&1 | FileCheck %s
 ; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s 2>&1 | FileCheck %s
 ; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s 2>&1 | FileCheck %s
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 02cd97c9fe82a..d925ca52f8560 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -2881,6 +2881,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -2894,6 +2895,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3074,6 +3076,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3089,6 +3092,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3270,6 +3274,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3288,6 +3293,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3482,6 +3488,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3502,6 +3509,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3697,6 +3705,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3717,6 +3726,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3899,6 +3909,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3914,6 +3925,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4095,6 +4107,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4110,6 +4123,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4304,6 +4318,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4324,6 +4339,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4519,6 +4535,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4539,6 +4556,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4734,6 +4752,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4754,6 +4773,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4949,6 +4969,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4969,6 +4990,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5164,6 +5186,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5184,6 +5207,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5379,6 +5403,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5399,6 +5424,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5594,6 +5620,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5614,6 +5641,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5809,6 +5837,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5829,6 +5858,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6031,6 +6061,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6047,6 +6078,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6256,6 +6288,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6273,6 +6306,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6491,6 +6525,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6512,6 +6547,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6735,6 +6771,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6757,6 +6794,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6980,6 +7018,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7002,6 +7041,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7212,6 +7252,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7229,6 +7270,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7438,6 +7480,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7455,6 +7498,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7677,6 +7721,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7699,6 +7744,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7922,6 +7968,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7944,6 +7991,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8167,6 +8215,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8189,6 +8238,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8412,6 +8462,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8434,6 +8485,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8657,6 +8709,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8679,6 +8732,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8902,6 +8956,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8924,6 +8979,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9147,6 +9203,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9169,6 +9226,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9392,6 +9450,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9414,6 +9473,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -12080,6 +12140,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12093,6 +12154,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12260,6 +12322,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12273,6 +12336,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12440,6 +12504,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12453,6 +12518,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12620,6 +12686,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12633,6 +12700,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12800,6 +12868,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12813,6 +12882,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12980,6 +13050,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12993,6 +13064,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13160,6 +13232,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13173,6 +13246,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13340,6 +13414,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13353,6 +13428,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13520,6 +13596,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13533,6 +13610,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13700,6 +13778,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13713,6 +13792,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13880,6 +13960,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13893,6 +13974,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14060,6 +14142,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14073,6 +14156,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14240,6 +14324,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14253,6 +14338,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14420,6 +14506,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14433,6 +14520,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14600,6 +14688,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14613,6 +14702,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14813,6 +14903,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14829,6 +14920,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15034,6 +15126,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15050,6 +15143,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15255,6 +15349,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15271,6 +15366,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15476,6 +15572,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15492,6 +15589,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15697,6 +15795,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15713,6 +15812,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15918,6 +16018,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15934,6 +16035,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16139,6 +16241,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16155,6 +16258,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16360,6 +16464,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16376,6 +16481,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16581,6 +16687,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16597,6 +16704,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16802,6 +16910,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16818,6 +16927,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17023,6 +17133,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17039,6 +17150,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17244,6 +17356,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17260,6 +17373,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17465,6 +17579,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17481,6 +17596,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17686,6 +17802,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17702,6 +17819,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17907,6 +18025,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17923,6 +18042,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index ba9711333a194..fce60ff12aed3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -183,6 +183,7 @@ define amdgpu_kernel void @local_nontemporal_load_0(
 ; GFX12-WGP-LABEL: local_nontemporal_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -196,6 +197,7 @@ define amdgpu_kernel void @local_nontemporal_load_0(
 ; GFX12-CU-LABEL: local_nontemporal_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -420,13 +422,16 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX12-WGP-LABEL: local_nontemporal_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
 ; GFX12-WGP-NEXT:    ds_load_b32 v1, v1
@@ -437,13 +442,16 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX12-CU-LABEL: local_nontemporal_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s2
 ; GFX12-CU-NEXT:    s_mov_b32 s2, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v1
@@ -615,6 +623,7 @@ define amdgpu_kernel void @local_nontemporal_store_0(
 ; GFX12-WGP-LABEL: local_nontemporal_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -628,6 +637,7 @@ define amdgpu_kernel void @local_nontemporal_store_0(
 ; GFX12-CU-LABEL: local_nontemporal_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -826,8 +836,10 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
 ; GFX12-WGP-NEXT:    s_mov_b32 s1, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
@@ -841,8 +853,10 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s1
 ; GFX12-CU-NEXT:    s_mov_b32 s1, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
@@ -1027,6 +1041,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
 ; GFX12-WGP-LABEL: local_nontemporal_volatile_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1040,6 +1055,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
 ; GFX12-CU-LABEL: local_nontemporal_volatile_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
index fe5f2c51734f7..033c71574643c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
@@ -2657,6 +2657,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -2670,6 +2671,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -2837,6 +2839,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -2850,6 +2853,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3017,6 +3021,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3030,6 +3035,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3197,6 +3203,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3210,6 +3217,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3377,6 +3385,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3390,6 +3399,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3557,6 +3567,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3570,6 +3581,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3737,6 +3749,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3750,6 +3763,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3917,6 +3931,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3930,6 +3945,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4097,6 +4113,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4110,6 +4127,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4277,6 +4295,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4290,6 +4309,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4457,6 +4477,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4470,6 +4491,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4637,6 +4659,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4650,6 +4673,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4817,6 +4841,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4830,6 +4855,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4997,6 +5023,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5010,6 +5037,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5177,6 +5205,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5190,6 +5219,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5390,6 +5420,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -5406,6 +5437,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -5611,6 +5643,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -5627,6 +5660,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -5832,6 +5866,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -5848,6 +5883,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6053,6 +6089,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6069,6 +6106,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6274,6 +6312,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6290,6 +6329,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6495,6 +6535,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6511,6 +6552,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6716,6 +6758,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6732,6 +6775,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6937,6 +6981,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6953,6 +6998,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7158,6 +7204,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7174,6 +7221,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7379,6 +7427,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7395,6 +7444,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7600,6 +7650,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7616,6 +7667,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7821,6 +7873,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7837,6 +7890,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8042,6 +8096,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8058,6 +8113,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8263,6 +8319,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8279,6 +8336,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8484,6 +8542,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8500,6 +8559,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11165,6 +11225,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg
 ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11178,6 +11239,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg
 ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11345,6 +11407,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11358,6 +11421,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11525,6 +11589,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11538,6 +11603,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11705,6 +11771,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11718,6 +11785,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11885,6 +11953,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11898,6 +11967,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12065,6 +12135,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12078,6 +12149,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12245,6 +12317,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12258,6 +12331,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12425,6 +12499,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12438,6 +12513,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12605,6 +12681,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12618,6 +12695,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12785,6 +12863,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12798,6 +12877,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12965,6 +13045,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12978,6 +13059,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13145,6 +13227,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13158,6 +13241,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13325,6 +13409,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13338,6 +13423,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13505,6 +13591,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13518,6 +13605,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13685,6 +13773,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13698,6 +13787,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13898,6 +13988,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp
 ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -13914,6 +14005,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp
 ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14119,6 +14211,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14135,6 +14228,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc
 ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14340,6 +14434,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc
 ; GFX12-WGP-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14356,6 +14451,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc
 ; GFX12-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14561,6 +14657,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14577,6 +14674,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc
 ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14782,6 +14880,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14798,6 +14897,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15003,6 +15103,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc
 ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15019,6 +15120,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc
 ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15224,6 +15326,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15240,6 +15343,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg
 ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15445,6 +15549,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg
 ; GFX12-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15461,6 +15566,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg
 ; GFX12-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15666,6 +15772,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15682,6 +15789,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg
 ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15887,6 +15995,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15903,6 +16012,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16108,6 +16218,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc
 ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16124,6 +16235,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc
 ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16329,6 +16441,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16345,6 +16458,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg
 ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16550,6 +16664,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg
 ; GFX12-WGP-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16566,6 +16681,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg
 ; GFX12-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16771,6 +16887,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16787,6 +16904,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg
 ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16992,6 +17110,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17008,6 +17127,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 1c4c8d41b18f9..548c5aceb25f7 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -2881,6 +2881,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -2894,6 +2895,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3074,6 +3076,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3089,6 +3092,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3270,6 +3274,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3288,6 +3293,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_system_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3482,6 +3488,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3502,6 +3509,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3697,6 +3705,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3717,6 +3726,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3899,6 +3909,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3914,6 +3925,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4095,6 +4107,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4110,6 +4123,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_system_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4304,6 +4318,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4324,6 +4339,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_system_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4519,6 +4535,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4539,6 +4556,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4734,6 +4752,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4754,6 +4773,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4949,6 +4969,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4969,6 +4990,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5164,6 +5186,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5184,6 +5207,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5379,6 +5403,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5399,6 +5424,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_system_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5594,6 +5620,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5614,6 +5641,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5809,6 +5837,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5829,6 +5858,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6031,6 +6061,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6047,6 +6078,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6256,6 +6288,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6273,6 +6306,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6491,6 +6525,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6512,6 +6547,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6735,6 +6771,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6757,6 +6794,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6980,6 +7018,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7002,6 +7041,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7212,6 +7252,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7229,6 +7270,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7438,6 +7480,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7455,6 +7498,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7677,6 +7721,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7699,6 +7744,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7922,6 +7968,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7944,6 +7991,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8167,6 +8215,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8189,6 +8238,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8412,6 +8462,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8434,6 +8485,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8657,6 +8709,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8679,6 +8732,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8902,6 +8956,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8924,6 +8979,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9147,6 +9203,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9169,6 +9226,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9392,6 +9450,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9414,6 +9473,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -12080,6 +12140,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12093,6 +12154,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12260,6 +12322,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12273,6 +12336,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12440,6 +12504,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12453,6 +12518,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12620,6 +12686,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12633,6 +12700,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12800,6 +12868,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12813,6 +12882,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12980,6 +13050,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12993,6 +13064,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13160,6 +13232,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13173,6 +13246,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13340,6 +13414,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13353,6 +13428,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13520,6 +13596,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13533,6 +13610,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13700,6 +13778,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13713,6 +13792,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13880,6 +13960,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13893,6 +13974,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14060,6 +14142,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14073,6 +14156,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14240,6 +14324,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14253,6 +14338,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14420,6 +14506,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14433,6 +14520,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14600,6 +14688,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14613,6 +14702,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14813,6 +14903,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14829,6 +14920,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15034,6 +15126,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15050,6 +15143,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15255,6 +15349,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15271,6 +15366,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15476,6 +15572,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15492,6 +15589,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15697,6 +15795,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15713,6 +15812,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15918,6 +16018,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15934,6 +16035,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16139,6 +16241,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16155,6 +16258,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16360,6 +16464,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16376,6 +16481,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16581,6 +16687,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16597,6 +16704,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16802,6 +16910,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16818,6 +16927,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17023,6 +17133,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17039,6 +17150,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17244,6 +17356,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17260,6 +17373,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17465,6 +17579,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17481,6 +17596,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17686,6 +17802,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17702,6 +17819,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17907,6 +18025,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17923,6 +18042,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index a52dd9b340169..a8f7051bd5050 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -123,6 +123,7 @@ define amdgpu_kernel void @local_volatile_load_0(
 ; GFX12-WGP-LABEL: local_volatile_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -136,6 +137,7 @@ define amdgpu_kernel void @local_volatile_load_0(
 ; GFX12-CU-LABEL: local_volatile_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -284,13 +286,16 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX12-WGP-LABEL: local_volatile_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-WGP-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
 ; GFX12-WGP-NEXT:    ds_load_b32 v1, v1
@@ -301,13 +306,16 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX12-CU-LABEL: local_volatile_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-CU-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s2
 ; GFX12-CU-NEXT:    s_mov_b32 s2, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v1
@@ -423,6 +431,7 @@ define amdgpu_kernel void @local_volatile_store_0(
 ; GFX12-WGP-LABEL: local_volatile_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -441,6 +450,7 @@ define amdgpu_kernel void @local_volatile_store_0(
 ; GFX12-CU-LABEL: local_volatile_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -576,8 +586,10 @@ define amdgpu_kernel void @local_volatile_store_1(
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
 ; GFX12-WGP-NEXT:    s_mov_b32 s1, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
@@ -596,8 +608,10 @@ define amdgpu_kernel void @local_volatile_store_1(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s1
 ; GFX12-CU-NEXT:    s_mov_b32 s1, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
index 02e4e0d69dc20..694ffb2964f56 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
@@ -2657,6 +2657,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -2670,6 +2671,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -2837,6 +2839,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -2850,6 +2853,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3017,6 +3021,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3030,6 +3035,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3197,6 +3203,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3210,6 +3217,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3377,6 +3385,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3390,6 +3399,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3557,6 +3567,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3570,6 +3581,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3737,6 +3749,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3750,6 +3763,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3917,6 +3931,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3930,6 +3945,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4097,6 +4113,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4110,6 +4127,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4277,6 +4295,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4290,6 +4309,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4457,6 +4477,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4470,6 +4491,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4637,6 +4659,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4650,6 +4673,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4817,6 +4841,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4830,6 +4855,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4997,6 +5023,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5010,6 +5037,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5177,6 +5205,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5190,6 +5219,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5390,6 +5420,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -5406,6 +5437,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -5611,6 +5643,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -5627,6 +5660,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -5832,6 +5866,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -5848,6 +5883,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6053,6 +6089,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6069,6 +6106,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6274,6 +6312,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6290,6 +6329,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6495,6 +6535,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6511,6 +6552,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6716,6 +6758,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6732,6 +6775,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6937,6 +6981,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6953,6 +6998,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7158,6 +7204,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7174,6 +7221,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7379,6 +7427,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7395,6 +7444,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7600,6 +7650,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7616,6 +7667,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7821,6 +7873,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7837,6 +7890,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8042,6 +8096,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8058,6 +8113,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8263,6 +8319,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8279,6 +8336,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8484,6 +8542,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8500,6 +8559,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11165,6 +11225,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11178,6 +11239,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11345,6 +11407,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11358,6 +11421,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11525,6 +11589,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11538,6 +11603,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11705,6 +11771,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11718,6 +11785,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11885,6 +11953,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11898,6 +11967,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12065,6 +12135,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12078,6 +12149,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12245,6 +12317,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12258,6 +12331,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12425,6 +12499,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12438,6 +12513,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12605,6 +12681,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12618,6 +12695,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12785,6 +12863,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12798,6 +12877,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12965,6 +13045,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12978,6 +13059,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13145,6 +13227,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13158,6 +13241,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13325,6 +13409,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13338,6 +13423,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13505,6 +13591,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13518,6 +13605,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13685,6 +13773,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13698,6 +13787,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13898,6 +13988,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch
 ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -13914,6 +14005,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch
 ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14119,6 +14211,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14135,6 +14228,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14340,6 +14434,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14356,6 +14451,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14561,6 +14657,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14577,6 +14674,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14782,6 +14880,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14798,6 +14897,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15003,6 +15103,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15019,6 +15120,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15224,6 +15326,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15240,6 +15343,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15445,6 +15549,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15461,6 +15566,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15666,6 +15772,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15682,6 +15789,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15887,6 +15995,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15903,6 +16012,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16108,6 +16218,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16124,6 +16235,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16329,6 +16441,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16345,6 +16458,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16550,6 +16664,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16566,6 +16681,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16771,6 +16887,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16787,6 +16904,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16992,6 +17110,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17008,6 +17127,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index c242963228537..0cf644c006fac 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -2881,6 +2881,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -2894,6 +2895,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3074,6 +3076,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3089,6 +3092,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3270,6 +3274,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3288,6 +3293,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3482,6 +3488,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3502,6 +3509,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3697,6 +3705,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3717,6 +3726,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3899,6 +3909,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3914,6 +3925,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4095,6 +4107,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4110,6 +4123,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4304,6 +4318,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4324,6 +4339,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4519,6 +4535,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4539,6 +4556,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4734,6 +4752,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4754,6 +4773,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4949,6 +4969,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4969,6 +4990,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5164,6 +5186,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5184,6 +5207,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5379,6 +5403,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5399,6 +5424,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5594,6 +5620,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5614,6 +5641,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5809,6 +5837,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5829,6 +5858,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6031,6 +6061,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6047,6 +6078,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6256,6 +6288,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6273,6 +6306,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6491,6 +6525,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6512,6 +6547,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6735,6 +6771,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6757,6 +6794,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6980,6 +7018,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7002,6 +7041,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7212,6 +7252,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7229,6 +7270,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7438,6 +7480,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7455,6 +7498,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7677,6 +7721,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7699,6 +7744,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7922,6 +7968,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7944,6 +7991,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8167,6 +8215,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8189,6 +8238,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8412,6 +8462,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8434,6 +8485,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8657,6 +8709,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8679,6 +8732,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8902,6 +8956,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8924,6 +8979,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9147,6 +9203,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9169,6 +9226,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9392,6 +9450,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9414,6 +9473,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -12080,6 +12140,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12093,6 +12154,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12260,6 +12322,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12273,6 +12336,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12440,6 +12504,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12453,6 +12518,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12620,6 +12686,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12633,6 +12700,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12800,6 +12868,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12813,6 +12882,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12980,6 +13050,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12993,6 +13064,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13160,6 +13232,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13173,6 +13246,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13340,6 +13414,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13353,6 +13428,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13520,6 +13596,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13533,6 +13610,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13700,6 +13778,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13713,6 +13792,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13880,6 +13960,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13893,6 +13974,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14060,6 +14142,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14073,6 +14156,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14240,6 +14324,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14253,6 +14338,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14420,6 +14506,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14433,6 +14520,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14600,6 +14688,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14613,6 +14702,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14813,6 +14903,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch
 ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14829,6 +14920,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch
 ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15034,6 +15126,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15050,6 +15143,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15255,6 +15349,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15271,6 +15366,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15476,6 +15572,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15492,6 +15589,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15697,6 +15795,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15713,6 +15812,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15918,6 +16018,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15934,6 +16035,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16139,6 +16241,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16155,6 +16258,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16360,6 +16464,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16376,6 +16481,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16581,6 +16687,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16597,6 +16704,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16802,6 +16910,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16818,6 +16927,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17023,6 +17133,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17039,6 +17150,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17244,6 +17356,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17260,6 +17373,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17465,6 +17579,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17481,6 +17596,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17686,6 +17802,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17702,6 +17819,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17907,6 +18025,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17923,6 +18042,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
index 61cec731feb56..8e292fa592975 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
@@ -6,6 +6,7 @@ define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr add
 ; GFX12-LABEL: private_last_use_load_0:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -24,13 +25,16 @@ define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr add
 ; GFX12-LABEL: private_last_use_load_1:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX12-NEXT:    s_mov_b32 s3, 2
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v1, s3, v1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v1, v1, s2 th:TH_LOAD_LU
@@ -49,6 +53,7 @@ define amdgpu_kernel void @private_last_use_and_volatile_load(ptr addrspace(5) %
 ; GFX12-LABEL: private_last_use_and_volatile_load:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -69,6 +74,7 @@ define amdgpu_kernel void @private_last_use_and_nontemporal_load(ptr addrspace(5
 ; GFX12-LABEL: private_last_use_and_nontemporal_load:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 4e08065e879fd..c3599c87985be 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -193,6 +193,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ; GFX12-WGP-LABEL: private_nontemporal_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -205,6 +206,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ; GFX12-CU-LABEL: private_nontemporal_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -442,13 +444,16 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX12-WGP-LABEL: private_nontemporal_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX12-WGP-NEXT:    s_mov_b32 s3, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s3, v1
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT
@@ -459,13 +464,16 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX12-CU-LABEL: private_nontemporal_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX12-CU-NEXT:    s_mov_b32 s3, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s3, v1
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT
@@ -648,6 +656,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ; GFX12-WGP-LABEL: private_nontemporal_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -660,6 +669,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ; GFX12-CU-LABEL: private_nontemporal_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -868,13 +878,16 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX12-WGP-LABEL: private_nontemporal_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
@@ -884,13 +897,16 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX12-CU-LABEL: private_nontemporal_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
 ; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s2
 ; GFX12-CU-NEXT:    s_mov_b32 s2, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
@@ -1085,6 +1101,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ; GFX12-WGP-LABEL: private_nontemporal_volatile_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -1099,6 +1116,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ; GFX12-CU-LABEL: private_nontemporal_volatile_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index a68b5f36b806e..9146f175eefcd 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -135,6 +135,7 @@ define amdgpu_kernel void @private_volatile_load_0(
 ; GFX12-WGP-LABEL: private_volatile_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
@@ -149,6 +150,7 @@ define amdgpu_kernel void @private_volatile_load_0(
 ; GFX12-CU-LABEL: private_volatile_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
@@ -312,13 +314,16 @@ define amdgpu_kernel void @private_volatile_load_1(
 ; GFX12-WGP-LABEL: private_volatile_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX12-WGP-NEXT:    s_mov_b32 s3, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s3, v1
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS
@@ -331,13 +336,16 @@ define amdgpu_kernel void @private_volatile_load_1(
 ; GFX12-CU-LABEL: private_volatile_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX12-CU-NEXT:    s_mov_b32 s3, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s3, v1
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS
@@ -475,6 +483,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ; GFX12-WGP-LABEL: private_volatile_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
@@ -493,6 +502,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ; GFX12-CU-LABEL: private_volatile_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
@@ -646,13 +656,16 @@ define amdgpu_kernel void @private_volatile_store_1(
 ; GFX12-WGP-LABEL: private_volatile_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
 ; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
 ; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
@@ -668,13 +681,16 @@ define amdgpu_kernel void @private_volatile_store_1(
 ; GFX12-CU-LABEL: private_volatile_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
 ; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s2
 ; GFX12-CU-NEXT:    s_mov_b32 s2, 2
+; GFX12-CU-NEXT:    s_wait_alu 0xfffe
 ; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index bbbbc0dc0f28d..a642543c3780d 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -11,8 +11,10 @@ define amdgpu_cs float @v_s_exp_f32(float inreg %src) {
 ; GFX12-NEXT:    s_add_f32 s0, s0, s1
 ; GFX12-NEXT:    s_cselect_b32 s1, 0x1f800000, 1.0
 ; GFX12-NEXT:    v_s_exp_f32 s0, s0
-; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
 ; GFX12-NEXT:    s_mul_f32 s0, s0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
   %result = call float @llvm.exp2.f32(float %src)
@@ -61,8 +63,10 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) {
 ; GFX12-NEXT:    s_mul_f32 s0, s0, s1
 ; GFX12-NEXT:    s_cselect_b32 s1, 0x42000000, 0
 ; GFX12-NEXT:    v_s_log_f32 s0, s0
-; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
 ; GFX12-NEXT:    s_sub_f32 s0, s0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
   %result = call float @llvm.log2.f32(float %src)
@@ -166,6 +170,7 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_cselect_b32 s1, s1, s0
 ; GFX12-SDAG-NEXT:    v_s_sqrt_f32 s2, s1
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
 ; GFX12-SDAG-NEXT:    s_mov_b32 s4, s1
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_add_co_i32 s3, s2, -1
@@ -179,16 +184,19 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_xor_b32 s6, s4, 0x80000000
 ; GFX12-SDAG-NEXT:    s_fmac_f32 s5, s6, s2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX12-SDAG-NEXT:    s_cmp_gt_f32 s5, 0
 ; GFX12-SDAG-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX12-SDAG-NEXT:    s_cmp_lt_f32 s0, 0xf800000
 ; GFX12-SDAG-NEXT:    s_mul_f32 s0, s2, 0x37800000
 ; GFX12-SDAG-NEXT:    v_cmp_class_f32_e64 s3, s1, 0x260
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    s_cselect_b32 s0, s0, s2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_and_b32 s2, s3, exec_lo
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
 ; GFX12-SDAG-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
@@ -200,6 +208,7 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    s_cselect_b32 s0, s2, s0
 ; GFX12-GISEL-NEXT:    v_s_sqrt_f32 s2, s0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
 ; GFX12-GISEL-NEXT:    s_mov_b32 s4, s0
 ; GFX12-GISEL-NEXT:    s_mov_b32 s6, s0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -217,11 +226,11 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
 ; GFX12-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
 ; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX12-GISEL-NEXT:    s_mul_f32 s3, s2, 0x37800000
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    s_cselect_b32 s1, s3, s2
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-GISEL-NEXT:    v_cmp_class_f32_e64 s1, s0, 0x260
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s1
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %result = call float @llvm.sqrt.f32(float %src)
@@ -270,10 +279,12 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) {
 ; GFX12-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
 ; GFX12-NEXT:    s_mul_f32 s0, s0, s1
 ; GFX12-NEXT:    s_cselect_b32 s1, 0x42000000, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
 ; GFX12-NEXT:    v_s_log_f32 s0, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_sub_f32 s0, s0, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
   %abs = call float @llvm.fabs.f32(float %src)
@@ -291,8 +302,10 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) {
 ; GFX12-SDAG-NEXT:    s_mul_f32 s0, s1, s0
 ; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x42000000, 0
 ; GFX12-SDAG-NEXT:    v_s_log_f32 s0, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
 ; GFX12-SDAG-NEXT:    s_sub_f32 s0, s0, s1
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
@@ -304,10 +317,12 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) {
 ; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
 ; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
 ; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
 ; GFX12-GISEL-NEXT:    v_s_log_f32 s0, s0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
 ; GFX12-GISEL-NEXT:    s_sub_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %neg = fneg float %src
diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index 4ea77d1d1ac15..b7aecca45def5 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -63,10 +63,11 @@ define void @test_remat_s_getpc_b64() {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    v_writelane_b32 v2, s30, 0
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ;;#ASMEND
@@ -74,16 +75,19 @@ define void @test_remat_s_getpc_b64() {
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ;;#ASMEND
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_sext_i32_i16 s1, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX12-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[0:1], off
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i64 @llvm.amdgcn.s.getpc()
diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
index 1f36f7a0d9616..d4f7bf656d3b5 100644
--- a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
@@ -62,12 +62,14 @@ define half @swap(half %a, half %b, i32 %i) {
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
 ; GFX12-TRUE16-NEXT:    v_swap_b16 v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2: ; %ret
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: swap:
@@ -80,17 +82,19 @@ define half @swap(half %a, half %b, i32 %i) {
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-FAKE16-NEXT:  .LBB0_1: ; %loop
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2
 ; GFX12-FAKE16-NEXT:    v_swap_b32 v1, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2: ; %ret
 ; GFX12-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
index 65feaf23ae2cb..ab222f4feeef0 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
@@ -45,18 +45,12 @@
 name:            mask_hazard_getpc1
 body:            |
   bb.0:
-    ; GFX11-LABEL: name: mask_hazard_getpc1
-    ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
-    ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
-    ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
-    ; GFX11-NEXT: S_ENDPGM 0
-    ;
-    ; GFX12-LABEL: name: mask_hazard_getpc1
-    ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
-    ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
-    ; GFX12-NEXT: S_ENDPGM 0
+    ; GCN-LABEL: name: mask_hazard_getpc1
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0
     $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     $sgpr0_sgpr1 = S_GETPC_B64
     $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
@@ -67,24 +61,15 @@ body:            |
 name:            mask_hazard_getpc2
 body:            |
   bb.0:
-    ; GFX11-LABEL: name: mask_hazard_getpc2
-    ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    ; GFX11-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
-    ; GFX11-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
-    ; GFX11-NEXT:   S_WAITCNT_DEPCTR 65534
-    ; GFX11-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
-    ; GFX11-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc, implicit $scc
-    ; GFX11-NEXT: }
-    ; GFX11-NEXT: S_ENDPGM 0
-    ;
-    ; GFX12-LABEL: name: mask_hazard_getpc2
-    ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    ; GFX12-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
-    ; GFX12-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
-    ; GFX12-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 4, implicit-def $scc
-    ; GFX12-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 12, implicit-def $scc, implicit $scc
-    ; GFX12-NEXT: }
-    ; GFX12-NEXT: S_ENDPGM 0
+    ; GCN-LABEL: name: mask_hazard_getpc2
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+    ; GCN-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT:   S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
+    ; GCN-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc, implicit $scc
+    ; GCN-NEXT: }
+    ; GCN-NEXT: S_ENDPGM 0
     $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     BUNDLE implicit-def $sgpr0_sgpr1 {
       $sgpr0_sgpr1 = S_GETPC_B64
@@ -523,18 +508,12 @@ body:            |
 name:            mask_hazard_subreg4
 body:            |
   bb.0:
-    ; GFX11-LABEL: name: mask_hazard_subreg4
-    ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
-    ; GFX11-NEXT: $vcc_lo = S_MOV_B32 0
-    ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
-    ; GFX11-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo
-    ; GFX11-NEXT: S_ENDPGM 0
-    ;
-    ; GFX12-LABEL: name: mask_hazard_subreg4
-    ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
-    ; GFX12-NEXT: $vcc_lo = S_MOV_B32 0
-    ; GFX12-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo
-    ; GFX12-NEXT: S_ENDPGM 0
+    ; GCN-LABEL: name: mask_hazard_subreg4
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vcc_lo = S_MOV_B32 0
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo
+    ; GCN-NEXT: S_ENDPGM 0
     $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
     $vcc_lo = S_MOV_B32 0
     $sgpr2 = S_MOV_B32 $vcc_lo
@@ -546,18 +525,12 @@ body:            |
 name:            mask_hazard_subreg5
 body:            |
   bb.0:
-    ; GFX11-LABEL: name: mask_hazard_subreg5
-    ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
-    ; GFX11-NEXT: $vcc_hi = S_MOV_B32 0
-    ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
-    ; GFX11-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi
-    ; GFX11-NEXT: S_ENDPGM 0
-    ;
-    ; GFX12-LABEL: name: mask_hazard_subreg5
-    ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
-    ; GFX12-NEXT: $vcc_hi = S_MOV_B32 0
-    ; GFX12-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi
-    ; GFX12-NEXT: S_ENDPGM 0
+    ; GCN-LABEL: name: mask_hazard_subreg5
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+    ; GCN-NEXT: $vcc_hi = S_MOV_B32 0
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi
+    ; GCN-NEXT: S_ENDPGM 0
     $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
     $vcc_hi = S_MOV_B32 0
     $sgpr2 = S_MOV_B32 $vcc_hi
@@ -569,20 +542,13 @@ body:            |
 name:            mask_hazard_waitcnt
 body:            |
   bb.0:
-    ; GFX11-LABEL: name: mask_hazard_waitcnt
-    ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    ; GFX11-NEXT: S_WAITCNT 0
-    ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
-    ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
-    ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
-    ; GFX11-NEXT: S_ENDPGM 0
-    ;
-    ; GFX12-LABEL: name: mask_hazard_waitcnt
-    ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    ; GFX12-NEXT: S_WAITCNT 0
-    ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
-    ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
-    ; GFX12-NEXT: S_ENDPGM 0
+    ; GCN-LABEL: name: mask_hazard_waitcnt
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: S_WAITCNT 0
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0
     $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     S_WAITCNT 0
     $sgpr0_sgpr1 = S_GETPC_B64
@@ -595,22 +561,14 @@ body:            |
 name:            mask_hazard_gap1
 body:            |
   bb.0:
-    ; GFX11-LABEL: name: mask_hazard_gap1
-    ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    ; GFX11-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
-    ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
-    ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
-    ; GFX11-NEXT: S_ENDPGM 0
-    ;
-    ; GFX12-LABEL: name: mask_hazard_gap1
-    ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX12-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
-    ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
-    ; GFX12-NEXT: S_ENDPGM 0
+    ; GCN-LABEL: name: mask_hazard_gap1
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0
     $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     $vgpr2 = V_MOV_B32_e32 0, implicit $exec
     $vgpr3 = V_MOV_B32_e32 0, implicit $exec
@@ -624,20 +582,13 @@ body:            |
 name:            mask_hazard_gap2
 body:            |
   bb.0:
-    ; GFX11-LABEL: name: mask_hazard_gap2
-    ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    ; GFX11-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
-    ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
-    ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
-    ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
-    ; GFX11-NEXT: S_ENDPGM 0
-    ;
-    ; GFX12-LABEL: name: mask_hazard_gap2
-    ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
-    ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
-    ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
-    ; GFX12-NEXT: S_ENDPGM 0
+    ; GCN-LABEL: name: mask_hazard_gap2
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0
     $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
     $sgpr0_sgpr1 = S_GETPC_B64
@@ -650,20 +601,13 @@ body:            |
 name:            mask_hazard_gap3
 body:            |
   bb.0:
-    ; GFX11-LABEL: name: mask_hazard_gap3
-    ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    ; GFX11-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
-    ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
-    ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
-    ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
-    ; GFX11-NEXT: S_ENDPGM 0
-    ;
-    ; GFX12-LABEL: name: mask_hazard_gap3
-    ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
-    ; GFX12-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
-    ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
-    ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
-    ; GFX12-NEXT: S_ENDPGM 0
+    ; GCN-LABEL: name: mask_hazard_gap3
+    ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0
     $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
     $sgpr0_sgpr1 = S_GETPC_B64
diff --git a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
new file mode 100644
index 0000000000000..2aa16dd904766
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
@@ -0,0 +1,862 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O0 %s
+# RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O2 %s
+
+--- |
+  @mem = internal unnamed_addr addrspace(4) constant [4 x <4 x i32>] [<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>]
+
+  define amdgpu_gs void @hazard_getpc1() { ret void }
+  define amdgpu_gs void @hazard_getpc2() { ret void }
+  define amdgpu_gs void @hazard_getpc3() { ret void }
+  define amdgpu_gs void @hazard_getpc4() { ret void }
+  define amdgpu_gs void @hazard_vcc1() { ret void }
+  define amdgpu_gs void @hazard_vcc2() { ret void }
+  define amdgpu_gs void @hazard_vcc3() { ret void }
+  define amdgpu_gs void @hazard_addc1() { ret void }
+  define amdgpu_gs void @hazard_addc2() { ret void }
+  define amdgpu_gs void @hazard_addc3() { ret void }
+  define amdgpu_gs void @hazard_addc4() { ret void }
+  define amdgpu_gs void @hazard_addc5() { ret void }
+  define amdgpu_gs void @hazard_addc6() { ret void }
+  define amdgpu_gs void @hazard_vaddc1() { ret void }
+  define amdgpu_gs void @hazard_gap1() { ret void }
+  define amdgpu_gs void @hazard_gap2() { ret void }
+  define amdgpu_gs void @hazard_gap3() { ret void }
+  define amdgpu_gs void @hazard_gap4_no_hazard() { ret void }
+  define amdgpu_gs void @hazard_valu_write1_no_hazard() { ret void }
+  define amdgpu_gs void @hazard_post_order1() { ret void }
+  define amdgpu_gs void @hazard_post_order2() { ret void }
+  define amdgpu_gs void @hazard_post_order_cycle() { ret void }
+  define amdgpu_cs void @hazard_calls() { ret void }
+...
+
+---
+name:            hazard_getpc1
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_getpc1
+    ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+    ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_getpc1
+    ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+    ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+    $sgpr0_sgpr1 = S_GETPC_B64
+    $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_getpc2
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_getpc2
+    ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec
+    ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_getpc2
+    ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec
+    ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec
+    $sgpr0_sgpr1 = S_GETPC_B64
+    $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_getpc3
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_getpc3
+    ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+    ; GCN-O0-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+    ; GCN-O0-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
+    ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 20, implicit-def $scc, implicit $scc
+    ; GCN-O0-NEXT: }
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_getpc3
+    ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+    ; GCN-O2-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+    ; GCN-O2-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-O2-NEXT:   S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
+    ; GCN-O2-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 16, implicit-def $scc, implicit $scc
+    ; GCN-O2-NEXT: }
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+    BUNDLE implicit-def $sgpr0_sgpr1 {
+      $sgpr0_sgpr1 = S_GETPC_B64
+      $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 4, implicit-def $scc
+      $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 12, implicit-def $scc, implicit $scc
+    }
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_getpc4
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_getpc4
+    ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+    ; GCN-O0-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+    ; GCN-O0-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT:   $sgpr1 = S_SEXT_I32_I16 $sgpr1
+    ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc
+    ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 28, implicit-def $scc, implicit $scc
+    ; GCN-O0-NEXT: }
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_getpc4
+    ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+    ; GCN-O2-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+    ; GCN-O2-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-O2-NEXT:   S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT:   $sgpr1 = S_SEXT_I32_I16 $sgpr1
+    ; GCN-O2-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 12, implicit-def $scc
+    ; GCN-O2-NEXT:   S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 24, implicit-def $scc, implicit $scc
+    ; GCN-O2-NEXT: }
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+    BUNDLE implicit-def $sgpr0_sgpr1 {
+      $sgpr0_sgpr1 = S_GETPC_B64
+      $sgpr1 = S_SEXT_I32_I16 $sgpr1
+      $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
+      $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 16, implicit-def $scc, implicit $scc
+    }
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_vcc1
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_vcc1
+    ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec
+    ; GCN-O0-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_vcc1
+    ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec
+    ; GCN-O2-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec
+    $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+    $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_vcc2
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_vcc2
+    ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec
+    ; GCN-O0-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_vcc2
+    ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec
+    ; GCN-O2-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+    $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+    $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_vcc3
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_vcc3
+    ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec
+    ; GCN-O0-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_vcc3
+    ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec
+    ; GCN-O2-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+    $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+    $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_addc1
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_addc1
+    ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O0-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_addc1
+    ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O2-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+    $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_addc2
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_addc2
+    ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+    ; GCN-O0-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_addc2
+    ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+    ; GCN-O2-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+    $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+    $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_addc3
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_addc3
+    ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_addc3
+    ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_addc4
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_addc4
+    ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec
+    ; GCN-O0-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_addc4
+    ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec
+    ; GCN-O2-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec
+    $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+    $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_addc5
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_addc5
+    ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr32 = S_MOV_B32 0
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_addc5
+    ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0
+    ; GCN-O2-NEXT: $sgpr32 = S_MOV_B32 0
+    ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    $sgpr16 = S_MOV_B32 0
+    $sgpr32 = S_MOV_B32 0
+    $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_addc6
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_addc6
+    ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr32 = S_MOV_B32 0
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr48 = S_MOV_B32 0
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr80 = S_MOV_B32 0
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr96 = S_MOV_B32 0
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_addc6
+    ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0
+    ; GCN-O2-NEXT: $sgpr32 = S_MOV_B32 0
+    ; GCN-O2-NEXT: $sgpr48 = S_MOV_B32 0
+    ; GCN-O2-NEXT: $sgpr80 = S_MOV_B32 0
+    ; GCN-O2-NEXT: $sgpr96 = S_MOV_B32 0
+    ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    $sgpr16 = S_MOV_B32 0
+    $sgpr32 = S_MOV_B32 0
+    $sgpr48 = S_MOV_B32 0
+    $sgpr80 = S_MOV_B32 0
+    $sgpr96 = S_MOV_B32 0
+    $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_vaddc1
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_vaddc1
+    ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_vaddc1
+    ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT: $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_gap1
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_gap1
+    ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_gap1
+    ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_gap2
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_gap2
+    ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: S_NOP 0
+    ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_gap2
+    ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: S_NOP 0
+    ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_gap3
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_gap3
+    ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_gap3
+    ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+    $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+    $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+    $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+    $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+    $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+    $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+    $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_gap4_no_hazard
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_gap4_no_hazard
+    ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_gap4_no_hazard
+    ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+    $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+    $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+    $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+    $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+    $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+    $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+    $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+    $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc
+    $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc
+    $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc
+    $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_valu_write1_no_hazard
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_valu_write1_no_hazard
+    ; GCN-O0: $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
+    ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_valu_write1_no_hazard
+    ; GCN-O2: $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
+    ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
+    $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+    $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_post_order1
+body:            |
+  bb.0:
+    ; GCN-O0-LABEL: name: hazard_post_order1
+    ; GCN-O0: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+    ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+    ; GCN-O0-NEXT: S_ENDPGM 0
+    ;
+    ; GCN-O2-LABEL: name: hazard_post_order1
+    ; GCN-O2: $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+    ; GCN-O2-NEXT: S_ENDPGM 0
+    $sgpr0_sgpr1 = S_GETPC_B64
+    $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_post_order2
+body:            |
+  ; GCN-O0-LABEL: name: hazard_post_order2
+  ; GCN-O0: bb.0:
+  ; GCN-O0-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-O0-NEXT: {{  $}}
+  ; GCN-O0-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+  ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O0-NEXT:   $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+  ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O0-NEXT:   S_BRANCH %bb.1
+  ; GCN-O0-NEXT: {{  $}}
+  ; GCN-O0-NEXT: bb.1:
+  ; GCN-O0-NEXT:   $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+  ; GCN-O0-NEXT:   S_ENDPGM 0
+  ;
+  ; GCN-O2-LABEL: name: hazard_post_order2
+  ; GCN-O2: bb.0:
+  ; GCN-O2-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-O2-NEXT: {{  $}}
+  ; GCN-O2-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+  ; GCN-O2-NEXT:   $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+  ; GCN-O2-NEXT:   S_BRANCH %bb.1
+  ; GCN-O2-NEXT: {{  $}}
+  ; GCN-O2-NEXT: bb.1:
+  ; GCN-O2-NEXT:   $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+  ; GCN-O2-NEXT:   S_ENDPGM 0
+  bb.0:
+    $sgpr0_sgpr1 = S_GETPC_B64
+    $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_post_order_cycle
+body:            |
+  ; GCN-O0-LABEL: name: hazard_post_order_cycle
+  ; GCN-O0: bb.0:
+  ; GCN-O0-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-O0-NEXT: {{  $}}
+  ; GCN-O0-NEXT:   S_NOP 0
+  ; GCN-O0-NEXT: {{  $}}
+  ; GCN-O0-NEXT: bb.1:
+  ; GCN-O0-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-O0-NEXT: {{  $}}
+  ; GCN-O0-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+  ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O0-NEXT:   $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+  ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O0-NEXT: {{  $}}
+  ; GCN-O0-NEXT: bb.2:
+  ; GCN-O0-NEXT:   successors: %bb.1(0x40000000), %bb.3(0x40000000)
+  ; GCN-O0-NEXT: {{  $}}
+  ; GCN-O0-NEXT:   $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+  ; GCN-O0-NEXT:   S_CBRANCH_SCC0 %bb.1, implicit $scc
+  ; GCN-O0-NEXT: {{  $}}
+  ; GCN-O0-NEXT: bb.3:
+  ; GCN-O0-NEXT:   S_ENDPGM 0
+  ;
+  ; GCN-O2-LABEL: name: hazard_post_order_cycle
+  ; GCN-O2: bb.0:
+  ; GCN-O2-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-O2-NEXT: {{  $}}
+  ; GCN-O2-NEXT:   S_NOP 0
+  ; GCN-O2-NEXT: {{  $}}
+  ; GCN-O2-NEXT: bb.1:
+  ; GCN-O2-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-O2-NEXT: {{  $}}
+  ; GCN-O2-NEXT:   $sgpr0_sgpr1 = S_GETPC_B64
+  ; GCN-O2-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O2-NEXT:   $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+  ; GCN-O2-NEXT: {{  $}}
+  ; GCN-O2-NEXT: bb.2:
+  ; GCN-O2-NEXT:   successors: %bb.1(0x40000000), %bb.3(0x40000000)
+  ; GCN-O2-NEXT: {{  $}}
+  ; GCN-O2-NEXT:   $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+  ; GCN-O2-NEXT:   S_CBRANCH_SCC0 %bb.1, implicit $scc
+  ; GCN-O2-NEXT: {{  $}}
+  ; GCN-O2-NEXT: bb.3:
+  ; GCN-O2-NEXT:   S_ENDPGM 0
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    $sgpr0_sgpr1 = S_GETPC_B64
+    $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+
+  bb.2:
+    $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+    S_CBRANCH_SCC0 %bb.1, implicit $scc
+
+  bb.3:
+    S_ENDPGM 0
+...
+
+---
+name:            hazard_calls
+frameInfo:
+  hasCalls: true
+body:            |
+  ; GCN-O0-LABEL: name: hazard_calls
+  ; GCN-O0: bb.0:
+  ; GCN-O0-NEXT:   $sgpr16 = S_MOV_B32 0
+  ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O0-NEXT:   S_SETPC_B64 $sgpr0_sgpr1
+  ; GCN-O0-NEXT: {{  $}}
+  ; GCN-O0-NEXT: bb.1:
+  ; GCN-O0-NEXT:   $sgpr18 = S_MOV_B32 0
+  ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O0-NEXT:   S_SETPC_B64_return $sgpr0_sgpr1
+  ; GCN-O0-NEXT: {{  $}}
+  ; GCN-O0-NEXT: bb.2:
+  ; GCN-O0-NEXT:   successors: %bb.3(0x80000000)
+  ; GCN-O0-NEXT: {{  $}}
+  ; GCN-O0-NEXT:   $sgpr20 = S_MOV_B32 0
+  ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O0-NEXT:   $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3
+  ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O0-NEXT:   $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc
+  ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O0-NEXT: {{  $}}
+  ; GCN-O0-NEXT: bb.3:
+  ; GCN-O0-NEXT:   successors: %bb.4(0x80000000)
+  ; GCN-O0-NEXT: {{  $}}
+  ; GCN-O0-NEXT:   $sgpr8_sgpr9 = S_CALL_B64 0
+  ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O0-NEXT: {{  $}}
+  ; GCN-O0-NEXT: bb.4:
+  ; GCN-O0-NEXT:   $sgpr22 = S_MOV_B32 $sgpr8
+  ; GCN-O0-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O0-NEXT:   S_ENDPGM 0
+  ;
+  ; GCN-O2-LABEL: name: hazard_calls
+  ; GCN-O2: bb.0:
+  ; GCN-O2-NEXT:   $sgpr16 = S_MOV_B32 0
+  ; GCN-O2-NEXT:   S_SETPC_B64 $sgpr0_sgpr1
+  ; GCN-O2-NEXT: {{  $}}
+  ; GCN-O2-NEXT: bb.1:
+  ; GCN-O2-NEXT:   $sgpr18 = S_MOV_B32 0
+  ; GCN-O2-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O2-NEXT:   S_SETPC_B64_return $sgpr0_sgpr1
+  ; GCN-O2-NEXT: {{  $}}
+  ; GCN-O2-NEXT: bb.2:
+  ; GCN-O2-NEXT:   successors: %bb.3(0x80000000)
+  ; GCN-O2-NEXT: {{  $}}
+  ; GCN-O2-NEXT:   $sgpr20 = S_MOV_B32 0
+  ; GCN-O2-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O2-NEXT:   $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3
+  ; GCN-O2-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O2-NEXT:   $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc
+  ; GCN-O2-NEXT: {{  $}}
+  ; GCN-O2-NEXT: bb.3:
+  ; GCN-O2-NEXT:   successors: %bb.4(0x80000000)
+  ; GCN-O2-NEXT: {{  $}}
+  ; GCN-O2-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O2-NEXT:   $sgpr8_sgpr9 = S_CALL_B64 0
+  ; GCN-O2-NEXT: {{  $}}
+  ; GCN-O2-NEXT: bb.4:
+  ; GCN-O2-NEXT:   S_WAITCNT_DEPCTR 65534
+  ; GCN-O2-NEXT:   $sgpr22 = S_MOV_B32 $sgpr8
+  ; GCN-O2-NEXT:   S_ENDPGM 0
+  bb.0:
+    $sgpr16 = S_MOV_B32 0
+    S_SETPC_B64 $sgpr0_sgpr1
+
+  bb.1:
+    $sgpr18 = S_MOV_B32 0
+    S_SETPC_B64_return $sgpr0_sgpr1
+
+  bb.2:
+    $sgpr20 = S_MOV_B32 0
+    $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3
+    $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc
+
+  bb.3:
+    $sgpr8_sgpr9 = S_CALL_B64 0
+
+  bb.4:
+    $sgpr22 = S_MOV_B32 $sgpr8
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
index 3f40a57ca1491..e3b96c08348fc 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
@@ -1,11 +1,12 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX12 %s
 
 # GCN-LABEL: name: hazard_vcmpx_permlane16
 # GCN:      V_CMPX_LE_F32_nosdst_e32
 # GCN:      S_ADD_U32
 # GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GFX12-NEXT: S_WAITCNT_DEPCTR
 # GCN-NEXT: V_PERMLANE16_B32_e64
 ---
 name:            hazard_vcmpx_permlane16
@@ -128,6 +129,7 @@ body:            |
 # GCN:      V_CMPX_LE_F32_nosdst_e32
 # GCN:      S_ADD_U32
 # GCN-NEXT: dead $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+# GFX12-NEXT: S_WAITCNT_DEPCTR
 # GCN-NEXT: V_PERMLANE16_B32_e64
 ---
 name:            hazard_vcmpx_permlane16_undef_src
@@ -150,6 +152,7 @@ body:            |
 # GCN:      V_CMPX_LE_F32_nosdst_e64
 # GCN:      S_ADD_U32
 # GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GFX12-NEXT: S_WAITCNT_DEPCTR
 # GCN-NEXT: V_PERMLANE16_B32_e64
 ---
 name:            hazard_vcmpx_e64_permlane16

From a5ce66423bfff6f2185e5fe48bc6ffc0ade7df4d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 3 Sep 2024 20:13:40 -0700
Subject: [PATCH 025/425] [RISCV] Remove RISCVISD::FP_ROUND_BF16.

Use isel patterns on regular FP_ROUND. For double->bf16 we need
to emit two instructions. Note the double->bf16 conversion does
double rounding, but I don't know a good way to fix that.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 27 ++-----------------
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |  2 --
 .../lib/Target/RISCV/RISCVInstrInfoZfbfmin.td | 16 ++++-------
 3 files changed, 7 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 11be5c25354f1..3742b897ca568 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -447,7 +447,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   if (Subtarget.hasStdExtZfbfmin()) {
     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
     setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
-    setOperationAction(ISD::FP_ROUND, MVT::bf16, Custom);
     setOperationAction(ISD::ConstantFP, MVT::bf16, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::bf16, Expand);
     setOperationAction(ISD::SELECT, MVT::bf16, Custom);
@@ -6631,30 +6630,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
          !Subtarget.hasVInstructionsF16()))
       return SplitVectorOp(Op, DAG);
     return lowerFMAXIMUM_FMINIMUM(Op, DAG, Subtarget);
-  case ISD::FP_EXTEND: {
-    if (!Op.getValueType().isVector())
-      return Op;
+  case ISD::FP_EXTEND:
+  case ISD::FP_ROUND:
     return lowerVectorFPExtendOrRoundLike(Op, DAG);
-  }
-  case ISD::FP_ROUND: {
-    SDLoc DL(Op);
-    EVT VT = Op.getValueType();
-    SDValue Op0 = Op.getOperand(0);
-    EVT Op0VT = Op0.getValueType();
-    if (VT == MVT::bf16 && Op0VT == MVT::f32 && Subtarget.hasStdExtZfbfmin())
-      return DAG.getNode(RISCVISD::FP_ROUND_BF16, DL, MVT::bf16, Op0);
-    if (VT == MVT::bf16 && Op0VT == MVT::f64 && Subtarget.hasStdExtZfbfmin() &&
-        Subtarget.hasStdExtDOrZdinx()) {
-      SDValue FloatVal =
-          DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Op0,
-                      DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
-      return DAG.getNode(RISCVISD::FP_ROUND_BF16, DL, MVT::bf16, FloatVal);
-    }
-
-    if (!Op.getValueType().isVector())
-      return Op;
-    return lowerVectorFPExtendOrRoundLike(Op, DAG);
-  }
   case ISD::STRICT_FP_ROUND:
   case ISD::STRICT_FP_EXTEND:
     return lowerStrictFPExtendOrRoundLike(Op, DAG);
@@ -20588,7 +20566,6 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FCVT_WU_RV64)
   NODE_NAME_CASE(STRICT_FCVT_W_RV64)
   NODE_NAME_CASE(STRICT_FCVT_WU_RV64)
-  NODE_NAME_CASE(FP_ROUND_BF16)
   NODE_NAME_CASE(FROUND)
   NODE_NAME_CASE(FCLASS)
   NODE_NAME_CASE(FSGNJX)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 29a16282ed001..f1d1cca043b35 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -116,8 +116,6 @@ enum NodeType : unsigned {
   FCVT_W_RV64,
   FCVT_WU_RV64,
 
-  FP_ROUND_BF16,
-
   // Rounds an FP value to its corresponding integer in the same FP format.
   // First operand is the value to round, the second operand is the largest
   // integer that can be represented exactly in the FP format. This will be
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfbfmin.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfbfmin.td
index fd618a938ae8c..8f0768f91c370 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfbfmin.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfbfmin.td
@@ -13,16 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-// RISC-V specific DAG Nodes.
-//===----------------------------------------------------------------------===//
-
-def SDT_RISCVFP_ROUND_BF16
-    : SDTypeProfile<1, 1, [SDTCisVT<0, bf16>, SDTCisVT<1, f32>]>;
-
-def riscv_fpround_bf16
-    : SDNode<"RISCVISD::FP_ROUND_BF16", SDT_RISCVFP_ROUND_BF16>;
-
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -60,7 +50,7 @@ def : StPat<store, FSH, FPR16, bf16>;
 
 /// Float conversion operations
 // f32 -> bf16, bf16 -> f32
-def : Pat<(bf16 (riscv_fpround_bf16 FPR32:$rs1)),
+def : Pat<(bf16 (fpround FPR32:$rs1)),
           (FCVT_BF16_S FPR32:$rs1, FRM_DYN)>;
 def : Pat<(fpextend (bf16 FPR16:$rs1)),
           (FCVT_S_BF16 FPR16:$rs1, FRM_RNE)>;
@@ -72,6 +62,10 @@ def : Pat<(riscv_fmv_x_signexth (bf16 FPR16:$src)), (FMV_X_H FPR16:$src)>;
 } // Predicates = [HasStdExtZfbfmin]
 
 let Predicates = [HasStdExtZfbfmin, HasStdExtD] in {
+// f64 -> bf16
+// FIXME: This pattern double rounds.
+def : Pat<(bf16 (fpround FPR64:$rs1)),
+          (FCVT_BF16_S (FCVT_S_D FPR64:$rs1, FRM_DYN), FRM_DYN)>;
 // bf16 -> f64
 def : Pat<(fpextend (bf16 FPR16:$rs1)),
           (FCVT_D_S (FCVT_S_BF16 FPR16:$rs1, FRM_DYN), FRM_RNE)>;

From 0ad6cee926865d7210eed9e67bfb20dce19c6633 Mon Sep 17 00:00:00 2001
From: Elvis Wang <110374989+ElvisWang123@users.noreply.github.com>
Date: Wed, 4 Sep 2024 11:29:50 +0800
Subject: [PATCH 026/425] [RISCV] Fix missing `i64` to `double` tests in the
 cast.ll. (NFC) (#106972)

---
 llvm/test/Analysis/CostModel/RISCV/cast.ll | 168 ++++++++++-----------
 1 file changed, 84 insertions(+), 84 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index 1fc4b3b346020..c1759b8b03d0f 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -2613,7 +2613,7 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = sitofp <2 x i16> undef to <2 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
@@ -2623,7 +2623,7 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4f64 = sitofp <4 x i16> undef to <4 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
@@ -2633,7 +2633,7 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i64_v8f64 = sitofp <8 x i16> undef to <8 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
@@ -2643,7 +2643,7 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i64_v16f64 = sitofp <16 x i16> undef to <16 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
@@ -2653,7 +2653,7 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i64_v32f64 = sitofp <32 x i16> undef to <32 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
@@ -2663,7 +2663,7 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i64_v64f64 = sitofp <64 x i16> undef to <64 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
@@ -2673,7 +2673,7 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i64_v128f64 = sitofp <128 x i16> undef to <128 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
@@ -2683,7 +2683,7 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
@@ -2693,7 +2693,7 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
@@ -2703,7 +2703,7 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
@@ -2713,7 +2713,7 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
@@ -2723,7 +2723,7 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
@@ -2733,7 +2733,7 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
@@ -2743,7 +2743,7 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -2756,7 +2756,7 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = sitofp <2 x i16> undef to <2 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
@@ -2766,7 +2766,7 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4f64 = sitofp <4 x i16> undef to <4 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
@@ -2776,7 +2776,7 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i64_v8f64 = sitofp <8 x i16> undef to <8 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
@@ -2786,7 +2786,7 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i64_v16f64 = sitofp <16 x i16> undef to <16 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
@@ -2796,7 +2796,7 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i64_v32f64 = sitofp <32 x i16> undef to <32 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
@@ -2806,7 +2806,7 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i64_v64f64 = sitofp <64 x i16> undef to <64 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
@@ -2816,7 +2816,7 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i64_v128f64 = sitofp <128 x i16> undef to <128 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
@@ -2826,7 +2826,7 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
@@ -2836,7 +2836,7 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
@@ -2846,7 +2846,7 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
@@ -2856,7 +2856,7 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
@@ -2866,7 +2866,7 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
@@ -2876,7 +2876,7 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
@@ -2886,7 +2886,7 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -2898,7 +2898,7 @@ define void @sitofp() {
   %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
   %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
   %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
-  %v2i64_v2f64 = sitofp <2 x i16> undef to <2 x double>
+  %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
   %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
   %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
 
@@ -2909,7 +2909,7 @@ define void @sitofp() {
   %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
   %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
   %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
-  %v4i64_v4f64 = sitofp <4 x i16> undef to <4 x double>
+  %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
   %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
   %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
 
@@ -2920,7 +2920,7 @@ define void @sitofp() {
   %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
   %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
   %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
-  %v8i64_v8f64 = sitofp <8 x i16> undef to <8 x double>
+  %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
   %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
   %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
 
@@ -2931,7 +2931,7 @@ define void @sitofp() {
   %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
   %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
   %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
-  %v16i64_v16f64 = sitofp <16 x i16> undef to <16 x double>
+  %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
   %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
   %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
 
@@ -2942,7 +2942,7 @@ define void @sitofp() {
   %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
   %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
   %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
-  %v32i64_v32f64 = sitofp <32 x i16> undef to <32 x double>
+  %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
   %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
   %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
 
@@ -2953,7 +2953,7 @@ define void @sitofp() {
   %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
   %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
   %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
-  %v64i64_v64f64 = sitofp <64 x i16> undef to <64 x double>
+  %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
   %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
   %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
 
@@ -2964,7 +2964,7 @@ define void @sitofp() {
   %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
   %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
   %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
-  %v128i64_v128f64 = sitofp <128 x i16> undef to <128 x double>
+  %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
   %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
   %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
 
@@ -2975,7 +2975,7 @@ define void @sitofp() {
   %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
   %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
   %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-  %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+  %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
   %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
   %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
 
@@ -2986,7 +2986,7 @@ define void @sitofp() {
   %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
   %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
   %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-  %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+  %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
   %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
   %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
 
@@ -2997,7 +2997,7 @@ define void @sitofp() {
   %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
   %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
   %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-  %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+  %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
   %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
   %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
 
@@ -3008,7 +3008,7 @@ define void @sitofp() {
   %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
   %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
   %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-  %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+  %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
   %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
   %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
 
@@ -3019,7 +3019,7 @@ define void @sitofp() {
   %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
   %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
   %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-  %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+  %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
   %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
   %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
 
@@ -3030,7 +3030,7 @@ define void @sitofp() {
   %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
   %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
   %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-  %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+  %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
   %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
   %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
 
@@ -3041,7 +3041,7 @@ define void @sitofp() {
   %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
   %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
   %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-  %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+  %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
   %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
   %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 
@@ -3057,7 +3057,7 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = uitofp <2 x i16> undef to <2 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
@@ -3067,7 +3067,7 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4f64 = uitofp <4 x i16> undef to <4 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
@@ -3077,7 +3077,7 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i64_v8f64 = uitofp <8 x i16> undef to <8 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
@@ -3087,7 +3087,7 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i64_v16f64 = uitofp <16 x i16> undef to <16 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
@@ -3097,7 +3097,7 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i64_v32f64 = uitofp <32 x i16> undef to <32 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
@@ -3107,7 +3107,7 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i64_v64f64 = uitofp <64 x i16> undef to <64 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
@@ -3117,7 +3117,7 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i64_v128f64 = uitofp <128 x i16> undef to <128 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
@@ -3127,7 +3127,7 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
@@ -3137,7 +3137,7 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
@@ -3147,7 +3147,7 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
@@ -3157,7 +3157,7 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
@@ -3167,7 +3167,7 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
@@ -3177,7 +3177,7 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
@@ -3187,7 +3187,7 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -3200,7 +3200,7 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = uitofp <2 x i16> undef to <2 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
@@ -3210,7 +3210,7 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4f64 = uitofp <4 x i16> undef to <4 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
@@ -3220,7 +3220,7 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i64_v8f64 = uitofp <8 x i16> undef to <8 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
@@ -3230,7 +3230,7 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i64_v16f64 = uitofp <16 x i16> undef to <16 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
@@ -3240,7 +3240,7 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i64_v32f64 = uitofp <32 x i16> undef to <32 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
@@ -3250,7 +3250,7 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i64_v64f64 = uitofp <64 x i16> undef to <64 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
@@ -3260,7 +3260,7 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i64_v128f64 = uitofp <128 x i16> undef to <128 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
@@ -3270,7 +3270,7 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
@@ -3280,7 +3280,7 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
@@ -3290,7 +3290,7 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
@@ -3300,7 +3300,7 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
@@ -3310,7 +3310,7 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
@@ -3320,7 +3320,7 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
@@ -3330,7 +3330,7 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -3342,7 +3342,7 @@ define void @uitofp() {
   %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
   %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
   %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
-  %v2i64_v2f64 = uitofp <2 x i16> undef to <2 x double>
+  %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
   %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
   %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
 
@@ -3353,7 +3353,7 @@ define void @uitofp() {
   %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
   %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
   %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
-  %v4i64_v4f64 = uitofp <4 x i16> undef to <4 x double>
+  %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
   %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
   %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
 
@@ -3364,7 +3364,7 @@ define void @uitofp() {
   %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
   %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
   %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
-  %v8i64_v8f64 = uitofp <8 x i16> undef to <8 x double>
+  %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
   %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
   %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
 
@@ -3375,7 +3375,7 @@ define void @uitofp() {
   %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
   %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
   %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
-  %v16i64_v16f64 = uitofp <16 x i16> undef to <16 x double>
+  %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
   %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
   %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
 
@@ -3386,7 +3386,7 @@ define void @uitofp() {
   %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
   %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
   %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
-  %v32i64_v32f64 = uitofp <32 x i16> undef to <32 x double>
+  %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
   %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
   %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
 
@@ -3397,7 +3397,7 @@ define void @uitofp() {
   %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
   %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
   %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
-  %v64i64_v64f64 = uitofp <64 x i16> undef to <64 x double>
+  %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
   %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
   %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
 
@@ -3408,7 +3408,7 @@ define void @uitofp() {
   %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
   %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
   %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
-  %v128i64_v128f64 = uitofp <128 x i16> undef to <128 x double>
+  %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
   %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
   %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
 
@@ -3419,7 +3419,7 @@ define void @uitofp() {
   %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
   %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
   %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-  %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+  %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
   %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
   %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
 
@@ -3430,7 +3430,7 @@ define void @uitofp() {
   %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
   %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
   %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-  %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+  %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
   %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
   %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
 
@@ -3441,7 +3441,7 @@ define void @uitofp() {
   %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
   %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
   %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-  %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+  %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
   %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
   %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
 
@@ -3452,7 +3452,7 @@ define void @uitofp() {
   %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
   %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
   %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-  %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+  %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
   %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
   %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
 
@@ -3463,7 +3463,7 @@ define void @uitofp() {
   %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
   %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
   %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-  %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+  %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
   %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
   %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
 
@@ -3474,7 +3474,7 @@ define void @uitofp() {
   %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
   %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
   %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-  %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+  %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
   %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
   %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
 
@@ -3485,7 +3485,7 @@ define void @uitofp() {
   %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
   %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
   %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-  %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+  %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
   %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
   %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 

From 8b28e2ebb36d72cfffe04904e3e1b9fdfa36ef94 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 3 Sep 2024 21:14:36 -0700
Subject: [PATCH 027/425] [WebAssembly] Rename legacy EH tests (#107166)

Give each test in `cfg-stackify-eh-legacy.ll` a name rather than
something like `test5`, because I plan to copy many of these test into a
new file that tests for the new EH (exnref) and some of the tests here
are not applicable to the new EH so the numbering will be different,
which can make things confusing.

Also this removes `test_` prefixes in the test function names in
`exception-legacy.ll`, because, well, we all know they are tests.
---
 .../WebAssembly/cfg-stackify-eh-legacy.ll     | 122 +++++++++---------
 .../CodeGen/WebAssembly/exception-legacy.ll   |  40 +++---
 2 files changed, 82 insertions(+), 80 deletions(-)

diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
index 21a0949debc12..cef92f459e4aa 100644
--- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
+++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
@@ -16,7 +16,7 @@ target triple = "wasm32-unknown-unknown"
 ; Simple test case with two catch clauses
 ;
 ; void foo();
-; void test0() {
+; void two_catches() {
 ;   try {
 ;     foo();
 ;   } catch (int) {
@@ -24,7 +24,7 @@ target triple = "wasm32-unknown-unknown"
 ;   }
 ; }
 
-; CHECK-LABEL: test0:
+; CHECK-LABEL: two_catches:
 ; CHECK: try
 ; CHECK:   call      foo
 ; CHECK: catch
@@ -42,7 +42,7 @@ target triple = "wasm32-unknown-unknown"
 ; CHECK:   end_block                                   # label[[L2]]:
 ; CHECK:   rethrow   0                                 # to caller
 ; CHECK: end_try                                       # label[[L1]]:
-define void @test0() personality ptr @__gxx_wasm_personality_v0 {
+define void @two_catches() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   invoke void @foo()
           to label %try.cont unwind label %catch.dispatch
@@ -82,7 +82,7 @@ try.cont:                                         ; preds = %catch, %catch2, %en
 }
 
 ; Nested try-catches within a catch
-; void test1() {
+; void nested_catch() {
 ;   try {
 ;     foo();
 ;   } catch (int) {
@@ -94,7 +94,7 @@ try.cont:                                         ; preds = %catch, %catch2, %en
 ;   }
 ; }
 
-; CHECK-LABEL: test1:
+; CHECK-LABEL: nested_catch:
 ; CHECK: try
 ; CHECK:   call  foo
 ; CHECK: catch
@@ -133,7 +133,7 @@ try.cont:                                         ; preds = %catch, %catch2, %en
 ; CHECK:   end_block                                   # label[[L1]]:
 ; CHECK:   call  __cxa_end_catch
 ; CHECK: end_try
-define void @test1() personality ptr @__gxx_wasm_personality_v0 {
+define void @nested_catch() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   invoke void @foo()
           to label %try.cont11 unwind label %catch.dispatch
@@ -206,7 +206,7 @@ unreachable:                                      ; preds = %rethrow5
 }
 
 ; Nested loop within a catch clause
-; void test2() {
+; void loop_within_catch() {
 ;   try {
 ;     foo();
 ;   } catch (...) {
@@ -215,7 +215,7 @@ unreachable:                                      ; preds = %rethrow5
 ;   }
 ; }
 
-; CHECK-LABEL: test2:
+; CHECK-LABEL: loop_within_catch:
 ; CHECK: try
 ; CHECK:   call      foo
 ; CHECK: catch
@@ -243,7 +243,7 @@ unreachable:                                      ; preds = %rethrow5
 ; CHECK:     br        0                               # 0: up to label[[L0]]
 ; CHECK:   end_loop
 ; CHECK: end_try                                       # label[[L3]]:
-define void @test2() personality ptr @__gxx_wasm_personality_v0 {
+define void @loop_within_catch() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   invoke void @foo()
           to label %try.cont unwind label %catch.dispatch
@@ -297,7 +297,7 @@ terminate:                                        ; preds = %ehcleanup
 ; TRY marker should be placed at bb0 because there's a branch from bb0 to bb2,
 ; and scopes cannot be interleaved.
 
-; NOOPT-LABEL: test3:
+; NOOPT-LABEL: block_try_markers:
 ; NOOPT: try
 ; NOOPT:   block
 ; NOOPT:     block
@@ -309,7 +309,7 @@ terminate:                                        ; preds = %ehcleanup
 ; NOOPT:   call      bar
 ; NOOPT: catch     {{.*}}
 ; NOOPT: end_try
-define void @test3() personality ptr @__gxx_wasm_personality_v0 {
+define void @block_try_markers() personality ptr @__gxx_wasm_personality_v0 {
 bb0:
   br i1 undef, label %bb1, label %bb2
 
@@ -343,13 +343,14 @@ try.cont:                                         ; preds = %catch.start, %bb4,
 ; Tests if try/end_try markers are placed correctly wrt loop/end_loop markers,
 ; when try and loop markers are in the same BB and end_try and end_loop are in
 ; another BB.
+; CHECK-LABEL: loop_try_markers:
 ; CHECK: loop
 ; CHECK:   try
 ; CHECK:     call      foo
 ; CHECK:   catch
 ; CHECK:   end_try
 ; CHECK: end_loop
-define void @test4(ptr %p) personality ptr @__gxx_wasm_personality_v0 {
+define void @loop_try_markers(ptr %p) personality ptr @__gxx_wasm_personality_v0 {
 entry:
   store volatile i32 0, ptr %p
   br label %loop
@@ -388,7 +389,7 @@ try.cont:                                         ; preds = %catch.start, %loop
 ; try-catch with try-delegate that rethrows an exception to the caller to fix
 ; this.
 
-; NOSORT-LABEL: test5:
+; NOSORT-LABEL: unwind_mismatches_0:
 ; NOSORT: try
 ; --- try-delegate starts (catch unwind mismatch)
 ; NOSORT    try
@@ -407,7 +408,7 @@ try.cont:                                         ; preds = %catch.start, %loop
 ; NOSORT: end_try
 ; NOSORT: return
 
-define void @test5() personality ptr @__gxx_wasm_personality_v0 {
+define void @unwind_mismatches_0() personality ptr @__gxx_wasm_personality_v0 {
 bb0:
   invoke void @foo()
           to label %bb1 unwind label %catch.dispatch0
@@ -446,7 +447,7 @@ try.cont:                                         ; preds = %catch.start1, %catc
 ; And the return value of 'baz' should NOT be stackified because the BB is split
 ; during fixing unwind mismatches.
 
-; NOSORT-LABEL: test6:
+; NOSORT-LABEL: unwind_mismatches_1:
 ; NOSORT: try
 ; NOSORT:   call  foo
 ; --- try-delegate starts (call unwind mismatch)
@@ -462,7 +463,7 @@ try.cont:                                         ; preds = %catch.start1, %catc
 ; NOSORT:   return
 ; NOSORT: end_try
 
-define void @test6() personality ptr @__gxx_wasm_personality_v0 {
+define void @unwind_mismatches_1() personality ptr @__gxx_wasm_personality_v0 {
 bb0:
   invoke void @foo()
           to label %bb1 unwind label %catch.dispatch0
@@ -486,11 +487,11 @@ try.cont:                                         ; preds = %catch.start0
   ret void
 }
 
-; The same as test5, but we have one more call 'call @foo' in bb1 which unwinds
-; to the caller. IN this case bb1 has two call unwind mismatches: 'call @foo'
-; unwinds to the caller and 'call @bar' unwinds to catch C0.
+; The same as unwind_mismatches_0, but we have one more call 'call @foo' in bb1
+; which unwinds to the caller. IN this case bb1 has two call unwind mismatches:
+  ; 'call @foo' unwinds to the caller and 'call @bar' unwinds to catch C0.
 
-; NOSORT-LABEL: test7:
+; NOSORT-LABEL: unwind_mismatches_2:
 ; NOSORT: try
 ; --- try-delegate starts (catch unwind mismatch)
 ; NOSORT    try
@@ -514,7 +515,7 @@ try.cont:                                         ; preds = %catch.start0
 ; NOSORT: end_try
 ; NOSORT: return
 
-define void @test7() personality ptr @__gxx_wasm_personality_v0 {
+define void @unwind_mismatches_2() personality ptr @__gxx_wasm_personality_v0 {
 bb0:
   invoke void @foo()
           to label %bb1 unwind label %catch.dispatch0
@@ -546,16 +547,17 @@ try.cont:                                         ; preds = %catch.start1, %catc
   ret void
 }
 
-; Similar situation as @test6. Here 'call @qux''s original unwind destination
-; was the caller, but after control flow linearization, their unwind destination
-; incorrectly becomes 'C0' within the function. We fix this by wrapping the call
-; with a nested try-delegate that rethrows the exception to the caller.
+; Similar situation as @unwind_mismatches_1. Here 'call @qux''s original unwind
+; destination was the caller, but after control flow linearization, their unwind
+; destination incorrectly becomes 'C0' within the function. We fix this by
+; wrapping the call with a nested try-delegate that rethrows the exception to
+; the caller.
 
 ; Because 'call @qux' pops an argument pushed by 'i32.const 5' from stack, the
 ; nested 'try' should be placed before `i32.const 5', not between 'i32.const 5'
 ; and 'call @qux'.
 
-; NOSORT-LABEL: test8:
+; NOSORT-LABEL: unwind_mismatches_3:
 ; NOSORT: try       i32
 ; NOSORT:   call  foo
 ; --- try-delegate starts (call unwind mismatch)
@@ -569,7 +571,7 @@ try.cont:                                         ; preds = %catch.start1, %catc
 ; NOSORT:   return
 ; NOSORT: end_try
 
-define i32 @test8() personality ptr @__gxx_wasm_personality_v0 {
+define i32 @unwind_mismatches_3() personality ptr @__gxx_wasm_personality_v0 {
 bb0:
   invoke void @foo()
           to label %bb1 unwind label %catch.dispatch0
@@ -594,8 +596,8 @@ try.cont:                                         ; preds = %catch.start0
 ; Tests the case when TEE stackifies a register in RegStackify but it gets
 ; unstackified in fixCallUnwindMismatches in CFGStackify.
 
-; NOSORT-LOCALS-LABEL: test9:
-define void @test9(i32 %x) personality ptr @__gxx_wasm_personality_v0 {
+; NOSORT-LOCALS-LABEL: unstackify_when_fixing_unwind_mismatch:
+define void @unstackify_when_fixing_unwind_mismatch(i32 %x) personality ptr @__gxx_wasm_personality_v0 {
 bb0:
   invoke void @foo()
           to label %bb1 unwind label %catch.dispatch0
@@ -640,7 +642,7 @@ try.cont:                                         ; preds = %catch.start0
 ; first catch because it is a non-C++ exception, it shouldn't unwind to the next
 ; catch, but it should unwind to the caller.
 
-; NOSORT-LABEL: test10:
+; NOSORT-LABEL: unwind_mismatches_4:
 ; NOSORT: try
 ; --- try-delegate starts (catch unwind mismatch)
 ; NOSORT:   try
@@ -667,7 +669,7 @@ try.cont:                                         ; preds = %catch.start0
 ; NOSORT: end_try
 ; NOSORT: return
 
-define void @test10() personality ptr @__gxx_wasm_personality_v0 {
+define void @unwind_mismatches_4() personality ptr @__gxx_wasm_personality_v0 {
 bb0:
   invoke void @foo()
           to label %bb1 unwind label %catch.dispatch0
@@ -709,7 +711,7 @@ try.cont:                                         ; preds = %catch.start1, %catc
 ; (before 'cont' is sorted) and there should not be any unwind destination
 ; mismatches in CFGStackify.
 
-; NOOPT-LABEL: test11:
+; NOOPT-LABEL: cfg_sort_order:
 ; NOOPT: block
 ; NOOPT:   try
 ; NOOPT:     call      foo
@@ -718,7 +720,7 @@ try.cont:                                         ; preds = %catch.start1, %catc
 ; NOOPT:   call      foo
 ; NOOPT: end_block
 ; NOOPT: return
-define void @test11(i32 %arg) personality ptr @__gxx_wasm_personality_v0 {
+define void @cfg_sort_order(i32 %arg) personality ptr @__gxx_wasm_personality_v0 {
 entry:
   %tobool = icmp ne i32 %arg, 0
   br i1 %tobool, label %if.then, label %if.end
@@ -753,7 +755,7 @@ if.end:                                           ; preds = %cont, %catch.start,
 ; invoke.cont BB fall within try~end_try, but they shouldn't cause crashes or
 ; unwinding destination mismatches in CFGStackify.
 
-; NOSORT-LABEL: test12:
+; NOSORT-LABEL: mem_intrinsics:
 ; NOSORT: try
 ; NOSORT:   call  foo
 ; NOSORT:   call {{.*}} memcpy
@@ -763,7 +765,7 @@ if.end:                                           ; preds = %cont, %catch.start,
 ; NOSORT: catch_all
 ; NOSORT:   rethrow 0
 ; NOSORT: end_try
-define void @test12(ptr %a, ptr %b) personality ptr @__gxx_wasm_personality_v0 {
+define void @mem_intrinsics(ptr %a, ptr %b) personality ptr @__gxx_wasm_personality_v0 {
 entry:
   %o = alloca %class.Object, align 1
   invoke void @foo()
@@ -787,11 +789,11 @@ ehcleanup:                                        ; preds = %entry
 ; 'nothrow_i32' and 'fun', because the return value of 'nothrow_i32' is
 ; stackified and pushed onto the stack to be consumed by the call to 'fun'.
 
-; CHECK-LABEL: test13:
+; CHECK-LABEL: try_marker_with_stackified_input:
 ; CHECK: try
 ; CHECK: call      $push{{.*}}=, nothrow_i32
 ; CHECK: call      fun, $pop{{.*}}
-define void @test13() personality ptr @__gxx_wasm_personality_v0 {
+define void @try_marker_with_stackified_input() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   %call = call i32 @nothrow_i32()
   invoke void @fun(i32 %call)
@@ -809,7 +811,7 @@ terminate:                                        ; preds = %entry
 ; This crashed on debug mode (= when NDEBUG is not defined) when the logic for
 ; computing the innermost region was not correct, in which a loop region
 ; contains an exception region. This should pass CFGSort without crashing.
-define void @test14() personality ptr @__gxx_wasm_personality_v0 {
+define void @loop_exception_region() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   %e = alloca %class.MyClass, align 4
   br label %for.cond
@@ -886,8 +888,8 @@ terminate7:                                       ; preds = %ehcleanup
 ;     ...
 ; bb2:            <- Continuation BB
 ;   end
-; CHECK-LABEL: test15:
-define void @test15(i32 %n) personality ptr @__gxx_wasm_personality_v0 {
+; CHECK-LABEL: remove_unnecessary_instrs:
+define void @remove_unnecessary_instrs(i32 %n) personality ptr @__gxx_wasm_personality_v0 {
 entry:
   invoke void @foo()
           to label %for.body unwind label %catch.dispatch
@@ -925,7 +927,7 @@ try.cont:                                         ; preds = %catch.start, %for.e
 }
 
 ; void foo();
-; void test16() {
+; void remove_unnecessary_br() {
 ;   try {
 ;     foo();
 ;     try {
@@ -955,8 +957,8 @@ try.cont:                                         ; preds = %catch.start, %for.e
 ; bb3:            <- Continuation BB
 ;   end
 ;
-; CHECK-LABEL: test16:
-define void @test16() personality ptr @__gxx_wasm_personality_v0 {
+; CHECK-LABEL: remove_unnecessary_br:
+define void @remove_unnecessary_br() personality ptr @__gxx_wasm_personality_v0 {
 ; CHECK: call foo
 entry:
   invoke void @foo()
@@ -1003,12 +1005,12 @@ invoke.cont2:                                     ; preds = %catch.start
 ; path back to the loop header), and is placed after the loop latch block
 ; 'invoke.cont' intentionally. This tests if 'end_loop' marker is placed
 ; correctly not right after 'invoke.cont' part but after 'ehcleanup' part,
-; NOSORT-LABEL: test17:
+; NOSORT-LABEL: loop_contains_exception:
 ; NOSORT: loop
 ; NOSORT: try
 ; NOSORT: end_try
 ; NOSORT: end_loop
-define void @test17(i32 %n) personality ptr @__gxx_wasm_personality_v0 {
+define void @loop_contains_exception(i32 %n) personality ptr @__gxx_wasm_personality_v0 {
 entry:
   br label %while.cond
 
@@ -1052,14 +1054,14 @@ while.end:                                        ; preds = %while.body, %while.
 ; before its corresponding `catch_all`, because both `try` and `catch_all` body
 ; should satisfy the return type requirements.
 
-; NOSORT-LABEL: test18:
+; NOSORT-LABEL: fix_function_end_return_type_with_try_catch:
 ; NOSORT: try i32
 ; NOSORT: loop i32
 ; NOSORT: end_loop
 ; NOSORT: catch_all
 ; NOSORT: end_try
 ; NOSORT-NEXT: end_function
-define i32 @test18(i32 %n) personality ptr @__gxx_wasm_personality_v0 {
+define i32 @fix_function_end_return_type_with_try_catch(i32 %n) personality ptr @__gxx_wasm_personality_v0 {
 entry:
   %t = alloca %class.Object, align 1
   br label %for.cond
@@ -1097,7 +1099,7 @@ ehcleanup:                                        ; preds = %if.then
 ; because the initial TRY placement for 'call @quux' was done before 'call @baz'
 ; because 'call @baz''s return value is stackified.
 
-; CHECK-LABEL: test19:
+; CHECK-LABEL: unwind_mismatches_5:
 ; CHECK: try
 ; CHECK:   try
 ; CHECK:     call $[[RET:[0-9]+]]=, baz
@@ -1105,7 +1107,7 @@ ehcleanup:                                        ; preds = %if.then
 ; CHECK:    call  quux, $[[RET]]
 ; CHECK: catch_all
 ; CHECK: end_try
-define void @test19() personality ptr @__gxx_wasm_personality_v0 {
+define void @unwind_mismatches_5() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   %call = call i32 @baz()
   invoke void @quux(i32 %call)
@@ -1147,10 +1149,10 @@ invoke.cont:                                      ; preds = %entry
 ; becomes invalid because it incorrectly branches into an inner scope. The
 ; destination should change to the BB where (b) points.
 
-; NOSORT-LABEL: test20:
+; NOSORT-LABEL: branch_remapping_after_fixing_unwind_mismatches_0:
 ; NOSORT: try
 ; NOSORT:   br_if   0
-define void @test20(i1 %arg) personality ptr @__gxx_wasm_personality_v0 {
+define void @branch_remapping_after_fixing_unwind_mismatches_0(i1 %arg) personality ptr @__gxx_wasm_personality_v0 {
 entry:
   br i1 %arg, label %bb0, label %dest
 
@@ -1187,8 +1189,8 @@ try.cont:                                         ; preds = %catch.start1, %catc
   ret void
 }
 
-; The similar case with test20, but multiple consecutive delegates are
-; generated:
+; The similar case with branch_remapping_after_fixing_unwind_mismatches_0, but
+; multiple consecutive delegates are generated:
 ; - Before:
 ; block
 ;   br (a)
@@ -1214,7 +1216,7 @@ try.cont:                                         ; preds = %catch.start1, %catc
 ;           <- (b) The br destination should be remapped to here
 ;
 ; The test was reduced by bugpoint and should not crash in CFGStackify.
-define void @test21() personality ptr @__gxx_wasm_personality_v0 {
+define void @branch_remapping_after_fixing_unwind_mismatches_1() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   br i1 undef, label %if.then, label %if.end12
 
@@ -1292,7 +1294,7 @@ unreachable:                                      ; preds = %rethrow19, %invoke.
 ; Regression test for WasmEHFuncInfo's reverse mapping bug. 'UnwindDestToSrc'
 ; should return a vector and not a single BB, which was incorrect.
 ; This was reduced by bugpoint and should not crash in CFGStackify.
-define void @test22() personality ptr @__gxx_wasm_personality_v0 {
+define void @wasm_eh_func_info_regression_test() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   invoke void @foo()
           to label %invoke.cont unwind label %catch.dispatch
@@ -1348,7 +1350,7 @@ unreachable:                                      ; preds = %invoke.cont8, %catc
   unreachable
 }
 
-; void test23() {
+; void exception_grouping_0() {
 ;   try {
 ;     try {
 ;       throw 0;
@@ -1364,7 +1366,7 @@ unreachable:                                      ; preds = %invoke.cont8, %catc
 ; included in catch.start's exception. Also, after we take catch.start2's
 ; exception out of catch.start's exception, we have to take out try.cont8 out of
 ; catch.start's exception, because it has a predecessor in catch.start2.
-define void @test23() personality ptr @__gxx_wasm_personality_v0 {
+define void @exception_grouping_0() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   %exception = call ptr @__cxa_allocate_exception(i32 4) #0
   store i32 0, ptr %exception, align 16
@@ -1442,7 +1444,7 @@ unreachable:                                      ; preds = %rethrow, %entry
 ; exception first, before taking out catch.start12's exception out of
 ; catch.start4's exception; otherwise we end up with an incorrect relationship
 ; of catch.start's exception > catch.start12's exception.
-define void @test24() personality ptr @__gxx_wasm_personality_v0 {
+define void @exception_grouping_1() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   invoke void @foo()
           to label %invoke.cont unwind label %catch.dispatch
@@ -1525,7 +1527,7 @@ unreachable:                                      ; preds = %rethrow, %rethrow6
   unreachable
 }
 
-; void test25() {
+; void exception_grouping_2() {
 ;   try {
 ;     try {
 ;       throw 0;
@@ -1545,7 +1547,7 @@ unreachable:                                      ; preds = %rethrow, %rethrow6
 ; contained in (a)'s exception. Because (a)'s unwind destination is (b), (b)'s
 ; exception is taken out of (a)'s. But because (c) is reachable from (b), we
 ; should make sure to take out (c)'s exception out of (a)'s exception too.
-define void @test25() personality ptr @__gxx_wasm_personality_v0 {
+define void @exception_grouping_2() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   %exception = call ptr @__cxa_allocate_exception(i32 4) #1
   store i32 0, ptr %exception, align 16
diff --git a/llvm/test/CodeGen/WebAssembly/exception-legacy.ll b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll
index 3537baa425164..aa191209516f6 100644
--- a/llvm/test/CodeGen/WebAssembly/exception-legacy.ll
+++ b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll
@@ -10,10 +10,10 @@ target triple = "wasm32-unknown-unknown"
 
 ; CHECK: .tagtype  __cpp_exception i32
 
-; CHECK-LABEL: test_throw:
+; CHECK-LABEL: throw:
 ; CHECK:     throw __cpp_exception, $0
 ; CHECK-NOT: unreachable
-define void @test_throw(ptr %p) {
+define void @throw(ptr %p) {
   call void @llvm.wasm.throw(i32 0, ptr %p)
   ret void
 }
@@ -21,14 +21,14 @@ define void @test_throw(ptr %p) {
 ; Simple test with a try-catch
 ;
 ; void foo();
-; void test_catch() {
+; void catch() {
 ;   try {
 ;     foo();
 ;   } catch (int) {
 ;   }
 ; }
 
-; CHECK-LABEL: test_catch:
+; CHECK-LABEL: catch:
 ; CHECK:     global.get  ${{.+}}=, __stack_pointer
 ; CHECK:     try
 ; CHECK:       call      foo
@@ -44,7 +44,7 @@ define void @test_throw(ptr %p) {
 ; CHECK:       end_block
 ; CHECK:       rethrow   0
 ; CHECK:     end_try
-define void @test_catch() personality ptr @__gxx_wasm_personality_v0 {
+define void @catch() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   invoke void @foo()
           to label %try.cont unwind label %catch.dispatch
@@ -79,12 +79,12 @@ try.cont:                                         ; preds = %catch, %entry
 ; struct Temp {
 ;   ~Temp() {}
 ; };
-; void test_cleanup() {
+; void cleanup() {
 ;   Temp t;
 ;   foo();
 ; }
 
-; CHECK-LABEL: test_cleanup:
+; CHECK-LABEL: cleanup:
 ; CHECK: try
 ; CHECK:   call      foo
 ; CHECK: catch_all
@@ -92,7 +92,7 @@ try.cont:                                         ; preds = %catch, %entry
 ; CHECK:   call      $drop=, _ZN4TempD2Ev
 ; CHECK:   rethrow   0
 ; CHECK: end_try
-define void @test_cleanup() personality ptr @__gxx_wasm_personality_v0 {
+define void @cleanup() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   %t = alloca %struct.Temp, align 1
   invoke void @foo()
@@ -112,7 +112,7 @@ ehcleanup:                                        ; preds = %entry
 ; temrinatepad, because __cxa_end_catch() also can throw within 'catch (...)'.
 ;
 ; void foo();
-; void test_terminatepad() {
+; void terminatepad() {
 ;   try {
 ;     foo();
 ;   } catch (...) {
@@ -120,7 +120,7 @@ ehcleanup:                                        ; preds = %entry
 ;   }
 ; }
 
-; CHECK-LABEL: test_terminatepad
+; CHECK-LABEL: terminatepad
 ; CHECK: try
 ; CHECK:   call      foo
 ; CHECK: catch
@@ -138,7 +138,7 @@ ehcleanup:                                        ; preds = %entry
 ; CHECK:   end_try
 ; CHECK:   call      __cxa_end_catch
 ; CHECK: end_try
-define void @test_terminatepad() personality ptr @__gxx_wasm_personality_v0 {
+define void @terminatepad() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   invoke void @foo()
           to label %try.cont unwind label %catch.dispatch
@@ -182,7 +182,7 @@ terminate:                                        ; preds = %ehcleanup
 ; instructions after a catch instruction.
 ;
 ; void bar(int) noexcept;
-; void test_no_prolog_epilog_in_ehpad() {
+; void no_prolog_epilog_in_ehpad() {
 ;   int stack_var = 0;
 ;   bar(stack_var);
 ;   try {
@@ -192,7 +192,7 @@ terminate:                                        ; preds = %ehcleanup
 ;   }
 ; }
 
-; CHECK-LABEL: test_no_prolog_epilog_in_ehpad
+; CHECK-LABEL: no_prolog_epilog_in_ehpad
 ; CHECK:     try
 ; CHECK:       call      foo
 ; CHECK:     catch
@@ -217,7 +217,7 @@ terminate:                                        ; preds = %ehcleanup
 ; CHECK-NOT:   global.set  __stack_pointer, $pop{{.+}}
 ; CHECK:       call      __cxa_end_catch
 ; CHECK:     end_try
-define void @test_no_prolog_epilog_in_ehpad() personality ptr @__gxx_wasm_personality_v0 {
+define void @no_prolog_epilog_in_ehpad() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   %stack_var = alloca i32, align 4
   call void @bar(ptr %stack_var)
@@ -262,14 +262,14 @@ ehcleanup:                                        ; preds = %catch
 ; store SP back to __stack_pointer global at the epilog.
 ;
 ; void foo();
-; void test_no_sp_writeback() {
+; void no_sp_writeback() {
 ;   try {
 ;     foo();
 ;   } catch (...) {
 ;   }
 ; }
 
-; CHECK-LABEL: test_no_sp_writeback
+; CHECK-LABEL: no_sp_writeback
 ; CHECK:     try
 ; CHECK:       call      foo
 ; CHECK:     catch
@@ -278,7 +278,7 @@ ehcleanup:                                        ; preds = %catch
 ; CHECK:     end_try
 ; CHECK-NOT: global.set  __stack_pointer
 ; CHECK:     return
-define void @test_no_sp_writeback() personality ptr @__gxx_wasm_personality_v0 {
+define void @no_sp_writeback() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   invoke void @foo()
           to label %try.cont unwind label %catch.dispatch
@@ -300,7 +300,7 @@ try.cont:                                         ; preds = %catch.start, %entry
 
 ; When the result of @llvm.wasm.get.exception is not used. This is created to
 ; fix a bug in LateEHPrepare and this should not crash.
-define void @test_get_exception_wo_use() personality ptr @__gxx_wasm_personality_v0 {
+define void @get_exception_wo_use() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   invoke void @foo()
           to label %try.cont unwind label %catch.dispatch
@@ -320,7 +320,7 @@ try.cont:                                         ; preds = %catch.start, %entry
 
 ; Tests a case when a cleanup region (cleanuppad ~ clanupret) contains another
 ; catchpad
-define void @test_complex_cleanup_region() personality ptr @__gxx_wasm_personality_v0 {
+define void @complex_cleanup_region() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   invoke void @foo()
           to label %invoke.cont unwind label %ehcleanup
@@ -352,7 +352,7 @@ ehcleanupret:                                     ; preds = %catch.start, %ehcle
 
 ; Regression test for the bug that 'rethrow' was not treated correctly as a
 ; terminator in isel.
-define void @test_rethrow_terminator() personality ptr @__gxx_wasm_personality_v0 {
+define void @rethrow_terminator() personality ptr @__gxx_wasm_personality_v0 {
 entry:
   invoke void @foo()
           to label %try.cont unwind label %catch.dispatch

From 9fef09fd2918e7d8c357b98a9a798fe207941f73 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 4 Sep 2024 12:19:46 +0800
Subject: [PATCH 028/425] [Clang][CodeGen] Fix type for atomic float incdec
 operators (#107075)

`llvm::ConstantFP::get(llvm::LLVMContext&, APFloat(float))` always
returns a f32 constant.
Fix https://github.com/llvm/llvm-project/issues/107054.
---
 clang/lib/CodeGen/CGExprScalar.cpp            |  25 +-
 .../test/CodeGen/AMDGPU/amdgpu-atomic-float.c | 112 +++----
 clang/test/CodeGen/X86/x86-atomic-double.c    |  88 +++---
 .../test/CodeGen/X86/x86-atomic-long_double.c | 293 ++++++++++++++----
 4 files changed, 339 insertions(+), 179 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index af11bc20a3b63..7aa2d3d89c293 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2865,19 +2865,22 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
                                   llvm::AtomicOrdering::SequentiallyConsistent);
       return isPre ? Builder.CreateBinOp(op, old, amt) : old;
     }
-    // Special case for atomic increment/decrement on floats
+    // Special case for atomic increment/decrement on floats.
+    // Bail out non-power-of-2-sized floating point types (e.g., x86_fp80).
     if (type->isFloatingType()) {
-      llvm::AtomicRMWInst::BinOp aop =
-          isInc ? llvm::AtomicRMWInst::FAdd : llvm::AtomicRMWInst::FSub;
-      llvm::Instruction::BinaryOps op =
-          isInc ? llvm::Instruction::FAdd : llvm::Instruction::FSub;
-      llvm::Value *amt = llvm::ConstantFP::get(
-          VMContext, llvm::APFloat(static_cast<float>(1.0)));
-      llvm::AtomicRMWInst *old =
-          CGF.emitAtomicRMWInst(aop, LV.getAddress(), amt,
-                                llvm::AtomicOrdering::SequentiallyConsistent);
+      llvm::Type *Ty = ConvertType(type);
+      if (llvm::has_single_bit(Ty->getScalarSizeInBits())) {
+        llvm::AtomicRMWInst::BinOp aop =
+            isInc ? llvm::AtomicRMWInst::FAdd : llvm::AtomicRMWInst::FSub;
+        llvm::Instruction::BinaryOps op =
+            isInc ? llvm::Instruction::FAdd : llvm::Instruction::FSub;
+        llvm::Value *amt = llvm::ConstantFP::get(Ty, 1.0);
+        llvm::AtomicRMWInst *old =
+            CGF.emitAtomicRMWInst(aop, LV.getAddress(), amt,
+                                  llvm::AtomicOrdering::SequentiallyConsistent);
 
-      return isPre ? Builder.CreateBinOp(op, old, amt) : old;
+        return isPre ? Builder.CreateBinOp(op, old, amt) : old;
+      }
     }
     value = EmitLoadOfLValue(LV, E->getExprLoc());
     input = value;
diff --git a/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c b/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c
index 6deff1116e1d8..a8fb989b64de5 100644
--- a/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c
+++ b/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c
@@ -99,20 +99,16 @@ float test_float_pre_inc()
 // SAFE-NEXT:  [[ENTRY:.*:]]
 // SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
 // SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 8
-// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
-// SAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
-// SAFE-NEXT:    ret double [[TMP1]]
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), double 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    ret double [[TMP0]]
 //
 // UNSAFE-LABEL: define dso_local double @test_double_post_inc(
 // UNSAFE-SAME: ) #[[ATTR0]] {
 // UNSAFE-NEXT:  [[ENTRY:.*:]]
 // UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
 // UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
-// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
-// UNSAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
-// UNSAFE-NEXT:    ret double [[TMP1]]
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), double 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    ret double [[TMP0]]
 //
 double test_double_post_inc()
 {
@@ -125,20 +121,16 @@ double test_double_post_inc()
 // SAFE-NEXT:  [[ENTRY:.*:]]
 // SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
 // SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 8
-// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
-// SAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
-// SAFE-NEXT:    ret double [[TMP1]]
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), double 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    ret double [[TMP0]]
 //
 // UNSAFE-LABEL: define dso_local double @test_double_post_dc(
 // UNSAFE-SAME: ) #[[ATTR0]] {
 // UNSAFE-NEXT:  [[ENTRY:.*:]]
 // UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
 // UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]]
-// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
-// UNSAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
-// UNSAFE-NEXT:    ret double [[TMP1]]
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), double 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    ret double [[TMP0]]
 //
 double test_double_post_dc()
 {
@@ -151,22 +143,18 @@ double test_double_post_dc()
 // SAFE-NEXT:  [[ENTRY:.*:]]
 // SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
 // SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 8
-// SAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
-// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
-// SAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
-// SAFE-NEXT:    ret double [[TMP2]]
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), double 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    [[TMP1:%.*]] = fsub double [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    ret double [[TMP1]]
 //
 // UNSAFE-LABEL: define dso_local double @test_double_pre_dc(
 // UNSAFE-SAME: ) #[[ATTR0]] {
 // UNSAFE-NEXT:  [[ENTRY:.*:]]
 // UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
 // UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]]
-// UNSAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
-// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
-// UNSAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
-// UNSAFE-NEXT:    ret double [[TMP2]]
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), double 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fsub double [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    ret double [[TMP1]]
 //
 double test_double_pre_dc()
 {
@@ -179,22 +167,18 @@ double test_double_pre_dc()
 // SAFE-NEXT:  [[ENTRY:.*:]]
 // SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
 // SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 8
-// SAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
-// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
-// SAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
-// SAFE-NEXT:    ret double [[TMP2]]
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), double 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    [[TMP1:%.*]] = fadd double [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    ret double [[TMP1]]
 //
 // UNSAFE-LABEL: define dso_local double @test_double_pre_inc(
 // UNSAFE-SAME: ) #[[ATTR0]] {
 // UNSAFE-NEXT:  [[ENTRY:.*:]]
 // UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
 // UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
-// UNSAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
-// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
-// UNSAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
-// UNSAFE-NEXT:    ret double [[TMP2]]
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), double 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fadd double [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    ret double [[TMP1]]
 //
 double test_double_pre_inc()
 {
@@ -207,20 +191,16 @@ double test_double_pre_inc()
 // SAFE-NEXT:  [[ENTRY:.*:]]
 // SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
 // SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 2
-// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
-// SAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
-// SAFE-NEXT:    ret half [[TMP1]]
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), half 0xH3C00 seq_cst, align 2
+// SAFE-NEXT:    ret half [[TMP0]]
 //
 // UNSAFE-LABEL: define dso_local half @test__Float16_post_inc(
 // UNSAFE-SAME: ) #[[ATTR0]] {
 // UNSAFE-NEXT:  [[ENTRY:.*:]]
 // UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
 // UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
-// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
-// UNSAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
-// UNSAFE-NEXT:    ret half [[TMP1]]
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), half 0xH3C00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    ret half [[TMP0]]
 //
 _Float16 test__Float16_post_inc()
 {
@@ -233,20 +213,16 @@ _Float16 test__Float16_post_inc()
 // SAFE-NEXT:  [[ENTRY:.*:]]
 // SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
 // SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 2
-// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
-// SAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
-// SAFE-NEXT:    ret half [[TMP1]]
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), half 0xH3C00 seq_cst, align 2
+// SAFE-NEXT:    ret half [[TMP0]]
 //
 // UNSAFE-LABEL: define dso_local half @test__Float16_post_dc(
 // UNSAFE-SAME: ) #[[ATTR0]] {
 // UNSAFE-NEXT:  [[ENTRY:.*:]]
 // UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
 // UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]]
-// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
-// UNSAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
-// UNSAFE-NEXT:    ret half [[TMP1]]
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), half 0xH3C00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    ret half [[TMP0]]
 //
 _Float16 test__Float16_post_dc()
 {
@@ -259,22 +235,18 @@ _Float16 test__Float16_post_dc()
 // SAFE-NEXT:  [[ENTRY:.*:]]
 // SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
 // SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 2
-// SAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
-// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
-// SAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
-// SAFE-NEXT:    ret half [[TMP2]]
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), half 0xH3C00 seq_cst, align 2
+// SAFE-NEXT:    [[TMP1:%.*]] = fsub half [[TMP0]], 0xH3C00
+// SAFE-NEXT:    ret half [[TMP1]]
 //
 // UNSAFE-LABEL: define dso_local half @test__Float16_pre_dc(
 // UNSAFE-SAME: ) #[[ATTR0]] {
 // UNSAFE-NEXT:  [[ENTRY:.*:]]
 // UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
 // UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]]
-// UNSAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
-// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
-// UNSAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
-// UNSAFE-NEXT:    ret half [[TMP2]]
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), half 0xH3C00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fsub half [[TMP0]], 0xH3C00
+// UNSAFE-NEXT:    ret half [[TMP1]]
 //
 _Float16 test__Float16_pre_dc()
 {
@@ -287,22 +259,18 @@ _Float16 test__Float16_pre_dc()
 // SAFE-NEXT:  [[ENTRY:.*:]]
 // SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
 // SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 2
-// SAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
-// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
-// SAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
-// SAFE-NEXT:    ret half [[TMP2]]
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), half 0xH3C00 seq_cst, align 2
+// SAFE-NEXT:    [[TMP1:%.*]] = fadd half [[TMP0]], 0xH3C00
+// SAFE-NEXT:    ret half [[TMP1]]
 //
 // UNSAFE-LABEL: define dso_local half @test__Float16_pre_inc(
 // UNSAFE-SAME: ) #[[ATTR0]] {
 // UNSAFE-NEXT:  [[ENTRY:.*:]]
 // UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
 // UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
-// UNSAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
-// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
-// UNSAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
-// UNSAFE-NEXT:    ret half [[TMP2]]
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), half 0xH3C00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fadd half [[TMP0]], 0xH3C00
+// UNSAFE-NEXT:    ret half [[TMP1]]
 //
 _Float16 test__Float16_pre_inc()
 {
diff --git a/clang/test/CodeGen/X86/x86-atomic-double.c b/clang/test/CodeGen/X86/x86-atomic-double.c
index 2354c89cc2b17..09c8f70c3db85 100644
--- a/clang/test/CodeGen/X86/x86-atomic-double.c
+++ b/clang/test/CodeGen/X86/x86-atomic-double.c
@@ -6,20 +6,14 @@
 // X64-LABEL: define dso_local double @test_double_post_inc(
 // X64-SAME: ) #[[ATTR0:[0-9]+]] {
 // X64-NEXT:  entry:
-// X64-NEXT:    [[RETVAL:%.*]] = alloca double, align 8
-// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, float 1.000000e+00 seq_cst, align 8
-// X64-NEXT:    store float [[TMP0]], ptr [[RETVAL]], align 8
-// X64-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 8
-// X64-NEXT:    ret double [[TMP1]]
+// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, double 1.000000e+00 seq_cst, align 8
+// X64-NEXT:    ret double [[TMP0]]
 //
 // X86-LABEL: define dso_local double @test_double_post_inc(
 // X86-SAME: ) #[[ATTR0:[0-9]+]] {
 // X86-NEXT:  entry:
-// X86-NEXT:    [[RETVAL:%.*]] = alloca double, align 4
-// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, float 1.000000e+00 seq_cst, align 8
-// X86-NEXT:    store float [[TMP0]], ptr [[RETVAL]], align 4
-// X86-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 4
-// X86-NEXT:    ret double [[TMP1]]
+// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, double 1.000000e+00 seq_cst, align 8
+// X86-NEXT:    ret double [[TMP0]]
 //
 double test_double_post_inc()
 {
@@ -30,20 +24,14 @@ double test_double_post_inc()
 // X64-LABEL: define dso_local double @test_double_post_dc(
 // X64-SAME: ) #[[ATTR0]] {
 // X64-NEXT:  entry:
-// X64-NEXT:    [[RETVAL:%.*]] = alloca double, align 8
-// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, float 1.000000e+00 seq_cst, align 8
-// X64-NEXT:    store float [[TMP0]], ptr [[RETVAL]], align 8
-// X64-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 8
-// X64-NEXT:    ret double [[TMP1]]
+// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, double 1.000000e+00 seq_cst, align 8
+// X64-NEXT:    ret double [[TMP0]]
 //
 // X86-LABEL: define dso_local double @test_double_post_dc(
 // X86-SAME: ) #[[ATTR0]] {
 // X86-NEXT:  entry:
-// X86-NEXT:    [[RETVAL:%.*]] = alloca double, align 4
-// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, float 1.000000e+00 seq_cst, align 8
-// X86-NEXT:    store float [[TMP0]], ptr [[RETVAL]], align 4
-// X86-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 4
-// X86-NEXT:    ret double [[TMP1]]
+// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, double 1.000000e+00 seq_cst, align 8
+// X86-NEXT:    ret double [[TMP0]]
 //
 double test_double_post_dc()
 {
@@ -54,22 +42,16 @@ double test_double_post_dc()
 // X64-LABEL: define dso_local double @test_double_pre_dc(
 // X64-SAME: ) #[[ATTR0]] {
 // X64-NEXT:  entry:
-// X64-NEXT:    [[RETVAL:%.*]] = alloca double, align 8
-// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_pre_dc.n, float 1.000000e+00 seq_cst, align 8
-// X64-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
-// X64-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 8
-// X64-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 8
-// X64-NEXT:    ret double [[TMP2]]
+// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_pre_dc.n, double 1.000000e+00 seq_cst, align 8
+// X64-NEXT:    [[TMP1:%.*]] = fsub double [[TMP0]], 1.000000e+00
+// X64-NEXT:    ret double [[TMP1]]
 //
 // X86-LABEL: define dso_local double @test_double_pre_dc(
 // X86-SAME: ) #[[ATTR0]] {
 // X86-NEXT:  entry:
-// X86-NEXT:    [[RETVAL:%.*]] = alloca double, align 4
-// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_pre_dc.n, float 1.000000e+00 seq_cst, align 8
-// X86-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
-// X86-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
-// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 4
-// X86-NEXT:    ret double [[TMP2]]
+// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_pre_dc.n, double 1.000000e+00 seq_cst, align 8
+// X86-NEXT:    [[TMP1:%.*]] = fsub double [[TMP0]], 1.000000e+00
+// X86-NEXT:    ret double [[TMP1]]
 //
 double test_double_pre_dc()
 {
@@ -80,25 +62,43 @@ double test_double_pre_dc()
 // X64-LABEL: define dso_local double @test_double_pre_inc(
 // X64-SAME: ) #[[ATTR0]] {
 // X64-NEXT:  entry:
-// X64-NEXT:    [[RETVAL:%.*]] = alloca double, align 8
-// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_pre_inc.n, float 1.000000e+00 seq_cst, align 8
-// X64-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
-// X64-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 8
-// X64-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 8
-// X64-NEXT:    ret double [[TMP2]]
+// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_pre_inc.n, double 1.000000e+00 seq_cst, align 8
+// X64-NEXT:    [[TMP1:%.*]] = fadd double [[TMP0]], 1.000000e+00
+// X64-NEXT:    ret double [[TMP1]]
 //
 // X86-LABEL: define dso_local double @test_double_pre_inc(
 // X86-SAME: ) #[[ATTR0]] {
 // X86-NEXT:  entry:
-// X86-NEXT:    [[RETVAL:%.*]] = alloca double, align 4
-// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_pre_inc.n, float 1.000000e+00 seq_cst, align 8
-// X86-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
-// X86-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
-// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 4
-// X86-NEXT:    ret double [[TMP2]]
+// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_pre_inc.n, double 1.000000e+00 seq_cst, align 8
+// X86-NEXT:    [[TMP1:%.*]] = fadd double [[TMP0]], 1.000000e+00
+// X86-NEXT:    ret double [[TMP1]]
 //
 double test_double_pre_inc()
 {
     static _Atomic double n;
     return ++n;
 }
+
+// X64-LABEL: define dso_local i32 @pr107054(
+// X64-SAME: ) #[[ATTR0]] {
+// X64-NEXT:  entry:
+// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @pr107054.n, double 1.000000e+00 seq_cst, align 8
+// X64-NEXT:    [[TMP1:%.*]] = fadd double [[TMP0]], 1.000000e+00
+// X64-NEXT:    [[CMP:%.*]] = fcmp oeq double [[TMP1]], 1.000000e+00
+// X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+// X64-NEXT:    ret i32 [[CONV]]
+//
+// X86-LABEL: define dso_local i32 @pr107054(
+// X86-SAME: ) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @pr107054.n, double 1.000000e+00 seq_cst, align 8
+// X86-NEXT:    [[TMP1:%.*]] = fadd double [[TMP0]], 1.000000e+00
+// X86-NEXT:    [[CMP:%.*]] = fcmp oeq double [[TMP1]], 1.000000e+00
+// X86-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+// X86-NEXT:    ret i32 [[CONV]]
+//
+int pr107054()
+{
+    static _Atomic double n;
+    return (++n) == 1;
+}
diff --git a/clang/test/CodeGen/X86/x86-atomic-long_double.c b/clang/test/CodeGen/X86/x86-atomic-long_double.c
index 2c3f381f13511..9c82784807dac 100644
--- a/clang/test/CodeGen/X86/x86-atomic-long_double.c
+++ b/clang/test/CodeGen/X86/x86-atomic-long_double.c
@@ -4,29 +4,60 @@
 
 // X64-LABEL: define dso_local x86_fp80 @testinc(
 // X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
-// X64-NEXT:  [[ENTRY:.*:]]
-// X64-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:  [[ENTRY:.*]]:
 // X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16
 // X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
 // X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// X64-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
-// X64-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
-// X64-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 16
-// X64-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
-// X64-NEXT:    ret x86_fp80 [[TMP3]]
+// X64-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP0]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X64:       [[ATOMIC_OP]]:
+// X64-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[ATOMIC_OP]] ]
+// X64-NEXT:    [[INC:%.*]] = fadd x86_fp80 [[TMP2]], 0xK3FFF8000000000000000
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[INC]], ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16
+// X64-NEXT:    [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0
+// X64-NEXT:    [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
+// X64-NEXT:    store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    br i1 [[TMP7]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X64:       [[ATOMIC_CONT]]:
+// X64-NEXT:    ret x86_fp80 [[INC]]
 //
 // X86-LABEL: define dso_local x86_fp80 @testinc(
 // X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
-// X86-NEXT:  [[ENTRY:.*:]]
-// X86-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:  [[ENTRY:.*]]:
 // X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4
 // X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
 // X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// X86-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
-// X86-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
-// X86-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 4
-// X86-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
-// X86-NEXT:    ret x86_fp80 [[TMP3]]
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// X86-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4
+// X86-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X86:       [[ATOMIC_OP]]:
+// X86-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[ATOMIC_OP]] ]
+// X86-NEXT:    [[INC:%.*]] = fadd x86_fp80 [[TMP2]], 0xK3FFF8000000000000000
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[INC]], ptr [[ATOMIC_TEMP2]], align 4
+// X86-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5)
+// X86-NEXT:    [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X86:       [[ATOMIC_CONT]]:
+// X86-NEXT:    ret x86_fp80 [[INC]]
 //
 long double testinc(_Atomic long double *addr) {
 
@@ -35,27 +66,60 @@ long double testinc(_Atomic long double *addr) {
 
 // X64-LABEL: define dso_local x86_fp80 @testdec(
 // X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// X64-NEXT:  [[ENTRY:.*:]]
-// X64-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:  [[ENTRY:.*]]:
 // X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16
 // X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
 // X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// X64-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
-// X64-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 16
-// X64-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
-// X64-NEXT:    ret x86_fp80 [[TMP2]]
+// X64-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP0]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X64:       [[ATOMIC_OP]]:
+// X64-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[ATOMIC_OP]] ]
+// X64-NEXT:    [[DEC:%.*]] = fadd x86_fp80 [[TMP2]], 0xKBFFF8000000000000000
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[DEC]], ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16
+// X64-NEXT:    [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0
+// X64-NEXT:    [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
+// X64-NEXT:    store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    br i1 [[TMP7]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X64:       [[ATOMIC_CONT]]:
+// X64-NEXT:    ret x86_fp80 [[TMP1]]
 //
 // X86-LABEL: define dso_local x86_fp80 @testdec(
 // X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// X86-NEXT:  [[ENTRY:.*:]]
-// X86-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:  [[ENTRY:.*]]:
 // X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4
 // X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
 // X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// X86-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
-// X86-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
-// X86-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
-// X86-NEXT:    ret x86_fp80 [[TMP2]]
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// X86-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4
+// X86-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X86:       [[ATOMIC_OP]]:
+// X86-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[ATOMIC_OP]] ]
+// X86-NEXT:    [[DEC:%.*]] = fadd x86_fp80 [[TMP2]], 0xKBFFF8000000000000000
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[DEC]], ptr [[ATOMIC_TEMP2]], align 4
+// X86-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5)
+// X86-NEXT:    [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X86:       [[ATOMIC_CONT]]:
+// X86-NEXT:    ret x86_fp80 [[TMP1]]
 //
 long double testdec(_Atomic long double *addr) {
 
@@ -175,29 +239,60 @@ long double testassign(_Atomic long double *addr) {
 
 // X64-LABEL: define dso_local x86_fp80 @test_volatile_inc(
 // X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// X64-NEXT:  [[ENTRY:.*:]]
-// X64-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:  [[ENTRY:.*]]:
 // X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16
 // X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
 // X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// X64-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
-// X64-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
-// X64-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 16
-// X64-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
-// X64-NEXT:    ret x86_fp80 [[TMP3]]
+// X64-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP0]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X64:       [[ATOMIC_OP]]:
+// X64-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[ATOMIC_OP]] ]
+// X64-NEXT:    [[INC:%.*]] = fadd x86_fp80 [[TMP2]], 0xK3FFF8000000000000000
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[INC]], ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP5:%.*]] = cmpxchg volatile ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16
+// X64-NEXT:    [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0
+// X64-NEXT:    [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
+// X64-NEXT:    store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    br i1 [[TMP7]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X64:       [[ATOMIC_CONT]]:
+// X64-NEXT:    ret x86_fp80 [[INC]]
 //
 // X86-LABEL: define dso_local x86_fp80 @test_volatile_inc(
 // X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// X86-NEXT:  [[ENTRY:.*:]]
-// X86-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:  [[ENTRY:.*]]:
 // X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4
 // X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
 // X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// X86-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
-// X86-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
-// X86-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 4
-// X86-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
-// X86-NEXT:    ret x86_fp80 [[TMP3]]
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// X86-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4
+// X86-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X86:       [[ATOMIC_OP]]:
+// X86-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[ATOMIC_OP]] ]
+// X86-NEXT:    [[INC:%.*]] = fadd x86_fp80 [[TMP2]], 0xK3FFF8000000000000000
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[INC]], ptr [[ATOMIC_TEMP2]], align 4
+// X86-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5)
+// X86-NEXT:    [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X86:       [[ATOMIC_CONT]]:
+// X86-NEXT:    ret x86_fp80 [[INC]]
 //
 long double test_volatile_inc(volatile _Atomic long double *addr) {
   return ++*addr;
@@ -205,27 +300,60 @@ long double test_volatile_inc(volatile _Atomic long double *addr) {
 
 // X64-LABEL: define dso_local x86_fp80 @test_volatile_dec(
 // X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// X64-NEXT:  [[ENTRY:.*:]]
-// X64-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:  [[ENTRY:.*]]:
 // X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16
 // X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
 // X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// X64-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
-// X64-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 16
-// X64-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
-// X64-NEXT:    ret x86_fp80 [[TMP2]]
+// X64-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP0]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X64:       [[ATOMIC_OP]]:
+// X64-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[ATOMIC_OP]] ]
+// X64-NEXT:    [[DEC:%.*]] = fadd x86_fp80 [[TMP2]], 0xKBFFF8000000000000000
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[DEC]], ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP5:%.*]] = cmpxchg volatile ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16
+// X64-NEXT:    [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0
+// X64-NEXT:    [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
+// X64-NEXT:    store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    br i1 [[TMP7]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X64:       [[ATOMIC_CONT]]:
+// X64-NEXT:    ret x86_fp80 [[TMP1]]
 //
 // X86-LABEL: define dso_local x86_fp80 @test_volatile_dec(
 // X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// X86-NEXT:  [[ENTRY:.*:]]
-// X86-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:  [[ENTRY:.*]]:
 // X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4
 // X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
 // X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// X86-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
-// X86-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
-// X86-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
-// X86-NEXT:    ret x86_fp80 [[TMP2]]
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// X86-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4
+// X86-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X86:       [[ATOMIC_OP]]:
+// X86-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[ATOMIC_OP]] ]
+// X86-NEXT:    [[DEC:%.*]] = fadd x86_fp80 [[TMP2]], 0xKBFFF8000000000000000
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[DEC]], ptr [[ATOMIC_TEMP2]], align 4
+// X86-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5)
+// X86-NEXT:    [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X86:       [[ATOMIC_CONT]]:
+// X86-NEXT:    ret x86_fp80 [[TMP1]]
 //
 long double test_volatile_dec(volatile _Atomic long double *addr) {
   return (*addr)--;
@@ -341,3 +469,64 @@ long double test_volatile_assign(volatile _Atomic long double *addr) {
 
   return *addr;
 }
+
+// X64-LABEL: define dso_local i32 @pr107054(
+// X64-SAME: ) #[[ATTR0]] {
+// X64-NEXT:  [[ENTRY:.*]]:
+// X64-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr @pr107054.n seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    [[TMP0:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X64:       [[ATOMIC_OP]]:
+// X64-NEXT:    [[TMP1:%.*]] = phi x86_fp80 [ [[TMP0]], %[[ENTRY]] ], [ [[TMP7:%.*]], %[[ATOMIC_OP]] ]
+// X64-NEXT:    [[INC:%.*]] = fadd x86_fp80 [[TMP1]], 0xK3FFF8000000000000000
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[TMP1]], ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[INC]], ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP4:%.*]] = cmpxchg ptr @pr107054.n, i128 [[TMP2]], i128 [[TMP3]] seq_cst seq_cst, align 16
+// X64-NEXT:    [[TMP5:%.*]] = extractvalue { i128, i1 } [[TMP4]], 0
+// X64-NEXT:    [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP4]], 1
+// X64-NEXT:    store i128 [[TMP5]], ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    [[TMP7]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    br i1 [[TMP6]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X64:       [[ATOMIC_CONT]]:
+// X64-NEXT:    [[CMP:%.*]] = fcmp oeq x86_fp80 [[INC]], 0xK3FFF8000000000000000
+// X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+// X64-NEXT:    ret i32 [[CONV]]
+//
+// X86-LABEL: define dso_local i32 @pr107054(
+// X86-SAME: ) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*]]:
+// X86-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef @pr107054.n, ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// X86-NEXT:    [[TMP0:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4
+// X86-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X86:       [[ATOMIC_OP]]:
+// X86-NEXT:    [[TMP1:%.*]] = phi x86_fp80 [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[ATOMIC_OP]] ]
+// X86-NEXT:    [[INC:%.*]] = fadd x86_fp80 [[TMP1]], 0xK3FFF8000000000000000
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[TMP1]], ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[INC]], ptr [[ATOMIC_TEMP2]], align 4
+// X86-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef @pr107054.n, ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5)
+// X86-NEXT:    [[TMP2]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X86:       [[ATOMIC_CONT]]:
+// X86-NEXT:    [[CMP:%.*]] = fcmp oeq x86_fp80 [[INC]], 0xK3FFF8000000000000000
+// X86-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+// X86-NEXT:    ret i32 [[CONV]]
+//
+int pr107054()
+{
+    static _Atomic long double n;
+    return (++n) == 1;
+}

From 6c607cfb2c2d8acd2b92d7ed8106ab1e4fc0d79d Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 4 Sep 2024 12:57:09 +0800
Subject: [PATCH 029/425] [RISCV] Preserve tail agnostic policy in foldVMV_V_V
 (#105788)

This patch helps avoid regressions in an upcoming patch by making sure
we don't accidentally lose a tail agnostic policy when folding a vmv.v.v
into its source.

The previous comment about RISCVInsertVSETVLI relaxing the policy didn't
take into account the fact that there's a policy operand on vmv.v.v,
which can be tail agnostic.

If the tail is agnostic (via either the policy operand or the passthru
being undef) and vmv.v.v's VL <= Src's VL, then Src's tail can be made
agnostic.
---
 llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 11 +++--
 .../CodeGen/RISCV/rvv/vmv.v.v-peephole.mir    | 42 +++++++++++++++++++
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 412fd790061a3..35c3bc9708d91 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -529,10 +529,13 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
                                               *Src->getParent()->getParent()));
   }
 
-  // Use a conservative tu,mu policy, RISCVInsertVSETVLI will relax it if
-  // passthru is undef.
-  Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc()))
-      .setImm(RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED);
+  // If MI was tail agnostic and the VL didn't increase, preserve it.
+  int64_t Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
+  bool TailAgnostic = (MI.getOperand(5).getImm() & RISCVII::TAIL_AGNOSTIC) ||
+                      Passthru.getReg() == RISCV::NoRegister;
+  if (TailAgnostic && isVLKnownLE(MI.getOperand(3), SrcVL))
+    Policy |= RISCVII::TAIL_AGNOSTIC;
+  Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())).setImm(Policy);
 
   MRI->replaceRegWith(MI.getOperand(0).getReg(), Src->getOperand(0).getReg());
   MI.eraseFromParent();
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
index b2526c6df6939..771b2073370e6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
@@ -18,3 +18,45 @@ body: |
     %y:gpr = ADDI $x0, 1
     %z:vr = PseudoVMV_V_V_M1 %passthru, %x, 4, 5 /* e32 */, 0 /* tu, mu */
 ...
+---
+name: tail_agnostic
+body: |
+  bb.0:
+    liveins: $v8
+    ; CHECK-LABEL: name: tail_agnostic
+    ; CHECK: liveins: $v8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %passthru:vr = COPY $v8
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 1 /* ta, mu */
+    %passthru:vr = COPY $v8
+    %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */
+    %y:vr = PseudoVMV_V_V_M1 %passthru, %x, 4, 5 /* e32 */, 1 /* ta, mu */
+...
+---
+name: tail_agnostic_larger_vl
+body: |
+  bb.0:
+    liveins: $v8
+    ; CHECK-LABEL: name: tail_agnostic_larger_vl
+    ; CHECK: liveins: $v8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %passthru:vr = COPY $v8
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */
+    %passthru:vr = COPY $v8
+    %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */
+    %y:vr = PseudoVMV_V_V_M1 %passthru, %x, 5, 5 /* e32 */, 1 /* ta, mu */
+...
+---
+name: undef_passthru_src_undef_passthru
+body: |
+  bb.0:
+    liveins: $v8
+    ; CHECK-LABEL: name: undef_passthru_src_undef_passthru
+    ; CHECK: liveins: $v8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %passthru:vr = COPY $v8
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 4, 5 /* e32 */, 1 /* ta, mu */
+    %passthru:vr = COPY $v8
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */
+    %y:vr = PseudoVMV_V_V_M1 $noreg, %x, 4, 5 /* e32 */, 0 /* tu, mu */
+...

From c94bd96c277e0b48e198fdc831bb576d9a04aced Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 4 Sep 2024 13:36:32 +0800
Subject: [PATCH 030/425] [Clang][CodeGen] Don't emit assumptions if current
 block is unreachable. (#106936)

Fixes https://github.com/llvm/llvm-project/issues/106898.

When emitting an infinite loop, clang codegen will delete the whole
block and leave builder's current block as nullptr:

https://github.com/llvm/llvm-project/blob/837ee5b46a5f7f898f0de7e46a19600b896a0a1f/clang/lib/CodeGen/CGStmt.cpp#L597-L600

Then clang will create `zext (icmp slt %a, %b)` without parent block for
`a < b`. It will crash here:

https://github.com/llvm/llvm-project/blob/837ee5b46a5f7f898f0de7e46a19600b896a0a1f/clang/lib/CodeGen/CGExprScalar.cpp#L416-L420

Even if we disabled this optimization, it still crashes in
`Builder.CreateAssumption`:

https://github.com/llvm/llvm-project/blob/837ee5b46a5f7f898f0de7e46a19600b896a0a1f/llvm/lib/IR/IRBuilder.cpp#L551-L561

This patch disables assumptions emission if current block is null.
---
 clang/lib/CodeGen/CGStmt.cpp        | 2 +-
 clang/test/SemaCXX/cxx23-assume.cpp | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 7158a06e6bc3b..b138c87a85349 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -752,7 +752,7 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
     } break;
     case attr::CXXAssume: {
       const Expr *Assumption = cast<CXXAssumeAttr>(A)->getAssumption();
-      if (getLangOpts().CXXAssumptions &&
+      if (getLangOpts().CXXAssumptions && Builder.GetInsertBlock() &&
           !Assumption->HasSideEffects(getContext())) {
         llvm::Value *AssumptionVal = EvaluateExprAsBool(Assumption);
         Builder.CreateAssumption(AssumptionVal);
diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp
index 9138501d726dd..eeae59daea3f7 100644
--- a/clang/test/SemaCXX/cxx23-assume.cpp
+++ b/clang/test/SemaCXX/cxx23-assume.cpp
@@ -158,3 +158,12 @@ foo (int x, int y)
   return x + y;
 }
 }
+
+// Do not crash when assumptions are unreachable.
+namespace gh106898 {
+int foo () { 
+    while(1);
+    int a = 0, b = 1;
+    __attribute__((assume (a < b)));
+}
+}

From 3e798476de466e8a051d3e753db379731a8d9705 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 3 Sep 2024 22:44:49 -0700
Subject: [PATCH 031/425] [LegalizeDAG][RISCV] Don't promote f16 vector
 ISD::FNEG/FABS/FCOPYSIGN to f32 when we don't have Zvfh. (#106652)

The fp_extend will canonicalize NaNs which is not the semantics of
FNEG/FABS/FCOPYSIGN.

For fixed vectors I'm scalarizing due to test changes on other targets
where the scalarization is expected. I will try to address in a follow
up.

For scalable vectors, we bitcast to integer and use integer logic ops.
---
 .../SelectionDAG/LegalizeVectorOps.cpp        |   63 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   20 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll     | 3279 +++++++++++++++--
 llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll   |   65 +-
 .../CodeGen/RISCV/rvv/vfcopysign-sdnode.ll    |  647 ++--
 .../RISCV/rvv/vfmsub-constrained-sdnode.ll    |  275 +-
 llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll   |   59 +-
 .../RISCV/rvv/vfnmadd-constrained-sdnode.ll   |  424 +--
 .../RISCV/rvv/vfnmsub-constrained-sdnode.ll   |  341 +-
 9 files changed, 3691 insertions(+), 1482 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 297c349ae4e2f..29dae4e27c768 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -142,6 +142,8 @@ class VectorLegalizer {
   std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
   SDValue ExpandStore(SDNode *N);
   SDValue ExpandFNEG(SDNode *Node);
+  SDValue ExpandFABS(SDNode *Node);
+  SDValue ExpandFCOPYSIGN(SDNode *Node);
   void ExpandFSUB(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void ExpandSETCC(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void ExpandBITREVERSE(SDNode *Node, SmallVectorImpl<SDValue> &Results);
@@ -942,6 +944,18 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
       return;
     }
     break;
+  case ISD::FABS:
+    if (SDValue Expanded = ExpandFABS(Node)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::FCOPYSIGN:
+    if (SDValue Expanded = ExpandFCOPYSIGN(Node)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
   case ISD::FSUB:
     ExpandFSUB(Node, Results);
     return;
@@ -1781,7 +1795,7 @@ SDValue VectorLegalizer::ExpandFNEG(SDNode *Node) {
 
   // FIXME: The FSUB check is here to force unrolling v1f64 vectors on AArch64.
   if (!TLI.isOperationLegalOrCustom(ISD::XOR, IntVT) ||
-      !TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
+      !(TLI.isOperationLegalOrCustom(ISD::FSUB, VT) || VT.isScalableVector()))
     return SDValue();
 
   SDLoc DL(Node);
@@ -1792,6 +1806,53 @@ SDValue VectorLegalizer::ExpandFNEG(SDNode *Node) {
   return DAG.getNode(ISD::BITCAST, DL, VT, Xor);
 }
 
+SDValue VectorLegalizer::ExpandFABS(SDNode *Node) {
+  EVT VT = Node->getValueType(0);
+  EVT IntVT = VT.changeVectorElementTypeToInteger();
+
+  // FIXME: We shouldn't restrict this to scalable vectors.
+  if (!TLI.isOperationLegalOrCustom(ISD::AND, IntVT) || !VT.isScalableVector())
+    return SDValue();
+
+  SDLoc DL(Node);
+  SDValue Cast = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(0));
+  SDValue ClearSignMask = DAG.getConstant(
+      APInt::getSignedMaxValue(IntVT.getScalarSizeInBits()), DL, IntVT);
+  SDValue ClearedSign = DAG.getNode(ISD::AND, DL, IntVT, Cast, ClearSignMask);
+  return DAG.getNode(ISD::BITCAST, DL, VT, ClearedSign);
+}
+
+SDValue VectorLegalizer::ExpandFCOPYSIGN(SDNode *Node) {
+  EVT VT = Node->getValueType(0);
+  EVT IntVT = VT.changeVectorElementTypeToInteger();
+
+  // FIXME: We shouldn't restrict this to scalable vectors.
+  if (VT != Node->getOperand(1).getValueType() ||
+      !TLI.isOperationLegalOrCustom(ISD::AND, IntVT) ||
+      !TLI.isOperationLegalOrCustom(ISD::OR, IntVT) || !VT.isScalableVector())
+    return SDValue();
+
+  SDLoc DL(Node);
+  SDValue Mag = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(0));
+  SDValue Sign = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(1));
+
+  SDValue SignMask = DAG.getConstant(
+      APInt::getSignMask(IntVT.getScalarSizeInBits()), DL, IntVT);
+  SDValue SignBit = DAG.getNode(ISD::AND, DL, IntVT, Sign, SignMask);
+
+  SDValue ClearSignMask = DAG.getConstant(
+      APInt::getSignedMaxValue(IntVT.getScalarSizeInBits()), DL, IntVT);
+  SDValue ClearedSign = DAG.getNode(ISD::AND, DL, IntVT, Mag, ClearSignMask);
+
+  SDNodeFlags Flags;
+  Flags.setDisjoint(true);
+
+  SDValue CopiedSign =
+      DAG.getNode(ISD::OR, DL, IntVT, ClearedSign, SignBit, Flags);
+
+  return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign);
+}
+
 void VectorLegalizer::ExpandFSUB(SDNode *Node,
                                  SmallVectorImpl<SDValue> &Results) {
   // For floating-point values, (a-b) is the same as a+(-b). If FNEG is legal,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 3742b897ca568..5089bbbe3c0d7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -934,13 +934,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
     // TODO: support more ops.
     static const unsigned ZvfhminPromoteOps[] = {
-        ISD::FMINNUM,     ISD::FMAXNUM,      ISD::FADD,        ISD::FSUB,
-        ISD::FMUL,        ISD::FMA,          ISD::FDIV,        ISD::FSQRT,
-        ISD::FABS,        ISD::FNEG,         ISD::FCOPYSIGN,   ISD::FCEIL,
-        ISD::FFLOOR,      ISD::FROUND,       ISD::FROUNDEVEN,  ISD::FRINT,
-        ISD::FNEARBYINT,  ISD::IS_FPCLASS,   ISD::SETCC,       ISD::FMAXIMUM,
-        ISD::FMINIMUM,    ISD::STRICT_FADD,  ISD::STRICT_FSUB, ISD::STRICT_FMUL,
-        ISD::STRICT_FDIV, ISD::STRICT_FSQRT, ISD::STRICT_FMA};
+        ISD::FMINNUM,     ISD::FMAXNUM,     ISD::FADD,         ISD::FSUB,
+        ISD::FMUL,        ISD::FMA,         ISD::FDIV,         ISD::FSQRT,
+        ISD::FCEIL,       ISD::FFLOOR,      ISD::FROUND,       ISD::FROUNDEVEN,
+        ISD::FRINT,       ISD::FNEARBYINT,  ISD::IS_FPCLASS,   ISD::SETCC,
+        ISD::FMAXIMUM,    ISD::FMINIMUM,    ISD::STRICT_FADD,  ISD::STRICT_FSUB,
+        ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FSQRT, ISD::STRICT_FMA};
 
     // TODO: support more vp ops.
     static const unsigned ZvfhminPromoteVPOps[] = {ISD::VP_FADD,
@@ -1082,6 +1081,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         // load/store
         setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
 
+        setOperationAction(ISD::FNEG, VT, Expand);
+        setOperationAction(ISD::FABS, VT, Expand);
+        setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+
         // Custom split nxv32f16 since nxv32f32 if not legal.
         if (VT == MVT::nxv32f16) {
           setOperationAction(ZvfhminPromoteOps, VT, Custom);
@@ -1337,6 +1340,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
             // available.
             setOperationAction(ISD::BUILD_VECTOR, MVT::f16, Custom);
           }
+          setOperationAction(ISD::FNEG, VT, Expand);
+          setOperationAction(ISD::FABS, VT, Expand);
+          setOperationAction(ISD::FCOPYSIGN, VT, Expand);
           MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
           // Don't promote f16 vector operations to f32 if f32 vector type is
           // not legal.
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index 1843157573257..56cd718536daa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -516,14 +516,50 @@ define void @fneg_v8f16(ptr %x) {
 ;
 ; ZVFHMIN-LABEL: fneg_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    mv a1, sp
+; ZVFHMIN-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-NEXT:    lui a3, 1048568
+; ZVFHMIN-NEXT:    fmv.x.h a4, fa3
+; ZVFHMIN-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-NEXT:    lui a5, 8
+; ZVFHMIN-NEXT:    xor a2, a2, a5
+; ZVFHMIN-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-NEXT:    xor a1, a1, a3
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-NEXT:    xor a4, a4, a3
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-NEXT:    xor a2, a2, a3
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-NEXT:    xor a1, a1, a3
+; ZVFHMIN-NEXT:    xor a2, a2, a5
+; ZVFHMIN-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-NEXT:    xor a2, a2, a3
+; ZVFHMIN-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    xor a1, a1, a3
+; ZVFHMIN-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 4, v0.t
 ; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = fneg <8 x half> %a
@@ -542,35 +578,112 @@ define void @fneg_v6f16(ptr %x) {
 ;
 ; ZVFHMIN-RV32-LABEL: fneg_v6f16:
 ; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfneg.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
+; ZVFHMIN-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-RV32-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-RV32-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-RV32-NEXT:    lui a3, 1048568
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa3
+; ZVFHMIN-RV32-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-RV32-NEXT:    xor a1, a1, a3
+; ZVFHMIN-RV32-NEXT:    xor a2, a2, a3
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-RV32-NEXT:    xor a4, a4, a3
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-RV32-NEXT:    xor a5, a5, a3
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
+; ZVFHMIN-RV32-NEXT:    xor a6, a6, a3
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-RV32-NEXT:    lui t0, 8
+; ZVFHMIN-RV32-NEXT:    xor a7, a7, t0
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a7
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a6
+; ZVFHMIN-RV32-NEXT:    xor a6, a7, a3
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a6
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-RV32-NEXT:    xor a3, a6, a3
+; ZVFHMIN-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
+; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-RV32-NEXT:    addi a0, a0, 8
+; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-RV32-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-RV32-NEXT:    ret
 ;
 ; ZVFHMIN-RV64-LABEL: fneg_v6f16:
 ; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-RV64-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfneg.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-RV64-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-RV64-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-RV64-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-RV64-NEXT:    lui a3, 1048568
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a4, fa3
+; ZVFHMIN-RV64-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-RV64-NEXT:    lui a5, 8
+; ZVFHMIN-RV64-NEXT:    xor a2, a2, a5
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-RV64-NEXT:    xor a1, a1, a3
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-RV64-NEXT:    xor a4, a4, a3
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV64-NEXT:    xor a2, a2, a3
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-RV64-NEXT:    xor a1, a1, a3
+; ZVFHMIN-RV64-NEXT:    xor a2, a2, a5
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-RV64-NEXT:    xor a2, a2, a3
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    xor a1, a1, a3
+; ZVFHMIN-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
 ; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
 ; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-RV64-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-RV64-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = fneg <6 x half> %a
@@ -623,17 +736,101 @@ define void @fabs_v8f16(ptr %x) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: fabs_v8f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-NEXT:    ret
+; ZVFHMIN-RV32-LABEL: fabs_v8f16:
+; ZVFHMIN-RV32:       # %bb.0:
+; ZVFHMIN-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-RV32-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-RV32-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-RV32-NEXT:    lui a3, 8
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa3
+; ZVFHMIN-RV32-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-RV32-NEXT:    addi a3, a3, -1
+; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-RV32-NEXT:    and a1, a1, a3
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-RV32-NEXT:    and a4, a4, a3
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-RV32-NEXT:    and a1, a1, a3
+; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    and a1, a1, a3
+; ZVFHMIN-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-RV32-NEXT:    addi sp, sp, 16
+; ZVFHMIN-RV32-NEXT:    ret
+;
+; ZVFHMIN-RV64-LABEL: fabs_v8f16:
+; ZVFHMIN-RV64:       # %bb.0:
+; ZVFHMIN-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-RV64-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-RV64-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-RV64-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-RV64-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-RV64-NEXT:    lui a3, 8
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a4, fa3
+; ZVFHMIN-RV64-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-RV64-NEXT:    addiw a3, a3, -1
+; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-RV64-NEXT:    and a1, a1, a3
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-RV64-NEXT:    and a4, a4, a3
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-RV64-NEXT:    and a1, a1, a3
+; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    and a1, a1, a3
+; ZVFHMIN-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-RV64-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-RV64-NEXT:    addi sp, sp, 16
+; ZVFHMIN-RV64-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = call <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
   store <8 x half> %b, ptr %x
@@ -652,35 +849,112 @@ define void @fabs_v6f16(ptr %x) {
 ;
 ; ZVFHMIN-RV32-LABEL: fabs_v6f16:
 ; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
+; ZVFHMIN-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-RV32-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-RV32-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-RV32-NEXT:    lui a3, 8
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa3
+; ZVFHMIN-RV32-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-RV32-NEXT:    addi a3, a3, -1
+; ZVFHMIN-RV32-NEXT:    and a1, a1, a3
+; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-RV32-NEXT:    and a4, a4, a3
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV32-NEXT:    and a5, a5, a3
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-RV32-NEXT:    and a6, a6, a3
+; ZVFHMIN-RV32-NEXT:    and a7, a7, a3
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a7
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a6
+; ZVFHMIN-RV32-NEXT:    and a6, a7, a3
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a6
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-RV32-NEXT:    and a3, a6, a3
+; ZVFHMIN-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
+; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-RV32-NEXT:    addi a0, a0, 8
+; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-RV32-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-RV32-NEXT:    ret
 ;
 ; ZVFHMIN-RV64-LABEL: fabs_v6f16:
 ; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-RV64-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-RV64-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-RV64-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-RV64-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-RV64-NEXT:    lui a3, 8
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a4, fa3
+; ZVFHMIN-RV64-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-RV64-NEXT:    addiw a3, a3, -1
+; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-RV64-NEXT:    and a1, a1, a3
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-RV64-NEXT:    and a4, a4, a3
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-RV64-NEXT:    and a1, a1, a3
+; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-RV64-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    and a1, a1, a3
+; ZVFHMIN-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
 ; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
 ; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-RV64-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-RV64-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = call <6 x half> @llvm.fabs.v6f16(<6 x half> %a)
@@ -737,19 +1011,287 @@ define void @copysign_v8f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: copysign_v8f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v8, v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-NEXT:    ret
+; ZVFHMIN-ZFH-RV32-LABEL: copysign_v8f16:
+; ZVFHMIN-ZFH-RV32:       # %bb.0:
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 2(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 16(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 0(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa2, fa3
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 20(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 22(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 6(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa1, fa2
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 26(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 10(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 24(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 8(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa2, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 28(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 12(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFH-RV32-NEXT:    ret
+;
+; ZVFHMIN-ZFH-RV64-LABEL: copysign_v8f16:
+; ZVFHMIN-ZFH-RV64:       # %bb.0:
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 2(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 16(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 0(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa2, fa3
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 20(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 22(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 6(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa1, fa2
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 26(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 10(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 24(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 8(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a5, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa2, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 28(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 12(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFH-RV64-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_v8f16:
+; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 2(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 16(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 0(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 20(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a1, 1048568
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, a2, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 22(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui t1, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a2, t1, -1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a3, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, t1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 26(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a5, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a6, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, t0, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a5, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 24(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, t2, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a7, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a6, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 28(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a5, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a7, t1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a6, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a5, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a3, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_v8f16:
+; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 2(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 16(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 0(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 20(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a1, 1048568
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a7, a2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t0, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 22(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui t1, 8
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a2, t1, -1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t2, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a3, a7
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, t1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 26(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a5, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a6, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, t0, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a5, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 24(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, t2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, a7, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a6, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 28(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a5, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a7, t1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a4, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v9, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a6, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a4, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a5, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a3, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
   %c = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b)
@@ -768,58 +1310,331 @@ define void @copysign_v6f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: copysign_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfsgnj.vv v8, v8, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
+; ZVFHMIN-ZFH-RV32-LABEL: copysign_v6f16:
+; ZVFHMIN-ZFH-RV32:       # %bb.0:
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 2(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 16(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 0(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa2, fa3
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 20(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 22(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 6(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa1, fa2
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 26(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 10(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 24(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 8(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa2, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 28(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 12(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a6
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a6
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-ZFH-RV32-NEXT:    addi a0, a0, 8
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFH-RV32-NEXT:    ret
 ;
-; ZVFHMIN-RV64-LABEL: copysign_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfsgnj.vv v8, v8, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
-  %a = load <6 x half>, ptr %x
-  %b = load <6 x half>, ptr %y
-  %c = call <6 x half> @llvm.copysign.v6f16(<6 x half> %a, <6 x half> %b)
-  store <6 x half> %c, ptr %x
-  ret void
-}
-declare <6 x half> @llvm.copysign.v6f16(<6 x half>, <6 x half>)
-
-define void @copysign_v4f32(ptr %x, ptr %y) {
-; ZVFH-LABEL: copysign_v4f32:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; ZVFH-NEXT:    vle32.v v8, (a0)
-; ZVFH-NEXT:    vle32.v v9, (a1)
-; ZVFH-NEXT:    vfsgnj.vv v8, v8, v9
-; ZVFH-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-LABEL: copysign_v6f16:
+; ZVFHMIN-ZFH-RV64:       # %bb.0:
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 2(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 16(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 0(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa2, fa3
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 20(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 22(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 6(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa1, fa2
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 26(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 10(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 24(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 8(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a5, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa2, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 28(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 12(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV64-NEXT:    vse64.v v9, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-ZFH-RV64-NEXT:    addi a0, a0, 8
+; ZVFHMIN-ZFH-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFH-RV64-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_v6f16:
+; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 2(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 16(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 0(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 20(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a2, 1048568
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 22(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui t1, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a3, t1, -1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a4, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a6, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 26(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a7, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, t0, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, a7, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, t2, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 24(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a6, a6, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, t0, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t0, t2, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a7, t0, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t0, t2, t1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 28(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t1, t1, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or t0, t1, t0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, t0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, t1, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t0, t0, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a7, t0, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, t1, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a7, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a3, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a0, a0, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_v6f16:
+; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 2(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 16(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 0(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 20(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a1, 1048568
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a7, a2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t0, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 22(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui t1, 8
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a2, t1, -1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t2, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a3, a7
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, t1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 26(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a5, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a6, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, t0, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a5, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 24(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, t2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, a7, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a6, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 28(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a5, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a7, t1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a4, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v9, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a6, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a4, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a5, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a3, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse64.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a0, a0, 8
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
+  %a = load <6 x half>, ptr %x
+  %b = load <6 x half>, ptr %y
+  %c = call <6 x half> @llvm.copysign.v6f16(<6 x half> %a, <6 x half> %b)
+  store <6 x half> %c, ptr %x
+  ret void
+}
+declare <6 x half> @llvm.copysign.v6f16(<6 x half>, <6 x half>)
+
+define void @copysign_v4f32(ptr %x, ptr %y) {
+; ZVFH-LABEL: copysign_v4f32:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; ZVFH-NEXT:    vle32.v v8, (a0)
+; ZVFH-NEXT:    vle32.v v9, (a1)
+; ZVFH-NEXT:    vfsgnj.vv v8, v8, v9
+; ZVFH-NEXT:    vse32.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: copysign_v4f32:
@@ -864,20 +1679,215 @@ define void @copysign_vf_v8f16(ptr %x, half %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: copysign_vf_v8f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v8, v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-NEXT:    ret
+; ZVFHMIN-ZFH-RV32-LABEL: copysign_vf_v8f16:
+; ZVFHMIN-ZFH-RV32:       # %bb.0:
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa4, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 6(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa4, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a4, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 8(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 16
+; ZVFHMIN-ZFH-RV32-NEXT:    ret
+;
+; ZVFHMIN-ZFH-RV64-LABEL: copysign_vf_v8f16:
+; ZVFHMIN-ZFH-RV64:       # %bb.0:
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa4, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 6(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa4, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a4, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 8(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 16
+; ZVFHMIN-ZFH-RV64-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_vf_v8f16:
+; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a2, 1048568
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a3, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a3, a3, -1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a2, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a2, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a2, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a2, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_vf_v8f16:
+; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a2, 1048568
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a3, 8
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a3, a3, -1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a4, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a4, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a4, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = insertelement <8 x half> poison, half %y, i32 0
   %c = shufflevector <8 x half> %b, <8 x half> poison, <8 x i32> zeroinitializer
@@ -895,52 +1905,247 @@ define void @copysign_vf_v6f16(ptr %x, half %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: copysign_vf_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    li a2, 192
-; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfsgnj.vv v8, v9, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
+; ZVFHMIN-ZFH-RV32-LABEL: copysign_vf_v6f16:
+; ZVFHMIN-ZFH-RV32:       # %bb.0:
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa4, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 6(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa4, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a4, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a6
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.h fa5, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-ZFH-RV32-NEXT:    addi a0, a0, 8
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 16
+; ZVFHMIN-ZFH-RV32-NEXT:    ret
 ;
-; ZVFHMIN-RV64-LABEL: copysign_vf_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    li a2, 192
-; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfsgnj.vv v8, v9, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-ZFH-RV64-LABEL: copysign_vf_v6f16:
+; ZVFHMIN-ZFH-RV64:       # %bb.0:
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa4, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 6(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa4, fa4, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a4, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa5, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.h fa5, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnj.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV64-NEXT:    vse64.v v9, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-ZFH-RV64-NEXT:    addi a0, a0, 8
+; ZVFHMIN-ZFH-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 16
+; ZVFHMIN-ZFH-RV64-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_vf_v6f16:
+; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a3, 1048568
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a1, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a2, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a5, a2, -1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a1, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a2, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a6, a6, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, a7, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a7, a7, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t0, t0, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or t0, t0, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t1, t1, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, t1, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, t0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a0, a0, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_vf_v6f16:
+; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a2, 1048568
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a4, 8
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a4, a4, -1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a3, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a5, a5, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a5, a5, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a3, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a5, a5, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a3, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse64.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a0, a0, 8
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = insertelement <6 x half> poison, half %y, i32 0
   %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer
@@ -999,24 +2204,303 @@ define void @copysign_neg_v8f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: copysign_neg_v8f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v8, v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-NEXT:    ret
+; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_v8f16:
+; ZVFHMIN-ZFH-RV32:       # %bb.0:
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 28(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 24(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 26(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 22(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa0, 20(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh ft0, 16(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh ft1, 18(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh ft2, 2(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh ft3, 0(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h ft1, ft2, ft1
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, ft1
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h ft0, ft3, ft0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, ft0
+; ZVFHMIN-ZFH-RV32-NEXT:    flh ft0, 4(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    flh ft1, 6(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa0, ft0, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa1, ft1, fa1
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa0, 10(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa1
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 8(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa2, fa0, fa2
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa2
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa3, fa1, fa3
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa3
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 12(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 14(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa5, fa2, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFH-RV32-NEXT:    ret
+;
+; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_v8f16:
+; ZVFHMIN-ZFH-RV64:       # %bb.0:
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 28(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 24(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 26(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 22(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa0, 20(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh ft0, 16(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh ft1, 18(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh ft2, 2(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh ft3, 0(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h ft1, ft2, ft1
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, ft1
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h ft0, ft3, ft0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, ft0
+; ZVFHMIN-ZFH-RV64-NEXT:    flh ft0, 4(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    flh ft1, 6(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa0, ft0, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa1, ft1, fa1
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa0, 10(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa1
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 8(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa2, fa0, fa2
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa2
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa3, fa1, fa3
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa3
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 12(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 14(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa2, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFH-RV64-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_v8f16:
+; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 28(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 24(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 26(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 22(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 20(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 16(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a1, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a2, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not t1, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a6, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a5, sp
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a5)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a7, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not t0, t0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not t2, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui t3, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a3, t3, -1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, a5, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a5, 1048568
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t2, t2, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or t2, t4, t2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, t4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t0, t0, t3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or t0, t4, t0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, t0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, t2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t0, t0, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, a7, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a7, t0, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, a7, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a6, a7, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, t1, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a6, a6, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, a7, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, t3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a7, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a4, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_v8f16:
+; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 28(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 24(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 26(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 22(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 20(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 16(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a1, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a2, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t0, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a4, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not t1, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a6, a6
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a5, sp
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a5)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a7, a7
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not t0, t0
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not t2, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui t3, 8
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a3, t3, -1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and t4, a5, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a5, 1048568
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and t2, t2, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or t2, t4, t2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t4, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and t4, t4, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and t0, t0, t3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or t0, t4, t0
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, t0
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, t2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t0, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and t0, t0, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a7, a7, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a7, t0, a7
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a7
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a7, a7, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, a6, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a6, a7, a6
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a6
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, a6, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a7, t1, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a6, a6, a7
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a7, a7, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, t3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a7, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v9, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a6
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a4, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
   %c = fneg <8 x half> %b
@@ -1035,52 +2519,331 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: copysign_neg_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfneg.v v8, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfsgnj.vv v8, v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
+; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_v6f16:
+; ZVFHMIN-ZFH-RV32:       # %bb.0:
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 28(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 24(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 26(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 22(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa0, 20(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh ft0, 16(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh ft1, 18(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh ft2, 2(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh ft3, 0(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h ft1, ft2, ft1
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, ft1
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h ft0, ft3, ft0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, ft0
+; ZVFHMIN-ZFH-RV32-NEXT:    flh ft0, 4(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    flh ft1, 6(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa0, ft0, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa1, ft1, fa1
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa0, 10(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a4, fa1
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 8(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa2, fa0, fa2
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa2
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa3, fa1, fa3
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a6, fa3
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 12(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a6
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 14(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa5, fa2, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-ZFH-RV32-NEXT:    addi a0, a0, 8
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFH-RV32-NEXT:    ret
 ;
-; ZVFHMIN-RV64-LABEL: copysign_neg_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfneg.v v8, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfsgnj.vv v8, v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_v6f16:
+; ZVFHMIN-ZFH-RV64:       # %bb.0:
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 28(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 24(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 26(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 22(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa0, 20(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh ft0, 16(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh ft1, 18(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh ft2, 2(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh ft3, 0(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h ft1, ft2, ft1
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, ft1
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h ft0, ft3, ft0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, ft0
+; ZVFHMIN-ZFH-RV64-NEXT:    flh ft0, 4(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    flh ft1, 6(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa0, ft0, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa1, ft1, fa1
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa0, 10(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa1
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 8(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa2, fa0, fa2
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa2
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa3, fa1, fa3
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa3
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 12(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 14(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa4, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa2, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV64-NEXT:    vse64.v v9, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-ZFH-RV64-NEXT:    addi a0, a0, 8
+; ZVFHMIN-ZFH-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFH-RV64-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_v6f16:
+; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 28(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 24(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 26(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 22(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 20(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 16(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a1, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a2, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a6, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not t1, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not t2, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a4, sp
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a4)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not t3, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a5, t0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a3, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui t0, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a4, t0, -1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, a7, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a7, 1048568
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, t4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, t4, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, t4, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, t4, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t3, t3, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or t3, t4, t3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, t3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, t4, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t2, t2, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or t2, t4, t2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, t2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, t4, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t1, t1, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or t1, t4, t1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, t4, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, t0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a6, t4, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, t1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a6, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, t3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, t2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a0, a0, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_v6f16:
+; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 28(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 24(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 26(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa3, 22(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 20(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 16(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a1, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a2, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t0, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a4, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not t1, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a6, a6
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a5, sp
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a5)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a7, a7
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not t0, t0
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not t2, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui t3, 8
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a3, t3, -1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and t4, a5, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a5, 1048568
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and t2, t2, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or t2, t4, t2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t4, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and t4, t4, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and t0, t0, t3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or t0, t4, t0
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, t0
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, t2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h t0, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and t0, t0, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a7, a7, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a7, t0, a7
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a7
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a7, a7, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, a6, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a6, a7, a6
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a6
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a6, a6, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a7, t1, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a6, a6, a7
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a7, a7, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, t3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a4, a7, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v9, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a6
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a4, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a4, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse64.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a0, a0, 8
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
   %c = fneg <6 x half> %b
@@ -1143,25 +2906,187 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: copysign_neg_trunc_v4f16_v4f32:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    vle32.v v9, (a1)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v8, v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-NEXT:    ret
+; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_trunc_v4f16_v4f32:
+; ZVFHMIN-ZFH-RV32:       # %bb.0:
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    vle32.v v9, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 8
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 10(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa5, fa5, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 8(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 4(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 12(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa5, fa4, fa3
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa5, fa2, fa1
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa5, fa5, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 16
+; ZVFHMIN-ZFH-RV32-NEXT:    ret
+;
+; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_trunc_v4f16_v4f32:
+; ZVFHMIN-ZFH-RV64:       # %bb.0:
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    vle32.v v9, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 8
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 10(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa5, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 8(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 4(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 12(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa4, fa3
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa2, fa1
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa5, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 16
+; ZVFHMIN-ZFH-RV64-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_trunc_v4f16_v4f32:
+; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle32.v v9, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 10(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a2, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a4, a2, -1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a3, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a6, 1048568
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a1, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a3, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a3, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a5, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a1, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a3, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a1, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_trunc_v4f16_v4f32:
+; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle32.v v9, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 8
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 10(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a2, 8
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a4, a2, -1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a3, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a6, 1048568
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a6
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a1, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a3, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a3, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a5, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a1, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a6
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a3, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a1, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a6
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
   %a = load <4 x half>, ptr %x
   %b = load <4 x float>, ptr %y
   %c = fneg <4 x float> %b
@@ -1185,65 +3110,215 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: copysign_neg_trunc_v3f16_v3f32:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    addi sp, sp, -16
-; ZVFHMIN-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 3, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle32.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfneg.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfsgnj.vv v8, v10, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    addi a1, sp, 8
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-RV32-NEXT:    fsh fa5, 4(a0)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse32.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    addi sp, sp, 16
-; ZVFHMIN-RV32-NEXT:    ret
+; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_trunc_v3f16_v3f32:
+; ZVFHMIN-ZFH-RV32:       # %bb.0:
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 3, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vle32.v v9, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 8
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 18(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa5, fa5, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 8(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 16(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 12(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 20(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa5, fa4, fa3
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa5, fa2, fa1
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 22(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa5, fa5, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 24
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 28(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa5, 4(a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFH-RV32-NEXT:    ret
 ;
-; ZVFHMIN-RV64-LABEL: copysign_neg_trunc_v3f16_v3f32:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    addi sp, sp, -16
-; ZVFHMIN-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle64.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 3, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle32.v v9, (a1)
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfneg.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfsgnj.vv v8, v10, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    addi a1, sp, 8
-; ZVFHMIN-RV64-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-RV64-NEXT:    fsh fa5, 4(a0)
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse32.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    addi sp, sp, 16
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_trunc_v3f16_v3f32:
+; ZVFHMIN-ZFH-RV64:       # %bb.0:
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 3, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV64-NEXT:    vle32.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; ZVFHMIN-ZFH-RV64-NEXT:    vle64.v v9, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFH-RV64-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v10, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 8
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 10(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 16(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 8(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 20(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 12(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa1, fa2
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 22(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 14(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 24
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 28(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa5, 4(a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFH-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFH-RV64-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_trunc_v3f16_v3f32:
+; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 3, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vle32.v v9, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 18(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a2, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a4, a2, -1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a3, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 16(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a6, 1048568
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a1, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a3, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a3, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 20(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a5, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a1, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 22(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a3, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a1, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 24
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 28(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa5, 4(a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
+;
+; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_trunc_v3f16_v3f32:
+; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 3, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle32.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle64.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v10, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 8
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 10(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a1, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a2, 1048568
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 16(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a4, 8
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a5, a4, -1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a3, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a3, a6
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 20(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a6, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a4, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a1, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 22(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a3, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a1, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 24
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 28(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 4(a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
   %a = load <3 x half>, ptr %x
   %b = load <3 x float>, ptr %y
   %c = fneg <3 x float> %b
@@ -1543,23 +3618,59 @@ define void @fmsub_v8f16(ptr %x, ptr %y, ptr %z) {
 ;
 ; ZVFHMIN-LABEL: fmsub_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vle16.v v8, (a2)
 ; ZVFHMIN-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-NEXT:    vle16.v v10, (a1)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v11
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    mv a1, sp
+; ZVFHMIN-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-NEXT:    lui a3, 1048568
+; ZVFHMIN-NEXT:    fmv.x.h a4, fa3
+; ZVFHMIN-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-NEXT:    lui a5, 8
+; ZVFHMIN-NEXT:    xor a2, a2, a5
+; ZVFHMIN-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-NEXT:    xor a1, a1, a3
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-NEXT:    xor a4, a4, a3
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-NEXT:    xor a2, a2, a3
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-NEXT:    xor a1, a1, a3
+; ZVFHMIN-NEXT:    xor a2, a2, a5
+; ZVFHMIN-NEXT:    vmv.v.x v11, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-NEXT:    vslide1down.vx v11, v11, a1
+; ZVFHMIN-NEXT:    xor a2, a2, a3
+; ZVFHMIN-NEXT:    vslide1down.vx v11, v11, a2
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    xor a1, a1, a3
+; ZVFHMIN-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-NEXT:    vslide1down.vx v11, v11, a1
+; ZVFHMIN-NEXT:    vslidedown.vi v11, v8, 4, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmacc.vv v10, v8, v9
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
@@ -1583,53 +3694,125 @@ define void @fmsub_v6f16(ptr %x, ptr %y, ptr %z) {
 ;
 ; ZVFHMIN-RV32-LABEL: fmsub_v6f16:
 ; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a2)
 ; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-RV32-NEXT:    vle16.v v10, (a1)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfneg.v v8, v11
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v11, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-RV32-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-RV32-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-RV32-NEXT:    lui a3, 1048568
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa3
+; ZVFHMIN-RV32-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-RV32-NEXT:    lui a5, 8
+; ZVFHMIN-RV32-NEXT:    xor a2, a2, a5
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-RV32-NEXT:    xor a1, a1, a3
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-RV32-NEXT:    xor a4, a4, a3
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV32-NEXT:    xor a2, a2, a3
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-RV32-NEXT:    xor a1, a1, a3
+; ZVFHMIN-RV32-NEXT:    xor a2, a2, a5
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v11, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v11, v11, a1
+; ZVFHMIN-RV32-NEXT:    xor a2, a2, a3
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v11, v11, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    xor a1, a1, a3
+; ZVFHMIN-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v11, v11, a1
+; ZVFHMIN-RV32-NEXT:    vslidedown.vi v11, v8, 4, v0.t
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v11
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v11
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmacc.vv v10, v8, v9
+; ZVFHMIN-RV32-NEXT:    vfmadd.vv v9, v11, v8
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 2
 ; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
 ; ZVFHMIN-RV32-NEXT:    vse32.v v9, (a1)
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-RV32-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-RV32-NEXT:    ret
 ;
 ; ZVFHMIN-RV64-LABEL: fmsub_v6f16:
 ; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-RV64-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a2)
 ; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-RV64-NEXT:    vle16.v v10, (a1)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfneg.v v8, v11
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v11, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-RV64-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-RV64-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-RV64-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-RV64-NEXT:    lui a3, 1048568
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a4, fa3
+; ZVFHMIN-RV64-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-RV64-NEXT:    lui a5, 8
+; ZVFHMIN-RV64-NEXT:    xor a2, a2, a5
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-RV64-NEXT:    xor a1, a1, a3
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-RV64-NEXT:    xor a4, a4, a3
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV64-NEXT:    xor a2, a2, a3
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-RV64-NEXT:    xor a1, a1, a3
+; ZVFHMIN-RV64-NEXT:    xor a2, a2, a5
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v11, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v11, v11, a1
+; ZVFHMIN-RV64-NEXT:    xor a2, a2, a3
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v11, v11, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV64-NEXT:    xor a1, a1, a3
+; ZVFHMIN-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v11, v11, a1
+; ZVFHMIN-RV64-NEXT:    vslidedown.vi v11, v8, 4, v0.t
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v11
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v11
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmacc.vv v10, v8, v9
+; ZVFHMIN-RV64-NEXT:    vfmadd.vv v9, v11, v8
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vse64.v v8, (a0)
 ; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v8, 2
 ; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
 ; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-RV64-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-RV64-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
@@ -2018,17 +4201,187 @@ define void @fneg_v16f16(ptr %x) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: fneg_v16f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vse16.v v10, (a0)
-; ZVFHMIN-NEXT:    ret
+; ZVFHMIN-RV32-LABEL: fneg_v16f16:
+; ZVFHMIN-RV32:       # %bb.0:
+; ZVFHMIN-RV32-NEXT:    addi sp, sp, -64
+; ZVFHMIN-RV32-NEXT:    .cfi_def_cfa_offset 64
+; ZVFHMIN-RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
+; ZVFHMIN-RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; ZVFHMIN-RV32-NEXT:    .cfi_offset ra, -4
+; ZVFHMIN-RV32-NEXT:    .cfi_offset s0, -8
+; ZVFHMIN-RV32-NEXT:    addi s0, sp, 64
+; ZVFHMIN-RV32-NEXT:    .cfi_def_cfa s0, 0
+; ZVFHMIN-RV32-NEXT:    andi sp, sp, -32
+; ZVFHMIN-RV32-NEXT:    vsetivli zero, 16, e16, m1, ta, mu
+; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-RV32-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-RV32-NEXT:    flh fa3, 6(sp)
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-RV32-NEXT:    flh fa4, 8(sp)
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a5, fa3
+; ZVFHMIN-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a6, fa4
+; ZVFHMIN-RV32-NEXT:    lui a1, 1048568
+; ZVFHMIN-RV32-NEXT:    xor a2, a2, a1
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-RV32-NEXT:    lui t0, 8
+; ZVFHMIN-RV32-NEXT:    xor a3, a3, t0
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v8, a3
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-RV32-NEXT:    xor a4, a4, a1
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-RV32-NEXT:    xor a5, a5, a1
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
+; ZVFHMIN-RV32-NEXT:    xor a4, a6, a1
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 16(sp)
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV32-NEXT:    xor a4, a7, a1
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 20(sp)
+; ZVFHMIN-RV32-NEXT:    xor a3, a3, a1
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-RV32-NEXT:    xor a2, a2, a1
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 22(sp)
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-RV32-NEXT:    xor a2, a4, t0
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 24(sp)
+; ZVFHMIN-RV32-NEXT:    xor a5, a5, a1
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-RV32-NEXT:    xor a3, a3, a1
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 26(sp)
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-RV32-NEXT:    xor a2, a2, a1
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 28(sp)
+; ZVFHMIN-RV32-NEXT:    xor a4, a4, a1
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a4
+; ZVFHMIN-RV32-NEXT:    xor a2, a2, a1
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-RV32-NEXT:    xor a3, a3, a1
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV32-NEXT:    xor a1, a2, a1
+; ZVFHMIN-RV32-NEXT:    li a2, 255
+; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 8, v0.t
+; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-RV32-NEXT:    addi sp, s0, -64
+; ZVFHMIN-RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; ZVFHMIN-RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; ZVFHMIN-RV32-NEXT:    addi sp, sp, 64
+; ZVFHMIN-RV32-NEXT:    ret
+;
+; ZVFHMIN-RV64-LABEL: fneg_v16f16:
+; ZVFHMIN-RV64:       # %bb.0:
+; ZVFHMIN-RV64-NEXT:    addi sp, sp, -64
+; ZVFHMIN-RV64-NEXT:    .cfi_def_cfa_offset 64
+; ZVFHMIN-RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
+; ZVFHMIN-RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; ZVFHMIN-RV64-NEXT:    .cfi_offset ra, -8
+; ZVFHMIN-RV64-NEXT:    .cfi_offset s0, -16
+; ZVFHMIN-RV64-NEXT:    addi s0, sp, 64
+; ZVFHMIN-RV64-NEXT:    .cfi_def_cfa s0, 0
+; ZVFHMIN-RV64-NEXT:    andi sp, sp, -32
+; ZVFHMIN-RV64-NEXT:    vsetivli zero, 16, e16, m1, ta, mu
+; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-RV64-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-RV64-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-RV64-NEXT:    flh fa3, 6(sp)
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-RV64-NEXT:    flh fa4, 8(sp)
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a5, fa3
+; ZVFHMIN-RV64-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a6, fa4
+; ZVFHMIN-RV64-NEXT:    lui a1, 1048568
+; ZVFHMIN-RV64-NEXT:    xor a2, a2, a1
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-RV64-NEXT:    lui t0, 8
+; ZVFHMIN-RV64-NEXT:    xor a3, a3, t0
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v8, a3
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-RV64-NEXT:    xor a4, a4, a1
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-RV64-NEXT:    xor a5, a5, a1
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a5
+; ZVFHMIN-RV64-NEXT:    xor a4, a6, a1
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 16(sp)
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV64-NEXT:    xor a4, a7, a1
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 20(sp)
+; ZVFHMIN-RV64-NEXT:    xor a3, a3, a1
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-RV64-NEXT:    xor a2, a2, a1
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 22(sp)
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-RV64-NEXT:    xor a2, a4, t0
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 24(sp)
+; ZVFHMIN-RV64-NEXT:    xor a5, a5, a1
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-RV64-NEXT:    xor a3, a3, a1
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 26(sp)
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-RV64-NEXT:    xor a2, a2, a1
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 28(sp)
+; ZVFHMIN-RV64-NEXT:    xor a4, a4, a1
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a4
+; ZVFHMIN-RV64-NEXT:    xor a2, a2, a1
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 30(sp)
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-RV64-NEXT:    xor a3, a3, a1
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    xor a1, a2, a1
+; ZVFHMIN-RV64-NEXT:    li a2, 255
+; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-RV64-NEXT:    vslidedown.vi v9, v8, 8, v0.t
+; ZVFHMIN-RV64-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-RV64-NEXT:    addi sp, s0, -64
+; ZVFHMIN-RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
+; ZVFHMIN-RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
+; ZVFHMIN-RV64-NEXT:    addi sp, sp, 64
+; ZVFHMIN-RV64-NEXT:    ret
   %a = load <16 x half>, ptr %x
   %b = fneg <16 x half> %a
   store <16 x half> %b, ptr %x
@@ -3554,24 +5907,60 @@ define void @fmsub_vf_v8f16(ptr %x, ptr %y, half %z) {
 ;
 ; ZVFHMIN-LABEL: fmsub_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    mv a1, sp
+; ZVFHMIN-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-NEXT:    flh fa4, 0(sp)
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    lui a1, 1048568
+; ZVFHMIN-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-NEXT:    lui a5, 8
+; ZVFHMIN-NEXT:    xor a3, a3, a5
+; ZVFHMIN-NEXT:    vmv.v.x v10, a3
+; ZVFHMIN-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-NEXT:    xor a2, a2, a1
+; ZVFHMIN-NEXT:    vslide1down.vx v10, v10, a2
+; ZVFHMIN-NEXT:    xor a4, a4, a1
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-NEXT:    vslide1down.vx v10, v10, a4
+; ZVFHMIN-NEXT:    xor a3, a3, a1
+; ZVFHMIN-NEXT:    vslide1down.vx v10, v10, a3
+; ZVFHMIN-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-NEXT:    xor a2, a2, a1
+; ZVFHMIN-NEXT:    xor a3, a3, a5
+; ZVFHMIN-NEXT:    vmv.v.x v11, a3
+; ZVFHMIN-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-NEXT:    vslide1down.vx v11, v11, a2
+; ZVFHMIN-NEXT:    xor a3, a3, a1
+; ZVFHMIN-NEXT:    vslide1down.vx v11, v11, a3
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    xor a1, a2, a1
+; ZVFHMIN-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-NEXT:    vslide1down.vx v11, v11, a1
+; ZVFHMIN-NEXT:    vslidedown.vi v11, v10, 4, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v11
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v11
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmacc.vv v10, v8, v9
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
-; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
@@ -3595,63 +5984,135 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) {
 ;
 ; ZVFHMIN-RV32-LABEL: fmsub_vf_v6f16:
 ; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
 ; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    li a2, 192
-; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v10, a2
-; ZVFHMIN-RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
+; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-RV32-NEXT:    li a4, 192
+; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a4
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a3, v0
+; ZVFHMIN-RV32-NEXT:    lui a1, 1048568
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-RV32-NEXT:    lui a5, 8
+; ZVFHMIN-RV32-NEXT:    xor a4, a4, a5
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v10, a4
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-RV32-NEXT:    xor a2, a2, a1
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v10, v10, a2
+; ZVFHMIN-RV32-NEXT:    xor a3, a3, a1
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v10, v10, a3
+; ZVFHMIN-RV32-NEXT:    xor a4, a4, a1
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v10, v10, a4
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-RV32-NEXT:    xor a2, a2, a1
+; ZVFHMIN-RV32-NEXT:    xor a3, a3, a5
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v11, a3
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v11, v11, a2
+; ZVFHMIN-RV32-NEXT:    xor a3, a3, a1
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v11, v11, a3
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV32-NEXT:    xor a1, a2, a1
+; ZVFHMIN-RV32-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v11, v11, a1
+; ZVFHMIN-RV32-NEXT:    vslidedown.vi v11, v10, 4, v0.t
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v11
 ; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfneg.v v9, v11
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v11, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v11
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmacc.vv v8, v9, v10
+; ZVFHMIN-RV32-NEXT:    vfmadd.vv v9, v11, v10
 ; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 2
 ; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
+; ZVFHMIN-RV32-NEXT:    vse32.v v9, (a1)
 ; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-RV32-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-RV32-NEXT:    ret
 ;
 ; ZVFHMIN-RV64-LABEL: fmsub_vf_v6f16:
 ; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-RV64-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
 ; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    li a2, 192
-; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v10, a2
-; ZVFHMIN-RV64-NEXT:    vmerge.vxm v10, v10, a1, v0
+; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-RV64-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-RV64-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-RV64-NEXT:    li a4, 192
+; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a4
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a3, v0
+; ZVFHMIN-RV64-NEXT:    lui a1, 1048568
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-RV64-NEXT:    lui a5, 8
+; ZVFHMIN-RV64-NEXT:    xor a4, a4, a5
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v10, a4
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-RV64-NEXT:    xor a2, a2, a1
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v10, v10, a2
+; ZVFHMIN-RV64-NEXT:    xor a3, a3, a1
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v10, v10, a3
+; ZVFHMIN-RV64-NEXT:    xor a4, a4, a1
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v10, v10, a4
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-RV64-NEXT:    xor a2, a2, a1
+; ZVFHMIN-RV64-NEXT:    xor a3, a3, a5
+; ZVFHMIN-RV64-NEXT:    vmv.v.x v11, a3
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v11, v11, a2
+; ZVFHMIN-RV64-NEXT:    xor a3, a3, a1
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v11, v11, a3
+; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV64-NEXT:    xor a1, a2, a1
+; ZVFHMIN-RV64-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-RV64-NEXT:    vslide1down.vx v11, v11, a1
+; ZVFHMIN-RV64-NEXT:    vslidedown.vi v11, v10, 4, v0.t
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v11
 ; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfneg.v v9, v11
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v11, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v11
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmacc.vv v8, v9, v10
+; ZVFHMIN-RV64-NEXT:    vfmadd.vv v9, v11, v10
 ; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
+; ZVFHMIN-RV64-NEXT:    vse64.v v8, (a0)
+; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v8, 2
 ; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
 ; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-RV64-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-RV64-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll
index 95a410ea56b74..4bf9ae16cdaf0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll
@@ -19,12 +19,10 @@ define <vscale x 1 x half> @vfabs_nxv1f16(<vscale x 1 x half> %v) {
 ;
 ; ZVFHMIN-LABEL: vfabs_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v9, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
   %r = call <vscale x 1 x half> @llvm.fabs.nxv1f16(<vscale x 1 x half> %v)
   ret <vscale x 1 x half> %r
@@ -41,12 +39,10 @@ define <vscale x 2 x half> @vfabs_nxv2f16(<vscale x 2 x half> %v) {
 ;
 ; ZVFHMIN-LABEL: vfabs_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v9, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
   %r = call <vscale x 2 x half> @llvm.fabs.nxv2f16(<vscale x 2 x half> %v)
   ret <vscale x 2 x half> %r
@@ -63,12 +59,10 @@ define <vscale x 4 x half> @vfabs_nxv4f16(<vscale x 4 x half> %v) {
 ;
 ; ZVFHMIN-LABEL: vfabs_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
   %r = call <vscale x 4 x half> @llvm.fabs.nxv4f16(<vscale x 4 x half> %v)
   ret <vscale x 4 x half> %r
@@ -85,12 +79,10 @@ define <vscale x 8 x half> @vfabs_nxv8f16(<vscale x 8 x half> %v) {
 ;
 ; ZVFHMIN-LABEL: vfabs_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
   %r = call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> %v)
   ret <vscale x 8 x half> %r
@@ -107,12 +99,10 @@ define <vscale x 16 x half> @vfabs_nxv16f16(<vscale x 16 x half> %v) {
 ;
 ; ZVFHMIN-LABEL: vfabs_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
   %r = call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> %v)
   ret <vscale x 16 x half> %r
@@ -129,17 +119,10 @@ define <vscale x 32 x half> @vfabs_nxv32f16(<vscale x 32 x half> %v) {
 ;
 ; ZVFHMIN-LABEL: vfabs_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
   %r = call <vscale x 32 x half> @llvm.fabs.nxv32f16(<vscale x 32 x half> %v)
   ret <vscale x 32 x half> %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll
index 029a121d08980..c71c07488581a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll
@@ -19,13 +19,12 @@ define <vscale x 1 x half> @vfcopysign_vv_nxv1f16(<vscale x 1 x half> %vm, <vsca
 ;
 ; ZVFHMIN-LABEL: vfcopysign_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %vs)
   ret <vscale x 1 x half> %r
@@ -45,12 +44,11 @@ define <vscale x 1 x half> @vfcopysign_vf_nxv1f16(<vscale x 1 x half> %vm, half
 ; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v9, v10, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 1 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 1 x half> %head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -67,18 +65,13 @@ define <vscale x 1 x half> @vfcopynsign_vv_nxv1f16(<vscale x 1 x half> %vm, <vsc
 ;
 ; ZVFHMIN-LABEL: vfcopynsign_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %n = fneg <vscale x 1 x half> %vs
   %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %n)
@@ -99,17 +92,12 @@ define <vscale x 1 x half> @vfcopynsign_vf_nxv1f16(<vscale x 1 x half> %vm, half
 ; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 1 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 1 x half> %head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -130,12 +118,11 @@ define <vscale x 1 x half> @vfcopysign_exttrunc_vv_nxv1f16_nxv1f32(<vscale x 1 x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vand.vx v9, v10, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %e = fptrunc <vscale x 1 x float> %vs to <vscale x 1 x half>
   %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %e)
@@ -158,12 +145,11 @@ define <vscale x 1 x half> @vfcopysign_exttrunc_vf_nxv1f16_nxv1f32(<vscale x 1 x
 ; ZVFHMIN-NEXT:    vfmv.v.f v9, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v9, v10, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 1 x float> poison, float %s, i32 0
   %splat = shufflevector <vscale x 1 x float> %head, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
@@ -182,19 +168,14 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vv_nxv1f16_nxv1f32(<vscale x 1
 ;
 ; ZVFHMIN-LABEL: vfcopynsign_exttrunc_vv_nxv1f16_nxv1f32:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %n = fneg <vscale x 1 x float> %vs
   %eneg = fptrunc <vscale x 1 x float> %n to <vscale x 1 x half>
@@ -216,19 +197,14 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vf_nxv1f16_nxv1f32(<vscale x 1
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.v.f v9, fa0
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 1 x float> poison, float %s, i32 0
   %splat = shufflevector <vscale x 1 x float> %head, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
@@ -254,12 +230,11 @@ define <vscale x 1 x half> @vfcopysign_exttrunc_vv_nxv1f16_nxv1f64(<vscale x 1 x
 ; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %e = fptrunc <vscale x 1 x double> %vs to <vscale x 1 x half>
   %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %e)
@@ -286,12 +261,11 @@ define <vscale x 1 x half> @vfcopysign_exttrunc_vf_nxv1f16_nxv1f64(<vscale x 1 x
 ; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 1 x double> poison, double %s, i32 0
   %splat = shufflevector <vscale x 1 x double> %head, <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer
@@ -312,22 +286,17 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vv_nxv1f16_nxv1f64(<vscale x 1
 ;
 ; ZVFHMIN-LABEL: vfcopynsign_exttrunc_vv_nxv1f16_nxv1f64:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %n = fneg <vscale x 1 x double> %vs
   %eneg = fptrunc <vscale x 1 x double> %n to <vscale x 1 x half>
@@ -351,22 +320,17 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vf_nxv1f16_nxv1f64(<vscale x 1
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.v.f v9, fa0
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 1 x double> poison, double %s, i32 0
   %splat = shufflevector <vscale x 1 x double> %head, <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer
@@ -387,13 +351,12 @@ define <vscale x 2 x half> @vfcopysign_vv_nxv2f16(<vscale x 2 x half> %vm, <vsca
 ;
 ; ZVFHMIN-LABEL: vfcopysign_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %r = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> %vm, <vscale x 2 x half> %vs)
   ret <vscale x 2 x half> %r
@@ -413,12 +376,11 @@ define <vscale x 2 x half> @vfcopysign_vf_nxv2f16(<vscale x 2 x half> %vm, half
 ; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v9, v10, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 2 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 2 x half> %head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -435,18 +397,13 @@ define <vscale x 2 x half> @vfcopynsign_vv_nxv2f16(<vscale x 2 x half> %vm, <vsc
 ;
 ; ZVFHMIN-LABEL: vfcopynsign_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %n = fneg <vscale x 2 x half> %vs
   %r = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> %vm, <vscale x 2 x half> %n)
@@ -467,17 +424,12 @@ define <vscale x 2 x half> @vfcopynsign_vf_nxv2f16(<vscale x 2 x half> %vm, half
 ; ZVFHMIN-NEXT:    vfmv.v.f v9, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 2 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 2 x half> %head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -497,13 +449,12 @@ define <vscale x 4 x half> @vfcopysign_vv_nxv4f16(<vscale x 4 x half> %vm, <vsca
 ;
 ; ZVFHMIN-LABEL: vfcopysign_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %r = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> %vm, <vscale x 4 x half> %vs)
   ret <vscale x 4 x half> %r
@@ -523,12 +474,11 @@ define <vscale x 4 x half> @vfcopysign_vf_nxv4f16(<vscale x 4 x half> %vm, half
 ; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v10, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 4 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -545,18 +495,13 @@ define <vscale x 4 x half> @vfcopynsign_vv_nxv4f16(<vscale x 4 x half> %vm, <vsc
 ;
 ; ZVFHMIN-LABEL: vfcopynsign_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v10, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %n = fneg <vscale x 4 x half> %vs
   %r = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> %vm, <vscale x 4 x half> %n)
@@ -577,17 +522,12 @@ define <vscale x 4 x half> @vfcopynsign_vf_nxv4f16(<vscale x 4 x half> %vm, half
 ; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v10, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 4 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -607,13 +547,12 @@ define <vscale x 8 x half> @vfcopysign_vv_nxv8f16(<vscale x 8 x half> %vm, <vsca
 ;
 ; ZVFHMIN-LABEL: vfcopysign_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %vs)
   ret <vscale x 8 x half> %r
@@ -633,12 +572,11 @@ define <vscale x 8 x half> @vfcopysign_vf_nxv8f16(<vscale x 8 x half> %vm, half
 ; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v12, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -655,18 +593,13 @@ define <vscale x 8 x half> @vfcopynsign_vv_nxv8f16(<vscale x 8 x half> %vm, <vsc
 ;
 ; ZVFHMIN-LABEL: vfcopynsign_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v12, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %n = fneg <vscale x 8 x half> %vs
   %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %n)
@@ -687,17 +620,12 @@ define <vscale x 8 x half> @vfcopynsign_vf_nxv8f16(<vscale x 8 x half> %vm, half
 ; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v12, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -718,12 +646,11 @@ define <vscale x 8 x half> @vfcopysign_exttrunc_vv_nxv8f16_nxv8f32(<vscale x 8 x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v12, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %e = fptrunc <vscale x 8 x float> %vs to <vscale x 8 x half>
   %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %e)
@@ -746,12 +673,11 @@ define <vscale x 8 x half> @vfcopysign_exttrunc_vf_nxv8f16_nxv8f32(<vscale x 8 x
 ; ZVFHMIN-NEXT:    vfmv.v.f v12, fa0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v12, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %s, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -770,19 +696,14 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vv_nxv8f16_nxv8f32(<vscale x 8
 ;
 ; ZVFHMIN-LABEL: vfcopynsign_exttrunc_vv_nxv8f16_nxv8f32:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %n = fneg <vscale x 8 x float> %vs
   %eneg = fptrunc <vscale x 8 x float> %n to <vscale x 8 x half>
@@ -804,19 +725,14 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vf_nxv8f16_nxv8f32(<vscale x 8
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.v.f v12, fa0
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %s, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -842,12 +758,11 @@ define <vscale x 8 x half> @vfcopysign_exttrunc_vv_nxv8f16_nxv8f64(<vscale x 8 x
 ; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v12, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %e = fptrunc <vscale x 8 x double> %vs to <vscale x 8 x half>
   %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %e)
@@ -874,12 +789,11 @@ define <vscale x 8 x half> @vfcopysign_exttrunc_vf_nxv8f16_nxv8f64(<vscale x 8 x
 ; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v12, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %s, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
@@ -900,22 +814,17 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vv_nxv8f16_nxv8f64(<vscale x 8
 ;
 ; ZVFHMIN-LABEL: vfcopynsign_exttrunc_vv_nxv8f16_nxv8f64:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v12, v8
+; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %n = fneg <vscale x 8 x double> %vs
   %eneg = fptrunc <vscale x 8 x double> %n to <vscale x 8 x half>
@@ -939,22 +848,17 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vf_nxv8f16_nxv8f64(<vscale x 8
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.v.f v16, fa0
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v12, v8
+; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %s, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
@@ -975,13 +879,12 @@ define <vscale x 16 x half> @vfcopysign_vv_nxv16f16(<vscale x 16 x half> %vm, <v
 ;
 ; ZVFHMIN-LABEL: vfcopysign_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v12, v12, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v12
 ; ZVFHMIN-NEXT:    ret
   %r = call <vscale x 16 x half> @llvm.copysign.nxv16f16(<vscale x 16 x half> %vm, <vscale x 16 x half> %vs)
   ret <vscale x 16 x half> %r
@@ -1001,12 +904,11 @@ define <vscale x 16 x half> @vfcopysign_vf_nxv16f16(<vscale x 16 x half> %vm, ha
 ; ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v12, v12, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v12
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -1023,18 +925,13 @@ define <vscale x 16 x half> @vfcopynsign_vv_nxv16f16(<vscale x 16 x half> %vm, <
 ;
 ; ZVFHMIN-LABEL: vfcopynsign_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
+; ZVFHMIN-NEXT:    vand.vx v12, v12, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v12
 ; ZVFHMIN-NEXT:    ret
   %n = fneg <vscale x 16 x half> %vs
   %r = call <vscale x 16 x half> @llvm.copysign.nxv16f16(<vscale x 16 x half> %vm, <vscale x 16 x half> %n)
@@ -1055,17 +952,12 @@ define <vscale x 16 x half> @vfcopynsign_vf_nxv16f16(<vscale x 16 x half> %vm, h
 ; ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v12, v12, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v12
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -1085,19 +977,12 @@ define <vscale x 32 x half> @vfcopysign_vv_nxv32f16(<vscale x 32 x half> %vm, <v
 ;
 ; ZVFHMIN-LABEL: vfcopysign_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v16, v16, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v16
 ; ZVFHMIN-NEXT:    ret
   %r = call <vscale x 32 x half> @llvm.copysign.nxv32f16(<vscale x 32 x half> %vm, <vscale x 32 x half> %vs)
   ret <vscale x 32 x half> %r
@@ -1117,17 +1002,13 @@ define <vscale x 32 x half> @vfcopysign_vf_nxv32f16(<vscale x 32 x half> %vm, ha
 ; ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v16, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v16, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vmv.v.v v28, v24
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v16, v24, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v16
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
@@ -1144,29 +1025,13 @@ define <vscale x 32 x half> @vfcopynsign_vv_nxv32f16(<vscale x 32 x half> %vm, <
 ;
 ; ZVFHMIN-LABEL: vfcopynsign_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v24, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v24, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
+; ZVFHMIN-NEXT:    vand.vx v16, v16, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v16
 ; ZVFHMIN-NEXT:    ret
   %n = fneg <vscale x 32 x half> %vs
   %r = call <vscale x 32 x half> @llvm.copysign.nxv32f16(<vscale x 32 x half> %vm, <vscale x 32 x half> %n)
@@ -1187,22 +1052,14 @@ define <vscale x 32 x half> @vfcopynsign_vf_nxv32f16(<vscale x 32 x half> %vm, h
 ; ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v16, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v16, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vmv.v.v v28, v24
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v16, v24, a0
+; ZVFHMIN-NEXT:    addi a1, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v16, v16, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v16
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll
index c835dc72268b3..725ac14b0e7a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll
@@ -22,19 +22,16 @@ define <vscale x 1 x half> @vfmsub_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v10, v11
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmacc.vv v12, v10, v9
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 1 x half> %vc
   %vd = call <vscale x 1 x half> @llvm.experimental.constrained.fma.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x half> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -55,18 +52,15 @@ define <vscale x 1 x half> @vfmsub_vf_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v11
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 1 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 1 x half> %head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -86,19 +80,16 @@ define <vscale x 2 x half> @vfmsub_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v11
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmacc.vv v12, v9, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 2 x half> %vb
   %vd = call <vscale x 2 x half> @llvm.experimental.constrained.fma.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vc, <vscale x 2 x half> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -119,18 +110,15 @@ define <vscale x 2 x half> @vfmsub_vf_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v11
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v8, v9
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 2 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 2 x half> %head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -150,19 +138,16 @@ define <vscale x 4 x half> @vfmsub_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v10, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmacc.vv v16, v10, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 4 x half> %vc
   %vd = call <vscale x 4 x half> @llvm.experimental.constrained.fma.nxv4f16(<vscale x 4 x half> %vb, <vscale x 4 x half> %va, <vscale x 4 x half> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -183,16 +168,13 @@ define <vscale x 4 x half> @vfmsub_vf_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v10, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -214,19 +196,16 @@ define <vscale x 8 x half> @vfmsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmacc.vv v20, v16, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 8 x half> %va
   %vd = call <vscale x 8 x half> @llvm.experimental.constrained.fma.nxv8f16(<vscale x 8 x half> %vb, <vscale x 8 x half> %vc, <vscale x 8 x half> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -247,16 +226,13 @@ define <vscale x 8 x half> @vfmsub_vf_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v12, v20
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
@@ -278,19 +254,30 @@ define <vscale x 16 x half> @vfmsub_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v24, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmacc.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 16 x half> %vb
   %vd = call <vscale x 16 x half> @llvm.experimental.constrained.fma.nxv16f16(<vscale x 16 x half> %vc, <vscale x 16 x half> %va, <vscale x 16 x half> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -312,16 +299,13 @@ define <vscale x 16 x half> @vfmsub_vf_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v0
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
@@ -351,58 +335,60 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v16, (a0)
+; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vmv4r.v v20, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v0, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmacc.vv v0, v8, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
@@ -431,85 +417,78 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    sub sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 28 * vlenb
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 12
+; ZVFHMIN-NEXT:    li a1, 20
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v24, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmv.v.f v0, fa5
+; ZVFHMIN-NEXT:    vfmv.v.f v24, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v24
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs4r.v v20, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    li a1, 12
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs4r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v0, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v0
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 20
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 20
+; ZVFHMIN-NEXT:    li a1, 12
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmacc.vv v0, v24, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 20
+; ZVFHMIN-NEXT:    li a1, 12
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl4r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 12
+; ZVFHMIN-NEXT:    li a1, 20
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 20
+; ZVFHMIN-NEXT:    li a1, 12
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 28
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll
index 21b895b812354..2991e52d4266a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll
@@ -17,12 +17,9 @@ define <vscale x 1 x half> @vfneg_vv_nxv1f16(<vscale x 1 x half> %va) {
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
   %vb = fneg <vscale x 1 x half> %va
   ret <vscale x 1 x half> %vb
@@ -37,12 +34,9 @@ define <vscale x 2 x half> @vfneg_vv_nxv2f16(<vscale x 2 x half> %va) {
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
   %vb = fneg <vscale x 2 x half> %va
   ret <vscale x 2 x half> %vb
@@ -57,12 +51,9 @@ define <vscale x 4 x half> @vfneg_vv_nxv4f16(<vscale x 4 x half> %va) {
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
   %vb = fneg <vscale x 4 x half> %va
   ret <vscale x 4 x half> %vb
@@ -77,12 +68,9 @@ define <vscale x 8 x half> @vfneg_vv_nxv8f16(<vscale x 8 x half> %va) {
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
   %vb = fneg <vscale x 8 x half> %va
   ret <vscale x 8 x half> %vb
@@ -97,12 +85,9 @@ define <vscale x 16 x half> @vfneg_vv_nxv16f16(<vscale x 16 x half> %va) {
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
   %vb = fneg <vscale x 16 x half> %va
   ret <vscale x 16 x half> %vb
@@ -117,17 +102,9 @@ define <vscale x 32 x half> @vfneg_vv_nxv32f16(<vscale x 32 x half> %va) {
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
   %vb = fneg <vscale x 32 x half> %va
   ret <vscale x 32 x half> %vb
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll
index b54590cd9d844..2f41b59d6b225 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll
@@ -22,24 +22,17 @@ define <vscale x 1 x half> @vfnmsub_vv_nxv1f16(<vscale x 1 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v11
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v11
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v8, v9
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 1 x half> %va
   %neg2 = fneg <vscale x 1 x half> %vc
@@ -61,23 +54,16 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16(<vscale x 1 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v11
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v11, v9, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 1 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 1 x half> %head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -98,24 +84,17 @@ define <vscale x 2 x half> @vfnmsub_vv_nxv2f16(<vscale x 2 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v11
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v11
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 2 x half> %va
   %neg2 = fneg <vscale x 2 x half> %vb
@@ -137,23 +116,16 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16(<vscale x 2 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v11
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 2 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 2 x half> %head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -174,24 +146,17 @@ define <vscale x 4 x half> @vfnmsub_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v10, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 4 x half> %vb
   %neg2 = fneg <vscale x 4 x half> %vc
@@ -213,23 +178,16 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v12, a0
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v10, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v10, v14, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 4 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -250,24 +208,17 @@ define <vscale x 8 x half> @vfnmsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 8 x half> %vb
   %neg2 = fneg <vscale x 8 x half> %va
@@ -289,23 +240,16 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v12, v16, a0
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v12, v20
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -326,25 +270,17 @@ define <vscale x 16 x half> @vfnmsub_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv4r.v v4, v8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 16 x half> %vc
   %neg2 = fneg <vscale x 16 x half> %vb
@@ -361,29 +297,21 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16(<vscale x 16 x half> %va, <vsca
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv4r.v v28, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v16, v24, a0
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -408,92 +336,79 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a2, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v24, v24, a0
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v0, v16, a0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v0, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v0, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
@@ -515,80 +430,95 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 20
+; ZVFHMIN-NEXT:    li a1, 40
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x14, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 20 * vlenb
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 12
-; ZVFHMIN-NEXT:    mul a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.v.f v24, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v0, v24
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.v v4, v0
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v24, v0, a0
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v0, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 12
+; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 12
+; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 20
+; ZVFHMIN-NEXT:    li a1, 40
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll
index 2f7e693a8a6f9..dc23b7dfbf1ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll
@@ -22,19 +22,16 @@ define <vscale x 1 x half> @vfnmsub_vv_nxv1f16(<vscale x 1 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v11
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v10, v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 1 x half> %va
   %vd = call <vscale x 1 x half> @llvm.experimental.constrained.fma.nxv1f16(<vscale x 1 x half> %neg, <vscale x 1 x half> %vb, <vscale x 1 x half> %vc, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -55,18 +52,15 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16(<vscale x 1 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v11
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v11, v9, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 1 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 1 x half> %head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -86,19 +80,16 @@ define <vscale x 2 x half> @vfnmsub_vv_nxv2f16(<vscale x 2 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v11
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 2 x half> %va
   %vd = call <vscale x 2 x half> @llvm.experimental.constrained.fma.nxv2f16(<vscale x 2 x half> %neg, <vscale x 2 x half> %vc, <vscale x 2 x half> %vb, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -119,18 +110,15 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16(<vscale x 2 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v11, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v11
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 2 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 2 x half> %head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -150,19 +138,16 @@ define <vscale x 4 x half> @vfnmsub_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v10, v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 4 x half> %vb
   %vd = call <vscale x 4 x half> @llvm.experimental.constrained.fma.nxv4f16(<vscale x 4 x half> %neg, <vscale x 4 x half> %va, <vscale x 4 x half> %vc, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -183,18 +168,15 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfmv.v.f v10, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v10
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v10, v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 4 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -214,19 +196,16 @@ define <vscale x 8 x half> @vfnmsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 8 x half> %vb
   %vd = call <vscale x 8 x half> @llvm.experimental.constrained.fma.nxv8f16(<vscale x 8 x half> %neg, <vscale x 8 x half> %vc, <vscale x 8 x half> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -247,18 +226,15 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfmv.v.f v12, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v12
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v12, v16, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -278,34 +254,16 @@ define <vscale x 16 x half> @vfnmsub_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 16 x half> %vc
   %vd = call <vscale x 16 x half> @llvm.experimental.constrained.fma.nxv16f16(<vscale x 16 x half> %neg, <vscale x 16 x half> %va, <vscale x 16 x half> %vb, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -321,38 +279,20 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16(<vscale x 16 x half> %va, <vsca
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; ZVFHMIN-NEXT:    vmv4r.v v28, v12
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.v.f v16, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl4r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vxor.vx v16, v24, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -376,77 +316,79 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a2, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v24, a0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v0, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vmv4r.v v20, v12
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v8, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v8
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v8
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
@@ -467,73 +409,86 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.v.f v24, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v24, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v0, v24
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.v v4, v0
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v0, v0, a0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    vmv.v.v v8, v4
+; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v8, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret

From 41402c6a8aa3a4336122bdb4530fb05538efedba Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 3 Sep 2024 22:49:02 -0700
Subject: [PATCH 032/425] [RISCV][GISel] Use CCValAssign::getCustomReg for
 converting f16/f32<->GPR. (#105700)

This gives us much better control of the generated code for GISel. I've
tried to closely match the current gisel code, but it looks like we had
2 layers of G_ANYEXT in some cases before.

SelectionDAG now checks needsCustom() instead of detecting the special
cases in the Bitcast handler.

Unfortunately, IRTranslator for bitcast still generates copies between
register classes of different sizes. Because of this we can't handle
i16<->f16 bitcasts without crashing. Not sure if I should teach
RISCVInstrInfo::copyPhysReg to allow copies between FPR16 and GPR or if
I should convert the copies to instructions in GISel.
---
 .../Target/RISCV/GISel/RISCVCallLowering.cpp  | 61 +++++++++++------
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 68 +++++++++++++------
 .../irtranslator/calling-conv-half.ll         |  9 ++-
 3 files changed, 92 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
index c3cb1be963cab..6e33032384ede 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
@@ -109,15 +109,6 @@ struct RISCVOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
                         const CCValAssign &VA) override {
-    // If we're passing a smaller fp value into a larger integer register,
-    // anyextend before copying.
-    if ((VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) ||
-        ((VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::i64) &&
-         VA.getValVT() == MVT::f16)) {
-      LLT DstTy = LLT::scalar(VA.getLocVT().getSizeInBits());
-      ValVReg = MIRBuilder.buildAnyExt(DstTy, ValVReg).getReg(0);
-    }
-
     Register ExtReg = extendRegister(ValVReg, VA);
     MIRBuilder.buildCopy(PhysReg, ExtReg);
     MIB.addUse(PhysReg, RegState::Implicit);
@@ -126,16 +117,35 @@ struct RISCVOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
   unsigned assignCustomValue(CallLowering::ArgInfo &Arg,
                              ArrayRef<CCValAssign> VAs,
                              std::function<void()> *Thunk) override {
+    const CCValAssign &VA = VAs[0];
+    if ((VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) ||
+        (VA.getLocVT().isInteger() && VA.getValVT() == MVT::f16)) {
+      Register PhysReg = VA.getLocReg();
+
+      auto assignFunc = [=]() {
+        auto Trunc = MIRBuilder.buildAnyExt(LLT(VA.getLocVT()), Arg.Regs[0]);
+        MIRBuilder.buildCopy(PhysReg, Trunc);
+        MIB.addUse(PhysReg, RegState::Implicit);
+      };
+
+      if (Thunk) {
+        *Thunk = assignFunc;
+        return 1;
+      }
+
+      assignFunc();
+      return 1;
+    }
+
     assert(VAs.size() >= 2 && "Expected at least 2 VAs.");
-    const CCValAssign &VALo = VAs[0];
     const CCValAssign &VAHi = VAs[1];
 
     assert(VAHi.needsCustom() && "Value doesn't need custom handling");
-    assert(VALo.getValNo() == VAHi.getValNo() &&
+    assert(VA.getValNo() == VAHi.getValNo() &&
            "Values belong to different arguments");
 
-    assert(VALo.getLocVT() == MVT::i32 && VAHi.getLocVT() == MVT::i32 &&
-           VALo.getValVT() == MVT::f64 && VAHi.getValVT() == MVT::f64 &&
+    assert(VA.getLocVT() == MVT::i32 && VAHi.getLocVT() == MVT::i32 &&
+           VA.getValVT() == MVT::f64 && VAHi.getValVT() == MVT::f64 &&
            "unexpected custom value");
 
     Register NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)),
@@ -154,7 +164,7 @@ struct RISCVOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
     }
 
     auto assignFunc = [=]() {
-      assignValueToReg(NewRegs[0], VALo.getLocReg(), VALo);
+      assignValueToReg(NewRegs[0], VA.getLocReg(), VA);
       if (VAHi.isRegLoc())
         assignValueToReg(NewRegs[1], VAHi.getLocReg(), VAHi);
     };
@@ -258,16 +268,29 @@ struct RISCVIncomingValueHandler : public CallLowering::IncomingValueHandler {
   unsigned assignCustomValue(CallLowering::ArgInfo &Arg,
                              ArrayRef<CCValAssign> VAs,
                              std::function<void()> *Thunk) override {
+    const CCValAssign &VA = VAs[0];
+    if ((VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) ||
+        (VA.getLocVT().isInteger() && VA.getValVT() == MVT::f16)) {
+      Register PhysReg = VA.getLocReg();
+
+      markPhysRegUsed(PhysReg);
+
+      LLT LocTy(VA.getLocVT());
+      auto Copy = MIRBuilder.buildCopy(LocTy, PhysReg);
+
+      MIRBuilder.buildTrunc(Arg.Regs[0], Copy.getReg(0));
+      return 1;
+    }
+
     assert(VAs.size() >= 2 && "Expected at least 2 VAs.");
-    const CCValAssign &VALo = VAs[0];
     const CCValAssign &VAHi = VAs[1];
 
     assert(VAHi.needsCustom() && "Value doesn't need custom handling");
-    assert(VALo.getValNo() == VAHi.getValNo() &&
+    assert(VA.getValNo() == VAHi.getValNo() &&
            "Values belong to different arguments");
 
-    assert(VALo.getLocVT() == MVT::i32 && VAHi.getLocVT() == MVT::i32 &&
-           VALo.getValVT() == MVT::f64 && VAHi.getValVT() == MVT::f64 &&
+    assert(VA.getLocVT() == MVT::i32 && VAHi.getLocVT() == MVT::i32 &&
+           VA.getValVT() == MVT::f64 && VAHi.getValVT() == MVT::f64 &&
            "unexpected custom value");
 
     Register NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)),
@@ -284,7 +307,7 @@ struct RISCVIncomingValueHandler : public CallLowering::IncomingValueHandler {
                            const_cast<CCValAssign &>(VAHi));
     }
 
-    assignValueToReg(NewRegs[0], VALo.getLocReg(), VALo);
+    assignValueToReg(NewRegs[0], VA.getLocReg(), VA);
     if (VAHi.isRegLoc())
       assignValueToReg(NewRegs[1], VAHi.getLocReg(), VAHi);
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5089bbbe3c0d7..d46a08a442a01 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -19226,6 +19226,19 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
   // similar local variables rather than directly checking against the target
   // ABI.
 
+  ArrayRef<MCPhysReg> ArgGPRs = RISCV::getArgGPRs(ABI);
+
+  if (UseGPRForF16_F32 && (ValVT == MVT::f16 || ValVT == MVT::bf16 ||
+                           (ValVT == MVT::f32 && XLen == 64))) {
+    Register Reg = State.AllocateReg(ArgGPRs);
+    if (Reg) {
+      LocVT = XLenVT;
+      State.addLoc(
+          CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
   if (UseGPRForF16_F32 &&
       (ValVT == MVT::f16 || ValVT == MVT::bf16 || ValVT == MVT::f32)) {
     LocVT = XLenVT;
@@ -19235,8 +19248,6 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
     LocInfo = CCValAssign::BCvt;
   }
 
-  ArrayRef<MCPhysReg> ArgGPRs = RISCV::getArgGPRs(ABI);
-
   // If this is a variadic argument, the RISC-V calling convention requires
   // that it is assigned an 'even' or 'aligned' register if it has 8-byte
   // alignment (RV32) or 16-byte alignment (RV64). An aligned register should
@@ -19483,6 +19494,17 @@ void RISCVTargetLowering::analyzeOutputArgs(
 static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
                                    const CCValAssign &VA, const SDLoc &DL,
                                    const RISCVSubtarget &Subtarget) {
+  if (VA.needsCustom()) {
+    if (VA.getLocVT().isInteger() &&
+        (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
+      Val = DAG.getNode(RISCVISD::FMV_H_X, DL, VA.getValVT(), Val);
+    else if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32)
+      Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
+    else
+      llvm_unreachable("Unexpected Custom handling.");
+    return Val;
+  }
+
   switch (VA.getLocInfo()) {
   default:
     llvm_unreachable("Unexpected CCValAssign::LocInfo");
@@ -19491,14 +19513,7 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
       Val = convertFromScalableVector(VA.getValVT(), Val, DAG, Subtarget);
     break;
   case CCValAssign::BCvt:
-    if (VA.getLocVT().isInteger() &&
-        (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
-      Val = DAG.getNode(RISCVISD::FMV_H_X, DL, VA.getValVT(), Val);
-    } else if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
-      Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
-    } else {
-      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
-    }
+    Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
     break;
   }
   return Val;
@@ -19544,6 +19559,17 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
                                    const RISCVSubtarget &Subtarget) {
   EVT LocVT = VA.getLocVT();
 
+  if (VA.needsCustom()) {
+    if (LocVT.isInteger() &&
+        (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
+      Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, LocVT, Val);
+    else if (LocVT == MVT::i64 && VA.getValVT() == MVT::f32)
+      Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
+    else
+      llvm_unreachable("Unexpected Custom handling.");
+    return Val;
+  }
+
   switch (VA.getLocInfo()) {
   default:
     llvm_unreachable("Unexpected CCValAssign::LocInfo");
@@ -19552,14 +19578,7 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
       Val = convertToScalableVector(LocVT, Val, DAG, Subtarget);
     break;
   case CCValAssign::BCvt:
-    if (LocVT.isInteger() &&
-        (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
-      Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, LocVT, Val);
-    } else if (LocVT == MVT::i64 && VA.getValVT() == MVT::f32) {
-      Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
-    } else {
-      Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
-    }
+    Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
     break;
   }
   return Val;
@@ -19693,8 +19712,14 @@ bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
       (LocVT == MVT::f64 && Subtarget.is64Bit() &&
        Subtarget.hasStdExtZdinx())) {
     if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
-      LocInfo = CCValAssign::BCvt;
+      if (LocVT.getSizeInBits() != Subtarget.getXLen()) {
+        LocVT = Subtarget.getXLenVT();
+        State.addLoc(
+            CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+        return false;
+      }
       LocVT = Subtarget.getXLenVT();
+      LocInfo = CCValAssign::BCvt;
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
@@ -20337,9 +20362,8 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
       Glue = RetValue2.getValue(2);
       RetValue = DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, RetValue,
                              RetValue2);
-    }
-
-    RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL, Subtarget);
+    } else
+      RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL, Subtarget);
 
     InVals.push_back(RetValue);
   }
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-half.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-half.ll
index 04fa62b195076..63bc43ae20e7b 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-half.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-half.ll
@@ -1018,7 +1018,6 @@ define half @caller_half_return_stack2(half %x, half %y) nounwind {
   ; RV64IF-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
   ; RV64IF-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
   ; RV64IF-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
-  ; RV64IF-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
   ; RV64IF-NEXT:   $f10_f = COPY [[ANYEXT]](s32)
   ; RV64IF-NEXT:   $f11_f = COPY [[ANYEXT1]](s32)
   ; RV64IF-NEXT:   $f12_f = COPY [[ANYEXT2]](s32)
@@ -1027,14 +1026,14 @@ define half @caller_half_return_stack2(half %x, half %y) nounwind {
   ; RV64IF-NEXT:   $f15_f = COPY [[ANYEXT5]](s32)
   ; RV64IF-NEXT:   $f16_f = COPY [[ANYEXT6]](s32)
   ; RV64IF-NEXT:   $f17_f = COPY [[ANYEXT7]](s32)
-  ; RV64IF-NEXT:   [[ANYEXT9:%[0-9]+]]:_(s64) = G_ANYEXT [[ANYEXT8]](s32)
-  ; RV64IF-NEXT:   $x10 = COPY [[ANYEXT9]](s64)
+  ; RV64IF-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC]](s16)
+  ; RV64IF-NEXT:   $x10 = COPY [[ANYEXT8]](s64)
   ; RV64IF-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_return_stack2, csr_ilp32f_lp64f, implicit-def $x1, implicit $f10_f, implicit $f11_f, implicit $f12_f, implicit $f13_f, implicit $f14_f, implicit $f15_f, implicit $f16_f, implicit $f17_f, implicit $x10, implicit-def $f10_f
   ; RV64IF-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
   ; RV64IF-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $f10_f
   ; RV64IF-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-  ; RV64IF-NEXT:   [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s16)
-  ; RV64IF-NEXT:   $f10_f = COPY [[ANYEXT10]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s16)
+  ; RV64IF-NEXT:   $f10_f = COPY [[ANYEXT9]](s32)
   ; RV64IF-NEXT:   PseudoRET implicit $f10_f
   ;
   ; RV64IZFH-LABEL: name: caller_half_return_stack2

From 4a44898be5d46694b59aa411f2b45a52f2ce8411 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 4 Sep 2024 13:50:56 +0800
Subject: [PATCH 033/425] [RISCV] Add passthru to vmv.v.v intrinsic tests. NFC

This prevents them from being optimized away in an upcoming peephole
---
 llvm/test/CodeGen/RISCV/rvv/vmv.v.v.ll | 430 ++++++++++++-------------
 1 file changed, 215 insertions(+), 215 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v.ll
index 7217c2cfafca2..784b807a6a2e5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v.ll
@@ -9,17 +9,17 @@ declare <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
   <vscale x 1 x i8>,
   iXLen);
 
-define <vscale x 1 x i8> @intrinsic_vmv.v.v_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, iXLen %1) nounwind {
+define <vscale x 1 x i8> @intrinsic_vmv.v.v_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
-    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
-    iXLen %1)
+    <vscale x 1 x i8> %1,
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -29,17 +29,17 @@ declare <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8(
   <vscale x 2 x i8>,
   iXLen);
 
-define <vscale x 2 x i8> @intrinsic_vmv.v.v_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, iXLen %1) nounwind {
+define <vscale x 2 x i8> @intrinsic_vmv.v.v_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8(
-    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
-    iXLen %1)
+    <vscale x 2 x i8> %1,
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -49,17 +49,17 @@ declare <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8(
   <vscale x 4 x i8>,
   iXLen);
 
-define <vscale x 4 x i8> @intrinsic_vmv.v.v_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, iXLen %1) nounwind {
+define <vscale x 4 x i8> @intrinsic_vmv.v.v_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8(
-    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
-    iXLen %1)
+    <vscale x 4 x i8> %1,
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -69,17 +69,17 @@ declare <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8(
   <vscale x 8 x i8>,
   iXLen);
 
-define <vscale x 8 x i8> @intrinsic_vmv.v.v_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, iXLen %1) nounwind {
+define <vscale x 8 x i8> @intrinsic_vmv.v.v_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8(
-    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
-    iXLen %1)
+    <vscale x 8 x i8> %1,
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -89,17 +89,17 @@ declare <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8(
   <vscale x 16 x i8>,
   iXLen);
 
-define <vscale x 16 x i8> @intrinsic_vmv.v.v_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, iXLen %1) nounwind {
+define <vscale x 16 x i8> @intrinsic_vmv.v.v_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8(
-    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
-    iXLen %1)
+    <vscale x 16 x i8> %1,
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -109,17 +109,17 @@ declare <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8(
   <vscale x 32 x i8>,
   iXLen);
 
-define <vscale x 32 x i8> @intrinsic_vmv.v.v_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, iXLen %1) nounwind {
+define <vscale x 32 x i8> @intrinsic_vmv.v.v_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8(
-    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
-    iXLen %1)
+    <vscale x 32 x i8> %1,
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -129,17 +129,17 @@ declare <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8(
   <vscale x 64 x i8>,
   iXLen);
 
-define <vscale x 64 x i8> @intrinsic_vmv.v.v_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, iXLen %1) nounwind {
+define <vscale x 64 x i8> @intrinsic_vmv.v.v_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8(
-    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
-    iXLen %1)
+    <vscale x 64 x i8> %1,
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -149,17 +149,17 @@ declare <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16(
   <vscale x 1 x i16>,
   iXLen);
 
-define <vscale x 1 x i16> @intrinsic_vmv.v.v_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, iXLen %1) nounwind {
+define <vscale x 1 x i16> @intrinsic_vmv.v.v_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16(
-    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
-    iXLen %1)
+    <vscale x 1 x i16> %1,
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -169,17 +169,17 @@ declare <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16(
   <vscale x 2 x i16>,
   iXLen);
 
-define <vscale x 2 x i16> @intrinsic_vmv.v.v_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, iXLen %1) nounwind {
+define <vscale x 2 x i16> @intrinsic_vmv.v.v_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16(
-    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
-    iXLen %1)
+    <vscale x 2 x i16> %1,
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -189,17 +189,17 @@ declare <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16(
   <vscale x 4 x i16>,
   iXLen);
 
-define <vscale x 4 x i16> @intrinsic_vmv.v.v_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, iXLen %1) nounwind {
+define <vscale x 4 x i16> @intrinsic_vmv.v.v_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16(
-    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
-    iXLen %1)
+    <vscale x 4 x i16> %1,
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -209,17 +209,17 @@ declare <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16(
   <vscale x 8 x i16>,
   iXLen);
 
-define <vscale x 8 x i16> @intrinsic_vmv.v.v_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, iXLen %1) nounwind {
+define <vscale x 8 x i16> @intrinsic_vmv.v.v_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16(
-    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
-    iXLen %1)
+    <vscale x 8 x i16> %1,
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -229,17 +229,17 @@ declare <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16(
   <vscale x 16 x i16>,
   iXLen);
 
-define <vscale x 16 x i16> @intrinsic_vmv.v.v_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, iXLen %1) nounwind {
+define <vscale x 16 x i16> @intrinsic_vmv.v.v_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16(
-    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
-    iXLen %1)
+    <vscale x 16 x i16> %1,
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -249,17 +249,17 @@ declare <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16(
   <vscale x 32 x i16>,
   iXLen);
 
-define <vscale x 32 x i16> @intrinsic_vmv.v.v_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, iXLen %1) nounwind {
+define <vscale x 32 x i16> @intrinsic_vmv.v.v_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16(
-    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
-    iXLen %1)
+    <vscale x 32 x i16> %1,
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -269,17 +269,17 @@ declare <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32(
   <vscale x 1 x i32>,
   iXLen);
 
-define <vscale x 1 x i32> @intrinsic_vmv.v.v_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, iXLen %1) nounwind {
+define <vscale x 1 x i32> @intrinsic_vmv.v.v_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32(
-    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
-    iXLen %1)
+    <vscale x 1 x i32> %1,
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -289,17 +289,17 @@ declare <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(
   <vscale x 2 x i32>,
   iXLen);
 
-define <vscale x 2 x i32> @intrinsic_vmv.v.v_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, iXLen %1) nounwind {
+define <vscale x 2 x i32> @intrinsic_vmv.v.v_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(
-    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
-    iXLen %1)
+    <vscale x 2 x i32> %1,
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -309,17 +309,17 @@ declare <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(
   <vscale x 4 x i32>,
   iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vmv.v.v_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, iXLen %1) nounwind {
+define <vscale x 4 x i32> @intrinsic_vmv.v.v_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(
-    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
-    iXLen %1)
+    <vscale x 4 x i32> %1,
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -329,17 +329,17 @@ declare <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32(
   <vscale x 8 x i32>,
   iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vmv.v.v_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, iXLen %1) nounwind {
+define <vscale x 8 x i32> @intrinsic_vmv.v.v_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32(
-    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
-    iXLen %1)
+    <vscale x 8 x i32> %1,
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -349,17 +349,17 @@ declare <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32(
   <vscale x 16 x i32>,
   iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vmv.v.v_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, iXLen %1) nounwind {
+define <vscale x 16 x i32> @intrinsic_vmv.v.v_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32(
-    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
-    iXLen %1)
+    <vscale x 16 x i32> %1,
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -369,17 +369,17 @@ declare <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(
   <vscale x 1 x i64>,
   iXLen);
 
-define <vscale x 1 x i64> @intrinsic_vmv.v.v_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, iXLen %1) nounwind {
+define <vscale x 1 x i64> @intrinsic_vmv.v.v_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(
-    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
-    iXLen %1)
+    <vscale x 1 x i64> %1,
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -389,17 +389,17 @@ declare <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64(
   <vscale x 2 x i64>,
   iXLen);
 
-define <vscale x 2 x i64> @intrinsic_vmv.v.v_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, iXLen %1) nounwind {
+define <vscale x 2 x i64> @intrinsic_vmv.v.v_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64(
-    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
-    iXLen %1)
+    <vscale x 2 x i64> %1,
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -409,17 +409,17 @@ declare <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64(
   <vscale x 4 x i64>,
   iXLen);
 
-define <vscale x 4 x i64> @intrinsic_vmv.v.v_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, iXLen %1) nounwind {
+define <vscale x 4 x i64> @intrinsic_vmv.v.v_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64(
-    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
-    iXLen %1)
+    <vscale x 4 x i64> %1,
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -429,17 +429,17 @@ declare <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64(
   <vscale x 8 x i64>,
   iXLen);
 
-define <vscale x 8 x i64> @intrinsic_vmv.v.v_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, iXLen %1) nounwind {
+define <vscale x 8 x i64> @intrinsic_vmv.v.v_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64(
-    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
-    iXLen %1)
+    <vscale x 8 x i64> %1,
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -449,17 +449,17 @@ declare <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16(
   <vscale x 1 x half>,
   iXLen);
 
-define <vscale x 1 x half> @intrinsic_vmv.v.v_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
+define <vscale x 1 x half> @intrinsic_vmv.v.v_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16(
-    <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
-    iXLen %1)
+    <vscale x 1 x half> %1,
+    iXLen %2)
 
   ret <vscale x 1 x half> %a
 }
@@ -469,17 +469,17 @@ declare <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16(
   <vscale x 2 x half>,
   iXLen);
 
-define <vscale x 2 x half> @intrinsic_vmv.v.v_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
+define <vscale x 2 x half> @intrinsic_vmv.v.v_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16(
-    <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
-    iXLen %1)
+    <vscale x 2 x half> %1,
+    iXLen %2)
 
   ret <vscale x 2 x half> %a
 }
@@ -489,17 +489,17 @@ declare <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16(
   <vscale x 4 x half>,
   iXLen);
 
-define <vscale x 4 x half> @intrinsic_vmv.v.v_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
+define <vscale x 4 x half> @intrinsic_vmv.v.v_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16(
-    <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
-    iXLen %1)
+    <vscale x 4 x half> %1,
+    iXLen %2)
 
   ret <vscale x 4 x half> %a
 }
@@ -509,17 +509,17 @@ declare <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16(
   <vscale x 8 x half>,
   iXLen);
 
-define <vscale x 8 x half> @intrinsic_vmv.v.v_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
+define <vscale x 8 x half> @intrinsic_vmv.v.v_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16(
-    <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
-    iXLen %1)
+    <vscale x 8 x half> %1,
+    iXLen %2)
 
   ret <vscale x 8 x half> %a
 }
@@ -529,17 +529,17 @@ declare <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16(
   <vscale x 16 x half>,
   iXLen);
 
-define <vscale x 16 x half> @intrinsic_vmv.v.v_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
+define <vscale x 16 x half> @intrinsic_vmv.v.v_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16(
-    <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
-    iXLen %1)
+    <vscale x 16 x half> %1,
+    iXLen %2)
 
   ret <vscale x 16 x half> %a
 }
@@ -549,17 +549,17 @@ declare <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16(
   <vscale x 32 x half>,
   iXLen);
 
-define <vscale x 32 x half> @intrinsic_vmv.v.v_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
+define <vscale x 32 x half> @intrinsic_vmv.v.v_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16(
-    <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
-    iXLen %1)
+    <vscale x 32 x half> %1,
+    iXLen %2)
 
   ret <vscale x 32 x half> %a
 }
@@ -569,17 +569,17 @@ declare <vscale x 1 x bfloat> @llvm.riscv.vmv.v.v.nxv1bf16(
   <vscale x 1 x bfloat>,
   iXLen);
 
-define <vscale x 1 x bfloat> @intrinsic_vmv.v.v_v_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+define <vscale x 1 x bfloat> @intrinsic_vmv.v.v_v_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1bf16_nxv1bf16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x bfloat> @llvm.riscv.vmv.v.v.nxv1bf16(
-    <vscale x 1 x bfloat> undef,
     <vscale x 1 x bfloat> %0,
-    iXLen %1)
+    <vscale x 1 x bfloat> %1,
+    iXLen %2)
 
   ret <vscale x 1 x bfloat> %a
 }
@@ -589,17 +589,17 @@ declare <vscale x 2 x bfloat> @llvm.riscv.vmv.v.v.nxv2bf16(
   <vscale x 2 x bfloat>,
   iXLen);
 
-define <vscale x 2 x bfloat> @intrinsic_vmv.v.v_v_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+define <vscale x 2 x bfloat> @intrinsic_vmv.v.v_v_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2bf16_nxv2bf16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x bfloat> @llvm.riscv.vmv.v.v.nxv2bf16(
-    <vscale x 2 x bfloat> undef,
     <vscale x 2 x bfloat> %0,
-    iXLen %1)
+    <vscale x 2 x bfloat> %1,
+    iXLen %2)
 
   ret <vscale x 2 x bfloat> %a
 }
@@ -609,17 +609,17 @@ declare <vscale x 4 x bfloat> @llvm.riscv.vmv.v.v.nxv4bf16(
   <vscale x 4 x bfloat>,
   iXLen);
 
-define <vscale x 4 x bfloat> @intrinsic_vmv.v.v_v_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+define <vscale x 4 x bfloat> @intrinsic_vmv.v.v_v_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4bf16_nxv4bf16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x bfloat> @llvm.riscv.vmv.v.v.nxv4bf16(
-    <vscale x 4 x bfloat> undef,
     <vscale x 4 x bfloat> %0,
-    iXLen %1)
+    <vscale x 4 x bfloat> %1,
+    iXLen %2)
 
   ret <vscale x 4 x bfloat> %a
 }
@@ -629,17 +629,17 @@ declare <vscale x 8 x bfloat> @llvm.riscv.vmv.v.v.nxv8bf16(
   <vscale x 8 x bfloat>,
   iXLen);
 
-define <vscale x 8 x bfloat> @intrinsic_vmv.v.v_v_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+define <vscale x 8 x bfloat> @intrinsic_vmv.v.v_v_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8bf16_nxv8bf16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x bfloat> @llvm.riscv.vmv.v.v.nxv8bf16(
-    <vscale x 8 x bfloat> undef,
     <vscale x 8 x bfloat> %0,
-    iXLen %1)
+    <vscale x 8 x bfloat> %1,
+    iXLen %2)
 
   ret <vscale x 8 x bfloat> %a
 }
@@ -649,17 +649,17 @@ declare <vscale x 16 x bfloat> @llvm.riscv.vmv.v.v.nxv16bf16(
   <vscale x 16 x bfloat>,
   iXLen);
 
-define <vscale x 16 x bfloat> @intrinsic_vmv.v.v_v_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+define <vscale x 16 x bfloat> @intrinsic_vmv.v.v_v_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16bf16_nxv16bf16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x bfloat> @llvm.riscv.vmv.v.v.nxv16bf16(
-    <vscale x 16 x bfloat> undef,
     <vscale x 16 x bfloat> %0,
-    iXLen %1)
+    <vscale x 16 x bfloat> %1,
+    iXLen %2)
 
   ret <vscale x 16 x bfloat> %a
 }
@@ -669,17 +669,17 @@ declare <vscale x 32 x bfloat> @llvm.riscv.vmv.v.v.nxv32bf16(
   <vscale x 32 x bfloat>,
   iXLen);
 
-define <vscale x 32 x bfloat> @intrinsic_vmv.v.v_v_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+define <vscale x 32 x bfloat> @intrinsic_vmv.v.v_v_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32bf16_nxv32bf16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x bfloat> @llvm.riscv.vmv.v.v.nxv32bf16(
-    <vscale x 32 x bfloat> undef,
     <vscale x 32 x bfloat> %0,
-    iXLen %1)
+    <vscale x 32 x bfloat> %1,
+    iXLen %2)
 
   ret <vscale x 32 x bfloat> %a
 }
@@ -689,17 +689,17 @@ declare <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
   <vscale x 1 x float>,
   iXLen);
 
-define <vscale x 1 x float> @intrinsic_vmv.v.v_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
+define <vscale x 1 x float> @intrinsic_vmv.v.v_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
-    <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
-    iXLen %1)
+    <vscale x 1 x float> %1,
+    iXLen %2)
 
   ret <vscale x 1 x float> %a
 }
@@ -709,17 +709,17 @@ declare <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32(
   <vscale x 2 x float>,
   iXLen);
 
-define <vscale x 2 x float> @intrinsic_vmv.v.v_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
+define <vscale x 2 x float> @intrinsic_vmv.v.v_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32(
-    <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
-    iXLen %1)
+    <vscale x 2 x float> %1,
+    iXLen %2)
 
   ret <vscale x 2 x float> %a
 }
@@ -729,17 +729,17 @@ declare <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32(
   <vscale x 4 x float>,
   iXLen);
 
-define <vscale x 4 x float> @intrinsic_vmv.v.v_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
+define <vscale x 4 x float> @intrinsic_vmv.v.v_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32(
-    <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
-    iXLen %1)
+    <vscale x 4 x float> %1,
+    iXLen %2)
 
   ret <vscale x 4 x float> %a
 }
@@ -749,17 +749,17 @@ declare <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32(
   <vscale x 8 x float>,
   iXLen);
 
-define <vscale x 8 x float> @intrinsic_vmv.v.v_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
+define <vscale x 8 x float> @intrinsic_vmv.v.v_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32(
-    <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
-    iXLen %1)
+    <vscale x 8 x float> %1,
+    iXLen %2)
 
   ret <vscale x 8 x float> %a
 }
@@ -769,17 +769,17 @@ declare <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32(
   <vscale x 16 x float>,
   iXLen);
 
-define <vscale x 16 x float> @intrinsic_vmv.v.v_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
+define <vscale x 16 x float> @intrinsic_vmv.v.v_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32(
-    <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
-    iXLen %1)
+    <vscale x 16 x float> %1,
+    iXLen %2)
 
   ret <vscale x 16 x float> %a
 }
@@ -789,17 +789,17 @@ declare <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64(
   <vscale x 1 x double>,
   iXLen);
 
-define <vscale x 1 x double> @intrinsic_vmv.v.v_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, iXLen %1) nounwind {
+define <vscale x 1 x double> @intrinsic_vmv.v.v_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64(
-    <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
-    iXLen %1)
+    <vscale x 1 x double> %1,
+    iXLen %2)
 
   ret <vscale x 1 x double> %a
 }
@@ -809,17 +809,17 @@ declare <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64(
   <vscale x 2 x double>,
   iXLen);
 
-define <vscale x 2 x double> @intrinsic_vmv.v.v_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, iXLen %1) nounwind {
+define <vscale x 2 x double> @intrinsic_vmv.v.v_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64(
-    <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
-    iXLen %1)
+    <vscale x 2 x double> %1,
+    iXLen %2)
 
   ret <vscale x 2 x double> %a
 }
@@ -829,17 +829,17 @@ declare <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64(
   <vscale x 4 x double>,
   iXLen);
 
-define <vscale x 4 x double> @intrinsic_vmv.v.v_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, iXLen %1) nounwind {
+define <vscale x 4 x double> @intrinsic_vmv.v.v_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64(
-    <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
-    iXLen %1)
+    <vscale x 4 x double> %1,
+    iXLen %2)
 
   ret <vscale x 4 x double> %a
 }
@@ -849,17 +849,17 @@ declare <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64(
   <vscale x 8 x double>,
   iXLen);
 
-define <vscale x 8 x double> @intrinsic_vmv.v.v_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, iXLen %1) nounwind {
+define <vscale x 8 x double> @intrinsic_vmv.v.v_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64(
-    <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
-    iXLen %1)
+    <vscale x 8 x double> %1,
+    iXLen %2)
 
   ret <vscale x 8 x double> %a
 }

From 3449ed8dece600f387357b71ff74ae4bc46828b6 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Tue, 3 Sep 2024 22:47:34 -0700
Subject: [PATCH 034/425] Revert "[clang-format] Correctly annotate braces in
 macro definition (#106662)"

This reverts commit 0fa78b6c7bd43c2498700a98c47a02cf4fd06388 due to
regression.

Fixes #107096.
---
 clang/lib/Format/UnwrappedLineParser.cpp      | 3 ++-
 clang/unittests/Format/TokenAnnotatorTest.cpp | 5 -----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 0d42a6c2bfb5c..246b29d308bfa 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -609,8 +609,9 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
             ProbablyBracedList = NextTok->isNot(tok::l_square);
           }
 
-          // Cpp macro definition body containing nonempty braced list or block:
+          // Cpp macro definition body that is a nonempty braced list or block:
           if (IsCpp && Line->InMacroBody && PrevTok != FormatTok &&
+              !FormatTok->Previous && NextTok->is(tok::eof) &&
               // A statement can end with only `;` (simple statement), a block
               // closing brace (compound statement), or `:` (label statement).
               // If PrevTok is a block opening brace, Tok ends an empty block.
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index a2986f589396b..c0436d8a2e180 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3278,11 +3278,6 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   EXPECT_BRACE_KIND(Tokens[10], BK_Block);
   EXPECT_TOKEN(Tokens[11], tok::r_brace, TT_StructRBrace);
   EXPECT_BRACE_KIND(Tokens[11], BK_Block);
-
-  Tokens = annotate("#define MEMBER(NAME) NAME{\"\"}");
-  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
-  EXPECT_BRACE_KIND(Tokens[7], BK_BracedInit);
-  EXPECT_BRACE_KIND(Tokens[9], BK_BracedInit);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsElaboratedTypeSpecifier) {

From 7deda4ed0c712fb830d25f4e3090ff04f7adbcf9 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 3 Sep 2024 23:13:52 -0700
Subject: [PATCH 035/425] [RISCV] Use MCRegister for variables returned from
 AllocateReg. NFC

Avoids a cast from Register to MCRegister for the CCValAssign
functions.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d46a08a442a01..8f95a86ade303 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -19087,7 +19087,7 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
       State.getMachineFunction().getSubtarget<RISCVSubtarget>();
   ArrayRef<MCPhysReg> ArgGPRs = RISCV::getArgGPRs(STI.getTargetABI());
 
-  if (Register Reg = State.AllocateReg(ArgGPRs)) {
+  if (MCRegister Reg = State.AllocateReg(ArgGPRs)) {
     // At least one half can be passed via register.
     State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg,
                                      VA1.getLocVT(), CCValAssign::Full));
@@ -19108,7 +19108,7 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
     return false;
   }
 
-  if (Register Reg = State.AllocateReg(ArgGPRs)) {
+  if (MCRegister Reg = State.AllocateReg(ArgGPRs)) {
     // The second half can also be passed via register.
     State.addLoc(
         CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full));
@@ -19230,7 +19230,7 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
 
   if (UseGPRForF16_F32 && (ValVT == MVT::f16 || ValVT == MVT::bf16 ||
                            (ValVT == MVT::f32 && XLen == 64))) {
-    Register Reg = State.AllocateReg(ArgGPRs);
+    MCRegister Reg = State.AllocateReg(ArgGPRs);
     if (Reg) {
       LocVT = XLenVT;
       State.addLoc(
@@ -19283,7 +19283,7 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
     // GPRs, split between a GPR and the stack, or passed completely on the
     // stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these
     // cases.
-    Register Reg = State.AllocateReg(ArgGPRs);
+    MCRegister Reg = State.AllocateReg(ArgGPRs);
     if (!Reg) {
       unsigned StackOffset = State.AllocateStack(8, Align(8));
       State.addLoc(
@@ -19292,7 +19292,7 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
     }
     LocVT = MVT::i32;
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    Register HiReg = State.AllocateReg(ArgGPRs);
+    MCRegister HiReg = State.AllocateReg(ArgGPRs);
     if (HiReg) {
       State.addLoc(
           CCValAssign::getCustomReg(ValNo, ValVT, HiReg, LocVT, LocInfo));
@@ -19340,7 +19340,7 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
   }
 
   // Allocate to a register if possible, or else a stack slot.
-  Register Reg;
+  MCRegister Reg;
   unsigned StoreSizeBytes = XLen / 8;
   Align StackAlign = Align(XLen / 8);
 

From 06286832db0c4ee1899f9cee1b8f6234e45f16c7 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 3 Sep 2024 23:29:03 -0700
Subject: [PATCH 036/425] Reland "Revert "AtomicExpand: Allow incrementally
 legalizing atomicrmw"" (#106793)

Reverts llvm/llvm-project#106792

The first commit of PR is pure revert, the rest is a possible fix.
---
 llvm/lib/CodeGen/AtomicExpandPass.cpp       |  35 +-
 llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll | 373 +++++++++++---------
 llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 373 +++++++++++---------
 llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 373 +++++++++++---------
 llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll | 373 +++++++++++---------
 5 files changed, 836 insertions(+), 691 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 39a705599f90c..2da723a0cc175 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -351,17 +351,30 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
 
   bool MadeChange = false;
 
-  SmallVector<Instruction *, 1> AtomicInsts;
-
-  // Changing control-flow while iterating through it is a bad idea, so gather a
-  // list of all atomic instructions before we start.
-  for (Instruction &I : instructions(F))
-    if (I.isAtomic() && !isa<FenceInst>(&I))
-      AtomicInsts.push_back(&I);
-
-  for (auto *I : AtomicInsts) {
-    if (processAtomicInstr(I))
-      MadeChange = true;
+  for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE;) {
+    BasicBlock *BB = &*BBI;
+    ++BBI;
+
+    BasicBlock::iterator Next;
+
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+         I = Next) {
+      Instruction &Inst = *I;
+      Next = std::next(I);
+
+      if (processAtomicInstr(&Inst)) {
+        MadeChange = true;
+
+        // Detect control flow change and resume iteration from the original
+        // block to inspect any newly inserted blocks. This allows incremental
+        // legalization of atomicrmw and cmpxchg.
+        if (Next == E || BB != Next->getParent()) {
+          BBI = BB->getIterator();
+          BBE = F.end();
+          break;
+        }
+      }
+    }
   }
 
   return MadeChange;
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
index 0d230bb9dcc6e..ed9c1b037d0cc 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
@@ -43,46 +43,49 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
-; SOFTFP-NOLSE-NEXT:    b .LBB0_1
-; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
+; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -128,46 +131,49 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
-; SOFTFP-NOLSE-NEXT:    b .LBB1_1
-; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
+; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -232,36 +238,40 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
-; SOFTFP-NOLSE-NEXT:    b .LBB2_1
-; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
+; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2
@@ -327,36 +337,40 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
-; SOFTFP-NOLSE-NEXT:    b .LBB3_1
-; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
+; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 4
@@ -399,35 +413,38 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
-; SOFTFP-NOLSE-NEXT:    b .LBB4_1
-; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
+; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4
@@ -469,36 +486,40 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x21, x1
+; SOFTFP-NOLSE-NEXT:    mov x20, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
-; SOFTFP-NOLSE-NEXT:    mov x20, x8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
+; SOFTFP-NOLSE-NEXT:    mov w9, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
-; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
+; SOFTFP-NOLSE-NEXT:    mov x1, x20
 ; SOFTFP-NOLSE-NEXT:    bl __adddf3
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    cmp x8, x21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
-; SOFTFP-NOLSE-NEXT:    b .LBB5_1
-; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
+; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8
@@ -687,18 +708,18 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w22, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
-; SOFTFP-NOLSE-NEXT:    mov w21, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -711,29 +732,33 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:    mov w8, w22
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
-; SOFTFP-NOLSE-NEXT:    b .LBB7_1
-; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB7_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
+; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -799,17 +824,18 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
@@ -819,25 +845,28 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
-; SOFTFP-NOLSE-NEXT:    b .LBB8_1
-; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
+; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -885,45 +914,49 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB9_2
-; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
-; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB9_6
 ; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
-; SOFTFP-NOLSE-NEXT:    mov w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w9, w0
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:    mov w9, w22
+; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x22, x9
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB9_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB9_3
-; SOFTFP-NOLSE-NEXT:    b .LBB9_1
-; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB9_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB9_2
+; SOFTFP-NOLSE-NEXT:  .LBB9_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
index bfe0d20ca814b..888b795876f7d 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
@@ -45,46 +45,49 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
-; SOFTFP-NOLSE-NEXT:    b .LBB0_1
-; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
+; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -130,46 +133,49 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
-; SOFTFP-NOLSE-NEXT:    b .LBB1_1
-; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
+; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -234,36 +240,40 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
-; SOFTFP-NOLSE-NEXT:    b .LBB2_1
-; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
+; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2
@@ -329,36 +339,40 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
-; SOFTFP-NOLSE-NEXT:    b .LBB3_1
-; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
+; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 4
@@ -401,35 +415,38 @@ define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
-; SOFTFP-NOLSE-NEXT:    b .LBB4_1
-; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
+; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4
@@ -471,36 +488,40 @@ define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x21, x1
+; SOFTFP-NOLSE-NEXT:    mov x20, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
-; SOFTFP-NOLSE-NEXT:    mov x20, x8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
+; SOFTFP-NOLSE-NEXT:    mov w9, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
-; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
+; SOFTFP-NOLSE-NEXT:    mov x1, x20
 ; SOFTFP-NOLSE-NEXT:    bl fmax
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    cmp x8, x21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
-; SOFTFP-NOLSE-NEXT:    b .LBB5_1
-; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
+; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8
@@ -567,18 +588,18 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w22, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB6_2
-; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
-; SOFTFP-NOLSE-NEXT:    mov w21, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB6_6
 ; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
@@ -591,29 +612,33 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:    mov w8, w22
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB6_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB6_3
-; SOFTFP-NOLSE-NEXT:    b .LBB6_1
-; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB6_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB6_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB6_2
+; SOFTFP-NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -723,17 +748,18 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -743,25 +769,28 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
-; SOFTFP-NOLSE-NEXT:    b .LBB7_1
-; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
+; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -809,45 +838,49 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
-; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
-; SOFTFP-NOLSE-NEXT:    mov w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w9, w0
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:    mov w9, w22
+; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x22, x9
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
-; SOFTFP-NOLSE-NEXT:    b .LBB8_1
-; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB8_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
+; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
index 6b7d2df044460..a3665c6e42860 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
@@ -45,46 +45,49 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
-; SOFTFP-NOLSE-NEXT:    b .LBB0_1
-; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
+; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -130,46 +133,49 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
-; SOFTFP-NOLSE-NEXT:    b .LBB1_1
-; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
+; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -234,36 +240,40 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
-; SOFTFP-NOLSE-NEXT:    b .LBB2_1
-; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
+; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2
@@ -329,36 +339,40 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
-; SOFTFP-NOLSE-NEXT:    b .LBB3_1
-; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
+; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 4
@@ -401,35 +415,38 @@ define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
 ; SOFTFP-NOLSE-NEXT:    bl fminf
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
-; SOFTFP-NOLSE-NEXT:    b .LBB4_1
-; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
+; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4
@@ -471,36 +488,40 @@ define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x21, x1
+; SOFTFP-NOLSE-NEXT:    mov x20, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
-; SOFTFP-NOLSE-NEXT:    mov x20, x8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
+; SOFTFP-NOLSE-NEXT:    mov w9, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
-; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
+; SOFTFP-NOLSE-NEXT:    mov x1, x20
 ; SOFTFP-NOLSE-NEXT:    bl fmin
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    cmp x8, x21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
-; SOFTFP-NOLSE-NEXT:    b .LBB5_1
-; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
+; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8
@@ -567,18 +588,18 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w22, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB6_2
-; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
-; SOFTFP-NOLSE-NEXT:    mov w21, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB6_6
 ; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
@@ -591,29 +612,33 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:    mov w8, w22
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB6_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB6_3
-; SOFTFP-NOLSE-NEXT:    b .LBB6_1
-; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB6_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB6_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB6_2
+; SOFTFP-NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -723,17 +748,18 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -743,25 +769,28 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
-; SOFTFP-NOLSE-NEXT:    b .LBB7_1
-; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
+; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -809,45 +838,49 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
-; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
-; SOFTFP-NOLSE-NEXT:    mov w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w9, w0
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:    mov w9, w22
+; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x22, x9
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
-; SOFTFP-NOLSE-NEXT:    b .LBB8_1
-; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB8_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
+; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
index 67e164037d5ce..7725ce0e73185 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
@@ -43,46 +43,49 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
-; SOFTFP-NOLSE-NEXT:    b .LBB0_1
-; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
+; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -128,46 +131,49 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
-; SOFTFP-NOLSE-NEXT:    b .LBB1_1
-; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
+; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -232,36 +238,40 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
-; SOFTFP-NOLSE-NEXT:    b .LBB2_1
-; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
+; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2
@@ -327,36 +337,40 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
-; SOFTFP-NOLSE-NEXT:    b .LBB3_1
-; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
+; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 4
@@ -399,35 +413,38 @@ define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
-; SOFTFP-NOLSE-NEXT:    b .LBB4_1
-; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
+; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4
@@ -469,36 +486,40 @@ define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x21, x1
+; SOFTFP-NOLSE-NEXT:    mov x20, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
-; SOFTFP-NOLSE-NEXT:    mov x20, x8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
+; SOFTFP-NOLSE-NEXT:    mov w9, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
-; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
+; SOFTFP-NOLSE-NEXT:    mov x1, x20
 ; SOFTFP-NOLSE-NEXT:    bl __subdf3
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    cmp x8, x21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
-; SOFTFP-NOLSE-NEXT:    b .LBB5_1
-; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
+; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8
@@ -687,18 +708,18 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w22, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
-; SOFTFP-NOLSE-NEXT:    mov w21, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -711,29 +732,33 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:    mov w8, w22
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
-; SOFTFP-NOLSE-NEXT:    b .LBB7_1
-; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB7_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
+; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -799,17 +824,18 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
@@ -819,25 +845,28 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
-; SOFTFP-NOLSE-NEXT:    b .LBB8_1
-; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
+; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -885,45 +914,49 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB9_2
-; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
-; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB9_6
 ; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
-; SOFTFP-NOLSE-NEXT:    mov w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w9, w0
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:    mov w9, w22
+; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x22, x9
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB9_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB9_3
-; SOFTFP-NOLSE-NEXT:    b .LBB9_1
-; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB9_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB9_2
+; SOFTFP-NOLSE-NEXT:  .LBB9_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload

From 427e202a401514cb28bf2ca621baae8e1b2f552f Mon Sep 17 00:00:00 2001
From: Princeton Ferro <princetonferro@gmail.com>
Date: Tue, 3 Sep 2024 23:54:36 -0700
Subject: [PATCH 037/425] [APInt] improve initialization performance (#106945)

The purpose is to save an extra memset in both cases:

1. When `int64_t(val) < 0`, zeroing out is redundant as the subsequent
for-loop will initialize to `val .. 0xFFFFF ....`. Instead we should
only create an uninitialized buffer, and transform the slow for-loop
into a memset to initialize the higher words to `0xFF`.
2. In the other case, first we create an uninitialized array (`new
int64_t[]`) and _then_ we zero it out with `memset`. But this can be
combined in one operation with `new int64_t[]()`, which
default-initializes the array.

On one example where use of APInt was heavy, this improved compile time
by 1%.
---
 llvm/lib/Support/APInt.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index 78d573966c6c9..2348a4c9b795e 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -34,9 +34,7 @@ using namespace llvm;
 /// A utility function for allocating memory, checking for allocation failures,
 /// and ensuring the contents are zeroed.
 inline static uint64_t* getClearedMemory(unsigned numWords) {
-  uint64_t *result = new uint64_t[numWords];
-  memset(result, 0, numWords * sizeof(uint64_t));
-  return result;
+  return new uint64_t[numWords]();
 }
 
 /// A utility function for allocating memory and checking for allocation
@@ -74,12 +72,15 @@ inline static unsigned getDigit(char cdigit, uint8_t radix) {
 
 
 void APInt::initSlowCase(uint64_t val, bool isSigned) {
-  U.pVal = getClearedMemory(getNumWords());
-  U.pVal[0] = val;
-  if (isSigned && int64_t(val) < 0)
-    for (unsigned i = 1; i < getNumWords(); ++i)
-      U.pVal[i] = WORDTYPE_MAX;
-  clearUnusedBits();
+  if (isSigned && int64_t(val) < 0) {
+    U.pVal = getMemory(getNumWords());
+    U.pVal[0] = val;
+    memset(&U.pVal[1], 0xFF, APINT_WORD_SIZE * (getNumWords() - 1));
+    clearUnusedBits();
+  } else {
+    U.pVal = getClearedMemory(getNumWords());
+    U.pVal[0] = val;
+  }
 }
 
 void APInt::initSlowCase(const APInt& that) {

From 4bccb01355edcfedacafede3e7878d74e2b0a28f Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Wed, 4 Sep 2024 10:02:55 +0200
Subject: [PATCH 038/425] [Clang] Workaround dependent source location issues
 (#106925)

In #78436 we made some SourceLocExpr dependent to
deal with the fact that their value should reflect the name of
specialized function - rather than the rtemplate in which they are first
used.

However SourceLocExpr are unusual in two ways
 - They don't depend on template arguments
- They morally depend on the context in which they are used (rather than
called from).

It's fair to say that this is quite novels and confuses clang. In
particular, in some cases, we used to create dependent SourceLocExpr and
never subsequently transform them, leaving dependent objects in
instantiated functions types. To work around that we avoid replacing
SourceLocExpr when we think they could remain dependent.
It's certainly not perfect but it fixes a number of reported bugs, and
seem to only affect scenarios in which the value of the SourceLocExpr
does not matter (overload resolution).

Fixes #106428
Fixes #81155
Fixes #80210
Fixes #85373

---------

Co-authored-by: Aaron Ballman <aaron@aaronballman.com>
---
 clang/docs/ReleaseNotes.rst            |  2 +
 clang/lib/Sema/SemaExpr.cpp            | 21 +++++++--
 clang/test/SemaCXX/source_location.cpp | 60 ++++++++++++++++++++++++++
 3 files changed, 79 insertions(+), 4 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 0eee71d00a2c5..45b08128600ef 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -342,6 +342,8 @@ Bug Fixes to C++ Support
   specialization right before its declaration context. (#GH64082)
 - Fixed a constraint comparison bug for friend declarations. (#GH78101)
 - Fix handling of ``_`` as the name of a lambda's init capture variable. (#GH107024)
+- Fix an issue with dependent source location expressions (#GH106428), (#GH81155), (#GH80210), (#GH85373)
+
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 94bb938b53b44..e291ef6c97eef 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -5443,11 +5443,24 @@ struct EnsureImmediateInvocationInDefaultArgs
 
   // Rewrite to source location to refer to the context in which they are used.
   ExprResult TransformSourceLocExpr(SourceLocExpr *E) {
-    if (E->getParentContext() == SemaRef.CurContext)
+    DeclContext *DC = E->getParentContext();
+    if (DC == SemaRef.CurContext)
       return E;
-    return getDerived().RebuildSourceLocExpr(E->getIdentKind(), E->getType(),
-                                             E->getBeginLoc(), E->getEndLoc(),
-                                             SemaRef.CurContext);
+
+    // FIXME: During instantiation, because the rebuild of defaults arguments
+    // is not always done in the context of the template instantiator,
+    // we run the risk of producing a dependent source location
+    // that would never be rebuilt.
+    // This usually happens during overload resolution, or in contexts
+    // where the value of the source location does not matter.
+    // However, we should find a better way to deal with source location
+    // of function templates.
+    if (!SemaRef.CurrentInstantiationScope ||
+        !SemaRef.CurContext->isDependentContext() || DC->isDependentContext())
+      DC = SemaRef.CurContext;
+
+    return getDerived().RebuildSourceLocExpr(
+        E->getIdentKind(), E->getType(), E->getBeginLoc(), E->getEndLoc(), DC);
   }
 };
 
diff --git a/clang/test/SemaCXX/source_location.cpp b/clang/test/SemaCXX/source_location.cpp
index 6b3610d703e71..34177bfe287fc 100644
--- a/clang/test/SemaCXX/source_location.cpp
+++ b/clang/test/SemaCXX/source_location.cpp
@@ -929,3 +929,63 @@ void test() {
 }
 
 }
+
+namespace GH106428 {
+
+struct add_fn {
+    template <typename T>
+    constexpr auto operator()(T lhs, T rhs,
+                              const std::source_location loc = std::source_location::current())
+        const -> T
+    {
+        return lhs + rhs;
+    }
+};
+
+
+template <class _Fp, class... _Args>
+decltype(_Fp{}(0, 0))
+__invoke(_Fp&& __f);
+
+template<typename T>
+struct type_identity { using type = T; };
+
+template<class Fn>
+struct invoke_result : type_identity<decltype(__invoke(Fn{}))> {};
+
+using i = invoke_result<add_fn>::type;
+static_assert(__is_same(i, int));
+
+}
+
+#if __cplusplus >= 202002L
+
+namespace GH81155 {
+struct buff {
+  buff(buff &, const char * = __builtin_FUNCTION());
+};
+
+template <class Ty>
+Ty declval();
+
+template <class Fx>
+auto Call(buff arg) -> decltype(Fx{}(arg));
+
+template <typename>
+struct F {};
+
+template <class Fx>
+struct InvocableR : F<decltype(Call<Fx>(declval<buff>()))> {
+  static constexpr bool value = false;
+};
+
+template <class Fx, bool = InvocableR<Fx>::value>
+void Help(Fx) {}
+
+void Test() {
+  Help([](buff) {});
+}
+
+}
+
+#endif

From de37da8e37c4c9042563e186068adca98bf59e07 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Wed, 4 Sep 2024 09:06:48 +0100
Subject: [PATCH 039/425] [MachineOutliner] Preserve instruction bundles
 (#106402)

When the machine outliner copies instructions from a source function
into an outlined function, it was doing it using `CloneMachineInstr`,
which is documented as not preserving the interior of any instruction
bundle. So outlining code that includes an instruction bundle would
fail, because in the outlined version, the bundle would be empty, so
instructions would go missing in the move.

This occurs when any bundled instruction appears in the outlined code,
so there was no need to construct an unusual test case: I've just copied
a function from the existing `stp-opt-with-renaming.mir`, which happens
to contain an SVE instruction bundle. Including two identical copies of
that function makes the outliner merge them, and then we check that it
didn't destroy the interior of the bundle in the process.
---
 llvm/lib/CodeGen/MachineOutliner.cpp          |  7 ++-
 .../AArch64/machine-outliner-bundle.mir       | 54 +++++++++++++++++++
 2 files changed, 57 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/machine-outliner-bundle.mir

diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index 42f410c277179..97500d0abebad 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -763,10 +763,9 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
       BuildMI(MBB, MBB.end(), DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(MF.addFrameInst(CFI));
     } else {
-      MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
-      NewMI->dropMemRefs(MF);
-      NewMI->setDebugLoc(DL);
-      MBB.insert(MBB.end(), NewMI);
+      MachineInstr &NewMI = TII.duplicate(MBB, MBB.end(), MI);
+      NewMI.dropMemRefs(MF);
+      NewMI.setDebugLoc(DL);
     }
   }
 
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-bundle.mir b/llvm/test/CodeGen/AArch64/machine-outliner-bundle.mir
new file mode 100644
index 0000000000000..1dd5b0811bdfb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-bundle.mir
@@ -0,0 +1,54 @@
+# RUN: llc -mtriple=aarch64 -run-pass=machine-outliner \
+# RUN: -verify-machineinstrs %s -o - | FileCheck %s
+
+# CHECK: name: OUTLINED_FUNCTION_0
+# CHECK-NOT: name:
+# CHECK: BUNDLE implicit-def $z3, implicit-def $q3, implicit-def $d3, implicit-def $s3, implicit-def $h3, implicit-def $b3, implicit $z19, implicit $p0, implicit $z16 {
+# CHECK:     $z3 = MOVPRFX_ZZ $z19
+# CHECK:     $z3 = FMUL_ZPmZ_S renamable $p0, killed $z3, renamable $z16
+# CHECK: }
+
+---
+name:            bundled
+alignment:       4
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    1
+  maxCallFrameSize: 0
+machineFunctionInfo:
+  hasRedZone:      false
+body:             |
+  bb.0.entry:
+    liveins: $z3, $z19, $p0, $z16
+    renamable $q0 = LDRQui $sp, 1 :: (load 16)
+    STRSui renamable $s0, $sp, 9, implicit killed $q0 :: (store (s32))
+    BUNDLE implicit-def $z3, implicit-def $q3, implicit-def $d3, implicit-def $s3, implicit-def $h3, implicit-def $b3, implicit $z19, implicit $p0, implicit $z16 {
+      $z3 = MOVPRFX_ZZ $z19
+      $z3 = FMUL_ZPmZ_S renamable $p0, killed $z3, renamable $z16
+    }
+    renamable $q0 = LDRQui $sp, 0 :: (load 16, align 32)
+    STRSui renamable $s0, $sp, 10, implicit killed $q0 :: (store (s32))
+    RET undef $lr
+...
+---
+name:            bundled_clone
+alignment:       4
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    1
+  maxCallFrameSize: 0
+machineFunctionInfo:
+  hasRedZone:      false
+body:             |
+  bb.0.entry:
+    liveins: $z3, $z19, $p0, $z16
+    renamable $q0 = LDRQui $sp, 1 :: (load 16)
+    STRSui renamable $s0, $sp, 9, implicit killed $q0 :: (store (s32))
+    BUNDLE implicit-def $z3, implicit-def $q3, implicit-def $d3, implicit-def $s3, implicit-def $h3, implicit-def $b3, implicit $z19, implicit $p0, implicit $z16 {
+      $z3 = MOVPRFX_ZZ $z19
+      $z3 = FMUL_ZPmZ_S renamable $p0, killed $z3, renamable $z16
+    }
+    renamable $q0 = LDRQui $sp, 0 :: (load 16, align 32)
+    STRSui renamable $s0, $sp, 10, implicit killed $q0 :: (store (s32))
+    RET undef $lr
+...

From 01e56849001b4ace984e9557abc82bc051e03677 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Wed, 4 Sep 2024 10:09:04 +0200
Subject: [PATCH 040/425] [clang] Respect the lifetimebound in assignment
 operator. (#106997)

Fixes #106372
---
 clang/docs/ReleaseNotes.rst               |  2 +
 clang/lib/Sema/CheckExprLifetime.cpp      | 64 +++++++++++++++--------
 clang/lib/Sema/CheckExprLifetime.h        |  1 +
 clang/lib/Sema/SemaOverload.cpp           |  4 +-
 clang/test/SemaCXX/attr-lifetimebound.cpp | 20 +++++++
 5 files changed, 67 insertions(+), 24 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 45b08128600ef..511724c73015e 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -268,6 +268,8 @@ Improvements to Clang's diagnostics
 
 - Improved diagnostic when trying to overload a function in an ``extern "C"`` context. (#GH80235)
 
+- Clang now respects lifetimebound attribute for the assignment operator parameter. (#GH106372).
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index f28789dba34e1..f7540a6e3a897 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -326,24 +326,11 @@ static bool shouldTrackFirstArgument(const FunctionDecl *FD) {
   return false;
 }
 
-static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) {
-  const TypeSourceInfo *TSI = FD->getTypeSourceInfo();
-  if (!TSI)
-    return false;
-  // Don't declare this variable in the second operand of the for-statement;
-  // GCC miscompiles that by ending its lifetime before evaluating the
-  // third operand. See gcc.gnu.org/PR86769.
-  AttributedTypeLoc ATL;
-  for (TypeLoc TL = TSI->getTypeLoc();
-       (ATL = TL.getAsAdjusted<AttributedTypeLoc>());
-       TL = ATL.getModifiedLoc()) {
-    if (ATL.getAttrAs<LifetimeBoundAttr>())
-      return true;
-  }
-
-  // Assume that all assignment operators with a "normal" return type return
-  // *this, that is, an lvalue reference that is the same type as the implicit
-  // object parameter (or the LHS for a non-member operator$=).
+// Return true if this is an "normal" assignment operator.
+// We assuments that a normal assingment operator always returns *this, that is,
+// an lvalue reference that is the same type as the implicit object parameter
+// (or the LHS for a non-member operator$=).
+static bool isNormalAsisgnmentOperator(const FunctionDecl *FD) {
   OverloadedOperatorKind OO = FD->getDeclName().getCXXOverloadedOperator();
   if (OO == OO_Equal || isCompoundAssignmentOperator(OO)) {
     QualType RetT = FD->getReturnType();
@@ -359,10 +346,27 @@ static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) {
         return true;
     }
   }
-
   return false;
 }
 
+static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) {
+  const TypeSourceInfo *TSI = FD->getTypeSourceInfo();
+  if (!TSI)
+    return false;
+  // Don't declare this variable in the second operand of the for-statement;
+  // GCC miscompiles that by ending its lifetime before evaluating the
+  // third operand. See gcc.gnu.org/PR86769.
+  AttributedTypeLoc ATL;
+  for (TypeLoc TL = TSI->getTypeLoc();
+       (ATL = TL.getAsAdjusted<AttributedTypeLoc>());
+       TL = ATL.getModifiedLoc()) {
+    if (ATL.getAttrAs<LifetimeBoundAttr>())
+      return true;
+  }
+
+  return isNormalAsisgnmentOperator(FD);
+}
+
 // Visit lifetimebound or gsl-pointer arguments.
 static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
                                        LocalVisitor Visit,
@@ -968,6 +972,22 @@ static bool pathOnlyHandlesGslPointer(IndirectLocalPath &Path) {
   return false;
 }
 
+static bool isAssginmentOperatorLifetimeBound(CXXMethodDecl *CMD) {
+  if (!CMD)
+    return false;
+  return isNormalAsisgnmentOperator(CMD) && CMD->param_size() == 1 &&
+         CMD->getParamDecl(0)->hasAttr<LifetimeBoundAttr>();
+}
+
+static bool shouldRunGSLAssignmentAnalysis(const Sema &SemaRef,
+                                           const AssignedEntity &Entity) {
+  bool EnableGSLAssignmentWarnings = !SemaRef.getDiagnostics().isIgnored(
+      diag::warn_dangling_lifetime_pointer_assignment, SourceLocation());
+  return (EnableGSLAssignmentWarnings &&
+          (isRecordWithAttr<PointerAttr>(Entity.LHS->getType()) ||
+           isAssginmentOperatorLifetimeBound(Entity.AssignmentOperator)));
+}
+
 static void checkExprLifetimeImpl(Sema &SemaRef,
                                   const InitializedEntity *InitEntity,
                                   const InitializedEntity *ExtendingEntity,
@@ -1267,8 +1287,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
   };
 
   llvm::SmallVector<IndirectLocalPathEntry, 8> Path;
-  if (EnableLifetimeWarnings && LK == LK_Assignment &&
-      isRecordWithAttr<PointerAttr>(AEntity->LHS->getType()))
+  if (LK == LK_Assignment && shouldRunGSLAssignmentAnalysis(SemaRef, *AEntity))
     Path.push_back({IndirectLocalPathEntry::GslPointerAssignment, Init});
 
   if (Init->isGLValue())
@@ -1301,8 +1320,7 @@ void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity,
       diag::warn_dangling_pointer_assignment, SourceLocation());
   bool RunAnalysis = (EnableDanglingPointerAssignment &&
                       Entity.LHS->getType()->isPointerType()) ||
-                     (EnableLifetimeWarnings &&
-                      isRecordWithAttr<PointerAttr>(Entity.LHS->getType()));
+                     shouldRunGSLAssignmentAnalysis(SemaRef, Entity);
 
   if (!RunAnalysis)
     return;
diff --git a/clang/lib/Sema/CheckExprLifetime.h b/clang/lib/Sema/CheckExprLifetime.h
index af381fb96c4d6..8c8d0806dee0a 100644
--- a/clang/lib/Sema/CheckExprLifetime.h
+++ b/clang/lib/Sema/CheckExprLifetime.h
@@ -22,6 +22,7 @@ namespace clang::sema {
 struct AssignedEntity {
   // The left-hand side expression of the assignment.
   Expr *LHS = nullptr;
+  CXXMethodDecl *AssignmentOperator = nullptr;
 };
 
 /// Check that the lifetime of the given expr (and its subobjects) is
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 95551173df91a..861b0a91240b3 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -14768,7 +14768,9 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
           // Check for a self move.
           DiagnoseSelfMove(Args[0], Args[1], OpLoc);
           // lifetime check.
-          checkExprLifetime(*this, AssignedEntity{Args[0]}, Args[1]);
+          checkExprLifetime(
+              *this, AssignedEntity{Args[0], dyn_cast<CXXMethodDecl>(FnDecl)},
+              Args[1]);
         }
         if (ImplicitThis) {
           QualType ThisType = Context.getPointerType(ImplicitThis->getType());
diff --git a/clang/test/SemaCXX/attr-lifetimebound.cpp b/clang/test/SemaCXX/attr-lifetimebound.cpp
index 6566ed6270cd1..0fb997a567108 100644
--- a/clang/test/SemaCXX/attr-lifetimebound.cpp
+++ b/clang/test/SemaCXX/attr-lifetimebound.cpp
@@ -287,3 +287,23 @@ std::span<int> test2() {
   return abc; // expected-warning {{address of stack memory associated with local variable}}
 }
 } // namespace ctor_cases
+
+namespace GH106372 {
+class [[gsl::Owner]] Foo {};
+class [[gsl::Pointer]] FooView {};
+
+class NonAnnotatedFoo {};
+class NonAnnotatedFooView {};
+
+template <typename T>
+struct StatusOr {
+  template <typename U = T>
+  StatusOr& operator=(U&& v [[clang::lifetimebound]]);
+};
+
+void test(StatusOr<FooView> foo1, StatusOr<NonAnnotatedFooView> foo2) {
+  foo1 = Foo(); // expected-warning {{object backing the pointer foo1 will be destroyed at the end}}
+  // No warning on non-gsl annotated types.
+  foo2 = NonAnnotatedFoo();
+}
+} // namespace GH106372

From 771b7af1db15e59f370ccadaa98bee8e5270b5f1 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Wed, 4 Sep 2024 10:13:47 +0200
Subject: [PATCH 041/425] =?UTF-8?q?Reapply=20"[llvm/DWARF]=20Recursively?=
 =?UTF-8?q?=20resolve=20DW=5FAT=5Fsignature=20references"=E2=80=A6=20(#994?=
 =?UTF-8?q?95)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… (#99444)

The previous version introduced a bug (caught by cross-project tests).
Explicit signature resolution is still necessary when one wants to
access the children (not attributes) of a given DIE.

The new version keeps just the findRecursively extension, and reverts
all the DWARFTypePrinter modifications.
---
 llvm/lib/DebugInfo/DWARF/DWARFDie.cpp         | 13 ++--
 .../DebugInfo/DWARF/DWARFDieTest.cpp          | 61 +++++++++++++++++++
 2 files changed, 67 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index 5daa093ee8a1b..9c26c4f8892b0 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -291,13 +291,12 @@ DWARFDie::findRecursively(ArrayRef<dwarf::Attribute> Attrs) const {
     if (auto Value = Die.find(Attrs))
       return Value;
 
-    if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_abstract_origin))
-      if (Seen.insert(D).second)
-        Worklist.push_back(D);
-
-    if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_specification))
-      if (Seen.insert(D).second)
-        Worklist.push_back(D);
+    for (dwarf::Attribute Attr :
+         {DW_AT_abstract_origin, DW_AT_specification, DW_AT_signature}) {
+      if (auto D = Die.getAttributeValueAsReferencedDie(Attr))
+        if (Seen.insert(D).second)
+          Worklist.push_back(D);
+    }
   }
 
   return std::nullopt;
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDieTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDieTest.cpp
index e1057b214ee4d..485ec720ffad6 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFDieTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFDieTest.cpp
@@ -643,4 +643,65 @@ TEST(DWARFDie, getDeclFileSpecificationAcrossCUBoundary) {
   EXPECT_EQ(DeclFile, Ref);
 }
 
+TEST(DWARFDie, getNameFromTypeUnit) {
+  const char *yamldata = R"(
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+        - Code:            0x2
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_signature
+              Form:            DW_FORM_ref_sig8
+        - Code:            0x3
+          Tag:             DW_TAG_type_unit
+          Children:        DW_CHILDREN_yes
+        - Code:            0x4
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_string
+  debug_info:
+    - Version:         5
+      UnitType:        DW_UT_compile
+      AbbrevTableID:   0
+      Entries:
+        - AbbrCode:        0x1
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0xdeadbeefbaadf00d
+        - AbbrCode:        0x0
+    - Version:         5
+      UnitType:        DW_UT_type
+      AbbrevTableID:   0
+      TypeSignature:   0xdeadbeefbaadf00d
+      TypeOffset:      25
+      Entries:
+        - AbbrCode:        0x3
+        - AbbrCode:        0x4
+          Values:
+            - CStr:        "STRUCT"
+        - AbbrCode:        0x0
+  )";
+
+  Expected<StringMap<std::unique_ptr<MemoryBuffer>>> Sections =
+      DWARFYAML::emitDebugSections(StringRef(yamldata),
+                                   /*IsLittleEndian=*/true,
+                                   /*Is64BitAddrSize=*/true);
+  ASSERT_THAT_EXPECTED(Sections, Succeeded());
+  std::unique_ptr<DWARFContext> Ctx =
+      DWARFContext::create(*Sections, 4, /*isLittleEndian=*/true);
+  DWARFCompileUnit *CU = Ctx->getCompileUnitForOffset(0);
+  ASSERT_NE(nullptr, CU);
+  DWARFDie Die = CU->getUnitDIE(/*ExtractUnitDIEOnly=*/false).getFirstChild();
+  ASSERT_TRUE(Die.isValid());
+
+  ASSERT_STREQ(Die.getName(DINameKind::ShortName), "STRUCT");
+}
+
 } // end anonymous namespace

From 009184fc3920f8a14dff9971edf68754ba28da5f Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 4 Sep 2024 01:28:20 -0700
Subject: [PATCH 042/425] [ThinLTO] Avoid repeated std::map lookups (NFC)
 (#107156)

This patch avoids repeated std::map lookups with try_emplace.

While I am at it, this patch adds a couple of calls to
std::vector::reserve.
---
 llvm/include/llvm/IR/ModuleSummaryIndexYAML.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
index b2747d24c5396..6cc533f043a51 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -214,15 +214,13 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
       io.setError("key not an integer");
       return;
     }
-    if (!V.count(KeyInt))
-      V.emplace(KeyInt, /*IsAnalysis=*/false);
-    auto &Elem = V.find(KeyInt)->second;
+    auto &Elem = V.try_emplace(KeyInt, /*IsAnalysis=*/false).first->second;
     for (auto &FSum : FSums) {
       std::vector<ValueInfo> Refs;
+      Refs.reserve(FSum.Refs.size());
       for (auto &RefGUID : FSum.Refs) {
-        if (!V.count(RefGUID))
-          V.emplace(RefGUID, /*IsAnalysis=*/false);
-        Refs.push_back(ValueInfo(/*IsAnalysis=*/false, &*V.find(RefGUID)));
+        auto It = V.try_emplace(RefGUID, /*IsAnalysis=*/false).first;
+        Refs.push_back(ValueInfo(/*IsAnalysis=*/false, &*It));
       }
       Elem.SummaryList.push_back(std::make_unique<FunctionSummary>(
           GlobalValueSummary::GVFlags(
@@ -247,6 +245,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
       for (auto &Sum : P.second.SummaryList) {
         if (auto *FSum = dyn_cast<FunctionSummary>(Sum.get())) {
           std::vector<uint64_t> Refs;
+          Refs.reserve(FSum->refs().size());
           for (auto &VI : FSum->refs())
             Refs.push_back(VI.getGUID());
           FSums.push_back(FunctionSummaryYaml{

From 8bfd6b953fc119bbc37c1755e701261fcfb31ad2 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 4 Sep 2024 01:28:49 -0700
Subject: [PATCH 043/425] [SSAUpdater] Use DenseMap::operator[] (NFC) (#107179)

I'm planning to deprecate DenseMap::FindAndConstruct in favor of
DenseMap::operator[].  I thought about renaming the variable to
PredInfo, but the name is taken, so I am leaving the name as is.
---
 llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
index 010d6b0de9f6a..746926e5bee33 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
@@ -139,17 +139,16 @@ class SSAUpdaterImpl {
       for (unsigned p = 0; p != Info->NumPreds; ++p) {
         BlkT *Pred = Preds[p];
         // Check if BBMap already has a BBInfo for the predecessor block.
-        typename BBMapTy::value_type &BBMapBucket =
-          BBMap.FindAndConstruct(Pred);
-        if (BBMapBucket.second) {
-          Info->Preds[p] = BBMapBucket.second;
+        BBInfo *&BBMapBucket = BBMap[Pred];
+        if (BBMapBucket) {
+          Info->Preds[p] = BBMapBucket;
           continue;
         }
 
         // Create a new BBInfo for the predecessor.
         ValT PredVal = AvailableVals->lookup(Pred);
         BBInfo *PredInfo = new (Allocator) BBInfo(Pred, PredVal);
-        BBMapBucket.second = PredInfo;
+        BBMapBucket = PredInfo;
         Info->Preds[p] = PredInfo;
 
         if (PredInfo->AvailableVal) {

From e99eb89d5d97efc709f18f9369f2ec087352baaa Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 4 Sep 2024 01:29:13 -0700
Subject: [PATCH 044/425] [SimplifyCFG] Use range-based for loops (NFC)
 (#107180)

---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 15de40c7b0996..f9db996cdc358 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1000,20 +1000,20 @@ bool SimplifyCFGOpt::simplifyEqualityComparisonWithOnlyPredecessor(
   // which value (or set of values) this is.
   ConstantInt *TIV = nullptr;
   BasicBlock *TIBB = TI->getParent();
-  for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
-    if (PredCases[i].Dest == TIBB) {
+  for (const auto &[Value, Dest] : PredCases)
+    if (Dest == TIBB) {
       if (TIV)
         return false; // Cannot handle multiple values coming to this block.
-      TIV = PredCases[i].Value;
+      TIV = Value;
     }
   assert(TIV && "No edge from pred to succ?");
 
   // Okay, we found the one constant that our value can be if we get into TI's
   // BB.  Find out which successor will unconditionally be branched to.
   BasicBlock *TheRealDest = nullptr;
-  for (unsigned i = 0, e = ThisCases.size(); i != e; ++i)
-    if (ThisCases[i].Value == TIV) {
-      TheRealDest = ThisCases[i].Dest;
+  for (const auto &[Value, Dest] : ThisCases)
+    if (Value == TIV) {
+      TheRealDest = Dest;
       break;
     }
 

From aacdc657fc255b2547bb37ee9bacde2df0452298 Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Wed, 4 Sep 2024 16:34:27 +0800
Subject: [PATCH 045/425] [Clang] Preserve the ExpandsToEmpty flag in
 PackIndexingType (#107181)

Similar to PackIndexingExpr, we should avoid another round of
transformation of the pattern if the pattern has already turned out to
be an empty pack. As an outcome, the empty SubstTemplateTypeParmPackType
won't occur, and we don't need to collect any unexpanded packs.

Fixes https://github.com/llvm/llvm-project/issues/105903
---
 clang/docs/ReleaseNotes.rst                |  2 +-
 clang/include/clang/AST/Type.h             | 13 +++++++++----
 clang/include/clang/AST/TypeProperties.td  |  5 ++++-
 clang/lib/AST/ASTContext.cpp               | 12 +++++++-----
 clang/lib/AST/Type.cpp                     |  7 ++++---
 clang/lib/Sema/TreeTransform.h             |  4 ++--
 clang/test/SemaCXX/cxx2c-pack-indexing.cpp | 12 ++++++++++++
 7 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 511724c73015e..4128ca78ce396 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -345,7 +345,7 @@ Bug Fixes to C++ Support
 - Fixed a constraint comparison bug for friend declarations. (#GH78101)
 - Fix handling of ``_`` as the name of a lambda's init capture variable. (#GH107024)
 - Fix an issue with dependent source location expressions (#GH106428), (#GH81155), (#GH80210), (#GH85373)
-
+- Fixed a bug in the substitution of empty pack indexing types. (#GH105903)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 08f7638d7d8f9..853226118af40 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -5828,12 +5828,15 @@ class PackIndexingType final
   QualType Pattern;
   Expr *IndexExpr;
 
-  unsigned Size;
+  unsigned Size : 31;
+
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned ExpandsToEmptyPack : 1;
 
 protected:
   friend class ASTContext; // ASTContext creates these.
   PackIndexingType(const ASTContext &Context, QualType Canonical,
-                   QualType Pattern, Expr *IndexExpr,
+                   QualType Pattern, Expr *IndexExpr, bool ExpandsToEmptyPack,
                    ArrayRef<QualType> Expansions = {});
 
 public:
@@ -5857,6 +5860,8 @@ class PackIndexingType final
 
   bool hasSelectedType() const { return getSelectedIndex() != std::nullopt; }
 
+  bool expandsToEmptyPack() const { return ExpandsToEmptyPack; }
+
   ArrayRef<QualType> getExpansions() const {
     return {getExpansionsPtr(), Size};
   }
@@ -5869,10 +5874,10 @@ class PackIndexingType final
     if (hasSelectedType())
       getSelectedType().Profile(ID);
     else
-      Profile(ID, Context, getPattern(), getIndexExpr());
+      Profile(ID, Context, getPattern(), getIndexExpr(), expandsToEmptyPack());
   }
   static void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context,
-                      QualType Pattern, Expr *E);
+                      QualType Pattern, Expr *E, bool ExpandsToEmptyPack);
 
 private:
   const QualType *getExpansionsPtr() const {
diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td
index 3df19315fd573..539a344cb0b69 100644
--- a/clang/include/clang/AST/TypeProperties.td
+++ b/clang/include/clang/AST/TypeProperties.td
@@ -473,9 +473,12 @@ let Class = PackIndexingType in {
   def : Property<"indexExpression", ExprRef> {
     let Read = [{ node->getIndexExpr() }];
   }
+  def : Property<"expandsToEmptyPack", Bool> {
+    let Read = [{ node->expandsToEmptyPack() }];
+  }
 
   def : Creator<[{
-    return ctx.getPackIndexingType(pattern, indexExpression);
+    return ctx.getPackIndexingType(pattern, indexExpression, expandsToEmptyPack);
   }]>;
 }
 
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index c61234aa4d1af..341ea98a1b149 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -6188,11 +6188,13 @@ QualType ASTContext::getPackIndexingType(QualType Pattern, Expr *IndexExpr,
                                          ArrayRef<QualType> Expansions,
                                          int Index) const {
   QualType Canonical;
+  bool ExpandsToEmptyPack = FullySubstituted && Expansions.empty();
   if (FullySubstituted && Index != -1) {
     Canonical = getCanonicalType(Expansions[Index]);
   } else {
     llvm::FoldingSetNodeID ID;
-    PackIndexingType::Profile(ID, *this, Pattern, IndexExpr);
+    PackIndexingType::Profile(ID, *this, Pattern, IndexExpr,
+                              ExpandsToEmptyPack);
     void *InsertPos = nullptr;
     PackIndexingType *Canon =
         DependentPackIndexingTypes.FindNodeOrInsertPos(ID, InsertPos);
@@ -6200,8 +6202,8 @@ QualType ASTContext::getPackIndexingType(QualType Pattern, Expr *IndexExpr,
       void *Mem = Allocate(
           PackIndexingType::totalSizeToAlloc<QualType>(Expansions.size()),
           TypeAlignment);
-      Canon = new (Mem)
-          PackIndexingType(*this, QualType(), Pattern, IndexExpr, Expansions);
+      Canon = new (Mem) PackIndexingType(*this, QualType(), Pattern, IndexExpr,
+                                         ExpandsToEmptyPack, Expansions);
       DependentPackIndexingTypes.InsertNode(Canon, InsertPos);
     }
     Canonical = QualType(Canon, 0);
@@ -6210,8 +6212,8 @@ QualType ASTContext::getPackIndexingType(QualType Pattern, Expr *IndexExpr,
   void *Mem =
       Allocate(PackIndexingType::totalSizeToAlloc<QualType>(Expansions.size()),
                TypeAlignment);
-  auto *T = new (Mem)
-      PackIndexingType(*this, Canonical, Pattern, IndexExpr, Expansions);
+  auto *T = new (Mem) PackIndexingType(*this, Canonical, Pattern, IndexExpr,
+                                       ExpandsToEmptyPack, Expansions);
   Types.push_back(T);
   return QualType(T, 0);
 }
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index e89ce2e4b3844..b976d1a0ee60a 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3992,12 +3992,12 @@ void DependentDecltypeType::Profile(llvm::FoldingSetNodeID &ID,
 
 PackIndexingType::PackIndexingType(const ASTContext &Context,
                                    QualType Canonical, QualType Pattern,
-                                   Expr *IndexExpr,
+                                   Expr *IndexExpr, bool ExpandsToEmptyPack,
                                    ArrayRef<QualType> Expansions)
     : Type(PackIndexing, Canonical,
            computeDependence(Pattern, IndexExpr, Expansions)),
       Context(Context), Pattern(Pattern), IndexExpr(IndexExpr),
-      Size(Expansions.size()) {
+      Size(Expansions.size()), ExpandsToEmptyPack(ExpandsToEmptyPack) {
 
   std::uninitialized_copy(Expansions.begin(), Expansions.end(),
                           getTrailingObjects<QualType>());
@@ -4042,9 +4042,10 @@ PackIndexingType::computeDependence(QualType Pattern, Expr *IndexExpr,
 
 void PackIndexingType::Profile(llvm::FoldingSetNodeID &ID,
                                const ASTContext &Context, QualType Pattern,
-                               Expr *E) {
+                               Expr *E, bool ExpandsToEmptyPack) {
   Pattern.Profile(ID);
   E->Profile(ID, Context, true);
+  ID.AddBoolean(ExpandsToEmptyPack);
 }
 
 UnaryTransformType::UnaryTransformType(QualType BaseType,
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 66e3f27fed9de..27eac401c28f5 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -6687,10 +6687,10 @@ TreeTransform<Derived>::TransformPackIndexingType(TypeLocBuilder &TLB,
   bool NotYetExpanded = Types.empty();
   bool FullySubstituted = true;
 
-  if (Types.empty())
+  if (Types.empty() && !PIT->expandsToEmptyPack())
     Types = llvm::ArrayRef<QualType>(&Pattern, 1);
 
-  for (const QualType &T : Types) {
+  for (QualType T : Types) {
     if (!T->containsUnexpandedParameterPack()) {
       QualType Transformed = getDerived().TransformType(T);
       if (Transformed.isNull())
diff --git a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
index 7d7e808746217..962dbb8137f28 100644
--- a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
+++ b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
@@ -258,4 +258,16 @@ void f() {
     vars<0>::x<0>();
 }
 
+} // namespace GH105900
+
+namespace GH105903 {
+
+template <typename... opts> struct temp {
+  template <unsigned s> static auto x() -> opts... [s] {} // expected-note {{invalid index 0 for pack 'opts' of size 0}}
+};
+
+void f() {
+  temp<>::x<0>(); // expected-error {{no matching}}
 }
+
+} // namespace GH105903

From 331f8225f37b714a4df7ff3176b574b756f4d965 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@gmail.com>
Date: Wed, 4 Sep 2024 11:35:18 +0300
Subject: [PATCH 046/425] [mlir][AsmParser] Expose `parseMinus()` (#106881)

Found while working on parser for custom expression type for my dialect.

Builtin `AffineExpr` uses low-level parser API which is not available
for external users.
---
 mlir/include/mlir/IR/OpImplementation.h |  6 ++++++
 mlir/lib/AsmParser/AsmParserImpl.h      | 10 ++++++++++
 2 files changed, 16 insertions(+)

diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h
index ae412c7227f8e..e2472eea8a371 100644
--- a/mlir/include/mlir/IR/OpImplementation.h
+++ b/mlir/include/mlir/IR/OpImplementation.h
@@ -641,6 +641,12 @@ class AsmParser {
   /// Parse a '+' token if present.
   virtual ParseResult parseOptionalPlus() = 0;
 
+  /// Parse a '-' token.
+  virtual ParseResult parseMinus() = 0;
+
+  /// Parse a '-' token if present.
+  virtual ParseResult parseOptionalMinus() = 0;
+
   /// Parse a '*' token.
   virtual ParseResult parseStar() = 0;
 
diff --git a/mlir/lib/AsmParser/AsmParserImpl.h b/mlir/lib/AsmParser/AsmParserImpl.h
index b12687833e3fd..04250f63dcd25 100644
--- a/mlir/lib/AsmParser/AsmParserImpl.h
+++ b/mlir/lib/AsmParser/AsmParserImpl.h
@@ -226,6 +226,16 @@ class AsmParserImpl : public BaseT {
     return success(parser.consumeIf(Token::plus));
   }
 
+  /// Parses a '-' token.
+  ParseResult parseMinus() override {
+    return parser.parseToken(Token::minus, "expected '-'");
+  }
+
+  /// Parses a '-' token if present.
+  ParseResult parseOptionalMinus() override {
+    return success(parser.consumeIf(Token::minus));
+  }
+
   /// Parse a '|' token.
   ParseResult parseVerticalBar() override {
     return parser.parseToken(Token::vertical_bar, "expected '|'");

From 8133d47632f35df00933bfd3d3626b003206ede4 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 4 Sep 2024 01:35:47 -0700
Subject: [PATCH 047/425] [CGOpenMPRuntime] Use DenseMap::operator[] (NFC)
 (#107185)

I'm planning to deprecate DenseMap::FindAndConstruct in favor of
DenseMap::operator[].
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp | 29 +++++++++++++--------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 23b977be81602..3d392d869ee39 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1327,25 +1327,24 @@ llvm::Function *CGOpenMPRuntime::emitTaskOutlinedFunction(
 
 void CGOpenMPRuntime::setLocThreadIdInsertPt(CodeGenFunction &CGF,
                                              bool AtCurrentPoint) {
-  auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn);
-  assert(!Elem.second.ServiceInsertPt && "Insert point is set already.");
+  auto &Elem = OpenMPLocThreadIDMap[CGF.CurFn];
+  assert(!Elem.ServiceInsertPt && "Insert point is set already.");
 
   llvm::Value *Undef = llvm::UndefValue::get(CGF.Int32Ty);
   if (AtCurrentPoint) {
-    Elem.second.ServiceInsertPt = new llvm::BitCastInst(
-        Undef, CGF.Int32Ty, "svcpt", CGF.Builder.GetInsertBlock());
+    Elem.ServiceInsertPt = new llvm::BitCastInst(Undef, CGF.Int32Ty, "svcpt",
+                                                 CGF.Builder.GetInsertBlock());
   } else {
-    Elem.second.ServiceInsertPt =
-        new llvm::BitCastInst(Undef, CGF.Int32Ty, "svcpt");
-    Elem.second.ServiceInsertPt->insertAfter(CGF.AllocaInsertPt);
+    Elem.ServiceInsertPt = new llvm::BitCastInst(Undef, CGF.Int32Ty, "svcpt");
+    Elem.ServiceInsertPt->insertAfter(CGF.AllocaInsertPt);
   }
 }
 
 void CGOpenMPRuntime::clearLocThreadIdInsertPt(CodeGenFunction &CGF) {
-  auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn);
-  if (Elem.second.ServiceInsertPt) {
-    llvm::Instruction *Ptr = Elem.second.ServiceInsertPt;
-    Elem.second.ServiceInsertPt = nullptr;
+  auto &Elem = OpenMPLocThreadIDMap[CGF.CurFn];
+  if (Elem.ServiceInsertPt) {
+    llvm::Instruction *Ptr = Elem.ServiceInsertPt;
+    Elem.ServiceInsertPt = nullptr;
     Ptr->eraseFromParent();
   }
 }
@@ -1441,18 +1440,18 @@ llvm::Value *CGOpenMPRuntime::getThreadID(CodeGenFunction &CGF,
   // kmpc_global_thread_num(ident_t *loc).
   // Generate thread id value and cache this value for use across the
   // function.
-  auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn);
-  if (!Elem.second.ServiceInsertPt)
+  auto &Elem = OpenMPLocThreadIDMap[CGF.CurFn];
+  if (!Elem.ServiceInsertPt)
     setLocThreadIdInsertPt(CGF);
   CGBuilderTy::InsertPointGuard IPG(CGF.Builder);
-  CGF.Builder.SetInsertPoint(Elem.second.ServiceInsertPt);
+  CGF.Builder.SetInsertPoint(Elem.ServiceInsertPt);
   auto DL = ApplyDebugLocation::CreateDefaultArtificial(CGF, Loc);
   llvm::CallInst *Call = CGF.Builder.CreateCall(
       OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
                                             OMPRTL___kmpc_global_thread_num),
       emitUpdateLocation(CGF, Loc));
   Call->setCallingConv(CGF.getRuntimeCC());
-  Elem.second.ThreadID = Call;
+  Elem.ThreadID = Call;
   return Call;
 }
 

From 12d678a8eb11821e20eab86445f0cc9b66c24990 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 4 Sep 2024 09:37:15 +0100
Subject: [PATCH 048/425] [AArch64] Add codegen tests for zext(deinterleave).
 NFC

---
 llvm/test/CodeGen/AArch64/zext-shuffle.ll | 561 ++++++++++++++++++++++
 1 file changed, 561 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/zext-shuffle.ll

diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
new file mode 100644
index 0000000000000..4ef8daf141715
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
@@ -0,0 +1,561 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-eabi -o - %s | FileCheck %s
+
+define <2 x i64> @v2i64_02(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: v2i64_02:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <2 x i32> <i32 0, i32 2>
+  %d = zext <2 x i32> %c to <2 x i64>
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @v2i64_13(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: v2i64_13:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    zip2 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <2 x i32> <i32 1, i32 3>
+  %d = zext <2 x i32> %c to <2 x i64>
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @v2i64_04812(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: v2i64_04812:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <2 x i32> <i32 0, i32 4>
+  %d = zext <2 x i32> %c to <2 x i64>
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @v2i64_15913(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: v2i64_15913:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip2 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <2 x i32> <i32 1, i32 5>
+  %d = zext <2 x i32> %c to <2 x i64>
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @v2i64_261014(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: v2i64_261014:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <2 x i32> <i32 2, i32 6>
+  %d = zext <2 x i32> %c to <2 x i64>
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @v2i64_37(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: v2i64_37:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    zip2 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <2 x i32> <i32 3, i32 7>
+  %d = zext <2 x i32> %c to <2 x i64>
+  ret <2 x i64> %d
+}
+
+define <4 x i64> @v2i64_i16_04812(<16 x i16> %a) {
+; CHECK-LABEL: v2i64_i16_04812:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI6_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI6_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
+  %s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %z1 = zext <4 x i16> %s1 to <4 x i64>
+  ret <4 x i64> %z1
+}
+
+define <4 x i64> @v2i64_i16_15913(<16 x i16> %a) {
+; CHECK-LABEL: v2i64_i16_15913:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI7_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI7_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
+  %s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %z1 = zext <4 x i16> %s1 to <4 x i64>
+  ret <4 x i64> %z1
+}
+
+define <4 x i64> @v2i64_i16_261014(<16 x i16> %a) {
+; CHECK-LABEL: v2i64_i16_261014:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI8_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI8_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
+  %s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %z1 = zext <4 x i16> %s1 to <4 x i64>
+  ret <4 x i64> %z1
+}
+
+define <4 x i64> @v2i64_i16_371115(<16 x i16> %a) {
+; CHECK-LABEL: v2i64_i16_371115:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI9_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI9_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
+  %s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %z1 = zext <4 x i16> %s1 to <4 x i64>
+  ret <4 x i64> %z1
+}
+
+
+define <4 x i32> @v4i32_0246(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: v4i32_0246:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %d = zext <4 x i16> %c to <4 x i32>
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @v4i32_1357(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: v4i32_1357:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %d = zext <4 x i16> %c to <4 x i32>
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @v4i32_04812(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: v4i32_04812:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI12_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI12_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %d = zext <4 x i16> %c to <4 x i32>
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @v4i32_15913(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: v4i32_15913:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI13_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI13_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %d = zext <4 x i16> %c to <4 x i32>
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @v4i32_261014(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: v4i32_261014:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI14_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI14_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %d = zext <4 x i16> %c to <4 x i32>
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @v4i32_371115(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: v4i32_371115:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI15_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %d = zext <4 x i16> %c to <4 x i32>
+  ret <4 x i32> %d
+}
+
+
+define <8 x i16> @v8i16_0246(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: v8i16_0246:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-NEXT:    ret
+  %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %d = zext <8 x i8> %c to <8 x i16>
+  ret <8 x i16> %d
+}
+
+define <8 x i16> @v8i16_1357(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: v8i16_1357:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp2 v0.16b, v0.16b, v0.16b
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %d = zext <8 x i8> %c to <8 x i16>
+  ret <8 x i16> %d
+}
+
+define <8 x i16> @v8i16_04812(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: v8i16_04812:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI18_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %d = zext <8 x i8> %c to <8 x i16>
+  ret <8 x i16> %d
+}
+
+define <8 x i16> @v8i16_15913(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: v8i16_15913:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI19_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI19_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %d = zext <8 x i8> %c to <8 x i16>
+  ret <8 x i16> %d
+}
+
+define <8 x i16> @v8i16_261014(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: v8i16_261014:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI20_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI20_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %d = zext <8 x i8> %c to <8 x i16>
+  ret <8 x i16> %d
+}
+
+define <8 x i16> @v8i16_371115(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: v8i16_371115:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI21_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI21_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %d = zext <8 x i8> %c to <8 x i16>
+  ret <8 x i16> %d
+}
+
+
+define <8 x i64> @zext_add(<32 x i16> %l) {
+; CHECK-LABEL: zext_add:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI22_0
+; CHECK-NEXT:    adrp x9, .LCPI22_3
+; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI22_0]
+; CHECK-NEXT:    adrp x8, .LCPI22_1
+; CHECK-NEXT:    ldr q7, [x9, :lo12:.LCPI22_3]
+; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI22_1]
+; CHECK-NEXT:    adrp x8, .LCPI22_2
+; CHECK-NEXT:    adrp x9, .LCPI22_7
+; CHECK-NEXT:    ldr q6, [x8, :lo12:.LCPI22_2]
+; CHECK-NEXT:    adrp x8, .LCPI22_4
+; CHECK-NEXT:    ldr q18, [x9, :lo12:.LCPI22_7]
+; CHECK-NEXT:    ldr q16, [x8, :lo12:.LCPI22_4]
+; CHECK-NEXT:    adrp x8, .LCPI22_5
+; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v5.16b, { v0.16b, v1.16b }, v5.16b
+; CHECK-NEXT:    ldr q17, [x8, :lo12:.LCPI22_5]
+; CHECK-NEXT:    adrp x8, .LCPI22_6
+; CHECK-NEXT:    tbl v7.16b, { v0.16b, v1.16b }, v7.16b
+; CHECK-NEXT:    ldr q19, [x8, :lo12:.LCPI22_6]
+; CHECK-NEXT:    tbl v17.16b, { v0.16b, v1.16b }, v17.16b
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v18.16b
+; CHECK-NEXT:    tbl v1.16b, { v2.16b, v3.16b }, v4.16b
+; CHECK-NEXT:    tbl v4.16b, { v2.16b, v3.16b }, v6.16b
+; CHECK-NEXT:    tbl v6.16b, { v2.16b, v3.16b }, v16.16b
+; CHECK-NEXT:    tbl v2.16b, { v2.16b, v3.16b }, v19.16b
+; CHECK-NEXT:    uaddl v5.4s, v5.4h, v7.4h
+; CHECK-NEXT:    uaddl v7.4s, v17.4h, v0.4h
+; CHECK-NEXT:    uaddl2 v4.4s, v1.8h, v4.8h
+; CHECK-NEXT:    uaddl2 v2.4s, v6.8h, v2.8h
+; CHECK-NEXT:    uaddl v0.2d, v5.2s, v7.2s
+; CHECK-NEXT:    uaddl2 v1.2d, v5.4s, v7.4s
+; CHECK-NEXT:    uaddl2 v3.2d, v4.4s, v2.4s
+; CHECK-NEXT:    uaddl v2.2d, v4.2s, v2.2s
+; CHECK-NEXT:    ret
+    %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+    %z1 = zext <8 x i16> %s1 to <8 x i64>
+    %s2 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+    %z2 = zext <8 x i16> %s2 to <8 x i64>
+    %s3 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+    %z3 = zext <8 x i16> %s3 to <8 x i64>
+    %s4 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+    %z4 = zext <8 x i16> %s4 to <8 x i64>
+    %a = add <8 x i64> %z1, %z2
+    %b = add <8 x i64> %z3, %z4
+    %c = add <8 x i64> %a, %b
+    ret <8 x i64> %c
+}
+
+define <8 x i64> @zext_load_add(ptr %p) {
+; CHECK-LABEL: zext_load_add:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+; CHECK-NEXT:    uaddl v4.4s, v0.4h, v1.4h
+; CHECK-NEXT:    uaddl v5.4s, v2.4h, v3.4h
+; CHECK-NEXT:    uaddl2 v6.4s, v0.8h, v1.8h
+; CHECK-NEXT:    uaddl2 v2.4s, v2.8h, v3.8h
+; CHECK-NEXT:    uaddl v0.2d, v4.2s, v5.2s
+; CHECK-NEXT:    uaddl2 v1.2d, v4.4s, v5.4s
+; CHECK-NEXT:    uaddl2 v3.2d, v6.4s, v2.4s
+; CHECK-NEXT:    uaddl v2.2d, v6.2s, v2.2s
+; CHECK-NEXT:    ret
+    %l = load <32 x i16>, ptr %p
+    %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+    %z1 = zext <8 x i16> %s1 to <8 x i64>
+    %s2 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+    %z2 = zext <8 x i16> %s2 to <8 x i64>
+    %s3 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+    %z3 = zext <8 x i16> %s3 to <8 x i64>
+    %s4 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+    %z4 = zext <8 x i16> %s4 to <8 x i64>
+    %a = add <8 x i64> %z1, %z2
+    %b = add <8 x i64> %z3, %z4
+    %c = add <8 x i64> %a, %b
+    ret <8 x i64> %c
+}
+
+define <8 x double> @uitofp_fadd(<32 x i16> %l) {
+; CHECK-LABEL: uitofp_fadd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI24_0
+; CHECK-NEXT:    adrp x9, .LCPI24_1
+; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    adrp x10, .LCPI24_6
+; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI24_0]
+; CHECK-NEXT:    ldr q5, [x9, :lo12:.LCPI24_1]
+; CHECK-NEXT:    adrp x8, .LCPI24_2
+; CHECK-NEXT:    adrp x9, .LCPI24_3
+; CHECK-NEXT:    ldr q6, [x8, :lo12:.LCPI24_2]
+; CHECK-NEXT:    adrp x8, .LCPI24_4
+; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v4.16b, { v0.16b, v1.16b }, v4.16b
+; CHECK-NEXT:    tbl v5.16b, { v2.16b, v3.16b }, v5.16b
+; CHECK-NEXT:    ldr q7, [x9, :lo12:.LCPI24_3]
+; CHECK-NEXT:    adrp x9, .LCPI24_5
+; CHECK-NEXT:    ldr q16, [x8, :lo12:.LCPI24_4]
+; CHECK-NEXT:    adrp x8, .LCPI24_7
+; CHECK-NEXT:    ldr q17, [x9, :lo12:.LCPI24_5]
+; CHECK-NEXT:    ldr q18, [x10, :lo12:.LCPI24_6]
+; CHECK-NEXT:    ldr q19, [x8, :lo12:.LCPI24_7]
+; CHECK-NEXT:    tbl v6.16b, { v0.16b, v1.16b }, v6.16b
+; CHECK-NEXT:    tbl v7.16b, { v2.16b, v3.16b }, v7.16b
+; CHECK-NEXT:    tbl v16.16b, { v0.16b, v1.16b }, v16.16b
+; CHECK-NEXT:    tbl v17.16b, { v2.16b, v3.16b }, v17.16b
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v18.16b
+; CHECK-NEXT:    tbl v1.16b, { v2.16b, v3.16b }, v19.16b
+; CHECK-NEXT:    ushll2 v5.4s, v5.8h, #0
+; CHECK-NEXT:    ushll v4.4s, v4.4h, #0
+; CHECK-NEXT:    ushll2 v7.4s, v7.8h, #0
+; CHECK-NEXT:    ushll v6.4s, v6.4h, #0
+; CHECK-NEXT:    ushll v16.4s, v16.4h, #0
+; CHECK-NEXT:    ushll2 v20.2d, v5.4s, #0
+; CHECK-NEXT:    ushll2 v21.2d, v4.4s, #0
+; CHECK-NEXT:    ushll2 v17.4s, v17.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v2.2d, v5.2s, #0
+; CHECK-NEXT:    ushll v3.2d, v4.2s, #0
+; CHECK-NEXT:    ushll2 v4.2d, v7.4s, #0
+; CHECK-NEXT:    ushll2 v5.2d, v6.4s, #0
+; CHECK-NEXT:    ushll v7.2d, v7.2s, #0
+; CHECK-NEXT:    ucvtf v18.2d, v20.2d
+; CHECK-NEXT:    ucvtf v19.2d, v21.2d
+; CHECK-NEXT:    ushll v6.2d, v6.2s, #0
+; CHECK-NEXT:    ushll2 v20.2d, v17.4s, #0
+; CHECK-NEXT:    ushll2 v21.2d, v16.4s, #0
+; CHECK-NEXT:    ushll v17.2d, v17.2s, #0
+; CHECK-NEXT:    ushll v16.2d, v16.2s, #0
+; CHECK-NEXT:    ushll v22.2d, v0.2s, #0
+; CHECK-NEXT:    ushll2 v23.2d, v1.4s, #0
+; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-NEXT:    ucvtf v4.2d, v4.2d
+; CHECK-NEXT:    ucvtf v5.2d, v5.2d
+; CHECK-NEXT:    ucvtf v7.2d, v7.2d
+; CHECK-NEXT:    ucvtf v6.2d, v6.2d
+; CHECK-NEXT:    ucvtf v20.2d, v20.2d
+; CHECK-NEXT:    ucvtf v21.2d, v21.2d
+; CHECK-NEXT:    ucvtf v17.2d, v17.2d
+; CHECK-NEXT:    ucvtf v16.2d, v16.2d
+; CHECK-NEXT:    ucvtf v22.2d, v22.2d
+; CHECK-NEXT:    ucvtf v23.2d, v23.2d
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    fadd v4.2d, v18.2d, v4.2d
+; CHECK-NEXT:    fadd v3.2d, v3.2d, v6.2d
+; CHECK-NEXT:    fadd v2.2d, v2.2d, v7.2d
+; CHECK-NEXT:    fadd v5.2d, v19.2d, v5.2d
+; CHECK-NEXT:    fadd v6.2d, v16.2d, v22.2d
+; CHECK-NEXT:    fadd v16.2d, v20.2d, v23.2d
+; CHECK-NEXT:    fadd v7.2d, v17.2d, v1.2d
+; CHECK-NEXT:    fadd v1.2d, v21.2d, v0.2d
+; CHECK-NEXT:    fadd v0.2d, v3.2d, v6.2d
+; CHECK-NEXT:    fadd v3.2d, v4.2d, v16.2d
+; CHECK-NEXT:    fadd v1.2d, v5.2d, v1.2d
+; CHECK-NEXT:    fadd v2.2d, v2.2d, v7.2d
+; CHECK-NEXT:    ret
+    %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+    %z1 = uitofp <8 x i16> %s1 to <8 x double>
+    %s2 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+    %z2 = uitofp <8 x i16> %s2 to <8 x double>
+    %s3 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+    %z3 = uitofp <8 x i16> %s3 to <8 x double>
+    %s4 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+    %z4 = uitofp <8 x i16> %s4 to <8 x double>
+    %a = fadd <8 x double> %z1, %z2
+    %b = fadd <8 x double> %z3, %z4
+    %c = fadd <8 x double> %a, %b
+    ret <8 x double> %c
+}
+
+define <8 x double> @uitofp_load_fadd(ptr %p) {
+; CHECK-LABEL: uitofp_load_fadd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+; CHECK-NEXT:    ushll2 v4.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-NEXT:    ushll2 v6.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v16.4s, v2.8h, #0
+; CHECK-NEXT:    ushll v17.4s, v2.4h, #0
+; CHECK-NEXT:    ushll2 v18.4s, v3.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v3.4h, #0
+; CHECK-NEXT:    ushll2 v1.2d, v4.4s, #0
+; CHECK-NEXT:    ushll2 v2.2d, v5.4s, #0
+; CHECK-NEXT:    ushll v3.2d, v4.2s, #0
+; CHECK-NEXT:    ushll v4.2d, v5.2s, #0
+; CHECK-NEXT:    ushll2 v5.2d, v6.4s, #0
+; CHECK-NEXT:    ushll2 v19.2d, v7.4s, #0
+; CHECK-NEXT:    ushll v6.2d, v6.2s, #0
+; CHECK-NEXT:    ushll v7.2d, v7.2s, #0
+; CHECK-NEXT:    ushll2 v20.2d, v16.4s, #0
+; CHECK-NEXT:    ushll2 v21.2d, v17.4s, #0
+; CHECK-NEXT:    ushll v16.2d, v16.2s, #0
+; CHECK-NEXT:    ushll v17.2d, v17.2s, #0
+; CHECK-NEXT:    ushll v22.2d, v0.2s, #0
+; CHECK-NEXT:    ushll2 v23.2d, v18.4s, #0
+; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-NEXT:    ushll v18.2d, v18.2s, #0
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-NEXT:    ucvtf v4.2d, v4.2d
+; CHECK-NEXT:    ucvtf v5.2d, v5.2d
+; CHECK-NEXT:    ucvtf v19.2d, v19.2d
+; CHECK-NEXT:    ucvtf v6.2d, v6.2d
+; CHECK-NEXT:    ucvtf v7.2d, v7.2d
+; CHECK-NEXT:    ucvtf v20.2d, v20.2d
+; CHECK-NEXT:    ucvtf v21.2d, v21.2d
+; CHECK-NEXT:    ucvtf v16.2d, v16.2d
+; CHECK-NEXT:    ucvtf v17.2d, v17.2d
+; CHECK-NEXT:    ucvtf v22.2d, v22.2d
+; CHECK-NEXT:    ucvtf v23.2d, v23.2d
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v18.2d, v18.2d
+; CHECK-NEXT:    fadd v1.2d, v1.2d, v5.2d
+; CHECK-NEXT:    fadd v4.2d, v4.2d, v7.2d
+; CHECK-NEXT:    fadd v6.2d, v3.2d, v6.2d
+; CHECK-NEXT:    fadd v2.2d, v2.2d, v19.2d
+; CHECK-NEXT:    fadd v3.2d, v17.2d, v22.2d
+; CHECK-NEXT:    fadd v5.2d, v16.2d, v18.2d
+; CHECK-NEXT:    fadd v7.2d, v21.2d, v0.2d
+; CHECK-NEXT:    fadd v16.2d, v20.2d, v23.2d
+; CHECK-NEXT:    fadd v0.2d, v4.2d, v3.2d
+; CHECK-NEXT:    fadd v3.2d, v1.2d, v16.2d
+; CHECK-NEXT:    fadd v1.2d, v2.2d, v7.2d
+; CHECK-NEXT:    fadd v2.2d, v6.2d, v5.2d
+; CHECK-NEXT:    ret
+    %l = load <32 x i16>, ptr %p
+    %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+    %z1 = uitofp <8 x i16> %s1 to <8 x double>
+    %s2 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+    %z2 = uitofp <8 x i16> %s2 to <8 x double>
+    %s3 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+    %z3 = uitofp <8 x i16> %s3 to <8 x double>
+    %s4 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+    %z4 = uitofp <8 x i16> %s4 to <8 x double>
+    %a = fadd <8 x double> %z1, %z2
+    %b = fadd <8 x double> %z3, %z4
+    %c = fadd <8 x double> %a, %b
+    ret <8 x double> %c
+}
+

From 030e4d0cdf4c43a6ec1ca301b5a358991fa2ac4f Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Wed, 4 Sep 2024 10:38:18 +0200
Subject: [PATCH 049/425] [Clang] Treat default template argument as constant
 expressions (#107073)

We only check that a default argument is a converted constant expression
when using the default argument.

However, when parsing a default argument, we need to make sure to parse
it as a constant expression such as not ODR-use variables. (otherwise,
we would try to capture default template arguments of generic lambdas)

Fixes #107048
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/lib/Parse/ParseTemplate.cpp             |  2 +-
 clang/test/SemaCXX/cxx2a-template-lambdas.cpp | 34 +++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4128ca78ce396..251eb4c1c4559 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -346,6 +346,7 @@ Bug Fixes to C++ Support
 - Fix handling of ``_`` as the name of a lambda's init capture variable. (#GH107024)
 - Fix an issue with dependent source location expressions (#GH106428), (#GH81155), (#GH80210), (#GH85373)
 - Fixed a bug in the substitution of empty pack indexing types. (#GH105903)
+- Clang no longer tries to capture non-odr used default arguments of template parameters of generic lambdas (#GH107048)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp
index 6ecfc15757f3d..de29652abbfd9 100644
--- a/clang/lib/Parse/ParseTemplate.cpp
+++ b/clang/lib/Parse/ParseTemplate.cpp
@@ -959,7 +959,7 @@ Parser::ParseNonTypeTemplateParameter(unsigned Depth, unsigned Position) {
       ++CurTemplateDepthTracker;
       EnterExpressionEvaluationContext ConstantEvaluated(
           Actions, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-      DefaultArg = Actions.CorrectDelayedTyposInExpr(ParseInitializer());
+      DefaultArg = Actions.ActOnConstantExpression(ParseInitializer());
       if (DefaultArg.isInvalid())
         SkipUntil(tok::comma, tok::greater, StopAtSemi | StopBeforeMatch);
     }
diff --git a/clang/test/SemaCXX/cxx2a-template-lambdas.cpp b/clang/test/SemaCXX/cxx2a-template-lambdas.cpp
index fff524e77d3bf..00ba291fbd198 100644
--- a/clang/test/SemaCXX/cxx2a-template-lambdas.cpp
+++ b/clang/test/SemaCXX/cxx2a-template-lambdas.cpp
@@ -97,3 +97,37 @@ void foo() {
 
 }
 #endif
+
+#if __cplusplus >= 202002L
+namespace {
+struct S {};
+constexpr S gs;
+void f() {
+  constexpr int x{};
+  const int y{};
+  auto b = []<int=x, int=y>{};
+  using A = decltype([]<int=x>{});
+
+  int z; // expected-note {{'z' declared here}}
+  auto c = []<int t=z>{
+    // expected-error@-1 {{no matching function for call to object of type}} \
+    // expected-error@-1 {{variable 'z' cannot be implicitly captured in a lambda with no capture-default specified}} \
+    // expected-note@-1 {{lambda expression begins here}} \
+    // expected-note@-1 4{{capture}} \
+    // expected-note@-1 {{candidate template ignored: substitution failure: reference to local variable 'z' declared in enclosing function}}
+    return t;
+  }();
+
+  auto class_type_global = []<S=gs>{};
+
+  static constexpr S static_s;
+  auto class_type_static = []<S=static_s>{};
+
+  constexpr S s;  // expected-note {{'s' declared here}}
+  auto class_type = []<S=s>{};
+  // expected-error@-1 {{variable 's' cannot be implicitly captured in a lambda with no capture-default specified}} \
+  // expected-note@-1 {{lambda expression begins here}} \
+  // expected-note@-1 4{{capture}}
+}
+}
+#endif

From 50febdeb64fce345b0fb669e9688d34c8ffe7912 Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Wed, 4 Sep 2024 16:41:56 +0800
Subject: [PATCH 050/425] [mlir][vector] Bugfix of linearize `vector.extract`
 (#106836)

This patch add check for `vector.extract` with scalar type, which is not
allowed when linearize `vector.extract`. Fix #106162.
---
 .../Dialect/Vector/Transforms/VectorLinearize.cpp    |  4 ++++
 mlir/test/Dialect/Vector/linearize.mlir              | 12 ++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
index 868397f2daaae..11917ac1e4022 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -337,6 +337,10 @@ struct LinearizeVectorExtract final
   matchAndRewrite(vector::ExtractOp extractOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Type dstTy = getTypeConverter()->convertType(extractOp.getType());
+    if (!dstTy)
+      return rewriter.notifyMatchFailure(extractOp,
+                                         "expected n-D vector type.");
+
     if (extractOp.getVector().getType().isScalable() ||
         cast<VectorType>(dstTy).isScalable())
       return rewriter.notifyMatchFailure(extractOp,
diff --git a/mlir/test/Dialect/Vector/linearize.mlir b/mlir/test/Dialect/Vector/linearize.mlir
index 916e3e5fd2529..543e76b5b26e0 100644
--- a/mlir/test/Dialect/Vector/linearize.mlir
+++ b/mlir/test/Dialect/Vector/linearize.mlir
@@ -306,3 +306,15 @@ func.func @test_vector_insert_scalable(%arg0: vector<2x8x[4]xf32>, %arg1: vector
   // ALL: return %[[RES]] : vector<2x8x[4]xf32>
   return %0 : vector<2x8x[4]xf32>
 }
+
+// -----
+
+// ALL-LABEL: test_vector_extract_scalar
+func.func @test_vector_extract_scalar() {
+  %cst = arith.constant dense<[1, 2, 3, 4]> : vector<4xi32>
+  // ALL-NOT: vector.shuffle
+  // ALL:     vector.extract
+  // ALL-NOT: vector.shuffle
+  %0 = vector.extract %cst[0] : i32 from vector<4xi32>
+  return
+}

From 58f289612f1959256fa2228f013cfe96304b45c4 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 4 Sep 2024 02:09:21 -0700
Subject: [PATCH 051/425] [XRay] Use DenseMap::{operator[],try_emplace} (NFC)
 (#107178)

I'm planning to deprecate DenseMap::FindAndConstruct in favor of
operator[].  I'm using try_emplace because "Vertices[I.first];" on its
own might look like a nop statement.
---
 llvm/include/llvm/XRay/Graph.h | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/XRay/Graph.h b/llvm/include/llvm/XRay/Graph.h
index 953ac1aa69637..07b418b8cb640 100644
--- a/llvm/include/llvm/XRay/Graph.h
+++ b/llvm/include/llvm/XRay/Graph.h
@@ -378,20 +378,17 @@ class Graph {
 
   /// Looks up the vertex with identifier I, if it does not exist it default
   /// constructs it.
-  VertexAttribute &operator[](const VertexIdentifier &I) {
-    return Vertices.FindAndConstruct(I).second;
-  }
+  VertexAttribute &operator[](const VertexIdentifier &I) { return Vertices[I]; }
 
   /// Looks up the edge with identifier I, if it does not exist it default
   /// constructs it, if it's endpoints do not exist it also default constructs
   /// them.
   EdgeAttribute &operator[](const EdgeIdentifier &I) {
-    auto &P = Edges.FindAndConstruct(I);
-    Vertices.FindAndConstruct(I.first);
-    Vertices.FindAndConstruct(I.second);
+    Vertices.try_emplace(I.first);
+    Vertices.try_emplace(I.second);
     InNeighbors[I.second].insert(I.first);
     OutNeighbors[I.first].insert(I.second);
-    return P.second;
+    return Edges[I];
   }
 
   /// Looks up a vertex with Identifier I, or an error if it does not exist.
@@ -479,8 +476,8 @@ class Graph {
     auto EI = Val.first;
     const auto &p = Edges.insert(std::move(Val));
     if (p.second) {
-      Vertices.FindAndConstruct(EI.first);
-      Vertices.FindAndConstruct(EI.second);
+      Vertices.try_emplace(EI.first);
+      Vertices.try_emplace(EI.second);
       InNeighbors[EI.second].insert(EI.first);
       OutNeighbors[EI.first].insert(EI.second);
     };

From 9e08db796b7fc1aa21ec0a0c16a0213229e02010 Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer@amd.com>
Date: Wed, 4 Sep 2024 10:16:14 +0100
Subject: [PATCH 052/425] [OpenMPIRBuilder] Don't drop debug info for target
 region. (#80692)

When an outlined function is generated for omp target region, a
corresponding DISubprogram was not being generated. This resulted in all
the debug information for the target region being dropped.

This commit adds DISubprogram for the outlined function if there is one
available for the parent function. It also updates the current debug
location so that the right scope is used for the entries in the outlined
function.

There are places in the OpenMPIRBuilder which changes insertion point but
don't update the debug location accordingly. They cause issue when debug info
is enabled. I have fixed a few that I observed to cause issue. But there may be
more and a systematic cleanup may be required.

With this change in place, I can set source line breakpoint in target
region and run to them in debugger.
---
 .../lib/Optimizer/Transforms/AddDebugInfo.cpp |  6 ++-
 .../Integration/debug-target-region-vars.f90  | 28 ++++++++++++
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 43 ++++++++++++++++---
 .../Frontend/OpenMPIRBuilderTest.cpp          |  4 ++
 mlir/test/Target/LLVMIR/omptarget-debug.mlir  | 29 +++++++++++++
 mlir/test/Target/LLVMIR/omptarget-debug2.mlir | 31 +++++++++++++
 6 files changed, 134 insertions(+), 7 deletions(-)
 create mode 100644 flang/test/Integration/debug-target-region-vars.f90
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-debug.mlir
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-debug2.mlir

diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index 46fc40b714aac..576e65ba6ecc5 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -309,7 +309,11 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
     return;
 
   funcOp.walk([&](fir::cg::XDeclareOp declOp) {
-    handleDeclareOp(declOp, fileAttr, spAttr, typeGen, symbolTable);
+    // FIXME: We currently dont handle variables that are not in the entry
+    // blocks of the fuctions. These may be variable or arguments used in the
+    // OpenMP target regions.
+    if (&funcOp.front() == declOp->getBlock())
+      handleDeclareOp(declOp, fileAttr, spAttr, typeGen, symbolTable);
   });
 }
 
diff --git a/flang/test/Integration/debug-target-region-vars.f90 b/flang/test/Integration/debug-target-region-vars.f90
new file mode 100644
index 0000000000000..a57afb301d9b7
--- /dev/null
+++ b/flang/test/Integration/debug-target-region-vars.f90
@@ -0,0 +1,28 @@
+! RUN: %flang_fc1 -fopenmp -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck  %s
+
+! Test that variables inside OpenMP target region don't cause build failure.
+subroutine test1
+  implicit none
+  real, allocatable :: xyz(:)
+  integer :: i
+
+  !$omp target simd map(from:xyz)
+  do i = 1, size(xyz)
+    xyz(i) = 5.0 * xyz(i)
+  end do
+end subroutine
+
+subroutine test2 (xyz)
+  integer :: i
+  integer :: xyz(:)
+
+  !$omp target map(from:xyz)
+    !$omp do private(xyz)
+      do i = 1, 10
+        xyz(i) = i
+      end do
+  !$omp end target
+end subroutine
+
+!CHECK: DISubprogram(name: "test1"{{.*}})
+!CHECK: DISubprogram(name: "test2"{{.*}})
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 532313a31fc13..027b927fa6424 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -4328,6 +4329,7 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,
   // That's why make an unconditional branch from loop preheader to loop
   // exit block
   Builder.restoreIP({Preheader, Preheader->end()});
+  Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
   Preheader->getTerminator()->eraseFromParent();
   Builder.CreateBr(CLI->getExit());
 
@@ -6584,13 +6586,45 @@ static Function *createOutlinedFunction(
       ParameterTypes.push_back(Arg->getType());
   }
 
+  auto BB = Builder.GetInsertBlock();
+  auto M = BB->getModule();
   auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
                                     /*isVarArg*/ false);
-  auto Func = Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName,
-                               Builder.GetInsertBlock()->getModule());
+  auto Func =
+      Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
 
   // Save insert point.
-  auto OldInsertPoint = Builder.saveIP();
+  IRBuilder<>::InsertPointGuard IPG(Builder);
+  // If there's a DISubprogram associated with current function, then
+  // generate one for the outlined function.
+  if (Function *ParentFunc = BB->getParent()) {
+    if (DISubprogram *SP = ParentFunc->getSubprogram()) {
+      DICompileUnit *CU = SP->getUnit();
+      DIBuilder DB(*M, true, CU);
+      DebugLoc DL = Builder.getCurrentDebugLocation();
+      if (DL) {
+        // TODO: We are using nullopt for arguments at the moment. This will
+        // need to be updated when debug data is being generated for variables.
+        DISubroutineType *Ty =
+            DB.createSubroutineType(DB.getOrCreateTypeArray(std::nullopt));
+        DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition |
+                                          DISubprogram::SPFlagOptimized |
+                                          DISubprogram::SPFlagLocalToUnit;
+
+        DISubprogram *OutlinedSP = DB.createFunction(
+            CU, FuncName, FuncName, SP->getFile(), DL.getLine(), Ty,
+            DL.getLine(), DINode::DIFlags::FlagArtificial, SPFlags);
+
+        // Attach subprogram to the function.
+        Func->setSubprogram(OutlinedSP);
+        // Update the CurrentDebugLocation in the builder so that right scope
+        // is used for things inside outlined function.
+        Builder.SetCurrentDebugLocation(
+            DILocation::get(Func->getContext(), DL.getLine(), DL.getCol(),
+                            OutlinedSP, DL.getInlinedAt()));
+      }
+    }
+  }
 
   // Generate the region into the function.
   BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
@@ -6697,9 +6731,6 @@ static Function *createOutlinedFunction(
   for (auto Deferred : DeferredReplacement)
     ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
 
-  // Restore insert point.
-  Builder.restoreIP(OldInsertPoint);
-
   return Func;
 }
 
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 6207792f9f0d0..c92a3ff2e7ba6 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -6006,6 +6006,10 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
   BasicBlock *FallbackBlock = Branch->getSuccessor(0);
   Iter = FallbackBlock->rbegin();
   CallInst *FCall = dyn_cast<CallInst>(&*(++Iter));
+  // 'F' has a dummy DISubprogram which causes OutlinedFunc to also
+  // have a DISubprogram. In this case, the call to OutlinedFunc needs
+  // to have a debug loc, otherwise verifier will complain.
+  FCall->setDebugLoc(DL);
   EXPECT_NE(FCall, nullptr);
 
   // Check that the correct aguments are passed in
diff --git a/mlir/test/Target/LLVMIR/omptarget-debug.mlir b/mlir/test/Target/LLVMIR/omptarget-debug.mlir
new file mode 100644
index 0000000000000..76a853249caca
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-debug.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {omp.is_target_device = true} {
+  llvm.func @_QQmain() {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr
+    %9 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target map_entries(%9 -> %arg0 : !llvm.ptr) {
+    ^bb0(%arg0: !llvm.ptr):
+      %13 = llvm.mlir.constant(1 : i32) : i32
+      llvm.store %13, %arg0 : i32, !llvm.ptr loc(#loc2)
+      omp.terminator
+    }
+    llvm.return
+  } loc(#loc3)
+}
+#file = #llvm.di_file<"target.f90" in "">
+#cu = #llvm.di_compile_unit<id = distinct[0]<>,
+ sourceLanguage = DW_LANG_Fortran95, file = #file, isOptimized = false,
+ emissionKind = LineTablesOnly>
+#sp_ty = #llvm.di_subroutine_type<callingConvention = DW_CC_normal>
+#sp = #llvm.di_subprogram<id = distinct[1]<>, compileUnit = #cu, scope = #file,
+ name = "_QQmain", file = #file, subprogramFlags = "Definition", type = #sp_ty>
+#loc1 = loc("target.f90":1:1)
+#loc2 = loc("target.f90":46:3)
+#loc3 = loc(fused<#sp>[#loc1])
+
+// CHECK-DAG: ![[SP:.*]] = {{.*}}!DISubprogram(name: "__omp_offloading_{{.*}}"{{.*}})
+// CHECK-DAG: !DILocation(line: 46, column: 3, scope: ![[SP]])
diff --git a/mlir/test/Target/LLVMIR/omptarget-debug2.mlir b/mlir/test/Target/LLVMIR/omptarget-debug2.mlir
new file mode 100644
index 0000000000000..ee19cc31e5c6b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-debug2.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Same test as omptarget-debug.mlir but with is_target_device = false.
+// Somehow test with omp.target don't work with -split-input-file.
+module attributes {omp.is_target_device = false} {
+  llvm.func @_QQmain() {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr
+    %9 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target map_entries(%9 -> %arg0 : !llvm.ptr) {
+    ^bb0(%arg0: !llvm.ptr):
+      %13 = llvm.mlir.constant(1 : i32) : i32
+      llvm.store %13, %arg0 : i32, !llvm.ptr loc(#loc2)
+      omp.terminator
+    }
+    llvm.return
+  } loc(#loc3)
+}
+#file = #llvm.di_file<"target.f90" in "">
+#cu = #llvm.di_compile_unit<id = distinct[0]<>,
+ sourceLanguage = DW_LANG_Fortran95, file = #file, isOptimized = false,
+ emissionKind = LineTablesOnly>
+#sp_ty = #llvm.di_subroutine_type<callingConvention = DW_CC_normal>
+#sp = #llvm.di_subprogram<id = distinct[1]<>, compileUnit = #cu, scope = #file,
+ name = "_QQmain", file = #file, subprogramFlags = "Definition", type = #sp_ty>
+#loc1 = loc("target.f90":1:1)
+#loc2 = loc("target.f90":46:3)
+#loc3 = loc(fused<#sp>[#loc1])
+
+// CHECK-DAG: ![[SP:.*]] = {{.*}}!DISubprogram(name: "__omp_offloading_{{.*}}"{{.*}})
+// CHECK-DAG: !DILocation(line: 46, column: 3, scope: ![[SP]])

From 5914566474de29309b0b4815ecd406805793de1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche@google.com>
Date: Wed, 4 Sep 2024 11:24:52 +0200
Subject: [PATCH 053/425] [Utils][SPIR-V] Adding spirv-sim to LLVM (#107094)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### 2nd submission
The buildbots are using python 3.8, and some type annotations I was
using are only available starting 3.9. The last commit on the pile is
the additional changes compared to the original submission
https://github.com/llvm/llvm-project/pull/104020.

### Original text:
Currently, the testing infrastructure for SPIR-V is based on FileCheck.
Those tests are great to check some level of codegen, but when the test
needs check both the CFG layout and the content of each basic-block,
things becomes messy.

Because the CHECK/CHECK-DAG/CHECK-NEXT state is limited, it is sometimes
hard to catch the good block: if 2 basic blocks have similar
instructions, FileCheck can match the wrong one.

Cross-lane interaction can be a bit difficult to understand, and
writting a FileCheck test that is strong enough to catch bad CFG
transforms while not being broken everytime some unrelated codegen part
changes is hard.

And lastly, the spirv-val tooling we have checks that the generated
SPIR-V respects the spec, not that it is correct in regards to the
source IR.

For those reasons, I believe the best way to test the structurizer is
to:

run spirv-val to make sure the CFG respects the spec.
simulate the function to validate result for each lane, making sure the
generated code is correct.
This simulator has no other dependencies than core python. It also only
supports a very limited set of instructions as we can test most features
through control-flow and some basic cross-lane interactions.

As-is, the added tests are just a harness for the simulator itself. If
this gets merged, the structurizer PR will benefit from this as I'll be
able to add extensive testing using this.

---------

Signed-off-by: Nathan Gauër <brioche@google.com>
---
 llvm/test/Other/spirv-sim/branch.spv          |  42 ++
 llvm/test/Other/spirv-sim/call.spv            |  36 +
 llvm/test/Other/spirv-sim/constant.spv        |  36 +
 llvm/test/Other/spirv-sim/lit.local.cfg       |   8 +
 llvm/test/Other/spirv-sim/loop.spv            |  58 ++
 .../Other/spirv-sim/simple-bad-result.spv     |  26 +
 llvm/test/Other/spirv-sim/simple.spv          |  22 +
 llvm/test/Other/spirv-sim/simulator-args.spv  |  36 +
 llvm/test/Other/spirv-sim/switch.spv          |  42 ++
 .../Other/spirv-sim/wave-get-lane-index.spv   |  30 +
 .../Other/spirv-sim/wave-read-lane-first.spv  |  83 +++
 llvm/test/lit.cfg.py                          |   2 +-
 llvm/utils/spirv-sim/instructions.py          | 381 ++++++++++
 llvm/utils/spirv-sim/spirv-sim.py             | 658 ++++++++++++++++++
 14 files changed, 1459 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Other/spirv-sim/branch.spv
 create mode 100644 llvm/test/Other/spirv-sim/call.spv
 create mode 100644 llvm/test/Other/spirv-sim/constant.spv
 create mode 100644 llvm/test/Other/spirv-sim/lit.local.cfg
 create mode 100644 llvm/test/Other/spirv-sim/loop.spv
 create mode 100644 llvm/test/Other/spirv-sim/simple-bad-result.spv
 create mode 100644 llvm/test/Other/spirv-sim/simple.spv
 create mode 100644 llvm/test/Other/spirv-sim/simulator-args.spv
 create mode 100644 llvm/test/Other/spirv-sim/switch.spv
 create mode 100644 llvm/test/Other/spirv-sim/wave-get-lane-index.spv
 create mode 100644 llvm/test/Other/spirv-sim/wave-read-lane-first.spv
 create mode 100644 llvm/utils/spirv-sim/instructions.py
 create mode 100755 llvm/utils/spirv-sim/spirv-sim.py

diff --git a/llvm/test/Other/spirv-sim/branch.spv b/llvm/test/Other/spirv-sim/branch.spv
new file mode 100644
index 0000000000000..7ee0ebcad249d
--- /dev/null
+++ b/llvm/test/Other/spirv-sim/branch.spv
@@ -0,0 +1,42 @@
+; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %}
+; RUN: spirv-sim --function=simple --wave=3 --expects=5,6,6 -i %s
+               OpCapability Shader
+               OpCapability GroupNonUniform
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %WaveIndex
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource HLSL 670
+               OpName %simple "simple"
+               OpName %main "main"
+                OpDecorate %WaveIndex BuiltIn SubgroupLocalInvocationId
+        %int = OpTypeInt 32 1
+       %uint = OpTypeInt 32 0
+       %bool = OpTypeBool
+      %int_2 = OpConstant %int 2
+      %int_5 = OpConstant %int 5
+      %int_6 = OpConstant %int 6
+     %uint_0 = OpConstant %uint 0
+       %void = OpTypeVoid
+  %main_type = OpTypeFunction %void
+%simple_type = OpTypeFunction %int
+  %uint_iptr = OpTypePointer Input %uint
+  %WaveIndex = OpVariable %uint_iptr Input
+       %main = OpFunction %void None %main_type
+      %entry = OpLabel
+               OpReturn
+               OpFunctionEnd
+     %simple = OpFunction %int None %simple_type
+          %1 = OpLabel
+          %2 = OpLoad %uint %WaveIndex
+          %3 = OpIEqual %bool %uint_0 %2
+               OpSelectionMerge %merge None
+               OpBranchConditional %3 %true %false
+       %true = OpLabel
+               OpBranch %merge
+      %false = OpLabel
+               OpBranch %merge
+      %merge = OpLabel
+          %4 = OpPhi %int %int_5 %true %int_6 %false
+               OpReturnValue %4
+               OpFunctionEnd
+
diff --git a/llvm/test/Other/spirv-sim/call.spv b/llvm/test/Other/spirv-sim/call.spv
new file mode 100644
index 0000000000000..320b048f95296
--- /dev/null
+++ b/llvm/test/Other/spirv-sim/call.spv
@@ -0,0 +1,36 @@
+; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %}
+; RUN: spirv-sim --function=simple --wave=1 --expects=2 -i %s
+               OpCapability Shader
+               OpCapability GroupNonUniform
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %WaveIndex
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource HLSL 670
+               OpName %simple "simple"
+               OpName %main "main"
+                OpDecorate %WaveIndex BuiltIn SubgroupLocalInvocationId
+        %int = OpTypeInt 32 1
+       %uint = OpTypeInt 32 0
+     %uint_2 = OpConstant %uint 2
+       %void = OpTypeVoid
+  %main_type = OpTypeFunction %void
+%simple_type = OpTypeFunction %int
+   %sub_type = OpTypeFunction %uint
+  %uint_iptr = OpTypePointer Input %uint
+  %WaveIndex = OpVariable %uint_iptr Input
+       %main = OpFunction %void None %main_type
+      %entry = OpLabel
+               OpReturn
+               OpFunctionEnd
+        %sub = OpFunction %uint None %sub_type
+          %a = OpLabel
+               OpReturnValue %uint_2
+               OpFunctionEnd
+     %simple = OpFunction %int None %simple_type
+          %1 = OpLabel
+          %2 = OpFunctionCall %uint %sub
+          %3 = OpBitcast %int %2
+               OpReturnValue %3
+               OpFunctionEnd
+
+
diff --git a/llvm/test/Other/spirv-sim/constant.spv b/llvm/test/Other/spirv-sim/constant.spv
new file mode 100644
index 0000000000000..1002427943a8d
--- /dev/null
+++ b/llvm/test/Other/spirv-sim/constant.spv
@@ -0,0 +1,36 @@
+; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %}
+; RUN: spirv-sim --function=a --wave=1 --expects=2 -i %s
+; RUN: spirv-sim --function=b --wave=1 --expects=1 -i %s
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource HLSL 670
+               OpName %a "a"
+               OpName %b "b"
+               OpName %main "main"
+        %int = OpTypeInt 32 1
+         %s1 = OpTypeStruct %int %int %int
+         %s2 = OpTypeStruct %s1
+      %int_1 = OpConstant %int 1
+      %int_2 = OpConstant %int 2
+     %s1_1_2 = OpConstantComposite %s1 %int_1 %int_2 %int_1
+      %s2_s1 = OpConstantComposite %s2 %s1_1_2
+       %void = OpTypeVoid
+  %main_type = OpTypeFunction %void
+   %simple_type = OpTypeFunction %int
+       %main = OpFunction %void None %main_type
+      %entry = OpLabel
+               OpReturn
+               OpFunctionEnd
+          %a = OpFunction %int None %simple_type
+          %1 = OpLabel
+          %2 = OpCompositeExtract %int %s1_1_2 1
+               OpReturnValue %2
+               OpFunctionEnd
+          %b = OpFunction %int None %simple_type
+          %3 = OpLabel
+          %4 = OpCompositeExtract %int %s2_s1 0 2
+               OpReturnValue %4
+               OpFunctionEnd
+
diff --git a/llvm/test/Other/spirv-sim/lit.local.cfg b/llvm/test/Other/spirv-sim/lit.local.cfg
new file mode 100644
index 0000000000000..67a8d9196f588
--- /dev/null
+++ b/llvm/test/Other/spirv-sim/lit.local.cfg
@@ -0,0 +1,8 @@
+spirv_sim_root = os.path.join(config.llvm_src_root, "utils", "spirv-sim")
+config.substitutions.append(
+  (
+    "spirv-sim",
+    "'%s' %s"
+    % (config.python_executable, os.path.join(spirv_sim_root, "spirv-sim.py")),
+  )
+)
diff --git a/llvm/test/Other/spirv-sim/loop.spv b/llvm/test/Other/spirv-sim/loop.spv
new file mode 100644
index 0000000000000..4fd0f1a7c96a3
--- /dev/null
+++ b/llvm/test/Other/spirv-sim/loop.spv
@@ -0,0 +1,58 @@
+; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %}
+; RUN: spirv-sim --function=simple --wave=4 --expects=0,2,2,4 -i %s
+               OpCapability Shader
+               OpCapability GroupNonUniform
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %WaveIndex
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource HLSL 670
+               OpName %simple "simple"
+               OpName %main "main"
+                OpDecorate %WaveIndex BuiltIn SubgroupLocalInvocationId
+        %int = OpTypeInt 32 1
+       %uint = OpTypeInt 32 0
+       %bool = OpTypeBool
+      %int_2 = OpConstant %int 2
+      %int_5 = OpConstant %int 5
+      %int_6 = OpConstant %int 6
+     %uint_0 = OpConstant %uint 0
+     %uint_2 = OpConstant %uint 2
+       %void = OpTypeVoid
+  %main_type = OpTypeFunction %void
+%simple_type = OpTypeFunction %int
+  %uint_iptr = OpTypePointer Input %uint
+  %uint_fptr = OpTypePointer Function %uint
+  %WaveIndex = OpVariable %uint_iptr Input
+       %main = OpFunction %void None %main_type
+      %unused = OpLabel
+               OpReturn
+               OpFunctionEnd
+     %simple = OpFunction %int None %simple_type
+      %entry = OpLabel
+; uint i = 0;
+          %i = OpVariable %uint_fptr Function
+          %1 = OpLoad %uint %WaveIndex
+               OpStore %i %uint_0
+               OpBranch %header
+     %header = OpLabel
+          %2 = OpLoad %uint %i
+          %3 = OpULessThan %bool %2 %1
+               OpLoopMerge %merge %continue None
+               OpBranchConditional %3 %body %merge
+; while (i < WaveGetLaneIndex()) {
+;     i += 2;
+; }
+       %body = OpLabel
+               OpBranch %continue
+   %continue = OpLabel
+          %4 = OpIAdd %uint %2 %uint_2
+               OpStore %i %4
+               OpBranch %header
+      %merge = OpLabel
+; return (int) i;
+          %5 = OpLoad %uint %i
+          %6 = OpBitcast %int %5
+               OpReturnValue %6
+               OpFunctionEnd
+
+
diff --git a/llvm/test/Other/spirv-sim/simple-bad-result.spv b/llvm/test/Other/spirv-sim/simple-bad-result.spv
new file mode 100644
index 0000000000000..f4dd046cc078b
--- /dev/null
+++ b/llvm/test/Other/spirv-sim/simple-bad-result.spv
@@ -0,0 +1,26 @@
+; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %}
+; RUN: not spirv-sim --function=simple --wave=1 --expects=1 -i %s 2>&1 | FileCheck %s
+
+; CHECK: Expected != Observed
+; CHECK: [1] != [2]
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource HLSL 670
+               OpName %simple "simple"
+               OpName %main "main"
+        %int = OpTypeInt 32 1
+      %int_2 = OpConstant %int 2
+       %void = OpTypeVoid
+  %main_type = OpTypeFunction %void
+   %simple_type = OpTypeFunction %int
+       %main = OpFunction %void None %main_type
+      %entry = OpLabel
+               OpReturn
+               OpFunctionEnd
+     %simple = OpFunction %int None %simple_type
+          %1 = OpLabel
+               OpReturnValue %int_2
+               OpFunctionEnd
+
diff --git a/llvm/test/Other/spirv-sim/simple.spv b/llvm/test/Other/spirv-sim/simple.spv
new file mode 100644
index 0000000000000..8c06192ea6e3d
--- /dev/null
+++ b/llvm/test/Other/spirv-sim/simple.spv
@@ -0,0 +1,22 @@
+; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %}
+; RUN: spirv-sim --function=simple --wave=1 --expects=2 -i %s
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource HLSL 670
+               OpName %simple "simple"
+               OpName %main "main"
+        %int = OpTypeInt 32 1
+      %int_2 = OpConstant %int 2
+       %void = OpTypeVoid
+  %main_type = OpTypeFunction %void
+   %simple_type = OpTypeFunction %int
+       %main = OpFunction %void None %main_type
+      %entry = OpLabel
+               OpReturn
+               OpFunctionEnd
+     %simple = OpFunction %int None %simple_type
+          %1 = OpLabel
+               OpReturnValue %int_2
+               OpFunctionEnd
diff --git a/llvm/test/Other/spirv-sim/simulator-args.spv b/llvm/test/Other/spirv-sim/simulator-args.spv
new file mode 100644
index 0000000000000..d8b1018064158
--- /dev/null
+++ b/llvm/test/Other/spirv-sim/simulator-args.spv
@@ -0,0 +1,36 @@
+; RUN: not spirv-sim --function=simple --wave=a --expects=2 -i %s 2>&1 | FileCheck %s --check-prefixes=CHECK-WAVE
+; RUN: not spirv-sim --function=simple --wave=1 --expects=a -i %s 2>&1 | FileCheck %s --check-prefixes=CHECK-EXPECT
+; RUN: not spirv-sim --function=simple --wave=1 --expects=1, -i %s 2>&1 | FileCheck %s --check-prefixes=CHECK-EXPECT
+; RUN: not spirv-sim --function=simple --wave=2 --expects=1 -i %s 2>&1 | FileCheck %s --check-prefixes=CHECK-SIZE
+; RUN: not spirv-sim --function=foo --wave=1 --expects=1 -i %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NAME
+
+; CHECK-WAVE: Invalid format for --wave/-w flag.
+
+; CHECK-EXPECT: Invalid format for --expects/-e flag.
+
+; CHECK-SIZE: Wave size != expected result array size
+
+; CHECK-NAME:          'foo' function not found. Known functions are:
+; CHECK-NAME-NEXT:     - main
+; CHECK-NAME-NEXT:     - simple
+; CHECK-NANE-NOT-NEXT: -
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource HLSL 670
+               OpName %simple "simple"
+               OpName %main "main"
+        %int = OpTypeInt 32 1
+      %int_2 = OpConstant %int 2
+       %void = OpTypeVoid
+  %main_type = OpTypeFunction %void
+   %simple_type = OpTypeFunction %int
+       %main = OpFunction %void None %main_type
+      %entry = OpLabel
+               OpReturn
+               OpFunctionEnd
+     %simple = OpFunction %int None %simple_type
+          %1 = OpLabel
+               OpReturnValue %int_2
+               OpFunctionEnd
diff --git a/llvm/test/Other/spirv-sim/switch.spv b/llvm/test/Other/spirv-sim/switch.spv
new file mode 100644
index 0000000000000..83dc56cecef2a
--- /dev/null
+++ b/llvm/test/Other/spirv-sim/switch.spv
@@ -0,0 +1,42 @@
+; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %}
+; RUN: spirv-sim --function=simple --wave=4 --expects=0,1,2,0 -i %s
+               OpCapability Shader
+               OpCapability GroupNonUniform
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %WaveIndex
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource HLSL 670
+               OpName %simple "simple"
+               OpName %main "main"
+                OpDecorate %WaveIndex BuiltIn SubgroupLocalInvocationId
+        %int = OpTypeInt 32 1
+       %uint = OpTypeInt 32 0
+       %bool = OpTypeBool
+      %int_0 = OpConstant %int 0
+      %int_1 = OpConstant %int 1
+      %int_2 = OpConstant %int 2
+     %uint_0 = OpConstant %uint 0
+       %void = OpTypeVoid
+  %main_type = OpTypeFunction %void
+%simple_type = OpTypeFunction %int
+  %uint_iptr = OpTypePointer Input %uint
+  %WaveIndex = OpVariable %uint_iptr Input
+       %main = OpFunction %void None %main_type
+      %entry = OpLabel
+               OpReturn
+               OpFunctionEnd
+     %simple = OpFunction %int None %simple_type
+          %1 = OpLabel
+          %2 = OpLoad %uint %WaveIndex
+               OpSelectionMerge %merge None
+               OpSwitch %2 %default 1 %case_1 2 %case_2
+    %default = OpLabel
+               OpBranch %merge
+     %case_1 = OpLabel
+               OpBranch %merge
+     %case_2 = OpLabel
+               OpBranch %merge
+      %merge = OpLabel
+          %4 = OpPhi %int %int_0 %default %int_1 %case_1 %int_2 %case_2
+               OpReturnValue %4
+               OpFunctionEnd
diff --git a/llvm/test/Other/spirv-sim/wave-get-lane-index.spv b/llvm/test/Other/spirv-sim/wave-get-lane-index.spv
new file mode 100644
index 0000000000000..1c1e5e8aefd4f
--- /dev/null
+++ b/llvm/test/Other/spirv-sim/wave-get-lane-index.spv
@@ -0,0 +1,30 @@
+; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %}
+; RUN: spirv-sim --function=simple --wave=4 --expects=0,1,2,3 -i %s
+               OpCapability Shader
+               OpCapability GroupNonUniform
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %WaveIndex
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource HLSL 670
+               OpName %simple "simple"
+               OpName %main "main"
+                OpDecorate %WaveIndex BuiltIn SubgroupLocalInvocationId
+        %int = OpTypeInt 32 1
+       %uint = OpTypeInt 32 0
+      %int_2 = OpConstant %int 2
+       %void = OpTypeVoid
+  %main_type = OpTypeFunction %void
+%simple_type = OpTypeFunction %int
+  %uint_iptr = OpTypePointer Input %uint
+  %WaveIndex = OpVariable %uint_iptr Input
+       %main = OpFunction %void None %main_type
+      %entry = OpLabel
+               OpReturn
+               OpFunctionEnd
+     %simple = OpFunction %int None %simple_type
+          %1 = OpLabel
+          %2 = OpLoad %uint %WaveIndex
+          %3 = OpBitcast %int %2
+               OpReturnValue %3
+               OpFunctionEnd
+
diff --git a/llvm/test/Other/spirv-sim/wave-read-lane-first.spv b/llvm/test/Other/spirv-sim/wave-read-lane-first.spv
new file mode 100644
index 0000000000000..801fb55fbaa9f
--- /dev/null
+++ b/llvm/test/Other/spirv-sim/wave-read-lane-first.spv
@@ -0,0 +1,83 @@
+; RUN: %if spirv-tools %{ spirv-as %s -o - | spirv-val - %}
+; RUN: spirv-sim --function=simple --wave=4 --expects=0,1,2,0 -i %s
+
+; int simple() {
+;   int m[4] = { 0, 1, 2, 0 };
+;   int idx = WaveGetLaneIndex();
+;   for (int i = 0; i < 4; i++) {
+;     if (i == m[idx]) {
+;       return WaveReadLaneFirst(idx);
+;     }
+;   }
+;   return 0;
+; }
+                       OpCapability Shader
+                       OpCapability GroupNonUniform
+                       OpCapability GroupNonUniformBallot
+                       OpMemoryModel Logical GLSL450
+                       OpEntryPoint GLCompute %main "main" %WaveIndex
+                       OpExecutionMode %main LocalSize 1 1 1
+                       OpSource HLSL 670
+                       OpName %simple "simple"
+                       OpName %main "main"
+                       OpDecorate %WaveIndex BuiltIn SubgroupLocalInvocationId
+                %int = OpTypeInt 32 1
+               %uint = OpTypeInt 32 0
+               %bool = OpTypeBool
+              %int_0 = OpConstant %int 0
+              %int_1 = OpConstant %int 1
+              %int_2 = OpConstant %int 2
+              %int_4 = OpConstant %int 4
+             %uint_3 = OpConstant %uint 3
+             %uint_4 = OpConstant %uint 4
+               %void = OpTypeVoid
+          %main_type = OpTypeFunction %void
+        %simple_type = OpTypeFunction %int
+          %uint_iptr = OpTypePointer Input %uint
+           %int_fptr = OpTypePointer Function %int
+     %arr_int_uint_4 = OpTypeArray %int %uint_4
+%arr_int_uint_4_fptr = OpTypePointer Function %arr_int_uint_4
+          %WaveIndex = OpVariable %uint_iptr Input
+               %main = OpFunction %void None %main_type
+              %entry = OpLabel
+                       OpReturn
+                       OpFunctionEnd
+             %simple = OpFunction %int None %simple_type
+         %bb_entry_0 = OpLabel
+                  %m = OpVariable %arr_int_uint_4_fptr Function
+                %idx = OpVariable %int_fptr Function
+                  %i = OpVariable %int_fptr Function
+                 %27 = OpCompositeConstruct %arr_int_uint_4 %int_0 %int_1 %int_2 %int_0
+                       OpStore %m %27
+                 %28 = OpLoad %uint %WaveIndex
+                 %29 = OpBitcast %int %28
+                       OpStore %idx %29
+                       OpStore %i %int_0
+                       OpBranch %for_check
+          %for_check = OpLabel
+                 %31 = OpLoad %int %i
+                 %33 = OpSLessThan %bool %31 %int_4
+                       OpLoopMerge %for_merge %for_continue None
+                       OpBranchConditional %33 %for_body %for_merge
+           %for_body = OpLabel
+                 %37 = OpLoad %int %i
+                 %38 = OpLoad %int %idx
+                 %39 = OpAccessChain %int_fptr %m %38
+                 %40 = OpLoad %int %39
+                 %41 = OpIEqual %bool %37 %40
+                       OpSelectionMerge %if_merge None
+                       OpBranchConditional %41 %if_true %if_merge
+            %if_true = OpLabel
+                 %44 = OpLoad %int %idx
+                 %45 = OpGroupNonUniformBroadcastFirst %int %uint_3 %44
+                       OpReturnValue %45
+           %if_merge = OpLabel
+                       OpBranch %for_continue
+       %for_continue = OpLabel
+                 %47 = OpLoad %int %i
+                 %48 = OpIAdd %int %47 %int_1
+                       OpStore %i %48
+                       OpBranch %for_check
+          %for_merge = OpLabel
+                       OpReturnValue %int_0
+                       OpFunctionEnd
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index bee7aa3903a1f..1e0dd0a7df34f 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -22,7 +22,7 @@
 
 # suffixes: A list of file extensions to treat as test files. This is overriden
 # by individual lit.local.cfg files in the test subdirectories.
-config.suffixes = [".ll", ".c", ".test", ".txt", ".s", ".mir", ".yaml"]
+config.suffixes = [".ll", ".c", ".test", ".txt", ".s", ".mir", ".yaml", ".spv"]
 
 # excludes: A list of directories to exclude from the testsuite. The 'Inputs'
 # subdirectories contain auxiliary inputs for various tests in their parent
diff --git a/llvm/utils/spirv-sim/instructions.py b/llvm/utils/spirv-sim/instructions.py
new file mode 100644
index 0000000000000..5e64a480a2be6
--- /dev/null
+++ b/llvm/utils/spirv-sim/instructions.py
@@ -0,0 +1,381 @@
+from typing import Optional, List
+
+
+# Base class for an instruction. To implement a basic instruction that doesn't
+# impact the control-flow, create a new class inheriting from this.
+class Instruction:
+    # Contains the name of the output register, if any.
+    _result: Optional[str]
+    # Contains the instruction opcode.
+    _opcode: str
+    # Contains all the instruction operands, except result and opcode.
+    _operands: List[str]
+
+    def __init__(self, line: str):
+        self.line = line
+        tokens = line.split()
+        if len(tokens) > 1 and tokens[1] == "=":
+            self._result = tokens[0]
+            self._opcode = tokens[2]
+            self._operands = tokens[3:] if len(tokens) > 2 else []
+        else:
+            self._result = None
+            self._opcode = tokens[0]
+            self._operands = tokens[1:] if len(tokens) > 1 else []
+
+    def __str__(self):
+        if self._result is None:
+            return f"      {self._opcode} {self._operands}"
+        return f"{self._result:3} = {self._opcode} {self._operands}"
+
+    # Returns the instruction opcode.
+    def opcode(self) -> str:
+        return self._opcode
+
+    # Returns the instruction operands.
+    def operands(self) -> List[str]:
+        return self._operands
+
+    # Returns the instruction output register. Calling this function is
+    # only allowed if has_output_register() is true.
+    def output_register(self) -> str:
+        assert self._result is not None
+        return self._result
+
+    # Returns true if this function has an output register. False otherwise.
+    def has_output_register(self) -> bool:
+        return self._result is not None
+
+    # This function is used to initialize state related to this instruction
+    # before module execution begins. For example, global Input variables
+    # can use this to store the lane ID into the register.
+    def static_execution(self, lane):
+        pass
+
+    # This function is called everytime this instruction is executed by a
+    # tangle. This function should not be directly overriden, instead see
+    # _impl and _advance_ip.
+    def runtime_execution(self, module, lane):
+        self._impl(module, lane)
+        self._advance_ip(module, lane)
+
+    # This function needs to be overriden if your instruction can be executed.
+    # It implements the logic of the instruction.
+    # 'Static' instructions like OpConstant should not override this since
+    # they are not supposed to be executed at runtime.
+    def _impl(self, module, lane):
+        raise RuntimeError(f"Unimplemented instruction {self}")
+
+    # By default, IP is incremented to point to the next instruction.
+    # If the instruction modifies IP (like OpBranch), this must be overridden.
+    def _advance_ip(self, module, lane):
+        lane.set_ip(lane.ip() + 1)
+
+
+# Those are parsed, but never executed.
+class OpEntryPoint(Instruction):
+    pass
+
+
+class OpFunction(Instruction):
+    pass
+
+
+class OpFunctionEnd(Instruction):
+    pass
+
+
+class OpLabel(Instruction):
+    pass
+
+
+class OpVariable(Instruction):
+    pass
+
+
+class OpName(Instruction):
+    def name(self) -> str:
+        return self._operands[1][1:-1]
+
+    def decoratedRegister(self) -> str:
+        return self._operands[0]
+
+
+# The only decoration we use if the BuiltIn one to initialize the values.
+class OpDecorate(Instruction):
+    def static_execution(self, lane):
+        if self._operands[1] == "LinkageAttributes":
+            return
+
+        assert (
+            self._operands[1] == "BuiltIn"
+            and self._operands[2] == "SubgroupLocalInvocationId"
+        )
+        lane.set_register(self._operands[0], lane.tid())
+
+
+# Constants
+class OpConstant(Instruction):
+    def static_execution(self, lane):
+        lane.set_register(self._result, int(self._operands[1]))
+
+
+class OpConstantTrue(OpConstant):
+    def static_execution(self, lane):
+        lane.set_register(self._result, True)
+
+
+class OpConstantFalse(OpConstant):
+    def static_execution(self, lane):
+        lane.set_register(self._result, False)
+
+
+class OpConstantComposite(OpConstant):
+    def static_execution(self, lane):
+        result = []
+        for op in self._operands[1:]:
+            result.append(lane.get_register(op))
+        lane.set_register(self._result, result)
+
+
+# Control flow instructions
+class OpFunctionCall(Instruction):
+    def _impl(self, module, lane):
+        pass
+
+    def _advance_ip(self, module, lane):
+        entry = module.get_function_entry(self._operands[1])
+        lane.do_call(entry, self._result)
+
+
+class OpReturn(Instruction):
+    def _impl(self, module, lane):
+        pass
+
+    def _advance_ip(self, module, lane):
+        lane.do_return(None)
+
+
+class OpReturnValue(Instruction):
+    def _impl(self, module, lane):
+        pass
+
+    def _advance_ip(self, module, lane):
+        lane.do_return(lane.get_register(self._operands[0]))
+
+
+class OpBranch(Instruction):
+    def _impl(self, module, lane):
+        pass
+
+    def _advance_ip(self, module, lane):
+        lane.set_ip(module.get_bb_entry(self._operands[0]))
+        pass
+
+
+class OpBranchConditional(Instruction):
+    def _impl(self, module, lane):
+        pass
+
+    def _advance_ip(self, module, lane):
+        condition = lane.get_register(self._operands[0])
+        if condition:
+            lane.set_ip(module.get_bb_entry(self._operands[1]))
+        else:
+            lane.set_ip(module.get_bb_entry(self._operands[2]))
+
+
+class OpSwitch(Instruction):
+    def _impl(self, module, lane):
+        pass
+
+    def _advance_ip(self, module, lane):
+        value = lane.get_register(self._operands[0])
+        default_label = self._operands[1]
+        i = 2
+        while i < len(self._operands):
+            imm = int(self._operands[i])
+            label = self._operands[i + 1]
+            if value == imm:
+                lane.set_ip(module.get_bb_entry(label))
+                return
+            i += 2
+        lane.set_ip(module.get_bb_entry(default_label))
+
+
+class OpUnreachable(Instruction):
+    def _impl(self, module, lane):
+        raise RuntimeError("This instruction should never be executed.")
+
+
+# Convergence instructions
+class MergeInstruction(Instruction):
+    def merge_location(self):
+        return self._operands[0]
+
+    def continue_location(self):
+        return None if len(self._operands) < 3 else self._operands[1]
+
+    def _impl(self, module, lane):
+        lane.handle_convergence_header(self)
+
+
+class OpLoopMerge(MergeInstruction):
+    pass
+
+
+class OpSelectionMerge(MergeInstruction):
+    pass
+
+
+# Other instructions
+class OpBitcast(Instruction):
+    def _impl(self, module, lane):
+        # TODO: find out the type from the defining instruction.
+        # This can only work for DXC.
+        if self._operands[0] == "%int":
+            lane.set_register(self._result, int(lane.get_register(self._operands[1])))
+        else:
+            raise RuntimeError("Unsupported OpBitcast operand")
+
+
+class OpAccessChain(Instruction):
+    def _impl(self, module, lane):
+        # Python dynamic types allows me to simplify. As long as the SPIR-V
+        # is legal, this should be fine.
+        # Note: SPIR-V structs are stored as tuples
+        value = lane.get_register(self._operands[1])
+        for operand in self._operands[2:]:
+            value = value[lane.get_register(operand)]
+        lane.set_register(self._result, value)
+
+
+class OpCompositeConstruct(Instruction):
+    def _impl(self, module, lane):
+        output = []
+        for op in self._operands[1:]:
+            output.append(lane.get_register(op))
+        lane.set_register(self._result, output)
+
+
+class OpCompositeExtract(Instruction):
+    def _impl(self, module, lane):
+        value = lane.get_register(self._operands[1])
+        output = value
+        for op in self._operands[2:]:
+            output = output[int(op)]
+        lane.set_register(self._result, output)
+
+
+class OpStore(Instruction):
+    def _impl(self, module, lane):
+        lane.set_register(self._operands[0], lane.get_register(self._operands[1]))
+
+
+class OpLoad(Instruction):
+    def _impl(self, module, lane):
+        lane.set_register(self._result, lane.get_register(self._operands[1]))
+
+
+class OpIAdd(Instruction):
+    def _impl(self, module, lane):
+        LHS = lane.get_register(self._operands[1])
+        RHS = lane.get_register(self._operands[2])
+        lane.set_register(self._result, LHS + RHS)
+
+
+class OpISub(Instruction):
+    def _impl(self, module, lane):
+        LHS = lane.get_register(self._operands[1])
+        RHS = lane.get_register(self._operands[2])
+        lane.set_register(self._result, LHS - RHS)
+
+
+class OpIMul(Instruction):
+    def _impl(self, module, lane):
+        LHS = lane.get_register(self._operands[1])
+        RHS = lane.get_register(self._operands[2])
+        lane.set_register(self._result, LHS * RHS)
+
+
+class OpLogicalNot(Instruction):
+    def _impl(self, module, lane):
+        LHS = lane.get_register(self._operands[1])
+        lane.set_register(self._result, not LHS)
+
+
+class _LessThan(Instruction):
+    def _impl(self, module, lane):
+        LHS = lane.get_register(self._operands[1])
+        RHS = lane.get_register(self._operands[2])
+        lane.set_register(self._result, LHS < RHS)
+
+
+class _GreaterThan(Instruction):
+    def _impl(self, module, lane):
+        LHS = lane.get_register(self._operands[1])
+        RHS = lane.get_register(self._operands[2])
+        lane.set_register(self._result, LHS > RHS)
+
+
+class OpSLessThan(_LessThan):
+    pass
+
+
+class OpULessThan(_LessThan):
+    pass
+
+
+class OpSGreaterThan(_GreaterThan):
+    pass
+
+
+class OpUGreaterThan(_GreaterThan):
+    pass
+
+
+class OpIEqual(Instruction):
+    def _impl(self, module, lane):
+        LHS = lane.get_register(self._operands[1])
+        RHS = lane.get_register(self._operands[2])
+        lane.set_register(self._result, LHS == RHS)
+
+
+class OpINotEqual(Instruction):
+    def _impl(self, module, lane):
+        LHS = lane.get_register(self._operands[1])
+        RHS = lane.get_register(self._operands[2])
+        lane.set_register(self._result, LHS != RHS)
+
+
+class OpPhi(Instruction):
+    def _impl(self, module, lane):
+        previousBBName = lane.get_previous_bb_name()
+        i = 1
+        while i < len(self._operands):
+            label = self._operands[i + 1]
+            if label == previousBBName:
+                lane.set_register(self._result, lane.get_register(self._operands[i]))
+                return
+            i += 2
+        raise RuntimeError("previousBB not in the OpPhi _operands")
+
+
+class OpSelect(Instruction):
+    def _impl(self, module, lane):
+        condition = lane.get_register(self._operands[1])
+        value = lane.get_register(self._operands[2 if condition else 3])
+        lane.set_register(self._result, value)
+
+
+# Wave intrinsics
+class OpGroupNonUniformBroadcastFirst(Instruction):
+    def _impl(self, module, lane):
+        assert lane.get_register(self._operands[1]) == 3
+        if lane.is_first_active_lane():
+            lane.broadcast_register(self._result, lane.get_register(self._operands[2]))
+
+
+class OpGroupNonUniformElect(Instruction):
+    def _impl(self, module, lane):
+        lane.set_register(self._result, lane.is_first_active_lane())
diff --git a/llvm/utils/spirv-sim/spirv-sim.py b/llvm/utils/spirv-sim/spirv-sim.py
new file mode 100755
index 0000000000000..428b0ca4eb796
--- /dev/null
+++ b/llvm/utils/spirv-sim/spirv-sim.py
@@ -0,0 +1,658 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+from dataclasses import dataclass
+from instructions import *
+from typing import Any, Iterable, Callable, Optional, Tuple, List, Dict
+import argparse
+import fileinput
+import inspect
+import re
+import sys
+
+RE_EXPECTS = re.compile(r"^([0-9]+,)*[0-9]+$")
+
+
+# Parse the SPIR-V instructions. Some instructions are ignored because
+# not required to simulate this module.
+# Instructions are to be implemented in instructions.py
+def parseInstruction(i):
+    IGNORED = set(
+        [
+            "OpCapability",
+            "OpMemoryModel",
+            "OpExecutionMode",
+            "OpExtension",
+            "OpSource",
+            "OpTypeInt",
+            "OpTypeStruct",
+            "OpTypeFloat",
+            "OpTypeBool",
+            "OpTypeVoid",
+            "OpTypeFunction",
+            "OpTypePointer",
+            "OpTypeArray",
+        ]
+    )
+    if i.opcode() in IGNORED:
+        return None
+
+    try:
+        Type = getattr(sys.modules["instructions"], i.opcode())
+    except AttributeError:
+        raise RuntimeError(f"Unsupported instruction {i}")
+    if not inspect.isclass(Type):
+        raise RuntimeError(
+            f"{i} instruction definition is not a class. Did you used 'def' instead of 'class'?"
+        )
+    return Type(i.line)
+
+
+# Split a list of instructions into pieces. Pieces are delimited by instructions of the type splitType.
+# The delimiter is the first instruction of the next piece.
+# This function returns no empty pieces:
+# - if 2 subsequent delimiters will mean 2 pieces. One with only the first delimiter, and the second
+#   with the delimiter and following instructions.
+# - if the first instruction is a delimiter, the first piece will begin with this delimiter.
+def splitInstructions(
+    splitType: type, instructions: Iterable[Instruction]
+) -> List[List[Instruction]]:
+    blocks: List[List[Instruction]] = [[]]
+    for instruction in instructions:
+        if isinstance(instruction, splitType) and len(blocks[-1]) > 0:
+            blocks.append([])
+        blocks[-1].append(instruction)
+    return blocks
+
+
+# Defines a BasicBlock in the simulator.
+# Begins at an OpLabel, and ends with a control-flow instruction.
+class BasicBlock:
+    def __init__(self, instructions) -> None:
+        assert isinstance(instructions[0], OpLabel)
+        # The name of the basic block, which is the register of the leading
+        # OpLabel.
+        self._name = instructions[0].output_register()
+        # The list of instructions belonging to this block.
+        self._instructions = instructions[1:]
+
+    # Returns the name of this basic block.
+    def name(self):
+        return self._name
+
+    # Returns the instruction at index in this basic block.
+    def __getitem__(self, index: int) -> Instruction:
+        return self._instructions[index]
+
+    # Returns the number of instructions in this basic block, excluding the
+    # leading OpLabel.
+    def __len__(self):
+        return len(self._instructions)
+
+    def dump(self):
+        print(f"        {self._name}:")
+        for instruction in self._instructions:
+            print(f"        {instruction}")
+
+
+# Defines a Function in the simulator.
+class Function:
+    def __init__(self, instructions) -> None:
+        assert isinstance(instructions[0], OpFunction)
+        # The name of the function (name of the register returned by OpFunction).
+        self._name: str = instructions[0].output_register()
+        # The list of basic blocks that belongs to this function.
+        self._basic_blocks: List[BasicBlock] = []
+        # The variables local to this function.
+        self._variables: List[OpVariable] = [
+            x for x in instructions if isinstance(x, OpVariable)
+        ]
+
+        assert isinstance(instructions[-1], OpFunctionEnd)
+        body = filter(lambda x: not isinstance(x, OpVariable), instructions[1:-1])
+        for block in splitInstructions(OpLabel, body):
+            self._basic_blocks.append(BasicBlock(block))
+
+    # Returns the name of this function.
+    def name(self) -> str:
+        return self._name
+
+    # Returns the basic block at index in this function.
+    def __getitem__(self, index: int) -> BasicBlock:
+        return self._basic_blocks[index]
+
+    # Returns the index of the basic block with the given name if found,
+    # -1 otherwise.
+    def get_bb_index(self, name) -> int:
+        for i in range(len(self._basic_blocks)):
+            if self._basic_blocks[i].name() == name:
+                return i
+        return -1
+
+    def dump(self):
+        print("      Variables:")
+        for var in self._variables:
+            print(f"        {var}")
+        print("      Blocks:")
+        for bb in self._basic_blocks:
+            bb.dump()
+
+
+# Represents an instruction pointer in the simulator.
+@dataclass
+class InstructionPointer:
+    # The current function the IP points to.
+    function: Function
+    # The basic block index in function IP points to.
+    basic_block: int
+    # The instruction in basic_block IP points to.
+    instruction_index: int
+
+    def __str__(self):
+        bb = self.function[self.basic_block]
+        i = bb[self.instruction_index]
+        return f"{bb.name()}:{self.instruction_index} in {self.function.name()} | {i}"
+
+    def __hash__(self):
+        return hash((self.function.name(), self.basic_block, self.instruction_index))
+
+    # Returns the basic block IP points to.
+    def bb(self) -> BasicBlock:
+        return self.function[self.basic_block]
+
+    # Returns the instruction IP points to.
+    def instruction(self):
+        return self.function[self.basic_block][self.instruction_index]
+
+    # Increment IP by 1. This only works inside a basic-block boundary.
+    # Incrementing IP when at the boundary of a basic block will fail.
+    def __add__(self, value: int):
+        bb = self.function[self.basic_block]
+        assert len(bb) > self.instruction_index + value
+        return InstructionPointer(
+            self.function, self.basic_block, self.instruction_index + value
+        )
+
+
+# Defines a Lane in this simulator.
+class Lane:
+    # The registers known by this lane.
+    _registers: Dict[str, Any]
+    # The current IP of this lane.
+    _ip: Optional[InstructionPointer]
+    # If this lane running.
+    _running: bool
+    # The wave this lane belongs to.
+    _wave: Wave
+    # The callstack of this lane. Each tuple represents 1 call.
+    # The first element is the IP the function will return to.
+    # The second element is the callback to call to store the return value
+    # into the correct register.
+    _callstack: List[Tuple[InstructionPointer, Callable[[Any], None]]]
+
+    _previous_bb: Optional[BasicBlock]
+    _current_bb: Optional[BasicBlock]
+
+    def __init__(self, wave: Wave, tid: int) -> None:
+        self._registers = dict()
+        self._ip = None
+        self._running = True
+        self._wave = wave
+        self._callstack = []
+
+        # The index of this lane in the wave.
+        self._tid = tid
+        # The last BB this lane was executing into.
+        self._previous_bb = None
+        # The current BB this lane is executing into.
+        self._current_bb = None
+
+    # Returns the lane/thread ID of this lane in its wave.
+    def tid(self) -> int:
+        return self._tid
+
+    # Returns true is this lane if the first by index in the current active tangle.
+    def is_first_active_lane(self) -> bool:
+        return self._tid == self._wave.get_first_active_lane_index()
+
+    # Broadcast value into the registers of all active lanes.
+    def broadcast_register(self, register: str, value: Any) -> None:
+        self._wave.broadcast_register(register, value)
+
+    # Returns the IP this lane is currently at.
+    def ip(self) -> InstructionPointer:
+        assert self._ip is not None
+        return self._ip
+
+    # Returns true if this lane is running, false otherwise.
+    # Running means not dead. An inactive lane is running.
+    def running(self) -> bool:
+        return self._running
+
+    # Set the register at "name" to "value" in this lane.
+    def set_register(self, name: str, value: Any) -> None:
+        self._registers[name] = value
+
+    # Get the value in register "name" in this lane.
+    # If allow_undef is true, fetching an unknown register won't fail.
+    def get_register(self, name: str, allow_undef: bool = False) -> Optional[Any]:
+        if allow_undef and name not in self._registers:
+            return None
+        return self._registers[name]
+
+    def set_ip(self, ip: InstructionPointer) -> None:
+        if ip.bb() != self._current_bb:
+            self._previous_bb = self._current_bb
+            self._current_bb = ip.bb()
+        self._ip = ip
+
+    def get_previous_bb_name(self):
+        return self._previous_bb.name()
+
+    def handle_convergence_header(self, instruction):
+        self._wave.handle_convergence_header(self, instruction)
+
+    def do_call(self, ip, output_register):
+        return_ip = None if self._ip is None else self._ip + 1
+        self._callstack.append(
+            (return_ip, lambda value: self.set_register(output_register, value))
+        )
+        self.set_ip(ip)
+
+    def do_return(self, value):
+        ip, callback = self._callstack[-1]
+        self._callstack.pop()
+
+        callback(value)
+        if len(self._callstack) == 0:
+            self._running = False
+        else:
+            self.set_ip(ip)
+
+
+# Represents the SPIR-V module in the simulator.
+class Module:
+    _functions: Dict[str, Function]
+    _prolog: List[Instruction]
+    _globals: List[Instruction]
+    _name2reg: Dict[str, str]
+    _reg2name: Dict[str, str]
+
+    def __init__(self, instructions) -> None:
+        chunks = splitInstructions(OpFunction, instructions)
+
+        # The instructions located outside of all functions.
+        self._prolog = chunks[0]
+        # The functions in this module.
+        self._functions = {}
+        # Global variables in this module.
+        self._globals = [
+            x
+            for x in instructions
+            if isinstance(x, OpVariable) or issubclass(type(x), OpConstant)
+        ]
+
+        # Helper dictionaries to get real names of registers, or registers by names.
+        self._name2reg = {}
+        self._reg2name = {}
+        for instruction in instructions:
+            if isinstance(instruction, OpName):
+                name = instruction.name()
+                reg = instruction.decoratedRegister()
+                self._name2reg[name] = reg
+                self._reg2name[reg] = name
+
+        for chunk in chunks[1:]:
+            function = Function(chunk)
+            assert function.name() not in self._functions
+            self._functions[function.name()] = function
+
+    # Returns the register matching "name" if any, None otherwise.
+    # This assumes names are unique.
+    def getRegisterFromName(self, name):
+        if name in self._name2reg:
+            return self._name2reg[name]
+        return None
+
+    # Returns the name given to "register" if any, None otherwise.
+    def getNameFromRegister(self, register):
+        if register in self._reg2name:
+            return self._reg2name[register]
+        return None
+
+    # Initialize the module before wave execution begins.
+    # See Instruction::static_execution for more details.
+    def initialize(self, lane):
+        for instruction in self._globals:
+            instruction.static_execution(lane)
+
+        # Initialize builtins
+        for instruction in self._prolog:
+            if isinstance(instruction, OpDecorate):
+                instruction.static_execution(lane)
+
+    def execute_one_instruction(self, lane: Lane, ip: InstructionPointer) -> None:
+        ip.instruction().runtime_execution(self, lane)
+
+    # Returns the first valid IP for the function defined by the given register.
+    # Calling this with a register not returned by OpFunction is illegal.
+    def get_function_entry(self, register: str) -> InstructionPointer:
+        if register not in self._functions:
+            raise RuntimeError(f"Function defining {register} not found.")
+        return InstructionPointer(self._functions[register], 0, 0)
+
+    # Returns the first valid IP for the basic block defined by register.
+    # Calling this with a register not returned by an OpLabel is illegal.
+    def get_bb_entry(self, register: str) -> InstructionPointer:
+        for name, function in self._functions.items():
+            index = function.get_bb_index(register)
+            if index != -1:
+                return InstructionPointer(function, index, 0)
+        raise RuntimeError(f"Instruction defining {register} not found.")
+
+    # Returns the list of function names in this module.
+    # If an OpName exists for this function, returns the pretty name, else
+    # returns the register name.
+    def get_function_names(self):
+        return [self.getNameFromRegister(reg) for reg, func in self._functions.items()]
+
+    # Returns the global variables defined in this module.
+    def variables(self) -> Iterable:
+        return [x.output_register() for x in self._globals]
+
+    def dump(self, function_name: Optional[str] = None):
+        print("Module:")
+        print("  globals:")
+        for instruction in self._globals:
+            print(f"    {instruction}")
+
+        if function_name is None:
+            print("  functions:")
+            for register, function in self._functions.items():
+                name = self.getNameFromRegister(register)
+                print(f"  Function {register} ({name})")
+                function.dump()
+            return
+
+        register = self.getRegisterFromName(function_name)
+        print(f"  function {register} ({function_name}):")
+        if register is not None:
+            self._functions[register].dump()
+        else:
+            print(f"    error: cannot find function.")
+
+
+# Defines a convergence requirement for the simulation:
+# A list of lanes impacted by a merge and possibly the associated
+# continue target.
+@dataclass
+class ConvergenceRequirement:
+    mergeTarget: InstructionPointer
+    continueTarget: Optional[InstructionPointer]
+    impactedLanes: set[int]
+
+
+Task = Dict[InstructionPointer, List[Lane]]
+
+
+# Defines a Lane group/Wave in the simulator.
+class Wave:
+    # The module this wave will execute.
+    _module: Module
+    # The lanes this wave will be composed of.
+    _lanes: List[Lane]
+    # The instructions scheduled for execution.
+    _tasks: Task
+    # The actual requirements to comply with when executing instructions.
+    # E.g: the set of lanes required to merge before executing the merge block.
+    _convergence_requirements: List[ConvergenceRequirement]
+    # The indices of the active lanes for the current executing instruction.
+    _active_lane_indices: set[int]
+
+    def __init__(self, module, wave_size: int) -> None:
+        assert wave_size > 0
+        self._module = module
+        self._lanes = []
+
+        for i in range(wave_size):
+            self._lanes.append(Lane(self, i))
+
+        self._tasks = {}
+        self._convergence_requirements = []
+        # The indices of the active lanes for the current executing instruction.
+        self._active_lane_indices = set()
+
+    # Returns True if the given IP can be executed for the given list of lanes.
+    def _is_task_candidate(self, ip: InstructionPointer, lanes: List[Lane]):
+        merged_lanes: set[int] = set()
+        for lane in self._lanes:
+            if not lane.running():
+                merged_lanes.add(lane.tid())
+
+        for requirement in self._convergence_requirements:
+            # This task is not executing a merge or continue target.
+            # Adding all lanes at those points into the ignore list.
+            if requirement.mergeTarget != ip and requirement.continueTarget != ip:
+                for tid in requirement.impactedLanes:
+                    if self._lanes[tid].ip() == requirement.mergeTarget:
+                        merged_lanes.add(tid)
+                    if self._lanes[tid].ip() == requirement.continueTarget:
+                        merged_lanes.add(tid)
+                continue
+
+            # This task is executing the current requirement continue/merge
+            # target.
+            for tid in requirement.impactedLanes:
+                lane = self._lanes[tid]
+                if not lane.running():
+                    continue
+
+                if lane.tid() in merged_lanes:
+                    continue
+
+                if ip == requirement.mergeTarget:
+                    if lane.ip() != requirement.mergeTarget:
+                        return False
+                else:
+                    if (
+                        lane.ip() != requirement.mergeTarget
+                        and lane.ip() != requirement.continueTarget
+                    ):
+                        return False
+        return True
+
+    # Returns the next task we can schedule. This must always return a task.
+    # Calling this when all lanes are dead is invalid.
+    def _get_next_runnable_task(self) -> Tuple[InstructionPointer, List[Lane]]:
+        candidate = None
+        for ip, lanes in self._tasks.items():
+            if len(lanes) == 0:
+                continue
+            if self._is_task_candidate(ip, lanes):
+                candidate = ip
+                break
+
+        if candidate:
+            lanes = self._tasks[candidate]
+            del self._tasks[ip]
+            return (candidate, lanes)
+        raise RuntimeError("No task to execute. Deadlock?")
+
+    # Handle an encountered merge instruction for the given lane.
+    def handle_convergence_header(self, lane: Lane, instruction: MergeInstruction):
+        mergeTarget = self._module.get_bb_entry(instruction.merge_location())
+        for requirement in self._convergence_requirements:
+            if requirement.mergeTarget == mergeTarget:
+                requirement.impactedLanes.add(lane.tid())
+                return
+
+        continueTarget = None
+        if instruction.continue_location():
+            continueTarget = self._module.get_bb_entry(instruction.continue_location())
+        requirement = ConvergenceRequirement(
+            mergeTarget, continueTarget, set([lane.tid()])
+        )
+        self._convergence_requirements.append(requirement)
+
+    # Returns true if some instructions are scheduled for execution.
+    def _has_tasks(self) -> bool:
+        return len(self._tasks) > 0
+
+    # Returns the index of the first active lane right now.
+    def get_first_active_lane_index(self) -> int:
+        return min(self._active_lane_indices)
+
+    # Broadcast the given value to all active lane registers.
+    def broadcast_register(self, register: str, value: Any) -> None:
+        for tid in self._active_lane_indices:
+            self._lanes[tid].set_register(register, value)
+
+    # Returns the entrypoint of the function associated with 'name'.
+    # Calling this function with an invalid name is illegal.
+    def _get_function_entry_from_name(self, name: str) -> InstructionPointer:
+        register = self._module.getRegisterFromName(name)
+        assert register is not None
+        return self._module.get_function_entry(register)
+
+    # Run the wave on the function 'function_name' until all lanes are dead.
+    # If verbose is True, execution trace is printed.
+    # Returns the value returned by the function for each lane.
+    def run(self, function_name: str, verbose: bool = False) -> List[Any]:
+        for t in self._lanes:
+            self._module.initialize(t)
+
+        entry_ip = self._get_function_entry_from_name(function_name)
+        assert entry_ip is not None
+        for t in self._lanes:
+            t.do_call(entry_ip, "__shader_output__")
+
+        self._tasks[self._lanes[0].ip()] = self._lanes
+        while self._has_tasks():
+            ip, lanes = self._get_next_runnable_task()
+            self._active_lane_indices = set([x.tid() for x in lanes])
+            if verbose:
+                print(
+                    f"Executing with lanes {self._active_lane_indices}: {ip.instruction()}"
+                )
+
+            for lane in lanes:
+                self._module.execute_one_instruction(lane, ip)
+                if not lane.running():
+                    continue
+
+                if lane.ip() in self._tasks:
+                    self._tasks[lane.ip()].append(lane)
+                else:
+                    self._tasks[lane.ip()] = [lane]
+
+            if verbose and ip.instruction().has_output_register():
+                register = ip.instruction().output_register()
+                print(
+                    f"   {register:3} = {[ x.get_register(register, allow_undef=True) for x in lanes ]}"
+                )
+
+        output = []
+        for lane in self._lanes:
+            output.append(lane.get_register("__shader_output__"))
+        return output
+
+    def dump_register(self, register: str) -> None:
+        for lane in self._lanes:
+            print(
+                f" Lane {lane.tid():2} | {register:3} = {lane.get_register(register)}"
+            )
+
+
+parser = argparse.ArgumentParser(
+    description="simulator", formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+parser.add_argument(
+    "-i", "--input", help="Text SPIR-V to read from", required=False, default="-"
+)
+parser.add_argument("-f", "--function", help="Function to execute")
+parser.add_argument("-w", "--wave", help="Wave size", default=32, required=False)
+parser.add_argument(
+    "-e",
+    "--expects",
+    help="Expected results per lanes, expects a list of values. Ex: '1, 2, 3'.",
+)
+parser.add_argument("-v", "--verbose", help="verbose", action="store_true")
+args = parser.parse_args()
+
+
+def load_instructions(filename: str):
+    if filename is None:
+        return []
+
+    if filename.strip() != "-":
+        try:
+            with open(filename, "r") as f:
+                lines = f.read().split("\n")
+        except Exception:  # (FileNotFoundError, PermissionError):
+            return []
+    else:
+        lines = sys.stdin.readlines()
+
+    # Remove leading/trailing whitespaces.
+    lines = [x.strip() for x in lines]
+    # Strip comments.
+    lines = [x for x in filter(lambda x: len(x) != 0 and x[0] != ";", lines)]
+
+    instructions = []
+    for i in [Instruction(x) for x in lines]:
+        out = parseInstruction(i)
+        if out != None:
+            instructions.append(out)
+    return instructions
+
+
+def main():
+    if args.expects is None or not RE_EXPECTS.match(args.expects):
+        print("Invalid format for --expects/-e flag.", file=sys.stderr)
+        sys.exit(1)
+    if args.function is None:
+        print("Invalid format for --function/-f flag.", file=sys.stderr)
+        sys.exit(1)
+    try:
+        int(args.wave)
+    except ValueError:
+        print("Invalid format for --wave/-w flag.", file=sys.stderr)
+        sys.exit(1)
+
+    expected_results = [int(x.strip()) for x in args.expects.split(",")]
+    wave_size = int(args.wave)
+    if len(expected_results) != wave_size:
+        print("Wave size != expected result array size", file=sys.stderr)
+        sys.exit(1)
+
+    instructions = load_instructions(args.input)
+    if len(instructions) == 0:
+        print("Invalid input. Expected a text SPIR-V module.")
+        sys.exit(1)
+
+    module = Module(instructions)
+    if args.verbose:
+        module.dump()
+        module.dump(args.function)
+
+    function_names = module.get_function_names()
+    if args.function not in function_names:
+        print(
+            f"'{args.function}' function not found. Known functions are:",
+            file=sys.stderr,
+        )
+        for name in function_names:
+            print(f" - {name}", file=sys.stderr)
+        sys.exit(1)
+
+    wave = Wave(module, wave_size)
+    results = wave.run(args.function, verbose=args.verbose)
+
+    if expected_results != results:
+        print("Expected != Observed", file=sys.stderr)
+        print(f"{expected_results} != {results}", file=sys.stderr)
+        sys.exit(1)
+    sys.exit(0)
+
+
+main()

From afb6dafc6b680fb204d40c7fee4b339aa8471010 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche@google.com>
Date: Wed, 4 Sep 2024 11:27:03 +0200
Subject: [PATCH 054/425] [clang][HLSL] Add WaveIsFirstLane() intrinsic
 (#103299)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commits add the WaveIsFirstLane() hlsl intrinsinc. This intrinsic
uses the convergence intrinsincs for the SPIR-V backend. On the DXIL
side, I'm not sure what the strategy is for convergence, so I
implemented that like in DXC: a normal builtin function.

Signed-off-by: Nathan Gauër <brioche@google.com>
---
 clang/include/clang/Basic/Builtins.td         |  6 +++
 clang/lib/CodeGen/CGBuiltin.cpp               |  4 ++
 clang/lib/CodeGen/CGHLSLRuntime.h             |  1 +
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      |  4 ++
 .../builtins/wave_is_first_lane.hlsl          | 34 ++++++++++++
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |  2 +
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |  1 +
 llvm/lib/Target/DirectX/DXIL.td               |  9 ++++
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |  8 +++
 .../SPIRV/SPIRVStripConvergentIntrinsics.cpp  | 53 +++++++++++--------
 .../CodeGen/DirectX/wave_is_first_lane.ll     | 13 +++++
 .../SPIRV/hlsl-intrinsics/WaveIsFirstLane.ll  | 27 ++++++++++
 12 files changed, 140 insertions(+), 22 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl
 create mode 100644 llvm/test/CodeGen/DirectX/wave_is_first_lane.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveIsFirstLane.ll

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 8668b25661dec..9e2a590f265ac 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4679,6 +4679,12 @@ def HLSLWaveGetLaneIndex : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "unsigned int()";
 }
 
+def HLSLWaveIsFirstLane : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_wave_is_first_lane"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "bool()";
+}
+
 def HLSLClamp : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_elementwise_clamp"];
   let Attributes = [NoThrow, Const];
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 786c2c224b349..012b43b8770be 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18723,6 +18723,10 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: {
         llvm::FunctionType::get(IntTy, {}, false), "__hlsl_wave_get_lane_index",
         {}, false, true));
   }
+  case Builtin::BI__builtin_hlsl_wave_is_first_lane: {
+    Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveIsFirstLaneIntrinsic();
+    return EmitRuntimeCall(Intrinsic::getDeclaration(&CGM.getModule(), ID));
+  }
   }
   return nullptr;
 }
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 55a4b97c160cd..40dcd74b7dd24 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -84,6 +84,7 @@ class CGHLSLRuntime {
   GENERATE_HLSL_INTRINSIC_FUNCTION(FDot, fdot)
   GENERATE_HLSL_INTRINSIC_FUNCTION(SDot, sdot)
   GENERATE_HLSL_INTRINSIC_FUNCTION(UDot, udot)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(WaveIsFirstLane, wave_is_first_lane)
 
   //===----------------------------------------------------------------------===//
   // End of reserved area for HLSL intrinsic getters.
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 6d38b668fe770..5c08a45a35377 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -1796,5 +1796,9 @@ _HLSL_AVAILABILITY(shadermodel, 6.0)
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_get_lane_index)
 __attribute__((convergent)) uint WaveGetLaneIndex();
 
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_is_first_lane)
+__attribute__((convergent)) bool WaveIsFirstLane();
+
 } // namespace hlsl
 #endif //_HLSL_HLSL_INTRINSICS_H_
diff --git a/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl b/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl
new file mode 100644
index 0000000000000..18860c321eb91
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple   \
+// RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple   \
+// RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
+
+[numthreads(1, 1, 1)]
+void main() {
+// CHECK-SPIRV: %[[#entry_tok:]] = call token @llvm.experimental.convergence.entry()
+
+// CHECK-SPIRV: %[[#loop_tok:]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %[[#entry_tok]]) ]
+  while (true) {
+
+// CHECK-DXIL:  %[[#]] = call i1 @llvm.dx.wave.is.first.lane()
+// CHECK-SPIRV: %[[#]] = call i1 @llvm.spv.wave.is.first.lane()
+// CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#loop_tok]]) ]
+    if (WaveIsFirstLane()) {
+      break;
+    }
+  }
+
+// CHECK-DXIL:  %[[#]] = call i1 @llvm.dx.wave.is.first.lane()
+// CHECK-SPIRV: %[[#]] = call i1 @llvm.spv.wave.is.first.lane()
+// CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#entry_tok]]) ]
+  if (WaveIsFirstLane()) {
+    return;
+  }
+}
+
+// CHECK-DXIL:  i1 @llvm.dx.wave.is.first.lane() #[[#attr:]]
+// CHECK-SPIRV: i1 @llvm.spv.wave.is.first.lane() #[[#attr:]]
+
+// CHECK: attributes #[[#attr]] = {{{.*}} convergent {{.*}}}
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 32af50b25f390..5f48e2ba939f5 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -79,4 +79,6 @@ def int_dx_umad : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLV
 def int_dx_normalize : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]>;
 def int_dx_rcp  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
 def int_dx_rsqrt  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+
+def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 63d9ba43a1183..cbf6e04f2844d 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -79,4 +79,5 @@ let TargetPrefix = "spv" in {
     DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
     [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
     [IntrNoMem, Commutative] >;
+  def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
 }
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 83ea36ca048ad..4e3ecf4300d82 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -746,3 +746,12 @@ def CreateHandleFromBinding : DXILOp<218, createHandleFromBinding> {
   let result = HandleTy;
   let stages = [Stages<DXIL1_6, [all_stages]>];
 }
+
+def WaveIsFirstLane :  DXILOp<110, waveIsFirstLane> {
+  let Doc = "returns 1 for the first lane in the wave";
+  let LLVMIntrinsic = int_dx_wave_is_first_lane;
+  let arguments = [];
+  let result = Int1Ty;
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 9e10d947081cc..fed82b904af4f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -2351,6 +2351,14 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   } break;
   case Intrinsic::spv_saturate:
     return selectSaturate(ResVReg, ResType, I);
+  case Intrinsic::spv_wave_is_first_lane: {
+    SPIRVType *IntTy = GR.getOrCreateSPIRVIntegerType(32, I, TII);
+    return BuildMI(BB, I, I.getDebugLoc(),
+                   TII.get(SPIRV::OpGroupNonUniformElect))
+        .addDef(ResVReg)
+        .addUse(GR.getSPIRVTypeID(ResType))
+        .addUse(GR.getOrCreateConstInt(3, I, IntTy, TII));
+  }
   default: {
     std::string DiagMsg;
     raw_string_ostream OS(DiagMsg);
diff --git a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
index dca30535acfa1..b632d78497767 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
@@ -41,31 +41,40 @@ class SPIRVStripConvergentIntrinsics : public FunctionPass {
   virtual bool runOnFunction(Function &F) override {
     DenseSet<Instruction *> ToRemove;
 
+    // Is the instruction is a convergent intrinsic, add it to kill-list and
+    // returns true. Returns false otherwise.
+    auto CleanupIntrinsic = [&](IntrinsicInst *II) {
+      if (II->getIntrinsicID() != Intrinsic::experimental_convergence_entry &&
+          II->getIntrinsicID() != Intrinsic::experimental_convergence_loop &&
+          II->getIntrinsicID() != Intrinsic::experimental_convergence_anchor)
+        return false;
+
+      II->replaceAllUsesWith(UndefValue::get(II->getType()));
+      ToRemove.insert(II);
+      return true;
+    };
+
+    // Replace the given CallInst by a similar CallInst with no convergencectrl
+    // attribute.
+    auto CleanupCall = [&](CallInst *CI) {
+      auto OB = CI->getOperandBundle(LLVMContext::OB_convergencectrl);
+      if (!OB.has_value())
+        return;
+
+      auto *NewCall = CallBase::removeOperandBundle(
+          CI, LLVMContext::OB_convergencectrl, CI);
+      NewCall->copyMetadata(*CI);
+      CI->replaceAllUsesWith(NewCall);
+      ToRemove.insert(CI);
+    };
+
     for (BasicBlock &BB : F) {
       for (Instruction &I : BB) {
-        if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
-          if (II->getIntrinsicID() !=
-                  Intrinsic::experimental_convergence_entry &&
-              II->getIntrinsicID() !=
-                  Intrinsic::experimental_convergence_loop &&
-              II->getIntrinsicID() !=
-                  Intrinsic::experimental_convergence_anchor) {
+        if (auto *II = dyn_cast<IntrinsicInst>(&I))
+          if (CleanupIntrinsic(II))
             continue;
-          }
-
-          II->replaceAllUsesWith(UndefValue::get(II->getType()));
-          ToRemove.insert(II);
-        } else if (auto *CI = dyn_cast<CallInst>(&I)) {
-          auto OB = CI->getOperandBundle(LLVMContext::OB_convergencectrl);
-          if (!OB.has_value())
-            continue;
-
-          auto *NewCall = CallBase::removeOperandBundle(
-              CI, LLVMContext::OB_convergencectrl, CI);
-          NewCall->copyMetadata(*CI);
-          CI->replaceAllUsesWith(NewCall);
-          ToRemove.insert(CI);
-        }
+        if (auto *CI = dyn_cast<CallInst>(&I))
+          CleanupCall(CI);
       }
     }
 
diff --git a/llvm/test/CodeGen/DirectX/wave_is_first_lane.ll b/llvm/test/CodeGen/DirectX/wave_is_first_lane.ll
new file mode 100644
index 0000000000000..2265dd8f7348c
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/wave_is_first_lane.ll
@@ -0,0 +1,13 @@
+; RUN: opt -S  -dxil-op-lower  -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s
+
+define void @main() #0 {
+entry:
+; CHECK: call i1 @dx.op.waveIsFirstLane(i32 110)
+  %0 = call i1 @llvm.dx.wave.is.first.lane()
+  ret void
+}
+
+declare i1 @llvm.dx.wave.is.first.lane() #1
+
+attributes #0 = { convergent norecurse "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #1 = { convergent nocallback nofree nosync nounwind willreturn }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveIsFirstLane.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveIsFirstLane.ll
new file mode 100644
index 0000000000000..94597b37cc7eb
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveIsFirstLane.ll
@@ -0,0 +1,27 @@
+; RUN: llc -O0 -mtriple=spirv-unknown-linux %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spirv-unknown-vulkan-compute"
+
+; CHECK-DAG:   %[[#uint:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#uint_3:]] = OpConstant %[[#uint]] 3
+; CHECK-DAG:   %[[#bool:]] = OpTypeBool
+
+define spir_func void @main() #0 {
+entry:
+  %0 = call token @llvm.experimental.convergence.entry()
+; CHECK:   %[[#]] = OpGroupNonUniformElect %[[#bool]] %[[#uint_3]]
+  %1 = call i1 @llvm.spv.wave.is.first.lane() [ "convergencectrl"(token %0) ]
+  ret void
+}
+
+declare i32 @__hlsl_wave_get_lane_index() #1
+
+attributes #0 = { convergent norecurse "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #1 = { convergent }
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 4, !"dx.disable_optimizations", i32 1}

From cc5c526c80a4cacf7ed5b7fbe50072594ec1aeaf Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Wed, 4 Sep 2024 11:30:58 +0200
Subject: [PATCH 055/425] [lldb] Fix and speedup the `memory find` command
 (#104193)

This patch fixes an issue where the `memory find` command would
effectively stop searching after encountering a memory read error (which
could happen due to unreadable memory), without giving any indication
that it has done so (it would just print it could not find the pattern).

To make matters worse, it would not terminate after encountering this
error, but rather proceed to slowly increment the address pointer, which
meant that searching a large region could take a very long time (and
give the appearance that lldb is actually searching for the thing).

The patch fixes this first problem by detecting read errors and
skipping over (using GetMemoryRegionInfo) the unreadable parts of memory
and resuming the search after them. It also reads the memory in bulk
(`max(sizeof(pattern))`), which speeds up the search significantly (up
to 6x for live processes, 18x for core files).
---
 lldb/source/Target/Process.cpp                | 69 ++++++++++---------
 .../memory/holes/TestMemoryHoles.py           | 13 ++++
 2 files changed, 48 insertions(+), 34 deletions(-)

diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index ae64f6f261bad..6c5c5162e2468 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -114,33 +114,6 @@ class ProcessOptionValueProperties
   }
 };
 
-class ProcessMemoryIterator {
-public:
-  ProcessMemoryIterator(Process &process, lldb::addr_t base)
-      : m_process(process), m_base_addr(base) {}
-
-  bool IsValid() { return m_is_valid; }
-
-  uint8_t operator[](lldb::addr_t offset) {
-    if (!IsValid())
-      return 0;
-
-    uint8_t retval = 0;
-    Status error;
-    if (0 == m_process.ReadMemory(m_base_addr + offset, &retval, 1, error)) {
-      m_is_valid = false;
-      return 0;
-    }
-
-    return retval;
-  }
-
-private:
-  Process &m_process;
-  const lldb::addr_t m_base_addr;
-  bool m_is_valid = true;
-};
-
 static constexpr OptionEnumValueElement g_follow_fork_mode_values[] = {
     {
         eFollowParent,
@@ -3379,21 +3352,49 @@ lldb::addr_t Process::FindInMemory(lldb::addr_t low, lldb::addr_t high,
   if (region_size < size)
     return LLDB_INVALID_ADDRESS;
 
+  // See "Boyer-Moore string search algorithm".
   std::vector<size_t> bad_char_heuristic(256, size);
-  ProcessMemoryIterator iterator(*this, low);
-
   for (size_t idx = 0; idx < size - 1; idx++) {
     decltype(bad_char_heuristic)::size_type bcu_idx = buf[idx];
     bad_char_heuristic[bcu_idx] = size - idx - 1;
   }
-  for (size_t s = 0; s <= (region_size - size);) {
+
+  // Memory we're currently searching through.
+  llvm::SmallVector<uint8_t, 0> mem;
+  // Position of the memory buffer.
+  addr_t mem_pos = low;
+  // Maximum number of bytes read (and buffered). We need to read at least
+  // `size` bytes for a successful match.
+  const size_t max_read_size = std::max<size_t>(size, 0x10000);
+
+  for (addr_t cur_addr = low; cur_addr <= (high - size);) {
+    if (cur_addr + size > mem_pos + mem.size()) {
+      // We need to read more data. We don't attempt to reuse the data we've
+      // already read (up to `size-1` bytes from `cur_addr` to
+      // `mem_pos+mem.size()`).  This is fine for patterns much smaller than
+      // max_read_size. For very
+      // long patterns we may need to do something more elaborate.
+      mem.resize_for_overwrite(max_read_size);
+      Status error;
+      mem.resize(ReadMemory(cur_addr, mem.data(),
+                            std::min(mem.size(), high - cur_addr), error));
+      mem_pos = cur_addr;
+      if (size > mem.size()) {
+        // We didn't read enough data. Skip to the next memory region.
+        MemoryRegionInfo info;
+        error = GetMemoryRegionInfo(mem_pos + mem.size(), info);
+        if (error.Fail())
+          break;
+        cur_addr = info.GetRange().GetRangeEnd();
+        continue;
+      }
+    }
     int64_t j = size - 1;
-    while (j >= 0 && buf[j] == iterator[s + j])
+    while (j >= 0 && buf[j] == mem[cur_addr + j - mem_pos])
       j--;
     if (j < 0)
-      return low + s;
-    else
-      s += bad_char_heuristic[iterator[s + size - 1]];
+      return cur_addr; // We have a match.
+    cur_addr += bad_char_heuristic[mem[cur_addr + size - 1 - mem_pos]];
   }
 
   return LLDB_INVALID_ADDRESS;
diff --git a/lldb/test/API/functionalities/memory/holes/TestMemoryHoles.py b/lldb/test/API/functionalities/memory/holes/TestMemoryHoles.py
index 1c2c90d483ea3..c61ae15b9dda7 100644
--- a/lldb/test/API/functionalities/memory/holes/TestMemoryHoles.py
+++ b/lldb/test/API/functionalities/memory/holes/TestMemoryHoles.py
@@ -43,6 +43,9 @@ def _prepare_inferior(self):
         # inside the holes we've deliberately left empty.
         self.memory = self.frame().FindVariable("mem_with_holes").GetValueAsUnsigned()
         self.pagesize = self.frame().FindVariable("pagesize").GetValueAsUnsigned()
+        self.num_pages = (
+            self.target().FindFirstGlobalVariable("num_pages").GetValueAsUnsigned()
+        )
         positions = self.frame().FindVariable("positions")
         self.positions = [
             positions.GetChildAtIndex(i).GetValueAsUnsigned()
@@ -58,3 +61,13 @@ def test_memory_read(self):
         self.assertEqual(len(content), self.pagesize)
         self.assertEqual(content[0:7], b"needle\0")
         self.assertTrue(error.Fail())
+
+    def test_memory_find(self):
+        self._prepare_inferior()
+
+        matches = [f"data found at location: {p:#x}" for p in self.positions]
+        self.expect(
+            f'memory find --count {len(self.positions)+1} --string "needle" '
+            f"{self.memory:#x} {self.memory+self.pagesize*self.num_pages:#x}",
+            substrs=matches + ["no more matches within the range"],
+        )

From 0b2550f8ab77a53f560f6a7a1b401c4803a36d48 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Wed, 4 Sep 2024 09:33:35 +0000
Subject: [PATCH 056/425] [lldb][test] Disable TestMultipleDebuggers on Linux

This used to timeout (https://github.com/llvm/llvm-project/issues/101162)
now it's aborting (https://github.com/llvm/llvm-project/pull/105765#issuecomment-2327645665)
---
 lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
index bee6e66aa7219..1fd4806cd74f4 100644
--- a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
+++ b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
@@ -13,6 +13,7 @@ class TestMultipleSimultaneousDebuggers(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
     # Sometimes times out on Linux, see https://github.com/llvm/llvm-project/issues/101162.
+    @skipIfLinux
     @skipIfNoSBHeaders
     @skipIfWindows
     @skipIfHostIncompatibleWithTarget

From 69657eb7f67600ec1c5c449d13eef3670dfb64da Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97@outlook.com>
Date: Wed, 4 Sep 2024 17:37:34 +0800
Subject: [PATCH 057/425] [llc] Provide `opt` like verifier options (#106665)

- Support `verify-each` option.
- Default behavior is verifying output only.
---
 llvm/include/llvm/Passes/CodeGenPassBuilder.h        |  3 +++
 llvm/test/CodeGen/AArch64/PHIElimination-crash.mir   |  2 +-
 .../test/CodeGen/AArch64/PHIElimination-debugloc.mir |  2 +-
 llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir     |  2 +-
 .../AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir  |  2 +-
 .../AMDGPU/early-lis-two-address-partial-def.mir     |  2 +-
 llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir       |  2 +-
 llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir       |  2 +-
 llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir  |  2 +-
 llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir  |  2 +-
 .../CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir |  2 +-
 llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir         |  2 +-
 llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir             |  4 ++--
 llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir             |  2 +-
 llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir            |  2 +-
 llvm/test/CodeGen/Hexagon/two-addr-tied-subregs.mir  |  2 +-
 llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir  |  2 +-
 llvm/test/CodeGen/PowerPC/livevars-crash1.mir        |  2 +-
 llvm/test/CodeGen/PowerPC/livevars-crash2.mir        |  2 +-
 llvm/test/CodeGen/X86/distancemap.mir                |  2 +-
 llvm/test/CodeGen/X86/phielim-undef.mir              |  2 +-
 llvm/test/CodeGen/X86/twoaddr-mul2.mir               |  2 +-
 llvm/test/tools/llc/new-pm/pipeline.mir              |  2 +-
 llvm/test/tools/llc/new-pm/verify.mir                |  2 +-
 llvm/tools/llc/NewPMDriver.cpp                       | 12 ++++++++----
 llvm/tools/llc/NewPMDriver.h                         |  4 +++-
 llvm/tools/llc/llc.cpp                               | 11 ++++++++++-
 27 files changed, 48 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index eb15beb835b53..64d20ae5b20ef 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -548,6 +548,9 @@ Error CodeGenPassBuilder<Derived, TargetMachineT>::buildPipeline(
   if (auto Err = derived().addMachinePasses(addPass))
     return std::move(Err);
 
+  if (!Opt.DisableVerify)
+    addPass(MachineVerifierPass());
+
   if (PrintAsm) {
     derived().addAsmPrinter(
         addPass, [this, &Out, DwoOut, FileType](MCContext &Ctx) {
diff --git a/llvm/test/CodeGen/AArch64/PHIElimination-crash.mir b/llvm/test/CodeGen/AArch64/PHIElimination-crash.mir
index 8f43686429268..4e09580aeb218 100644
--- a/llvm/test/CodeGen/AArch64/PHIElimination-crash.mir
+++ b/llvm/test/CodeGen/AArch64/PHIElimination-crash.mir
@@ -1,7 +1,7 @@
 # RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o /dev/null %s \
 # RUN:   -run-pass=livevars,phi-node-elimination,twoaddressinstruction \
 # RUN:   -no-phi-elim-live-out-early-exit=1 -phi-elim-split-all-critical-edges=1
-# RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o /dev/null %s \
+# RUN: llc -mtriple=aarch64-linux-gnu -verify-each -o /dev/null %s \
 # RUN:   --passes='require<live-vars>,phi-node-elimination,two-address-instruction' \
 # RUN:   -no-phi-elim-live-out-early-exit=1 -phi-elim-split-all-critical-edges=1
 
diff --git a/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir b/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir
index 9b8283352161a..01c44e3f253bb 100644
--- a/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir
+++ b/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir
@@ -2,7 +2,7 @@
 # RUN:   -run-pass=livevars,phi-node-elimination,twoaddressinstruction \
 # RUN:   -no-phi-elim-live-out-early-exit=1 -phi-elim-split-all-critical-edges=1 \
 # RUN: | FileCheck %s
-# RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s \
+# RUN: llc -mtriple=aarch64-linux-gnu -verify-each -o - %s \
 # RUN:   --passes='require<live-vars>,phi-node-elimination,two-address-instruction' \
 # RUN:   -no-phi-elim-live-out-early-exit=1 -phi-elim-split-all-critical-edges=1 \
 # RUN: | FileCheck %s
diff --git a/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir b/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir
index c483d669a2758..c1ddc9c14d814 100644
--- a/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir
+++ b/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=aarch64-unknown-linux -run-pass=twoaddressinstruction -verify-machineinstrs %s -o - | FileCheck %s
-# RUN: llc -mtriple=aarch64-unknown-linux --passes=two-address-instruction %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-unknown-linux --passes=two-address-instruction -verify-each %s -o - | FileCheck %s
 # REQUIRES: aarch64-registered-target
 
 # Verify that the register class is correctly constrained after the twoaddress replacement
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir
index e84c51b73ad1e..75148ecff5377 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -early-live-intervals -run-pass=liveintervals -run-pass=twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -passes='require<live-intervals>,two-address-instruction' -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -passes='require<live-intervals>,two-address-instruction' -verify-each -o - %s | FileCheck %s
 
 ---
 name:            dyn_extract_v7f64_v_v
diff --git a/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir b/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir
index bf111d19f2147..186b171f4e805 100644
--- a/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=liveintervals -run-pass=twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck --check-prefix=GFX90A %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --passes='require<live-intervals>,two-address-instruction' -o - %s | FileCheck --check-prefix=GFX90A %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --passes='require<live-intervals>,two-address-instruction' -verify-each -o - %s | FileCheck --check-prefix=GFX90A %s
 
 ---
 name:            aligned_partial_vgpr_64
diff --git a/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir
index 0f20b8a2f1e29..1768e39d1a06c 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir
@@ -1,5 +1,5 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s --passes=two-address-instruction -o - | FileCheck --check-prefixes=GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GFX10 %s
 
 # GFX10-LABEL: name: test_fmamk_reg_imm_f16
 # GFX10: %2:vgpr_32 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
index 91ade8806e4d1..820b8579bd0a4 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GFX11 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GFX11 %s
 
 ---
 name:            test_fmamk_reg_imm_f16
diff --git a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir
index 8b009978055ac..829d01d8e1c36 100644
--- a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir
+++ b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir
@@ -1,5 +1,5 @@
 # RUN: llc -mtriple amdgcn -run-pass livevars -run-pass phi-node-elimination -verify-machineinstrs -o - %s | FileCheck %s
-# RUN: llc -mtriple amdgcn --passes='require<live-vars>,phi-node-elimination' -o - %s | FileCheck %s
+# RUN: llc -mtriple amdgcn --passes='require<live-vars>,phi-node-elimination' -verify-each -o - %s | FileCheck %s
 
 # CHECK-LABEL:  phi-cf-test
 # CHECK: bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir b/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir
index dfeca8db0b464..0ee768466509a 100644
--- a/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir
+++ b/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass liveintervals,phi-node-elimination -o - %s | FileCheck -check-prefixes=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 --passes='require<live-intervals>,phi-node-elimination' -o - %s | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 --passes='require<live-intervals>,phi-node-elimination' -verify-each -o - %s | FileCheck -check-prefixes=GCN %s
 
 # This checks liveintervals pass verification and phi-node-elimination correctly preserves them.
 
diff --git a/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir b/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir
index 4bb0046c0ee01..77032ffcf18a9 100644
--- a/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir
+++ b/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir
@@ -1,5 +1,5 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=livevars,phi-node-elimination,twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 --passes='require<live-vars>,phi-node-elimination,two-address-instruction' -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 --passes='require<live-vars>,phi-node-elimination,two-address-instruction' -verify-each -o - %s | FileCheck %s
 # This used to fail under ASAN enabled build because we didn't update LiveVariables in SIInstrInfo::convertToThreeAddress()
 # CHECK: _amdgpu_ps_main
 
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir
index ab6d207cd9668..b0a75a526cf2b 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir
@@ -1,5 +1,5 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx90a %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx90a %s --passes=two-address-instruction -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a %s --passes=two-address-instruction -verify-each -o - | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: name: test_fmamk_reg_imm_f64
 # GCN: V_FMA_F64_e64 0, killed %0, 0, %2, 0, killed %1, 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir
index daac34bab0fd0..1c444eca7675c 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir
@@ -1,7 +1,7 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GCN %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s --passes=two-address-instruction -o - | FileCheck --check-prefixes=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -o - | FileCheck --check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GCN %s
 
 # GCN-LABEL: name: test_fmamk_reg_imm_f32
 # GCN: %2:vgpr_32 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir
index 7062878a84609..9b1deb9aa9f73 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir
@@ -1,5 +1,5 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 %s --passes=two-address-instruction -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 %s --passes=two-address-instruction -verify-each -o - | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: name: test_madmk_reg_imm_f32
 # GCN: V_MADMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir
index 5fbb149548909..98d2eca213aae 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir
@@ -1,5 +1,5 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -verify-each -o - | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32
 # GCN: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec
diff --git a/llvm/test/CodeGen/Hexagon/two-addr-tied-subregs.mir b/llvm/test/CodeGen/Hexagon/two-addr-tied-subregs.mir
index c533a5a167ab2..d5753491dbad1 100644
--- a/llvm/test/CodeGen/Hexagon/two-addr-tied-subregs.mir
+++ b/llvm/test/CodeGen/Hexagon/two-addr-tied-subregs.mir
@@ -1,5 +1,5 @@
 # RUN: llc -march hexagon -run-pass livevars -run-pass twoaddressinstruction  -verify-machineinstrs -o - %s | FileCheck %s
-# RUN: llc -march hexagon --passes='require<live-vars>,two-address-instruction' -o - %s | FileCheck %s
+# RUN: llc -march hexagon --passes='require<live-vars>,two-address-instruction' -verify-each -o - %s | FileCheck %s
 
 
 ###############################################################################
diff --git a/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir b/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir
index 2a669ed6b03a1..8bdf719f4bb5b 100644
--- a/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir
+++ b/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir
@@ -1,5 +1,5 @@
 # RUN: llc -mtriple powerpc64-unknown-linux-gnu -run-pass livevars -run-pass phi-node-elimination -verify-machineinstrs -o - %s | FileCheck %s
-# RUN: llc -mtriple powerpc64-unknown-linux-gnu --passes='require<live-vars>,phi-node-elimination' -o - %s | FileCheck %s
+# RUN: llc -mtriple powerpc64-unknown-linux-gnu --passes='require<live-vars>,phi-node-elimination' -verify-each -o - %s | FileCheck %s
 
 # This test case was originally known as
 #   test/CodeGen/PowerPC/2013-07-01-PHIElimBug.ll
diff --git a/llvm/test/CodeGen/PowerPC/livevars-crash1.mir b/llvm/test/CodeGen/PowerPC/livevars-crash1.mir
index 68d2e5a627e9d..ec01a3c326cfa 100644
--- a/llvm/test/CodeGen/PowerPC/livevars-crash1.mir
+++ b/llvm/test/CodeGen/PowerPC/livevars-crash1.mir
@@ -2,7 +2,7 @@
 # RUN:   -run-pass=livevars,phi-node-elimination -verify-machineinstrs | \
 # RUN:  FileCheck %s
 # RUN: llc -mtriple powerpc64le-unknown-linux-gnu %s -o - 2>&1 \
-# RUN:   --passes='require<live-vars>,phi-node-elimination' | \
+# RUN:   --passes='require<live-vars>,phi-node-elimination' -verify-each | \
 # RUN:  FileCheck %s
 
 --- |
diff --git a/llvm/test/CodeGen/PowerPC/livevars-crash2.mir b/llvm/test/CodeGen/PowerPC/livevars-crash2.mir
index e165c85d5b72a..deaae3936cefb 100644
--- a/llvm/test/CodeGen/PowerPC/livevars-crash2.mir
+++ b/llvm/test/CodeGen/PowerPC/livevars-crash2.mir
@@ -2,7 +2,7 @@
 # RUN:   -run-pass=livevars,phi-node-elimination -verify-machineinstrs | \
 # RUN:   FileCheck %s
 # RUN: llc -mtriple powerpc64le-unknown-linux-gnu %s -o - 2>&1 \
-# RUN:   --passes='require<live-vars>,phi-node-elimination' | \
+# RUN:   --passes='require<live-vars>,phi-node-elimination' -verify-each | \
 # RUN:   FileCheck %s
 
 --- |
diff --git a/llvm/test/CodeGen/X86/distancemap.mir b/llvm/test/CodeGen/X86/distancemap.mir
index 0a2f422302bd3..8b0c4bfe9d000 100644
--- a/llvm/test/CodeGen/X86/distancemap.mir
+++ b/llvm/test/CodeGen/X86/distancemap.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc %s -o - -mtriple=x86_64-unknown-linux -run-pass=twoaddressinstruction -verify-machineinstrs | FileCheck %s
-# RUN: llc %s -o - -mtriple=x86_64-unknown-linux --passes=two-address-instruction | FileCheck %s
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux --passes=two-address-instruction -verify-each | FileCheck %s
 
 # In TwoAddressInstructionPass, new instructions should be added to DistanceMap.
 # In this case, function convertInstTo3Addr is called on the first ADD
diff --git a/llvm/test/CodeGen/X86/phielim-undef.mir b/llvm/test/CodeGen/X86/phielim-undef.mir
index cebc725537d0e..1200449b49c78 100644
--- a/llvm/test/CodeGen/X86/phielim-undef.mir
+++ b/llvm/test/CodeGen/X86/phielim-undef.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=x86_64-- -verify-machineinstrs -o - %s -run-pass=livevars,phi-node-elimination,twoaddressinstruction | FileCheck %s
-# RUN: llc -mtriple=x86_64-- -verify-machineinstrs -o - %s --passes='require<live-vars>,phi-node-elimination,two-address-instruction' | FileCheck %s
+# RUN: llc -mtriple=x86_64-- -verify-machineinstrs -o - %s --passes='require<live-vars>,phi-node-elimination,two-address-instruction' -verify-each | FileCheck %s
 
 --- |
   @b114 = external global i16, align 1
diff --git a/llvm/test/CodeGen/X86/twoaddr-mul2.mir b/llvm/test/CodeGen/X86/twoaddr-mul2.mir
index e21005fa92397..b24c893cc11fc 100644
--- a/llvm/test/CodeGen/X86/twoaddr-mul2.mir
+++ b/llvm/test/CodeGen/X86/twoaddr-mul2.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=x86_64-unknown -mcpu=haswell -run-pass=twoaddressinstruction -verify-machineinstrs %s -o - | FileCheck %s
-# RUN: llc -mtriple=x86_64-unknown -mcpu=haswell --passes=two-address-instruction -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=x86_64-unknown -mcpu=haswell --passes=two-address-instruction -verify-each %s -o - | FileCheck %s
 
 # Check that we don't have any uses of [[COPY]] after it is killed.
 ---
diff --git a/llvm/test/tools/llc/new-pm/pipeline.mir b/llvm/test/tools/llc/new-pm/pipeline.mir
index ee058f763b779..761a3a424ee67 100644
--- a/llvm/test/tools/llc/new-pm/pipeline.mir
+++ b/llvm/test/tools/llc/new-pm/pipeline.mir
@@ -1,7 +1,7 @@
 # RUN: llc -mtriple=x86_64-pc-linux-gnu -x mir -passes=no-op-machine-function --print-pipeline-passes -filetype=null < %s | FileCheck %s --match-full-lines
 # RUN: llc -mtriple=x86_64-pc-linux-gnu -x mir -passes='require<machine-dom-tree>,print<machine-dom-tree>' -print-pipeline-passes < %s | FileCheck --check-prefix=ANALYSIS %s
 
-# CHECK: function(machine-function(no-op-machine-function)),PrintMIRPreparePass,function(machine-function(print))
+# CHECK: function(machine-function(no-op-machine-function)),PrintMIRPreparePass,function(machine-function(verify,print))
 
 # ANALYSIS: require<machine-dom-tree>,print<machine-dom-tree>
 
diff --git a/llvm/test/tools/llc/new-pm/verify.mir b/llvm/test/tools/llc/new-pm/verify.mir
index 0cc7fc837e5be..6aa4362811e04 100644
--- a/llvm/test/tools/llc/new-pm/verify.mir
+++ b/llvm/test/tools/llc/new-pm/verify.mir
@@ -1,4 +1,4 @@
-# RUN: not --crash llc -mtriple=x86_64-pc-linux-gnu -debug-pass-manager -passes='module(function(machine-function(trigger-verifier-error)))' -filetype=null %s 2>&1 | FileCheck %s
+# RUN: not --crash llc -mtriple=x86_64-pc-linux-gnu -debug-pass-manager -passes='module(function(machine-function(trigger-verifier-error)))' -verify-each -filetype=null %s 2>&1 | FileCheck %s
 
 # CHECK: Verifying machine function f
 # CHECK: Broken machine function found after pass "TriggerVerifierErrorPass"
diff --git a/llvm/tools/llc/NewPMDriver.cpp b/llvm/tools/llc/NewPMDriver.cpp
index c8088da49a278..2a49eaff1d7cb 100644
--- a/llvm/tools/llc/NewPMDriver.cpp
+++ b/llvm/tools/llc/NewPMDriver.cpp
@@ -18,8 +18,10 @@
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/CodeGen/MIRParser/MIRParser.h"
 #include "llvm/CodeGen/MIRPrinter.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachinePassManager.h"
+#include "llvm/CodeGen/MachineVerifier.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
@@ -29,7 +31,6 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
-#include "llvm/Passes/CodeGenPassBuilder.h" // TODO: Include pass headers properly.
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Support/CommandLine.h"
@@ -88,7 +89,7 @@ int llvm::compileModuleWithNewPM(
     StringRef Arg0, std::unique_ptr<Module> M, std::unique_ptr<MIRParser> MIR,
     std::unique_ptr<TargetMachine> Target, std::unique_ptr<ToolOutputFile> Out,
     std::unique_ptr<ToolOutputFile> DwoOut, LLVMContext &Context,
-    const TargetLibraryInfoImpl &TLII, bool NoVerify, StringRef PassPipeline,
+    const TargetLibraryInfoImpl &TLII, VerifierKind VK, StringRef PassPipeline,
     CodeGenFileType FileType) {
 
   if (!PassPipeline.empty() && TargetPassConfig::hasLimitedCodeGenPipeline()) {
@@ -104,14 +105,15 @@ int llvm::compileModuleWithNewPM(
 
   // Fetch options from TargetPassConfig
   CGPassBuilderOption Opt = getCGPassBuilderOption();
-  Opt.DisableVerify = NoVerify;
+  Opt.DisableVerify = VK != VerifierKind::InputOutput;
   Opt.DebugPM = DebugPM;
   Opt.RegAlloc = RegAlloc;
 
   MachineModuleInfo MMI(&LLVMTM);
 
   PassInstrumentationCallbacks PIC;
-  StandardInstrumentations SI(Context, Opt.DebugPM, !NoVerify);
+  StandardInstrumentations SI(Context, Opt.DebugPM,
+                              VK == VerifierKind::EachPass);
   registerCodeGenCallback(PIC, LLVMTM);
 
   MachineFunctionAnalysisManager MFAM;
@@ -147,6 +149,8 @@ int llvm::compileModuleWithNewPM(
     ExitOnErr(PB.parsePassPipeline(MPM, PassPipeline));
     MPM.addPass(PrintMIRPreparePass(*OS));
     MachineFunctionPassManager MFPM;
+    if (VK == VerifierKind::InputOutput)
+      MFPM.addPass(MachineVerifierPass());
     MFPM.addPass(PrintMIRPass(*OS));
     FPM.addPass(createFunctionToMachineFunctionPassAdaptor(std::move(MFPM)));
     MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
diff --git a/llvm/tools/llc/NewPMDriver.h b/llvm/tools/llc/NewPMDriver.h
index b0beeaf596c8f..c8a60223cb296 100644
--- a/llvm/tools/llc/NewPMDriver.h
+++ b/llvm/tools/llc/NewPMDriver.h
@@ -32,6 +32,8 @@ class ToolOutputFile;
 class LLVMContext;
 class MIRParser;
 
+enum class VerifierKind { None, InputOutput, EachPass };
+
 struct LLCDiagnosticHandler : public DiagnosticHandler {
   bool handleDiagnostics(const DiagnosticInfo &DI) override;
 };
@@ -42,7 +44,7 @@ int compileModuleWithNewPM(StringRef Arg0, std::unique_ptr<Module> M,
                            std::unique_ptr<ToolOutputFile> Out,
                            std::unique_ptr<ToolOutputFile> DwoOut,
                            LLVMContext &Context,
-                           const TargetLibraryInfoImpl &TLII, bool NoVerify,
+                           const TargetLibraryInfoImpl &TLII, VerifierKind VK,
                            StringRef PassPipeline, CodeGenFileType FileType);
 } // namespace llvm
 
diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index 80c84a977c26c..3f27a5fd1a0eb 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -130,6 +130,9 @@ static cl::opt<std::string> SplitDwarfFile(
 static cl::opt<bool> NoVerify("disable-verify", cl::Hidden,
                               cl::desc("Do not verify input module"));
 
+static cl::opt<bool> VerifyEach("verify-each",
+                                cl::desc("Verify after each transform"));
+
 static cl::opt<bool>
     DisableSimplifyLibCalls("disable-simplify-libcalls",
                             cl::desc("Disable simplify-libcalls"));
@@ -647,10 +650,16 @@ static int compileModule(char **argv, LLVMContext &Context) {
     WithColor::warning(errs(), argv[0])
         << ": warning: ignoring -mc-relax-all because filetype != obj";
 
+  VerifierKind VK = VerifierKind::InputOutput;
+  if (NoVerify)
+    VK = VerifierKind::None;
+  else if (VerifyEach)
+    VK = VerifierKind::EachPass;
+
   if (EnableNewPassManager || !PassPipeline.empty()) {
     return compileModuleWithNewPM(argv[0], std::move(M), std::move(MIR),
                                   std::move(Target), std::move(Out),
-                                  std::move(DwoOut), Context, TLII, NoVerify,
+                                  std::move(DwoOut), Context, TLII, VK,
                                   PassPipeline, codegen::getFileType());
   }
 

From deeafeab815ddfe7b507e9f79fe8f992265a9f3b Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Wed, 4 Sep 2024 13:38:55 +0400
Subject: [PATCH 058/425] [lldb][NFC] Move few static helpers to the class
 Socket (#106640)

Fixed a typo in Socket::SetOption().
---
 lldb/include/lldb/Host/Socket.h       | 15 +++++++++--
 lldb/source/Host/common/Socket.cpp    | 36 +++++++++++++++++++--------
 lldb/source/Host/common/TCPSocket.cpp | 35 +++++---------------------
 3 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/lldb/include/lldb/Host/Socket.h b/lldb/include/lldb/Host/Socket.h
index 304a91bdf6741..764a048976eb4 100644
--- a/lldb/include/lldb/Host/Socket.h
+++ b/lldb/include/lldb/Host/Socket.h
@@ -112,8 +112,17 @@ class Socket : public IOObject {
   static llvm::Expected<std::unique_ptr<UDPSocket>>
   UdpConnect(llvm::StringRef host_and_port, bool child_processes_inherit);
 
-  int GetOption(int level, int option_name, int &option_value);
-  int SetOption(int level, int option_name, int option_value);
+  static int GetOption(NativeSocket sockfd, int level, int option_name,
+                       int &option_value);
+  int GetOption(int level, int option_name, int &option_value) {
+    return GetOption(m_socket, level, option_name, option_value);
+  };
+
+  static int SetOption(NativeSocket sockfd, int level, int option_name,
+                       int option_value);
+  int SetOption(int level, int option_name, int option_value) {
+    return SetOption(m_socket, level, option_name, option_value);
+  };
 
   NativeSocket GetNativeSocket() const { return m_socket; }
   SocketProtocol GetSocketProtocol() const { return m_protocol; }
@@ -138,6 +147,8 @@ class Socket : public IOObject {
 
   virtual size_t Send(const void *buf, const size_t num_bytes);
 
+  static int CloseSocket(NativeSocket sockfd);
+  static Status GetLastError();
   static void SetLastError(Status &error);
   static NativeSocket CreateSocket(const int domain, const int type,
                                    const int protocol,
diff --git a/lldb/source/Host/common/Socket.cpp b/lldb/source/Host/common/Socket.cpp
index 1a506aa95b246..1a63571b94c6f 100644
--- a/lldb/source/Host/common/Socket.cpp
+++ b/lldb/source/Host/common/Socket.cpp
@@ -386,11 +386,7 @@ Status Socket::Close() {
   LLDB_LOGF(log, "%p Socket::Close (fd = %" PRIu64 ")",
             static_cast<void *>(this), static_cast<uint64_t>(m_socket));
 
-#if defined(_WIN32)
-  bool success = closesocket(m_socket) == 0;
-#else
-  bool success = ::close(m_socket) == 0;
-#endif
+  bool success = CloseSocket(m_socket) == 0;
   // A reference to a FD was passed in, set it to an invalid value
   m_socket = kInvalidSocketValue;
   if (!success) {
@@ -400,18 +396,20 @@ Status Socket::Close() {
   return error;
 }
 
-int Socket::GetOption(int level, int option_name, int &option_value) {
+int Socket::GetOption(NativeSocket sockfd, int level, int option_name,
+                      int &option_value) {
   get_socket_option_arg_type option_value_p =
       reinterpret_cast<get_socket_option_arg_type>(&option_value);
   socklen_t option_value_size = sizeof(int);
-  return ::getsockopt(m_socket, level, option_name, option_value_p,
+  return ::getsockopt(sockfd, level, option_name, option_value_p,
                       &option_value_size);
 }
 
-int Socket::SetOption(int level, int option_name, int option_value) {
+int Socket::SetOption(NativeSocket sockfd, int level, int option_name,
+                      int option_value) {
   set_socket_option_arg_type option_value_p =
-      reinterpret_cast<get_socket_option_arg_type>(&option_value);
-  return ::setsockopt(m_socket, level, option_name, option_value_p,
+      reinterpret_cast<set_socket_option_arg_type>(&option_value);
+  return ::setsockopt(sockfd, level, option_name, option_value_p,
                       sizeof(option_value));
 }
 
@@ -427,6 +425,24 @@ void Socket::SetLastError(Status &error) {
 #endif
 }
 
+Status Socket::GetLastError() {
+  std::error_code EC;
+#ifdef _WIN32
+  EC = llvm::mapWindowsError(WSAGetLastError());
+#else
+  EC = std::error_code(errno, std::generic_category());
+#endif
+  return EC;
+}
+
+int Socket::CloseSocket(NativeSocket sockfd) {
+#ifdef _WIN32
+  return ::closesocket(sockfd);
+#else
+  return ::close(sockfd);
+#endif
+}
+
 NativeSocket Socket::CreateSocket(const int domain, const int type,
                                   const int protocol,
                                   bool child_processes_inherit, Status &error) {
diff --git a/lldb/source/Host/common/TCPSocket.cpp b/lldb/source/Host/common/TCPSocket.cpp
index fc005814308d9..4f1518ef697ff 100644
--- a/lldb/source/Host/common/TCPSocket.cpp
+++ b/lldb/source/Host/common/TCPSocket.cpp
@@ -33,28 +33,9 @@
 #include <winsock2.h>
 #endif
 
-#ifdef _WIN32
-#define CLOSE_SOCKET closesocket
-typedef const char *set_socket_option_arg_type;
-#else
-#include <unistd.h>
-#define CLOSE_SOCKET ::close
-typedef const void *set_socket_option_arg_type;
-#endif
-
 using namespace lldb;
 using namespace lldb_private;
 
-static Status GetLastSocketError() {
-  std::error_code EC;
-#ifdef _WIN32
-  EC = llvm::mapWindowsError(WSAGetLastError());
-#else
-  EC = std::error_code(errno, std::generic_category());
-#endif
-  return EC;
-}
-
 static const int kType = SOCK_STREAM;
 
 TCPSocket::TCPSocket(bool should_close, bool child_processes_inherit)
@@ -208,12 +189,8 @@ Status TCPSocket::Listen(llvm::StringRef name, int backlog) {
       continue;
 
     // enable local address reuse
-    int option_value = 1;
-    set_socket_option_arg_type option_value_p =
-        reinterpret_cast<set_socket_option_arg_type>(&option_value);
-    if (::setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, option_value_p,
-                     sizeof(option_value)) == -1) {
-      CLOSE_SOCKET(fd);
+    if (SetOption(fd, SOL_SOCKET, SO_REUSEADDR, 1) == -1) {
+      CloseSocket(fd);
       continue;
     }
 
@@ -229,8 +206,8 @@ Status TCPSocket::Listen(llvm::StringRef name, int backlog) {
       err = ::listen(fd, backlog);
 
     if (err == -1) {
-      error = GetLastSocketError();
-      CLOSE_SOCKET(fd);
+      error = GetLastError();
+      CloseSocket(fd);
       continue;
     }
 
@@ -251,7 +228,7 @@ Status TCPSocket::Listen(llvm::StringRef name, int backlog) {
 
 void TCPSocket::CloseListenSockets() {
   for (auto socket : m_listen_sockets)
-    CLOSE_SOCKET(socket.first);
+    CloseSocket(socket.first);
   m_listen_sockets.clear();
 }
 
@@ -280,7 +257,7 @@ llvm::Expected<std::vector<MainLoopBase::ReadHandleUP>> TCPSocket::Accept(
 
       const lldb_private::SocketAddress &AddrIn = m_listen_sockets[fd];
       if (!AddrIn.IsAnyAddr() && AcceptAddr != AddrIn) {
-        CLOSE_SOCKET(sock);
+        CloseSocket(sock);
         LLDB_LOG(log, "rejecting incoming connection from {0} (expecting {1})",
                  AcceptAddr.GetIPAddress(), AddrIn.GetIPAddress());
         return;

From 59093cae8681fe5d3d951905887b67a99acf76e6 Mon Sep 17 00:00:00 2001
From: Lukacma <Marian.Lukac@arm.com>
Date: Wed, 4 Sep 2024 10:39:43 +0100
Subject: [PATCH 059/425] [AARCH64][SVE] Add intrinsics for SVE LUTI
 instructions (#97058)

This patch adds intrinsics for LUTI2 and LUTI4 instructions, which use
SVE registers, as specified in the
https://github.com/ARM-software/acle/pull/324
---
 clang/include/clang/Basic/arm_sve.td          |  20 +-
 .../aarch64-sve2-intrinsics/acle_sve2_luti.c  | 337 ++++++++++++++++++
 .../acle_sve2_imm_lane.cpp                    |  32 ++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  20 ++
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  36 +-
 .../CodeGen/AArch64/sve2-intrinsics-luti.ll   | 107 ++++++
 6 files changed, 550 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c
 create mode 100644 llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 078373823a3b6..81527d8b98760 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1939,6 +1939,24 @@ def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u",  "b", MergeNone, "", [VerifyRunti
 def SVTBX_BF16  : SInst<"svtbx[_{d}]",  "dddu", "b", MergeNone, "aarch64_sve_tbx", [VerifyRuntimeMode]>;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// SVE2 - Lookup table
+let SVETargetGuard = "sve2,lut", SMETargetGuard = "sme2,lut" in {
+  def SVLUTI2_B : SInst<"svluti2_lane[_{d}]", "dd[i", "cUc", MergeNone, "aarch64_sve_luti2_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+  def SVLUTI2_H : SInst<"svluti2_lane[_{d}]", "dd[i", "sUsh", MergeNone, "aarch64_sve_luti2_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_7>]>;
+
+  def SVLUTI4_B : SInst<"svluti4_lane[_{d}]", "dd[i", "cUc", MergeNone, "aarch64_sve_luti4_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
+  def SVLUTI4_H : SInst<"svluti4_lane[_{d}]", "dd[i", "sUsh", MergeNone, "aarch64_sve_luti4_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+
+  def SVLUTI4_x2 : SInst<"svluti4_lane[_{d}]_x2", "d2.d[i", "sUsh", MergeNone, "aarch64_sve_luti4_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+}
+
+let SVETargetGuard = "sve2,lut,bf16", SMETargetGuard = "sme2,lut,bf16" in {
+  def SVLUTI2_BF16 : SInst<"svluti2_lane[_{d}]", "dd[i", "b", MergeNone, "aarch64_sve_luti2_lane", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_7>]>;
+  def SVLUTI4_BF16 : SInst<"svluti4_lane[_{d}]", "dd[i", "b", MergeNone, "aarch64_sve_luti4_lane", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+  def SVLUTI4_BF16_x2 : SInst<"svluti4_lane[_{d}]_x2", "d2.d[i", "b", MergeNone, "aarch64_sve_luti4_lane_x2", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Optional
 
@@ -2400,4 +2418,4 @@ let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
 
   def SVBFMLSLB_LANE : SInst<"svbfmlslb_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslb_lane", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
   def SVBFMLSLT_LANE : SInst<"svbfmlslt_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslt_lane", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
-}
+}
\ No newline at end of file
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c
new file mode 100644
index 0000000000000..60c4828c407e8
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c
@@ -0,0 +1,337 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
+// RUN:   -target-feature +sme -target-feature +sme2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
+// RUN:   -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
+// RUN:   -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
+// RUN:   -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +lut -target-feature +bf16 -O1 -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR
+#endif
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// SME-CHECK-LABEL: @test_svluti2_lane_s8(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// SME-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svluti2_lane_s8u10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svluti2_lane_s8(svint8_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti2_lane,_s8,)(table, indices, 0);
+}
+
+// SME-CHECK-LABEL: @test_svluti2_lane_u8(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// SME-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svluti2_lane_u8u11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svluti2_lane_u8(svuint8_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti2_lane,_u8,)(table, indices, 3);
+}
+
+// SME-CHECK-LABEL: @test_svluti2_lane_s16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// SME-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti2_lane_s16u11__SVInt16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svluti2_lane_s16(svint16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti2_lane,_s16,)(table, indices, 0);
+}
+
+// SME-CHECK-LABEL: @test_svluti2_lane_u16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7)
+// SME-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti2_lane_u16u12__SVUint16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svluti2_lane_u16(svuint16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti2_lane,_u16,)(table, indices, 7);
+}
+
+// SME-CHECK-LABEL: @test_svluti2_lane_f16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5)
+// SME-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5)
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti2_lane_f16u13__SVFloat16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svluti2_lane_f16(svfloat16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti2_lane,_f16,)(table, indices, 5);
+}
+
+// SME-CHECK-LABEL: @test_svluti2_lane_bf16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// SME-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z22test_svluti2_lane_bf16u14__SVBfloat16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svluti2_lane_bf16(svbfloat16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti2_lane,_bf16,)(table, indices, 2);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_s8(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// SME-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svluti4_lane_s8u10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svluti4_lane_s8(svint8_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_s8,)(table, indices, 0);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_u8(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// SME-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svluti4_lane_u8u11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svluti4_lane_u8(svuint8_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_u8,)(table, indices, 1);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_s16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// SME-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti4_lane_s16u11__SVInt16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svluti4_lane_s16(svint16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_s16,)(table, indices, 0);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_u16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7)
+// SME-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti4_lane_u16u12__SVUint16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svluti4_lane_u16(svuint16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_u16,)(table, indices, 3);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_f16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5)
+// SME-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti4_lane_f16u13__SVFloat16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svluti4_lane_f16(svfloat16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_f16,)(table, indices, 2);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_bf16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// SME-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z22test_svluti4_lane_bf16u14__SVBfloat16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svluti4_lane_bf16(svbfloat16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_bf16,)(table, indices, 1);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_s16_x2(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// SME-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// SME-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// SME-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+// CHECK-LABEL: @test_svluti4_lane_s16_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_s16_x211svint16x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+//
+svint16_t test_svluti4_lane_s16_x2(svint16x2_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_s16,_x2)(table, indices, 0);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_u16_x2(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// SME-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// SME-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7)
+// SME-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+// CHECK-LABEL: @test_svluti4_lane_u16_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_u16_x212svuint16x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+//
+svuint16_t test_svluti4_lane_u16_x2(svuint16x2_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_u16,_x2)(table, indices, 3);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_f16_x2(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE:%.*]], i64 0)
+// SME-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE]], i64 8)
+// SME-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5)
+// SME-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+// CHECK-LABEL: @test_svluti4_lane_f16_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+//
+svfloat16_t test_svluti4_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_f16,_x2)(table, indices, 2);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_bf16_x2(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE:%.*]], i64 0)
+// SME-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE]], i64 8)
+// SME-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// SME-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+// CHECK-LABEL: @test_svluti4_lane_bf16_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svluti4_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svluti4_lane_bf16_x2(svbfloat16x2_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_bf16,_x2)(table, indices, 1);
+}
diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp
index bca063385420a..e405077b3de93 100644
--- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp
+++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp
@@ -78,6 +78,14 @@ void test_range_0_7()
   SVE_ACLE_FUNC(svqrdmlsh_lane,_s16,,)(svundef_s16(), svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
   SVE_ACLE_FUNC(svqrdmulh_lane,_s16,,)(svundef_s16(), svundef_s16(), -1);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
+  SVE_ACLE_FUNC(svluti2_lane,_s16,,)(svundef_s16(), svundef_u8(), -1);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
+  SVE_ACLE_FUNC(svluti2_lane,_u16,,)(svundef_u16(), svundef_u8(), -1);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
+  SVE_ACLE_FUNC(svluti2_lane,_f16,,)(svundef_f16(), svundef_u8(), -1);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
+  SVE_ACLE_FUNC(svluti2_lane,_bf16,,)(svundef_bf16(), svundef_u8(), -1);
 }
 
 void test_range_0_3()
@@ -146,6 +154,26 @@ void test_range_0_3()
   SVE_ACLE_FUNC(svqdmullb_lane,_s64,,)(svundef_s32(), svundef_s32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
   SVE_ACLE_FUNC(svqdmullt_lane,_s64,,)(svundef_s32(), svundef_s32(), -1);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti2_lane,_s8,,)(svundef_s8(), svundef_u8(), -1);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti2_lane,_u8,,)(svundef_u8(), svundef_u8(), -1);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_s16,,)(svundef_s16(), svundef_u8(), -1);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_u16,,)(svundef_u16(), svundef_u8(), -1);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_f16,,)(svundef_f16(), svundef_u8(), -1);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_bf16,,)(svundef_bf16(), svundef_u8(), -1);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_s16,_x2,)(svcreate2_s16(svundef_s16(),svundef_s16()), svundef_u8(), -1);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_u16,_x2,)(svcreate2_u16(svundef_u16(),svundef_u16()), svundef_u8(), -1);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_f16,_x2,)(svcreate2_f16(svundef_f16(),svundef_f16()), svundef_u8(), -1);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_bf16,_x2,)(svcreate2_bf16(svundef_bf16(),svundef_bf16()), svundef_u8(), -1);
 }
 
 void test_range_0_1()
@@ -180,4 +208,8 @@ void test_range_0_1()
   SVE_ACLE_FUNC(svqrdmlsh_lane,_s64,,)(svundef_s64(), svundef_s64(), svundef_s64(), 2);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
   SVE_ACLE_FUNC(svqrdmulh_lane,_s64,,)(svundef_s64(), svundef_s64(), 2);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
+  SVE_ACLE_FUNC(svluti4_lane,_s8,,)(svundef_s8(), svundef_u8(), -1);
+  // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
+  SVE_ACLE_FUNC(svluti4_lane,_u8,,)(svundef_u8(), svundef_u8(), -1);
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 3735bf5222fce..6c50b18ee583f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1288,6 +1288,13 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  LLVMVectorOfBitcastsToInt<0>],
                 [IntrNoMem]>;
 
+  class SVE2_LUTI_Inrinsic
+    :  DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 llvm_nxv16i8_ty,
+                 llvm_i32_ty],
+                 [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
   class SVE2_1VectorArg_Long_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMSubdivide2VectorType<0>,
@@ -2682,6 +2689,19 @@ def int_aarch64_sve_sm4ekey : ClangBuiltin<"__builtin_sve_svsm4ekey_u32">,
 def int_aarch64_sve_tbl2 : AdvSIMD_SVE2_TBX_Intrinsic;
 def int_aarch64_sve_tbx  : AdvSIMD_SVE2_TBX_Intrinsic;
 
+//
+// SVE2 - Lookup Table
+//
+
+def int_aarch64_sve_luti2_lane : SVE2_LUTI_Inrinsic;
+def int_aarch64_sve_luti4_lane : SVE2_LUTI_Inrinsic;
+def int_aarch64_sve_luti4_lane_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                    [LLVMMatchType<0>,
+                                    LLVMMatchType<0>,
+                                    llvm_nxv16i8_ty,
+                                    llvm_i32_ty],
+                                    [IntrNoMem, ImmArg<ArgIndex<3>>]>;
+
 //
 // SVE2 - Optional bit permutation
 //
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index a1b960fa8a58a..d6d503171a41e 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10412,6 +10412,15 @@ multiclass sve2_luti2_vector_index<string mnemonic> {
     let Inst{23-22} = idx{2-1};
     let Inst{12}    = idx{0};
   }
+
+  def : SVE_3_Op_Imm_Pat<nxv16i8, int_aarch64_sve_luti2_lane, nxv16i8, nxv16i8,
+                         i32, timm32_0_3, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, int_aarch64_sve_luti2_lane, nxv8i16, nxv16i8,
+                         i32, timm32_0_7, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv8f16, int_aarch64_sve_luti2_lane, nxv8f16, nxv16i8,
+                         i32, timm32_0_7, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv8bf16, int_aarch64_sve_luti2_lane, nxv8bf16, nxv16i8,
+                         i32, timm32_0_7, !cast<Instruction>(NAME # _H)>;
 }
 
 // FP8 Look up table read with 4-bit indices
@@ -10424,14 +10433,39 @@ multiclass sve2_luti4_vector_index<string mnemonic> {
     bits<2> idx;
     let Inst{23-22} = idx;
   }
+
+  def : SVE_3_Op_Imm_Pat<nxv16i8, int_aarch64_sve_luti4_lane, nxv16i8, nxv16i8,
+                        i32, timm32_0_1, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, int_aarch64_sve_luti4_lane, nxv8i16, nxv16i8,
+                         i32, timm32_0_3, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv8f16, int_aarch64_sve_luti4_lane, nxv8f16, nxv16i8,
+                         i32, timm32_0_3, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv8bf16, int_aarch64_sve_luti4_lane, nxv8bf16, nxv16i8,
+                         i32, timm32_0_3, !cast<Instruction>(NAME # _H)>;
 }
 
 // FP8 Look up table read with 4-bit indices (two contiguous registers)
 multiclass sve2_luti4_vector_vg2_index<string mnemonic> {
-  def _H : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexS32b, {?, 0b101}, mnemonic> {
+  def NAME : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexS32b, {?, 0b101}, mnemonic> {
     bits<2> idx;
     let Inst{23-22} = idx;
   }
+
+  def : Pat<(nxv8i16 (int_aarch64_sve_luti4_lane_x2 nxv8i16:$Op1, nxv8i16:$Op2,
+                      nxv16i8:$Op3, (i32 timm32_0_3:$Op4))),
+            (nxv8i16 (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0,
+                                                                   nxv8i16:$Op2, zsub1),
+                                                nxv16i8:$Op3, timm32_0_3:$Op4))>;
+  def : Pat<(nxv8f16 (int_aarch64_sve_luti4_lane_x2 nxv8f16:$Op1, nxv8f16:$Op2,
+                      nxv16i8:$Op3, (i32 timm32_0_3:$Op4))),
+            (nxv8f16 (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0,
+                                                                   nxv8f16:$Op2, zsub1),
+                                                nxv16i8:$Op3, timm32_0_3:$Op4))>;
+  def : Pat<(nxv8bf16 (int_aarch64_sve_luti4_lane_x2 nxv8bf16:$Op1, nxv8bf16:$Op2,
+                      nxv16i8:$Op3, (i32 timm32_0_3:$Op4))),
+            (nxv8bf16 (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0,
+                                                                    nxv8bf16:$Op2, zsub1),
+                                                nxv16i8:$Op3, timm32_0_3:$Op4))>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll
new file mode 100644
index 0000000000000..5cea7536e1f3c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+sve2,+lut,+bf16 | FileCheck %s
+
+define <vscale x 16 x i8> @test_luti2_lane_i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti2_lane_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti2 z0.b, { z0.b }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 16 x i8>  %res
+}
+
+define <vscale x 8 x i16> @test_luti2_lane_i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti2_lane_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti2 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x i16>  %res
+}
+
+define <vscale x 8 x half> @test_luti2_lane_f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti2_lane_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti2 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.lane.nxv8f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x half>  %res
+}
+
+define <vscale x 8 x bfloat> @test_luti2_lane_bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti2_lane_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti2 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.lane.nxv8bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x bfloat>  %res
+}
+
+define <vscale x 16 x i8> @test_luti4_lane_i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti4 z0.b, { z0.b }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 16 x i8>  %res
+}
+
+define <vscale x 8 x i16> @test_luti4_lane_i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti4 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x i16>  %res
+}
+
+define <vscale x 8 x half> @test_luti4_lane_f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti4 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.nxv8f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x half>  %res
+}
+
+define <vscale x 8 x bfloat> @test_luti4_lane_bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti4 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.nxv8bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x bfloat>  %res
+}
+
+define <vscale x 8 x i16> @test_luti4_lane_i16_x2(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_i16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    luti4 z0.h, { z2.h, z3.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> %table, <vscale x 8 x i16> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x i16>  %res
+}
+
+define <vscale x 8 x half> @test_luti4_lane_f16_x2(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_f16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    luti4 z0.h, { z2.h, z3.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.x2.nxv8f16(<vscale x 8 x half> %table, <vscale x 8 x half> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x half>  %res
+}
+
+define <vscale x 8 x bfloat> @test_luti4_lane_bf16_x2(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_bf16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    luti4 z0.h, { z2.h, z3.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16(<vscale x 8 x bfloat> %table, <vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x bfloat>  %res
+}

From 3e948eb3e88d89107406ca0812934bea42101e3a Mon Sep 17 00:00:00 2001
From: Lukacma <Marian.Lukac@arm.com>
Date: Wed, 4 Sep 2024 10:39:59 +0100
Subject: [PATCH 060/425] [AArch64][NEON] Add intrinsics for LUTI (#96883)

This patch adds intrinsics for NEON LUTI2 and LUTI4 instructions as
specified in the [ACLE
proposal](https://github.com/ARM-software/acle/pull/324)
---
 clang/include/clang/Basic/arm_neon.td         |  19 +
 clang/lib/CodeGen/CGBuiltin.cpp               |  89 +++
 clang/test/CodeGen/aarch64-neon-luti.c        | 506 ++++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  35 ++
 .../lib/Target/AArch64/AArch64InstrFormats.td |  14 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  40 ++
 llvm/test/CodeGen/AArch64/neon-luti.ll        | 253 +++++++++
 7 files changed, 949 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-neon-luti.c
 create mode 100644 llvm/test/CodeGen/AArch64/neon-luti.ll

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 3098fa67e6a51..536c0652280b9 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2096,3 +2096,22 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "r
   def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">;
   def VSTL1_LANE  : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">;
 }
+
+// Lookup table read with 2-bit/4-bit indices
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in {
+  def VLUTI2_B    : SInst<"vluti2_lane",    "Q.(qU)I",   "cUcPcQcQUcQPc">;
+  def VLUTI2_B_Q  : SInst<"vluti2_laneq",   "Q.(QU)I",   "cUcPcQcQUcQPc">;
+  def VLUTI2_H    : SInst<"vluti2_lane",    "Q.(<qU)I",   "sUsPshQsQUsQPsQh">;
+  def VLUTI2_H_Q  : SInst<"vluti2_laneq",   "Q.(<QU)I",   "sUsPshQsQUsQPsQh">;
+  def VLUTI4_B    : SInst<"vluti4_lane",    "..(qU)I",   "QcQUcQPc">;
+  def VLUTI4_B_Q    : SInst<"vluti4_laneq",    "..UI",   "QcQUcQPc">;
+  def VLUTI4_H_X2 : SInst<"vluti4_lane_x2", ".2(<qU)I", "QsQUsQPsQh">;
+  def VLUTI4_H_X2_Q : SInst<"vluti4_laneq_x2", ".2(<U)I", "QsQUsQPsQh">;
+    
+  let ArchGuard = "defined(__aarch64__)", TargetGuard= "lut,bf16" in {
+    def VLUTI2_BF      : SInst<"vluti2_lane",     "Q.(<qU)I",   "bQb">;
+    def VLUTI2_BF_Q    : SInst<"vluti2_laneq",    "Q.(<QU)I",   "bQb">;
+    def VLUTI4_BF_X2   : SInst<"vluti4_lane_x2", ".2(<qU)I", "Qb">;
+    def VLUTI4_BF_X2_Q   : SInst<"vluti4_laneq_x2", ".2(<U)I", "Qb">;
+  }
+}
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 012b43b8770be..e826c1c6fbbd2 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -13481,6 +13481,95 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::aarch64_neon_suqadd;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
   }
+
+  case NEON::BI__builtin_neon_vluti2_laneq_bf16:
+  case NEON::BI__builtin_neon_vluti2_laneq_f16:
+  case NEON::BI__builtin_neon_vluti2_laneq_p16:
+  case NEON::BI__builtin_neon_vluti2_laneq_p8:
+  case NEON::BI__builtin_neon_vluti2_laneq_s16:
+  case NEON::BI__builtin_neon_vluti2_laneq_s8:
+  case NEON::BI__builtin_neon_vluti2_laneq_u16:
+  case NEON::BI__builtin_neon_vluti2_laneq_u8: {
+    Int = Intrinsic::aarch64_neon_vluti2_laneq;
+    llvm::Type *Tys[2];
+    Tys[0] = Ty;
+    Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                             /*isQuad*/ false));
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_laneq");
+  }
+  case NEON::BI__builtin_neon_vluti2q_laneq_bf16:
+  case NEON::BI__builtin_neon_vluti2q_laneq_f16:
+  case NEON::BI__builtin_neon_vluti2q_laneq_p16:
+  case NEON::BI__builtin_neon_vluti2q_laneq_p8:
+  case NEON::BI__builtin_neon_vluti2q_laneq_s16:
+  case NEON::BI__builtin_neon_vluti2q_laneq_s8:
+  case NEON::BI__builtin_neon_vluti2q_laneq_u16:
+  case NEON::BI__builtin_neon_vluti2q_laneq_u8: {
+    Int = Intrinsic::aarch64_neon_vluti2_laneq;
+    llvm::Type *Tys[2];
+    Tys[0] = Ty;
+    Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                             /*isQuad*/ true));
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_laneq");
+  }
+  case NEON::BI__builtin_neon_vluti2_lane_bf16:
+  case NEON::BI__builtin_neon_vluti2_lane_f16:
+  case NEON::BI__builtin_neon_vluti2_lane_p16:
+  case NEON::BI__builtin_neon_vluti2_lane_p8:
+  case NEON::BI__builtin_neon_vluti2_lane_s16:
+  case NEON::BI__builtin_neon_vluti2_lane_s8:
+  case NEON::BI__builtin_neon_vluti2_lane_u16:
+  case NEON::BI__builtin_neon_vluti2_lane_u8: {
+    Int = Intrinsic::aarch64_neon_vluti2_lane;
+    llvm::Type *Tys[2];
+    Tys[0] = Ty;
+    Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                             /*isQuad*/ false));
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_lane");
+  }
+  case NEON::BI__builtin_neon_vluti2q_lane_bf16:
+  case NEON::BI__builtin_neon_vluti2q_lane_f16:
+  case NEON::BI__builtin_neon_vluti2q_lane_p16:
+  case NEON::BI__builtin_neon_vluti2q_lane_p8:
+  case NEON::BI__builtin_neon_vluti2q_lane_s16:
+  case NEON::BI__builtin_neon_vluti2q_lane_s8:
+  case NEON::BI__builtin_neon_vluti2q_lane_u16:
+  case NEON::BI__builtin_neon_vluti2q_lane_u8: {
+    Int = Intrinsic::aarch64_neon_vluti2_lane;
+    llvm::Type *Tys[2];
+    Tys[0] = Ty;
+    Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                             /*isQuad*/ true));
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_lane");
+  }
+  case NEON::BI__builtin_neon_vluti4q_lane_p8:
+  case NEON::BI__builtin_neon_vluti4q_lane_s8:
+  case NEON::BI__builtin_neon_vluti4q_lane_u8: {
+    Int = Intrinsic::aarch64_neon_vluti4q_lane;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_lane");
+  }
+  case NEON::BI__builtin_neon_vluti4q_laneq_p8:
+  case NEON::BI__builtin_neon_vluti4q_laneq_s8:
+  case NEON::BI__builtin_neon_vluti4q_laneq_u8: {
+    Int = Intrinsic::aarch64_neon_vluti4q_laneq;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq");
+  }
+  case NEON::BI__builtin_neon_vluti4q_lane_bf16_x2:
+  case NEON::BI__builtin_neon_vluti4q_lane_f16_x2:
+  case NEON::BI__builtin_neon_vluti4q_lane_p16_x2:
+  case NEON::BI__builtin_neon_vluti4q_lane_s16_x2:
+  case NEON::BI__builtin_neon_vluti4q_lane_u16_x2: {
+    Int = Intrinsic::aarch64_neon_vluti4q_lane_x2;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_lane_x2");
+  }
+  case NEON::BI__builtin_neon_vluti4q_laneq_bf16_x2:
+  case NEON::BI__builtin_neon_vluti4q_laneq_f16_x2:
+  case NEON::BI__builtin_neon_vluti4q_laneq_p16_x2:
+  case NEON::BI__builtin_neon_vluti4q_laneq_s16_x2:
+  case NEON::BI__builtin_neon_vluti4q_laneq_u16_x2: {
+    Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
+  }
   }
 }
 
diff --git a/clang/test/CodeGen/aarch64-neon-luti.c b/clang/test/CodeGen/aarch64-neon-luti.c
new file mode 100644
index 0000000000000..72cb6bcdb40f0
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-neon-luti.c
@@ -0,0 +1,506 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+#include <arm_neon.h>
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -target-feature +bf16 -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_u8(
+// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8(<8 x i8> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANE]]
+//
+uint8x16_t test_vluti2_lane_u8(uint8x8_t vn, uint8x8_t vm) {
+  return vluti2_lane_u8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_laneq_u8(
+// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v8i8(<8 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANEQ]]
+//
+uint8x16_t test_vluti2_laneq_u8(uint8x8_t vn, uint8x16_t vm) {
+  return vluti2_laneq_u8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_u8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANE]]
+//
+uint8x16_t test_vluti2q_lane_u8(uint8x16_t vn, uint8x8_t vm) {
+  return vluti2q_lane_u8(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_u8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANEQ]]
+//
+uint8x16_t test_vluti2q_laneq_u8(uint8x16_t vn, uint8x16_t vm) {
+  return vluti2q_laneq_u8(vn, vm, 7);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_s8(
+// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8(<8 x i8> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANE]]
+//
+int8x16_t test_vluti2_lane_s8(int8x8_t vn, uint8x8_t vm) {
+  return vluti2_lane_s8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_laneq_s8(
+// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v8i8(<8 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANEQ]]
+//
+int8x16_t test_vluti2_laneq_s8(int8x8_t vn, uint8x16_t vm) {
+  return vluti2_laneq_s8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_s8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANE]]
+//
+int8x16_t test_vluti2q_lane_s8(int8x16_t vn, uint8x8_t vm) {
+  return vluti2q_lane_s8(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_s8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANEQ]]
+//
+int8x16_t test_vluti2q_laneq_s8(int8x16_t vn, uint8x16_t vm) {
+  return vluti2q_laneq_s8(vn, vm, 7);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_p8(
+// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8(<8 x i8> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANE]]
+//
+poly8x16_t test_vluti2_lane_p8(poly8x8_t vn, uint8x8_t vm) {
+  return vluti2_lane_p8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_laneq_p8(
+// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v8i8(<8 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANEQ]]
+//
+poly8x16_t test_vluti2_laneq_p8(poly8x8_t vn, uint8x16_t vm) {
+  return vluti2_laneq_p8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_p8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANE]]
+//
+poly8x16_t test_vluti2q_lane_p8(poly8x16_t vn, uint8x8_t vm) {
+  return vluti2q_lane_p8(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_p8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANEQ]]
+//
+poly8x16_t test_vluti2q_laneq_p8(poly8x16_t vn, uint8x16_t vm) {
+  return vluti2q_laneq_p8(vn, vm, 7);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_lane_u16(
+// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v4i16(<4 x i16> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI2_LANE1]]
+//
+uint16x8_t test_vluti2_lane_u16(uint16x4_t vn, uint8x8_t vm) {
+  return vluti2_lane_u16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_laneq_u16(
+// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v4i16(<4 x i16> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI2_LANEQ1]]
+//
+uint16x8_t test_vluti2_laneq_u16(uint16x4_t vn, uint8x16_t vm) {
+  return vluti2_laneq_u16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_lane_u16(
+// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i16(<8 x i16> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI2_LANE1]]
+//
+uint16x8_t test_vluti2q_lane_u16(uint16x8_t vn, uint8x8_t vm) {
+  return vluti2q_lane_u16(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_laneq_u16(
+// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v8i16(<8 x i16> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI2_LANEQ1]]
+//
+uint16x8_t test_vluti2q_laneq_u16(uint16x8_t vn, uint8x16_t vm) {
+  return vluti2q_laneq_u16(vn, vm, 7);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_lane_s16(
+// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v4i16(<4 x i16> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI2_LANE1]]
+//
+int16x8_t test_vluti2_lane_s16(int16x4_t vn, uint8x8_t vm) {
+  return vluti2_lane_s16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_laneq_s16(
+// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v4i16(<4 x i16> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI2_LANEQ1]]
+//
+int16x8_t test_vluti2_laneq_s16(int16x4_t vn, uint8x16_t vm) {
+  return vluti2_laneq_s16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_lane_s16(
+// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i16(<8 x i16> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI2_LANE1]]
+//
+int16x8_t test_vluti2q_lane_s16(int16x8_t vn, uint8x8_t vm) {
+  return vluti2q_lane_s16(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_laneq_s16(
+// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v8i16(<8 x i16> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI2_LANEQ1]]
+//
+int16x8_t test_vluti2q_laneq_s16(int16x8_t vn, uint8x16_t vm) {
+  return vluti2q_laneq_s16(vn, vm, 7);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vluti2_lane_f16(
+// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v4f16(<4 x half> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x half> [[VLUTI2_LANE1]]
+//
+float16x8_t test_vluti2_lane_f16(float16x4_t vn, uint8x8_t vm) {
+  return vluti2_lane_f16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vluti2_laneq_f16(
+// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.laneq.v8f16.v4f16(<4 x half> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x half> [[VLUTI2_LANEQ1]]
+//
+float16x8_t test_vluti2_laneq_f16(float16x4_t vn, uint8x16_t vm) {
+  return vluti2_laneq_f16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vluti2q_lane_f16(
+// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v8f16(<8 x half> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <8 x half> [[VLUTI2_LANE1]]
+//
+float16x8_t test_vluti2q_lane_f16(float16x8_t vn, uint8x8_t vm) {
+  return vluti2q_lane_f16(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vluti2q_laneq_f16(
+// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.laneq.v8f16.v8f16(<8 x half> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <8 x half> [[VLUTI2_LANEQ1]]
+//
+float16x8_t test_vluti2q_laneq_f16(float16x8_t vn, uint8x16_t vm) {
+  return vluti2q_laneq_f16(vn, vm, 7);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2_lane_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE1:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v4bf16(<4 x bfloat> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x bfloat> [[VLUTI2_LANE1]]
+//
+bfloat16x8_t test_vluti2_lane_bf16(bfloat16x4_t vn, uint8x8_t vm) {
+  return vluti2_lane_bf16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2_laneq_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ1:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.laneq.v8bf16.v4bf16(<4 x bfloat> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x bfloat> [[VLUTI2_LANEQ1]]
+//
+bfloat16x8_t test_vluti2_laneq_bf16(bfloat16x4_t vn, uint8x16_t vm) {
+  return vluti2_laneq_bf16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2q_lane_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE1:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v8bf16(<8 x bfloat> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <8 x bfloat> [[VLUTI2_LANE1]]
+//
+bfloat16x8_t test_vluti2q_lane_bf16(bfloat16x8_t vn, uint8x8_t vm) {
+  return vluti2q_lane_bf16(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2q_laneq_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ1:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.laneq.v8bf16.v8bf16(<8 x bfloat> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <8 x bfloat> [[VLUTI2_LANEQ1]]
+//
+bfloat16x8_t test_vluti2q_laneq_bf16(bfloat16x8_t vn, uint8x16_t vm) {
+  return vluti2q_laneq_bf16(vn, vm, 7);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_lane_p16(
+// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v4i16(<4 x i16> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI2_LANE1]]
+//
+poly16x8_t test_vluti2_lane_p16(poly16x4_t vn, uint8x8_t vm) {
+  return vluti2_lane_p16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_laneq_p16(
+// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v4i16(<4 x i16> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI2_LANEQ1]]
+//
+poly16x8_t test_vluti2_laneq_p16(poly16x4_t vn, uint8x16_t vm) {
+  return vluti2_laneq_p16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_lane_p16(
+// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i16(<8 x i16> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI2_LANE1]]
+//
+poly16x8_t test_vluti2q_lane_p16(poly16x8_t vn, uint8x8_t vm) {
+  return vluti2q_lane_p16(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_laneq_p16(
+// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v8i16(<8 x i16> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI2_LANEQ1]]
+//
+poly16x8_t test_vluti2q_laneq_p16(poly16x8_t vn, uint8x16_t vm) {
+  return vluti2q_laneq_p16(vn, vm, 7);
+}
+
+//
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_lane_u8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI4Q_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI4Q_LANE]]
+//
+uint8x16_t test_vluti4q_lane_u8(uint8x16_t vn, uint8x8_t vm) {
+  return vluti4q_lane_u8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_u8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI4Q_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI4Q_LANEQ]]
+//
+uint8x16_t test_vluti4q_laneq_u8(uint8x16_t vn, uint8x16_t vm) {
+  return vluti4q_laneq_u8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_lane_s8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI4Q_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI4Q_LANE]]
+//
+int8x16_t test_vluti4q_lane_s8(int8x16_t vn, uint8x8_t vm) {
+  return vluti4q_lane_s8(vn, vm, 1);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_s8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI4Q_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI4Q_LANEQ]]
+//
+int8x16_t test_vluti4q_laneq_s8(int8x16_t vn, uint8x16_t vm) {
+  return vluti4q_laneq_s8(vn, vm, 1);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_lane_p8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI4Q_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI4Q_LANE]]
+//
+poly8x16_t test_vluti4q_lane_p8(poly8x16_t vn, uint8x8_t vm) {
+  return vluti4q_lane_p8(vn, vm, 1);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_p8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VLUTI4Q_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[VLUTI4Q_LANEQ]]
+//
+poly8x16_t test_vluti4q_laneq_p8(poly8x16_t vn, uint8x16_t vm) {
+  return vluti4q_laneq_p8(vn, vm, 1);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_lane_u16_x2(
+// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0
+// CHECK-NEXT:    [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1
+// CHECK-NEXT:    [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.lane.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI4Q_LANE_X24]]
+//
+uint16x8_t test_vluti4q_lane_u16_x2(uint16x8x2_t vn, uint8x8_t vm) {
+  return vluti4q_lane_u16_x2(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_laneq_u16_x2(
+// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0
+// CHECK-NEXT:    [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1
+// CHECK-NEXT:    [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI4Q_LANEQ_X24]]
+//
+uint16x8_t test_vluti4q_laneq_u16_x2(uint16x8x2_t vn, uint8x16_t vm) {
+  return vluti4q_laneq_u16_x2(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_lane_s16_x2(
+// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0
+// CHECK-NEXT:    [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1
+// CHECK-NEXT:    [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.lane.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI4Q_LANE_X24]]
+//
+int16x8_t test_vluti4q_lane_s16_x2(int16x8x2_t vn, uint8x8_t vm) {
+  return vluti4q_lane_s16_x2(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_laneq_s16_x2(
+// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0
+// CHECK-NEXT:    [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1
+// CHECK-NEXT:    [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI4Q_LANEQ_X24]]
+//
+int16x8_t test_vluti4q_laneq_s16_x2(int16x8x2_t vn, uint8x16_t vm) {
+  return vluti4q_laneq_s16_x2(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vluti4q_lane_f16_x2(
+// CHECK-SAME: [2 x <8 x half>] alignstack(16) [[VN_COERCE:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[VN_COERCE]], 0
+// CHECK-NEXT:    [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[VN_COERCE]], 1
+// CHECK-NEXT:    [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti4q.lane.x2.v8f16(<8 x half> [[VN_COERCE_FCA_0_EXTRACT]], <8 x half> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 1)
+// CHECK-NEXT:    ret <8 x half> [[VLUTI4Q_LANE_X24]]
+//
+float16x8_t test_vluti4q_lane_f16_x2(float16x8x2_t vn, uint8x8_t vm) {
+  return vluti4q_lane_f16_x2(vn, vm, 1);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vluti4q_laneq_f16_x2(
+// CHECK-SAME: [2 x <8 x half>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[VN_COERCE]], 0
+// CHECK-NEXT:    [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[VN_COERCE]], 1
+// CHECK-NEXT:    [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti4q.laneq.x2.v8f16(<8 x half> [[VN_COERCE_FCA_0_EXTRACT]], <8 x half> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 1)
+// CHECK-NEXT:    ret <8 x half> [[VLUTI4Q_LANEQ_X24]]
+//
+float16x8_t test_vluti4q_laneq_f16_x2(float16x8x2_t vn, uint8x16_t vm) {
+  return vluti4q_laneq_f16_x2(vn, vm, 1);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti4q_lane_bf16_x2(
+// CHECK-SAME: [2 x <8 x bfloat>] alignstack(16) [[VN_COERCE:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 0
+// CHECK-NEXT:    [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 1
+// CHECK-NEXT:    [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.lane.x2.v8bf16(<8 x bfloat> [[VN_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 2)
+// CHECK-NEXT:    ret <8 x bfloat> [[VLUTI4Q_LANE_X24]]
+//
+bfloat16x8_t test_vluti4q_lane_bf16_x2(bfloat16x8x2_t vn, uint8x8_t vm) {
+  return vluti4q_lane_bf16_x2(vn, vm, 2);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti4q_laneq_bf16_x2(
+// CHECK-SAME: [2 x <8 x bfloat>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 0
+// CHECK-NEXT:    [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 1
+// CHECK-NEXT:    [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.laneq.x2.v8bf16(<8 x bfloat> [[VN_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 2)
+// CHECK-NEXT:    ret <8 x bfloat> [[VLUTI4Q_LANEQ_X24]]
+//
+bfloat16x8_t test_vluti4q_laneq_bf16_x2(bfloat16x8x2_t vn, uint8x16_t vm) {
+  return vluti4q_laneq_bf16_x2(vn, vm, 2);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_lane_p16_x2(
+// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0
+// CHECK-NEXT:    [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1
+// CHECK-NEXT:    [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.lane.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI4Q_LANE_X24]]
+//
+poly16x8_t test_vluti4q_lane_p16_x2(poly16x8x2_t vn, uint8x8_t vm) {
+  return vluti4q_lane_p16_x2(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_laneq_p16_x2(
+// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0
+// CHECK-NEXT:    [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1
+// CHECK-NEXT:    [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x i16> [[VLUTI4Q_LANEQ_X24]]
+//
+poly16x8_t test_vluti4q_laneq_p16_x2(poly16x8x2_t vn, uint8x16_t vm) {
+  return vluti4q_laneq_p16_x2(vn, vm, 0);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 6c50b18ee583f..6727ee69d7b3e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -565,6 +565,41 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   def int_aarch64_neon_vcmla_rot270 : AdvSIMD_3VectorArg_Intrinsic;
 }
 
+let TargetPrefix = "aarch64" in {
+def int_aarch64_neon_vluti2_lane : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                       [llvm_anyvector_ty, llvm_v8i8_ty,
+                                        llvm_i32_ty],
+                                       [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
+def int_aarch64_neon_vluti2_laneq : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                       [llvm_anyvector_ty, llvm_v16i8_ty,
+                                        llvm_i32_ty],
+                                       [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
+def int_aarch64_neon_vluti4q_lane: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                       [LLVMMatchType<0>, llvm_v8i8_ty,
+                                        llvm_i32_ty],
+                                       [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
+def int_aarch64_neon_vluti4q_laneq: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                       [LLVMMatchType<0>, llvm_v16i8_ty,
+                                        llvm_i32_ty],
+                                       [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
+
+def int_aarch64_neon_vluti4q_lane_x2:
+    DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                          [LLVMMatchType<0>, LLVMMatchType<0>,
+                           llvm_v8i8_ty, llvm_i32_ty],
+                          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
+
+def int_aarch64_neon_vluti4q_laneq_x2:
+    DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                          [LLVMMatchType<0>, LLVMMatchType<0>,
+                           llvm_v16i8_ty, llvm_i32_ty],
+                          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
+}
+
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_2Vector2Index_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index ab8251dc83014..16002011aedfb 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -8239,11 +8239,11 @@ multiclass SIMDTableLookupTied<bit op, string asm> {
 // AdvSIMD LUT
 //----------------------------------------------------------------------------
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTableLookupIndexed<bit Q, bits<5> opc, RegisterOperand vectype,
+class BaseSIMDTableLookupIndexed<bit Q, bits<5> opc,
                             RegisterOperand listtype, Operand idx_type,
                             string asm, string kind>
-  : I<(outs vectype:$Rd),
-      (ins listtype:$Rn, vectype:$Rm, idx_type:$idx),
+  : I<(outs V128:$Rd),
+      (ins listtype:$Rn, V128:$Rm, idx_type:$idx),
       asm, "\t$Rd" # kind # ", $Rn, $Rm$idx", "", []>,
     Sched<[]> {
   bits<5> Rd;
@@ -8263,22 +8263,22 @@ class BaseSIMDTableLookupIndexed<bit Q, bits<5> opc, RegisterOperand vectype,
 }
 
 multiclass BaseSIMDTableLookupIndexed2<string asm> {
-  def v16f8 : BaseSIMDTableLookupIndexed<0b1, {0b10,?,?,0b1}, V128, VecListOne16b, VectorIndexS, asm, ".16b"> {
+  def _B : BaseSIMDTableLookupIndexed<0b1, {0b10,?,?,0b1}, VecListOne16b, VectorIndexS32b_timm, asm, ".16b"> {
     bits<2> idx;
     let Inst{14-13} = idx;
   }
-  def v8f16 : BaseSIMDTableLookupIndexed<0b1, {0b11,?,?,?}, V128, VecListOne8h, VectorIndexH, asm, ".8h" > {
+  def _H : BaseSIMDTableLookupIndexed<0b1, {0b11,?,?,?}, VecListOne8h, VectorIndexH32b_timm, asm, ".8h" > {
     bits<3> idx;
     let Inst{14-12} = idx;
   }
 }
 
 multiclass BaseSIMDTableLookupIndexed4<string asm> {
-  def v16f8 : BaseSIMDTableLookupIndexed<0b1, {0b01,?,0b10}, V128, VecListOne16b, VectorIndexD, asm, ".16b"> {
+  def _B : BaseSIMDTableLookupIndexed<0b1, {0b01,?,0b10}, VecListOne16b, VectorIndexD32b_timm, asm, ".16b"> {
     bit idx;
     let Inst{14} = idx;
   }
-  def v8f16 : BaseSIMDTableLookupIndexed<0b1, {0b01,?,?,0b1}, V128, VecListTwo8h, VectorIndexS, asm, ".8h" > {
+  def _H : BaseSIMDTableLookupIndexed<0b1, {0b01,?,?,0b1}, VecListTwo8h, VectorIndexS32b_timm, asm, ".8h" > {
     bits<2> idx;
     let Inst{14-13} = idx;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c659697c3a1be..ccef85bfaa8af 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6616,6 +6616,46 @@ def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
 let Predicates = [HasLUT] in {
   defm LUT2 : BaseSIMDTableLookupIndexed2<"luti2">;
   defm LUT4 : BaseSIMDTableLookupIndexed4<"luti4">;
+  
+  multiclass Luti2_patterns<Instruction Instr, ValueType VT64, ValueType VT128>{
+    def : Pat<(VT128 (int_aarch64_neon_vluti2_lane VT64:$Rn, 
+                   v8i8:$Rm, i32:$idx)),
+              (Instr (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 
+              (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),  VectorIndexS32b_timm:$idx)>;
+    def : Pat<(VT128 (int_aarch64_neon_vluti2_laneq VT64:$Rn,
+                   v16i8:$Rm, i32:$idx)),
+              (Instr (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 
+              V128:$Rm,  VectorIndexS32b_timm:$idx)>;
+    def : Pat<(VT128 (int_aarch64_neon_vluti2_lane VT128:$Rn, 
+                   v8i8:$Rm, i32:$idx)),
+              (Instr V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),  
+              VectorIndexS32b_timm:$idx)>;
+    def : Pat<(VT128 (int_aarch64_neon_vluti2_laneq VT128:$Rn,
+                   v16i8:$Rm, i32:$idx)),
+              (Instr V128:$Rn, V128:$Rm,  VectorIndexS32b_timm:$idx)>;
+  }
+
+  defm : Luti2_patterns<LUT2_B, v8i8, v16i8>;
+  defm : Luti2_patterns<LUT2_H, v4i16, v8i16>;
+  defm : Luti2_patterns<LUT2_H, v4f16, v8f16>;
+  defm : Luti2_patterns<LUT2_H, v4bf16, v8bf16>;
+ 
+  def : Pat<(v16i8 (int_aarch64_neon_vluti4q_laneq v16i8:$Rn, 
+                    v16i8:$Rm, i32:$idx)),
+            (LUT4_B VecListOne16b:$Rn, V128:$Rm,  VectorIndexD32b_timm:$idx)>;
+  def : Pat<(v16i8 (int_aarch64_neon_vluti4q_lane v16i8:$Rn,
+                    v8i8:$Rm, i32:$idx)),
+            (LUT4_B VecListOne16b:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),  VectorIndexD32b_timm:$idx)>;
+
+  foreach VT = [v8i16, v8f16, v8bf16] in {
+    def : Pat<(VT (int_aarch64_neon_vluti4q_laneq_x2 VT:$Rn1, 
+                   VT:$Rn2, v16i8:$Rm, i32:$idx)),
+              (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), V128:$Rm,  VectorIndexS32b_timm:$idx)>;
+    def : Pat<(VT (int_aarch64_neon_vluti4q_lane_x2 VT:$Rn1,
+                   VT:$Rn2, v8i8:$Rm, i32:$idx)),
+              (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1),
+                      (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),  VectorIndexS32b_timm:$idx)>;
+  }
 }
 
 //----------------------------------------------------------------------------
diff --git a/llvm/test/CodeGen/AArch64/neon-luti.ll b/llvm/test/CodeGen/AArch64/neon-luti.ll
new file mode 100644
index 0000000000000..5436662753762
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-luti.ll
@@ -0,0 +1,253 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon,+lut,+bf16 | FileCheck %s
+
+define <16 x i8> @test_luti2_lane_i8(<8 x i8> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2_lane_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    luti2 v0.16b, { v0.16b }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8(<8 x i8> %vn, <8 x i8> %vm, i32 0)
+   ret <16 x i8>  %res
+}
+
+define <16 x i8> @test_luti2_laneq_i8(<8 x i8> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2_laneq_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    luti2 v0.16b, { v0.16b }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v8i8(<8 x i8> %vn, <16 x i8> %vm, i32 0)
+   ret <16 x i8>  %res
+}
+
+define <16 x i8> @test_luti2q_lane_i8(<16 x i8> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2q_lane_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    luti2 v0.16b, { v0.16b }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> %vn, <8 x i8> %vm, i32 0)
+   ret <16 x i8>  %res
+}
+
+define <16 x i8> @test_luti2q_laneq_i8(<16 x i8> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2q_laneq_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti2 v0.16b, { v0.16b }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> %vn, <16 x i8> %vm, i32 0)
+   ret <16 x i8>  %res
+}
+
+define <8 x i16> @test_luti2_lane_i16(<4 x i16> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2_lane_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v4i16(<4 x i16> %vn, <8 x i8> %vm, i32 0)
+   ret <8 x i16>  %res
+}
+
+define <8 x i16> @test_luti2_laneq_i16(<4 x i16> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2_laneq_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v4i16(<4 x i16> %vn, <16 x i8> %vm, i32 0)
+   ret <8 x i16>  %res
+}
+
+define <8 x i16> @test_luti2q_lane_i16(<4 x i16> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2q_lane_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i16(<4 x i16> %vn, <8 x i8> %vm, i32 0)
+   ret <8 x i16>  %res
+}
+
+define <8 x i16> @test_luti2q_laneq_i16(<8 x i16> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2q_laneq_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v8i16(<8 x i16> %vn, <16 x i8> %vm, i32 0)
+   ret <8 x i16>  %res
+}
+
+define <8 x half> @test_luti2_lane_f16(<4 x half> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2_lane_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v4f16(<4 x half> %vn, <8 x i8> %vm, i32 0)
+   ret <8 x half>  %res
+}
+
+define <8 x half> @test_luti2_laneq_f16(<4 x half> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2_laneq_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.laneq.v8f16.v4i16(<4 x half> %vn, <16 x i8> %vm, i32 0)
+   ret <8 x half>  %res
+}
+
+define <8 x half> @test_luti2q_lane_f16(<8 x half> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2q_lane_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v8f16(<8 x half> %vn, <8 x i8> %vm, i32 0)
+   ret <8 x half>  %res
+}
+
+define <8 x half> @test_luti2q_laneq_f16(<8 x half> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2q_laneq_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.laneq.v8f16.v8f16(<8 x half> %vn, <16 x i8> %vm, i32 0)
+   ret <8 x half>  %res
+}
+
+define <8 x bfloat> @test_luti2_lane_bf16(<4 x bfloat> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2_lane_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v4bf16(<4 x bfloat> %vn, <8 x i8> %vm, i32 0)
+   ret <8 x bfloat>  %res
+}
+
+define <8 x bfloat> @test_luti2_laneq_bf16(<4 x bfloat> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2_laneq_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.laneq.v8bf16.v4bf16(<4 x bfloat> %vn, <16 x i8> %vm, i32 0)
+   ret <8 x bfloat>  %res
+}
+
+define <8 x bfloat> @test_luti2q_lane_bf16(<4 x bfloat> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2q_lane_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v8bf16(<4 x bfloat> %vn, <8 x i8> %vm, i32 0)
+   ret <8 x bfloat>  %res
+}
+
+define <8 x bfloat> @test_luti2q_laneq_bf16(<8 x bfloat> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2q_laneq_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.laneq.v8bf16.v8bf16(<8 x bfloat> %vn, <16 x i8> %vm, i32 0)
+   ret <8 x bfloat>  %res
+}
+
+define <16 x i8> @test_luti4q_lane_i8(<16 x i8> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti4q_lane_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    luti4 v0.16b, { v0.16b }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> %vn, <8 x i8> %vm, i32 0)
+   ret <16 x i8>  %res
+}
+
+define <16 x i8> @test_luti4q_laneq_i8(<16 x i8> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti4q_laneq_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti4 v0.16b, { v0.16b }, v1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> %vn, <16 x i8> %vm, i32 0)
+   ret <16 x i8>  %res
+}
+
+define <8 x i16> @test_luti4q_lane_x2_i16(<8 x i16> %vn1, <8 x i16> %vn2, <8 x i8> %vm){
+; CHECK-LABEL: test_luti4q_lane_x2_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    luti4 v0.8h, { v0.8h, v1.8h }, v2[1]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x i16> @llvm.aarch64.neon.vluti4q.lane.x2.v8i16(<8 x i16> %vn1, <8 x i16> %vn2, <8 x i8> %vm, i32 1)
+   ret <8 x i16> %res
+}
+
+define <8 x i16> @test_luti4q_laneq_x2_i16(<8 x i16> %vn1, <8 x i16> %vn2, <16 x i8> %vm){
+; CHECK-LABEL: test_luti4q_laneq_x2_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    luti4 v0.8h, { v0.8h, v1.8h }, v2[1]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> %vn1, <8 x i16> %vn2, <16 x i8> %vm, i32 1)
+   ret <8 x i16> %res
+}
+
+define <8 x half> @test_luti4q_lane_x2_f16(<8 x half>%vn1, <8 x half> %vn2, <8 x i8> %vm){
+; CHECK-LABEL: test_luti4q_lane_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    luti4 v0.8h, { v0.8h, v1.8h }, v2[1]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x half> @llvm.aarch64.neon.vluti4q.lane.x2.v8f16(<8 x half> %vn1, <8 x half> %vn2, <8 x i8> %vm, i32 1)
+   ret <8 x half> %res
+}
+
+
+define <8 x half> @test_luti4q_laneq_x2_f16(<8 x half>%vn1, <8 x half> %vn2, <16 x i8> %vm){
+; CHECK-LABEL: test_luti4q_laneq_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    luti4 v0.8h, { v0.8h, v1.8h }, v2[1]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x half> @llvm.aarch64.neon.vluti4q.laneq.x2.v8f16(<8 x half> %vn1, <8 x half> %vn2, <16 x i8> %vm, i32 1)
+   ret <8 x half> %res
+}
+
+define <8 x bfloat> @test_luti4q_laneq_x2_bf16(<8 x bfloat>%vn1, <8 x bfloat> %vn2, <16 x i8> %vm){
+; CHECK-LABEL: test_luti4q_laneq_x2_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    luti4 v0.8h, { v0.8h, v1.8h }, v2[1]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.laneq.x2.v8bf16(<8 x bfloat> %vn1, <8 x bfloat> %vn2, <16 x i8> %vm, i32 1)
+   ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_luti4q_lane_x2_bf16(<8 x bfloat>%vn1, <8 x bfloat> %vn2, <8 x i8> %vm){
+; CHECK-LABEL: test_luti4q_lane_x2_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    luti4 v0.8h, { v0.8h, v1.8h }, v2[1]
+; CHECK-NEXT:    ret
+   %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.lane.x2.v8bf16(<8 x bfloat> %vn1, <8 x bfloat> %vn2, <8 x i8> %vm, i32 1)
+   ret <8 x bfloat> %res
+}

From 5a658ee933065d0e4ef1a65d9a6ddfba2874ee98 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Wed, 4 Sep 2024 09:42:18 +0000
Subject: [PATCH 061/425] [lldb][test] Skip some lldb-server tests on Windows

These are known to return errors occasionaly on our Windows on Arm
bot.
---
 lldb/test/API/tools/lldb-server/TestGdbRemoteLaunch.py | 1 +
 lldb/test/API/tools/lldb-server/TestLldbGdbServer.py   | 1 +
 lldb/test/API/tools/lldb-server/TestNonStop.py         | 1 +
 3 files changed, 3 insertions(+)

diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteLaunch.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteLaunch.py
index ad84a40932c65..a2ac1fdb6270f 100644
--- a/lldb/test/API/tools/lldb-server/TestGdbRemoteLaunch.py
+++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteLaunch.py
@@ -58,6 +58,7 @@ def test_launch_via_vRun(self):
         self.assertEqual(context["O_content"], b"arg1\r\narg2\r\narg3\r\n")
 
     @add_test_categories(["llgs"])
+    @skipIfWindows  # Sometimes returns '$E1f'.
     def test_launch_via_vRun_no_args(self):
         self.build()
         server = self.connect_to_debug_monitor()
diff --git a/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py b/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py
index 93485cd32f519..592037db502aa 100644
--- a/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py
+++ b/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py
@@ -468,6 +468,7 @@ def test_Hg_fails_on_zero_pid(self):
         self.Hg_fails_on_pid(0)
 
     @add_test_categories(["llgs"])
+    @skipIfWindows  # Sometimes returns '$E37'.
     def test_Hg_fails_on_minus_one_pid(self):
         self.build()
         self.set_inferior_startup_launch()
diff --git a/lldb/test/API/tools/lldb-server/TestNonStop.py b/lldb/test/API/tools/lldb-server/TestNonStop.py
index 62bda48ee049b..841de50818797 100644
--- a/lldb/test/API/tools/lldb-server/TestNonStop.py
+++ b/lldb/test/API/tools/lldb-server/TestNonStop.py
@@ -276,6 +276,7 @@ def test_multiple_vCont(self):
         self.expect_gdbremote_sequence()
 
     @add_test_categories(["llgs"])
+    @skipIfWindows  # Sometimes results in '$E37' instead of expected '$OK'
     def test_vCont_then_stop(self):
         self.build()
         self.set_inferior_startup_launch()

From d25e24a0eb909b7604572d28d15cbe648ecccd90 Mon Sep 17 00:00:00 2001
From: Carlos Galvez <carlosgalvezp@gmail.com>
Date: Wed, 4 Sep 2024 11:49:16 +0200
Subject: [PATCH 062/425] =?UTF-8?q?[clang-tidy]=20Suggest=20using=20reinte?=
 =?UTF-8?q?rpret=5Fcast=20in=20bugprone-casting-thro=E2=80=A6=20(#106784)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ugh-void

reinterpret_cast is the equivalent construct, and more clearly expresses
intent.

Co-authored-by: Carlos Gálvez <carlos.galvez@zenseact.com>
---
 .../bugprone/CastingThroughVoidCheck.cpp      |  4 +-
 clang-tools-extra/docs/ReleaseNotes.rst       |  4 ++
 .../checks/bugprone/casting-through-void.rst  | 21 +++++++++-
 .../bugprone/casting-through-void.cpp         | 40 +++++++++----------
 4 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp
index 9e714b4be4dfe..f0a9ace229740 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp
@@ -38,7 +38,9 @@ void CastingThroughVoidCheck::check(const MatchFinder::MatchResult &Result) {
   const auto ST = *Result.Nodes.getNodeAs<QualType>("source_type");
   const auto VT = *Result.Nodes.getNodeAs<QualType>("void_type");
   const auto *CE = Result.Nodes.getNodeAs<ExplicitCastExpr>("cast");
-  diag(CE->getExprLoc(), "do not cast %0 to %1 through %2") << ST << TT << VT;
+  diag(CE->getExprLoc(),
+       "do not cast %0 to %1 through %2; use reinterpret_cast instead")
+      << ST << TT << VT;
 }
 
 } // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index b001a6ad44669..6999c1ef2ea4b 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -104,6 +104,10 @@ New check aliases
 Changes in existing checks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Improved :doc:`bugprone-casting-through-void
+  <clang-tidy/checks/bugprone/casting-through-void>` check to suggest replacing
+  the offending code with ``reinterpret_cast``, to more clearly express intent.
+
 - Improved :doc:`modernize-use-std-format
   <clang-tidy/checks/modernize/use-std-format>` check to support replacing
   member function calls too.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/casting-through-void.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/casting-through-void.rst
index a9ab478b9a82e..d9f94b6a3f20b 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/casting-through-void.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/casting-through-void.rst
@@ -3,7 +3,9 @@
 bugprone-casting-through-void
 =============================
 
-Detects unsafe or redundant two-step casting operations involving ``void*``.
+Detects unsafe or redundant two-step casting operations involving ``void*``,
+which is equivalent to ``reinterpret_cast`` as per the
+`C++ Standard <https://eel.is/c++draft/expr.reinterpret.cast#7>`_.
 
 Two-step type conversions via ``void*`` are discouraged for several reasons.
 
@@ -16,7 +18,17 @@ Two-step type conversions via ``void*`` are discouraged for several reasons.
 
 In summary, avoiding two-step type conversions through ``void*`` ensures clearer code,
 maintains essential compiler warnings, and prevents ambiguity and potential runtime
-errors, particularly in complex inheritance scenarios.
+errors, particularly in complex inheritance scenarios. If such a cast is wanted,
+it shall be done via ``reinterpret_cast``, to express the intent more clearly.
+
+Note: it is expected that, after applying the suggested fix and using
+``reinterpret_cast``, the check :doc:`cppcoreguidelines-pro-type-reinterpret-cast
+<../cppcoreguidelines/pro-type-reinterpret-cast>` will emit a warning.
+This is intentional: ``reinterpret_cast`` is a dangerous operation that can
+easily break the strict aliasing rules when dereferencing the casted pointer,
+invoking Undefined Behavior. The warning is there to prompt users to carefuly
+analyze whether the usage of ``reinterpret_cast`` is safe, in which case the
+warning may be suppressed.
 
 Examples:
 
@@ -29,3 +41,8 @@ Examples:
    reinterpret_cast<IntegerPointer>(reinterpret_cast<void *>(ptr)); // WRONG
    (IntegerPointer)(void *)ptr; // WRONG
    IntegerPointer(static_cast<void *>(ptr)); // WRONG
+
+   reinterpret_cast<IntegerPointer>(ptr); // OK, clearly expresses intent.
+                                          // NOTE: dereferencing this pointer violates
+                                          // the strict aliasing rules, invoking
+                                          // Undefined Behavior.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/casting-through-void.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/casting-through-void.cpp
index a784e49885873..68172212904f8 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/casting-through-void.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/casting-through-void.cpp
@@ -10,42 +10,42 @@ const double cd = 100;
 
 void normal_test() {
   static_cast<int *>(static_cast<void *>(&d));
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void]
   static_cast<int *>(static_cast<V>(&d));
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'V' (aka 'void *') [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'V' (aka 'void *'); use reinterpret_cast instead [bugprone-casting-through-void]
   static_cast<int *>(static_cast<void *>(&i));
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'int *' to 'int *' through 'void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'int *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void]
 
   static_cast<void *>(static_cast<void *>(&i));
 }
 
 void const_pointer_test() {
   static_cast<int *const>(static_cast<void *>(&d));
-  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *const' through 'void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *const' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void]
   static_cast<int *const>(static_cast<V>(&d));
-  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *const' through 'V' (aka 'void *') [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *const' through 'V' (aka 'void *'); use reinterpret_cast instead [bugprone-casting-through-void]
   static_cast<int *const>(static_cast<void *>(&i));
-  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'int *' to 'int *const' through 'void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'int *' to 'int *const' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void]
 
   static_cast<void *const>(static_cast<void *>(&i));
 }
 
 void const_test() {
   static_cast<const int *>(static_cast<const void *>(&d));
-  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'double *' to 'const int *' through 'const void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'double *' to 'const int *' through 'const void *'; use reinterpret_cast instead [bugprone-casting-through-void]
   static_cast<const int *>(static_cast<const V>(&d));
-  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'double *' to 'const int *' through 'const V' (aka 'void *const') [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'double *' to 'const int *' through 'const V' (aka 'void *const'); use reinterpret_cast instead [bugprone-casting-through-void]
   static_cast<const int *>(static_cast<const void *>(&i));
-  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'int *' to 'const int *' through 'const void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'int *' to 'const int *' through 'const void *'; use reinterpret_cast instead [bugprone-casting-through-void]
 
   static_cast<const void *>(static_cast<const void *>(&i));
 
   static_cast<const int *>(static_cast<const void *>(&cd));
-  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'const double *' to 'const int *' through 'const void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'const double *' to 'const int *' through 'const void *'; use reinterpret_cast instead [bugprone-casting-through-void]
   static_cast<const int *>(static_cast<const CV>(&cd));
-  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'const double *' to 'const int *' through 'const CV' (aka 'const void *const') [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'const double *' to 'const int *' through 'const CV' (aka 'const void *const'); use reinterpret_cast instead [bugprone-casting-through-void]
   static_cast<const int *>(static_cast<const void *>(&ci));
-  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'const int *' to 'const int *' through 'const void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not cast 'const int *' to 'const int *' through 'const void *'; use reinterpret_cast instead [bugprone-casting-through-void]
 
   static_cast<const void *>(static_cast<const void *>(&ci));
 }
@@ -53,11 +53,11 @@ void const_test() {
 
 void reinterpret_cast_test() {
   static_cast<int *>(reinterpret_cast<void *>(&d));
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void]
   reinterpret_cast<int *>(static_cast<void *>(&d));
-  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void]
   reinterpret_cast<int *>(reinterpret_cast<void *>(&d));
-  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void]
 
   static_cast<void *>(reinterpret_cast<void *>(&i));
   reinterpret_cast<void *>(reinterpret_cast<void *>(&i));
@@ -66,11 +66,11 @@ void reinterpret_cast_test() {
 
 void c_style_cast_test() {
   static_cast<int *>((void *)&d);
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void]
   (int *)(void *)&d;
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void]
   static_cast<int *>((void *)&d);
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void]
 
   static_cast<void *>((void *)&i);
 }
@@ -82,12 +82,12 @@ using I = int *;
 void cxx_functional_cast() {
   A(static_cast<void*>(&d));
   I(static_cast<void*>(&d));
-  // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: do not cast 'double *' to 'I' (aka 'int *') through 'void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: do not cast 'double *' to 'I' (aka 'int *') through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void]
 }
 
 void bit_cast() {
   __builtin_bit_cast(int *, static_cast<void *>(&d));
-  // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void]
+  // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: do not cast 'double *' to 'int *' through 'void *'; use reinterpret_cast instead [bugprone-casting-through-void]
 }
 
 namespace PR87069 {

From 519b36925cf2e1a59f76bd509471d2e1830169f0 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Wed, 4 Sep 2024 11:49:50 +0200
Subject: [PATCH 063/425] [LLD][COFF][NFC] Store impSym as DefinedImportData in
 ImportFile. (#107162)

---
 lld/COFF/InputFiles.cpp  | 3 +--
 lld/COFF/InputFiles.h    | 2 +-
 lld/COFF/SymbolTable.cpp | 4 ++--
 lld/COFF/SymbolTable.h   | 2 +-
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index e1ea4ebeabc9b..50bc62312a6f8 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -1059,8 +1059,7 @@ void ImportFile::parse() {
   // address pointed by the __imp_ symbol. (This allows you to call
   // DLL functions just like regular non-DLL functions.)
   if (hdr->getType() == llvm::COFF::IMPORT_CODE)
-    thunkSym = ctx.symtab.addImportThunk(
-        name, cast_or_null<DefinedImportData>(impSym), hdr->Machine);
+    thunkSym = ctx.symtab.addImportThunk(name, impSym, hdr->Machine);
 }
 
 BitcodeFile::BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb,
diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h
index a332ac87b265e..8b3303a8d87f4 100644
--- a/lld/COFF/InputFiles.h
+++ b/lld/COFF/InputFiles.h
@@ -346,7 +346,7 @@ class ImportFile : public InputFile {
   static bool classof(const InputFile *f) { return f->kind() == ImportKind; }
   MachineTypes getMachineType() const override;
 
-  Symbol *impSym = nullptr;
+  DefinedImportData *impSym = nullptr;
   Symbol *thunkSym = nullptr;
   std::string dllName;
 
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index a5f155bc05bc9..bb7583bb9a7df 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -771,12 +771,12 @@ Symbol *SymbolTable::addCommon(InputFile *f, StringRef n, uint64_t size,
   return s;
 }
 
-Symbol *SymbolTable::addImportData(StringRef n, ImportFile *f) {
+DefinedImportData *SymbolTable::addImportData(StringRef n, ImportFile *f) {
   auto [s, wasInserted] = insert(n, nullptr);
   s->isUsedInRegularObj = true;
   if (wasInserted || isa<Undefined>(s) || s->isLazy()) {
     replaceSymbol<DefinedImportData>(s, n, f);
-    return s;
+    return cast<DefinedImportData>(s);
   }
 
   reportDuplicate(s, f);
diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h
index b5f95d2ad7f11..51c6c79ec1446 100644
--- a/lld/COFF/SymbolTable.h
+++ b/lld/COFF/SymbolTable.h
@@ -102,7 +102,7 @@ class SymbolTable {
   Symbol *addCommon(InputFile *f, StringRef n, uint64_t size,
                     const llvm::object::coff_symbol_generic *s = nullptr,
                     CommonChunk *c = nullptr);
-  Symbol *addImportData(StringRef n, ImportFile *f);
+  DefinedImportData *addImportData(StringRef n, ImportFile *f);
   Symbol *addImportThunk(StringRef name, DefinedImportData *s,
                          uint16_t machine);
   void addLibcall(StringRef name);

From 126d6f27102fca0d69dc50cf29a37442d18304cf Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 4 Sep 2024 11:03:22 +0100
Subject: [PATCH 064/425] [AMDGPU] Improve codegen for GFX10+ DPP reductions
 and scans (#107108)

Use poison for an unused input to the permlanex16 intrinsic, to improve
register allocation and avoid an unnecessary v_mov instruction.
---
 .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp   |  18 +-
 .../global-atomic-fadd.f32-no-rtn.ll          |   9 +-
 .../GlobalISel/global-atomic-fadd.f32-rtn.ll  |  27 +-
 .../atomic_optimizations_global_pointer.ll    | 368 ++++----
 .../atomic_optimizations_local_pointer.ll     | 873 ++++++++----------
 .../atomic_optimizations_pixelshader.ll       |  24 +-
 .../AMDGPU/global-atomic-fadd.f32-no-rtn.ll   |   5 +-
 .../AMDGPU/global-atomic-fadd.f32-rtn.ll      |  22 +-
 .../AMDGPU/global_atomics_scan_fadd.ll        | 250 ++---
 .../AMDGPU/global_atomics_scan_fmax.ll        | 156 ++--
 .../AMDGPU/global_atomics_scan_fmin.ll        | 156 ++--
 .../AMDGPU/global_atomics_scan_fsub.ll        | 250 ++---
 12 files changed, 877 insertions(+), 1281 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 95afc3fcc8d7d..f408a013d7a37 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -421,9 +421,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
 
   // Reduce within each pair of rows (i.e. 32 lanes).
   assert(ST->hasPermLaneX16());
-  Value *Permlanex16Call = B.CreateIntrinsic(
-      V->getType(), Intrinsic::amdgcn_permlanex16,
-      {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
+  Value *Permlanex16Call =
+      B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
+                        {PoisonValue::get(AtomicTy), V, B.getInt32(0),
+                         B.getInt32(0), B.getFalse(), B.getFalse()});
   V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);
   if (ST->isWave32()) {
     return V;
@@ -432,7 +433,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
   if (ST->hasPermLane64()) {
     // Reduce across the upper and lower 32 lanes.
     Value *Permlane64Call =
-        B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
+        B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlane64, V);
     return buildNonAtomicBinOp(B, Op, V, Permlane64Call);
   }
 
@@ -481,9 +482,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
     // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
     // 48..63).
     assert(ST->hasPermLaneX16());
-    Value *PermX = B.CreateIntrinsic(
-        V->getType(), Intrinsic::amdgcn_permlanex16,
-        {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
+    Value *PermX =
+        B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
+                          {PoisonValue::get(AtomicTy), V, B.getInt32(-1),
+                           B.getInt32(-1), B.getFalse(), B.getFalse()});
 
     Value *UpdateDPPCall = B.CreateCall(
         UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
@@ -493,7 +495,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
     if (!ST->isWave32()) {
       // Combine lane 31 into lanes 32..63.
       Value *const Lane31 = B.CreateIntrinsic(
-          V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
+          AtomicTy, Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
 
       Value *UpdateDPPCall = B.CreateCall(
           UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
index fcd8c6fb0fe7c..9c634aba348d2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
@@ -285,6 +285,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
   ; GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GFX11-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_32_xm0_xexec = SI_PS_LIVE
   ; GFX11-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.2
@@ -312,12 +313,12 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; GFX11-NEXT:   [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY10]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
-  ; GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GFX11-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec
+  ; GFX11-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+  ; GFX11-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], [[COPY11]], 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_ADD_F32_e64_4]], implicit $exec
-  ; GFX11-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GFX11-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY11]], implicit $exec
+  ; GFX11-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GFX11-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY12]], implicit $exec
   ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.3
   ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
index a8f9ed2e6fba9..fdce9d9258c88 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
@@ -269,22 +269,23 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY10]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GFX11-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec
-  ; GFX11-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; GFX11-NEXT:   [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec
-  ; GFX11-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
+  ; GFX11-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+  ; GFX11-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[COPY11]], 0, implicit $exec
   ; GFX11-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; GFX11-NEXT:   [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec
+  ; GFX11-NEXT:   [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec
+  ; GFX11-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
+  ; GFX11-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; GFX11-NEXT:   [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 15
   ; GFX11-NEXT:   [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_3]]
   ; GFX11-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; GFX11-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READLANE_B32_]], [[S_MOV_B32_4]], [[V_MOV_B32_dpp5]]
   ; GFX11-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 31
   ; GFX11-NEXT:   [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_5]]
-  ; GFX11-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_1]]
-  ; GFX11-NEXT:   [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY13]], implicit $exec
-  ; GFX11-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GFX11-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY14]], implicit $exec
+  ; GFX11-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_1]]
+  ; GFX11-NEXT:   [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY14]], implicit $exec
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GFX11-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY15]], implicit $exec
   ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.3
   ; GFX11-NEXT: {{  $}}
@@ -298,7 +299,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT: bb.4.Flow:
   ; GFX11-NEXT:   successors: %bb.6(0x80000000)
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI %41, %bb.5, [[DEF]], %bb.1
+  ; GFX11-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI %42, %bb.5, [[DEF]], %bb.1
   ; GFX11-NEXT:   SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.6
   ; GFX11-NEXT: {{  $}}
@@ -309,10 +310,10 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
   ; GFX11-NEXT:   [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
-  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GFX11-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY15]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GFX11-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_5]], 0, [[COPY16]], [[V_CMP_EQ_U32_e64_]], implicit $exec
+  ; GFX11-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY16]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX11-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_5]], 0, [[COPY17]], [[V_CMP_EQ_U32_e64_]], implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.4
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.6 (%ir-block.38):
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index ed036a83b6143..9397aee37524f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1257,8 +1257,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -1320,8 +1319,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1379,15 +1377,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -1453,25 +1450,23 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s4, s6
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
@@ -1518,15 +1513,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -1596,22 +1590,21 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1232_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1232_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
@@ -3150,23 +3143,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v4, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s5
@@ -3265,23 +3256,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s8, v4, 15
@@ -3348,7 +3337,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -3363,27 +3351,25 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v4, 31
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s5
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
@@ -3465,13 +3451,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -3482,31 +3467,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s7, v4, 15
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s8, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s6
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s7, 16
@@ -3563,13 +3547,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1264_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -3580,29 +3563,27 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s5, v4, 31
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -3690,7 +3671,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v2
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -3703,23 +3684,22 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1232_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1232_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1232_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1232_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1232_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s7, v4, 15
@@ -5024,8 +5004,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -5087,8 +5066,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -5146,15 +5124,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -5220,25 +5197,23 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s4, s6
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
@@ -5285,15 +5260,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -5363,22 +5337,21 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1232_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1232_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
@@ -6959,23 +6932,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v4, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s5
@@ -7074,23 +7045,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s8, v4, 15
@@ -7157,7 +7126,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -7172,27 +7140,25 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v4, 31
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s5
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
@@ -7274,13 +7240,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7291,31 +7256,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s7, v4, 15
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s8, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s6
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s7, 16
@@ -7372,13 +7336,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1264_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7389,29 +7352,27 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s5, v4, 31
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -7499,7 +7460,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v2
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -7512,23 +7473,22 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1232_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1232_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1232_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1232_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1232_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s7, v4, 15
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index c7296185422ce..6d0e0cc7869b3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -880,8 +880,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -936,8 +935,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -986,21 +984,20 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -1053,15 +1050,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1394,8 +1390,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1428,8 +1423,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
@@ -1460,16 +1454,14 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1502,16 +1494,14 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
 ; GFX1132_DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
@@ -2623,23 +2613,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v4, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s5
@@ -2732,20 +2720,18 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v4, 15
@@ -2807,7 +2793,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -2822,27 +2807,25 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v4, 31
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s5
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
@@ -2919,13 +2902,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -2936,28 +2918,28 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v4, 15
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s5, 16
@@ -3434,10 +3416,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v3, v1
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v1, 0, 0
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v3
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -3500,10 +3480,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v3, v1
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v1, 0, 0
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -3560,11 +3538,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v3, 0, 0
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v1, 0, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_add_co_u32 v2, vcc, v3, v2
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
@@ -3630,12 +3605,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v3, 0, 0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v1, 0, 0
 ; GFX1132_DPP-NEXT:    v_add_co_u32 v2, vcc_lo, v3, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo
@@ -4528,8 +4500,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -4584,8 +4555,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -4634,21 +4604,20 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -4701,15 +4670,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -5042,8 +5010,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5076,8 +5043,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
@@ -5108,16 +5074,14 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -5150,16 +5114,14 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
 ; GFX1132_DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
@@ -6297,23 +6259,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v4, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s5
@@ -6406,20 +6366,18 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v4, 15
@@ -6481,7 +6439,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -6496,27 +6453,25 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v4, 31
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s5
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
@@ -6593,13 +6548,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -6610,28 +6564,28 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v4, 15
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s5, 16
@@ -7093,8 +7047,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -7149,8 +7102,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -7199,21 +7151,20 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -7266,20 +7217,18 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
@@ -7811,10 +7760,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -7894,10 +7841,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
@@ -7965,11 +7910,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
@@ -8061,16 +8003,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -8535,8 +8475,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -8591,8 +8530,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -8641,21 +8579,20 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -8708,15 +8645,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -9252,10 +9188,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -9335,10 +9269,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
@@ -9406,11 +9338,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
@@ -9502,16 +9431,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -9976,8 +9903,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -10032,8 +9958,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -10082,21 +10007,20 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -10149,15 +10073,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -10693,10 +10616,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -10776,10 +10697,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
@@ -10847,11 +10766,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
@@ -10943,16 +10859,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -11421,8 +11335,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -11477,8 +11390,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -11527,21 +11439,20 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -11594,20 +11505,18 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
@@ -12521,24 +12430,22 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s4
@@ -12639,22 +12546,20 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
@@ -12738,35 +12643,32 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -12871,25 +12773,23 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
@@ -13361,8 +13261,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -13417,8 +13316,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -13467,21 +13365,20 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -13534,20 +13431,18 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
@@ -14457,24 +14352,22 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s4
@@ -14574,22 +14467,20 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -14672,35 +14563,32 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -14805,25 +14693,23 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
@@ -15290,8 +15176,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -15346,8 +15231,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -15396,21 +15280,20 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -15463,15 +15346,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -16375,24 +16257,22 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s4
@@ -16491,22 +16371,20 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
@@ -16588,35 +16466,32 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -16719,25 +16594,23 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
@@ -17209,8 +17082,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -17265,8 +17137,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -17315,21 +17186,20 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -17382,20 +17252,18 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
@@ -18295,24 +18163,22 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s4
@@ -18411,22 +18277,20 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
@@ -18508,35 +18372,32 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -18639,25 +18500,23 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index f67fcd6e0caf5..22eb8d05b5ff2 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -396,8 +396,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064-NEXT:    v_readlane_b32 s12, v1, 31
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s12
@@ -460,8 +459,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032-NEXT:    v_readlane_b32 s11, v1, 31
 ; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -518,21 +516,19 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s12, v1, 31
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_readlane_b32 s12, v1, 31
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s12
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-NEXT:    v_readlane_b32 s12, v1, 15
 ; GFX1164-NEXT:    v_readlane_b32 s13, v1, 31
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_writelane_b32 v3, s12, 16
 ; GFX1164-NEXT:    s_mov_b64 exec, s[10:11]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_or_saveexec_b64 s[10:11], -1
 ; GFX1164-NEXT:    v_readlane_b32 s12, v1, 63
@@ -594,20 +590,18 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_readlane_b32 s11, v1, 31
 ; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-NEXT:    v_readlane_b32 s10, v1, 15
 ; GFX1132-NEXT:    s_mov_b32 exec_lo, s9
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-NEXT:    s_or_saveexec_b32 s9, -1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-NEXT:    v_writelane_b32 v3, s10, 16
 ; GFX1132-NEXT:    s_mov_b32 exec_lo, s9
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132-NEXT:    s_and_saveexec_b32 s9, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
index ad3f920eadc91..6bd0b11acc3ea 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
@@ -186,8 +186,9 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
-  ; GFX11_GFX12-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GFX11_GFX12-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[V_ADD_F32_e64_3]], 0, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX11_GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+  ; GFX11_GFX12-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], [[COPY5]], 0, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; GFX11_GFX12-NEXT:   early-clobber %1:vgpr_32 = STRICT_WWM killed [[V_ADD_F32_e64_4]], implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
index 3951e02d46a8f..6766c0c1fdaeb 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
@@ -229,7 +229,9 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GFX11-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[V_ADD_F32_e64_3]], 0, implicit $exec
+  ; GFX11-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF1]]
+  ; GFX11-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[COPY5]], 0, implicit $exec
   ; GFX11-NEXT:   [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], killed [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec
@@ -241,8 +243,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_4]]
   ; GFX11-NEXT:   early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec
   ; GFX11-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
-  ; GFX11-NEXT:   [[COPY5:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
-  ; GFX11-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX11-NEXT:   [[COPY6:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
+  ; GFX11-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.2
   ; GFX11-NEXT: {{  $}}
@@ -250,8 +252,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   successors: %bb.4(0x80000000)
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY %2
-  ; GFX11-NEXT:   [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY6]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
+  ; GFX11-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY %2
+  ; GFX11-NEXT:   [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY7]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
   ; GFX11-NEXT:   S_BRANCH %bb.4
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.3.Flow:
@@ -264,13 +266,13 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT: bb.4 (%ir-block.32):
   ; GFX11-NEXT:   successors: %bb.3(0x80000000)
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
+  ; GFX11-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
   ; GFX11-NEXT:   SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX11-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
-  ; GFX11-NEXT:   early-clobber %44:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
-  ; GFX11-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec
-  ; GFX11-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY5]]
-  ; GFX11-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_5]], 0, [[V_READFIRSTLANE_B32_]], [[COPY7]], implicit $exec
+  ; GFX11-NEXT:   early-clobber %46:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
+  ; GFX11-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec
+  ; GFX11-NEXT:   [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY6]]
+  ; GFX11-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_5]], 0, [[V_READFIRSTLANE_B32_]], [[COPY8]], implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.3
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.5 (%ir-block.38):
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 44cd2c6e3af67..584b280cefb8a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -917,8 +917,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -999,8 +998,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1070,16 +1068,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1139,15 +1135,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2155,8 +2150,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -2237,8 +2231,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2308,16 +2301,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2377,15 +2368,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -3453,8 +3443,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -3535,8 +3524,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3606,16 +3594,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3675,15 +3661,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -4247,8 +4232,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -4329,8 +4313,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4400,16 +4383,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -4469,15 +4450,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -5570,8 +5550,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -5652,8 +5631,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5723,16 +5701,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -5805,16 +5781,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -7630,10 +7605,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v8, 0
@@ -7760,10 +7733,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7881,11 +7852,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v11, v9
@@ -8007,20 +7975,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -9212,10 +9178,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
@@ -9308,10 +9272,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9395,11 +9357,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
@@ -9489,20 +9448,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -10671,10 +10628,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
@@ -10767,10 +10722,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -10854,11 +10807,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
@@ -10948,20 +10898,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB14_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -11612,10 +11560,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
@@ -11708,10 +11654,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -11795,11 +11739,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
@@ -11889,20 +11830,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB15_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -13750,10 +13689,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v8, 0
@@ -13880,10 +13817,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14001,11 +13936,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v11, v9
@@ -14127,20 +14059,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index f0196fadc4b3f..464ec088dc297 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -833,8 +833,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
@@ -911,8 +910,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -978,19 +976,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1054,17 +1050,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -1898,8 +1892,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
@@ -1976,8 +1969,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -2043,19 +2035,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2119,17 +2109,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2963,8 +2951,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
@@ -3041,8 +3028,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -3108,19 +3094,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3184,17 +3168,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -5002,10 +4984,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
@@ -5142,10 +5122,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -5273,11 +5251,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
@@ -5411,21 +5386,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -6452,10 +6426,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
@@ -6543,10 +6515,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -6625,11 +6595,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
@@ -6732,21 +6699,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -8569,10 +8535,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
@@ -8709,10 +8673,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -8840,11 +8802,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
@@ -8978,21 +8937,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index f672c9c6afa22..26a0e34d18bdb 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -833,8 +833,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
@@ -911,8 +910,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -978,19 +976,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1054,17 +1050,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -1898,8 +1892,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
@@ -1976,8 +1969,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -2043,19 +2035,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2119,17 +2109,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2963,8 +2951,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
@@ -3041,8 +3028,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -3108,19 +3094,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3184,17 +3168,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -5002,10 +4984,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
@@ -5142,10 +5122,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -5273,11 +5251,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
@@ -5411,21 +5386,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -6452,10 +6426,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
@@ -6543,10 +6515,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -6625,11 +6595,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
@@ -6732,21 +6699,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -8569,10 +8535,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
@@ -8709,10 +8673,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -8840,11 +8802,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
@@ -8978,21 +8937,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 2165a6ff65e3b..c158a8007bcc5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -995,8 +995,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -1077,8 +1076,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1148,16 +1146,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1230,16 +1226,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2345,8 +2340,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -2427,8 +2421,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2498,16 +2491,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2580,16 +2571,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -3695,8 +3685,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -3777,8 +3766,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3848,16 +3836,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3930,16 +3916,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -4541,8 +4526,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -4623,8 +4607,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4694,16 +4677,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -4776,16 +4757,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -5890,8 +5870,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -5972,8 +5951,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6043,16 +6021,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -6125,16 +6101,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -7950,10 +7925,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v8, 0
@@ -8080,10 +8053,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8201,11 +8172,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v11, v9
@@ -8327,20 +8295,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -9531,10 +9497,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
@@ -9627,10 +9591,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9714,11 +9676,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
@@ -9808,20 +9767,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -10990,10 +10947,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
@@ -11086,10 +11041,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -11173,11 +11126,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
@@ -11267,20 +11217,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB14_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -11931,10 +11879,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
@@ -12027,10 +11973,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -12114,11 +12058,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
@@ -12208,20 +12149,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB15_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -14068,10 +14007,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v8, 0
@@ -14198,10 +14135,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14319,11 +14254,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v11, v9
@@ -14445,20 +14377,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:

From 2fef449f30e2f484897cb199e3338a1520803c7d Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Wed, 4 Sep 2024 11:07:11 +0100
Subject: [PATCH 065/425] [LLVM][AArch64] Enable verifyTargetSDNode for
 scalable vectors and fix the fallout. (#104820)

Fix incorrect use of AArch64ISD::UZP1/UUNPK{HI,LO} in:
  AArch64TargetLowering::LowerDIV
  AArch64TargetLowering::LowerINSERT_SUBVECTOR

The latter highlighted DAG combines that relied on broken behaviour,
which this patch also fixes.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 87 ++++++++++++++-----
 1 file changed, 64 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1735ff5cd6974..5e3f9364ac3e1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14908,10 +14908,11 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
     // NOP cast operands to the largest legal vector of the same element count.
     if (VT.isFloatingPoint()) {
       Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
-      Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
+      Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
     } else {
       // Legal integer vectors are already their largest so Vec0 is fine as is.
       Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
+      Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
     }
 
     // To replace the top/bottom half of vector V with vector SubV we widen the
@@ -14920,11 +14921,13 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
     SDValue Narrow;
     if (Idx == 0) {
       SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
+      HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
       Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
     } else {
       assert(Idx == InVT.getVectorMinNumElements() &&
              "Invalid subvector index!");
       SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
+      LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
       Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
     }
 
@@ -15024,7 +15027,9 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
   SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
   SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
   SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
-  return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
+  SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, dl, VT, ResultLo);
+  SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, dl, VT, ResultHi);
+  return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLoCast, ResultHiCast);
 }
 
 bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
@@ -22739,7 +22744,19 @@ static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG,
   SDValue Rshrnb = DAG.getNode(
       AArch64ISD::RSHRNB_I, DL, ResVT,
       {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
-  return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
+  return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Rshrnb);
+}
+
+static SDValue isNVCastToHalfWidthElements(SDValue V) {
+  if (V.getOpcode() != AArch64ISD::NVCAST)
+    return SDValue();
+
+  SDValue Op = V.getOperand(0);
+  if (V.getValueType().getVectorElementCount() !=
+      Op.getValueType().getVectorElementCount() * 2)
+    return SDValue();
+
+  return Op;
 }
 
 static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
@@ -22802,25 +22819,37 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
   if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
     return Urshr;
 
-  if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget))
-    return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
+  if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
+    if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
+      Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
+      return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
+    }
+  }
 
-  if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget))
-    return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
+  if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
+    if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
+      Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
+      return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
+    }
+  }
 
-  // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
-  if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
-    if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
-      SDValue X = Op0.getOperand(0).getOperand(0);
-      return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
+  // uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)
+  if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
+    if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {
+      if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
+        SDValue X = PreCast.getOperand(0).getOperand(0);
+        return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
+      }
     }
   }
 
-  // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
-  if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
-    if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
-      SDValue Z = Op1.getOperand(0).getOperand(1);
-      return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
+  // uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)
+  if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
+    if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {
+      if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
+        SDValue Z = PreCast.getOperand(0).getOperand(1);
+        return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
+      }
     }
   }
 
@@ -29415,9 +29444,6 @@ void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const {
            VT.isInteger() && "Expected integer vectors!");
     assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
            "Expected vectors of equal size!");
-    // TODO: Enable assert once bogus creations have been fixed.
-    if (VT.isScalableVector())
-      break;
     assert(OpVT.getVectorElementCount() == VT.getVectorElementCount() * 2 &&
            "Expected result vector with half the lanes of its input!");
     break;
@@ -29435,12 +29461,27 @@ void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const {
     EVT Op1VT = N->getOperand(1).getValueType();
     assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
            "Expected vectors!");
-    // TODO: Enable assert once bogus creations have been fixed.
-    if (VT.isScalableVector())
-      break;
     assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
     break;
   }
+  case AArch64ISD::RSHRNB_I: {
+    assert(N->getNumValues() == 1 && "Expected one result!");
+    assert(N->getNumOperands() == 2 && "Expected two operands!");
+    EVT VT = N->getValueType(0);
+    EVT Op0VT = N->getOperand(0).getValueType();
+    EVT Op1VT = N->getOperand(1).getValueType();
+    assert(VT.isVector() && VT.isInteger() &&
+           "Expected integer vector result type!");
+    assert(Op0VT.isVector() && Op0VT.isInteger() &&
+           "Expected first operand to be an integer vector!");
+    assert(VT.getSizeInBits() == Op0VT.getSizeInBits() &&
+           "Expected vectors of equal size!");
+    assert(VT.getVectorElementCount() == Op0VT.getVectorElementCount() * 2 &&
+           "Expected input vector with half the lanes of its result!");
+    assert(Op1VT == MVT::i32 && isa<ConstantSDNode>(N->getOperand(1)) &&
+           "Expected second operand to be a constant i32!");
+    break;
+  }
   }
 }
 #endif

From 29c076b8598c9627cea493fdfc1a30c83385e820 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 4 Sep 2024 12:11:08 +0200
Subject: [PATCH 066/425] [Lint] Fix crash with scalable alloca

---
 llvm/lib/Analysis/Lint.cpp          |  4 ++--
 llvm/test/Analysis/Lint/scalable.ll | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Analysis/Lint/scalable.ll

diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index a44d5a3bbe462..29b5d38fc93b7 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -436,8 +436,8 @@ void Lint::visitMemoryReference(Instruction &I, const MemoryLocation &Loc,
 
     if (AllocaInst *AI = dyn_cast<AllocaInst>(Base)) {
       Type *ATy = AI->getAllocatedType();
-      if (!AI->isArrayAllocation() && ATy->isSized())
-        BaseSize = DL->getTypeAllocSize(ATy);
+      if (!AI->isArrayAllocation() && ATy->isSized() && !ATy->isScalableTy())
+        BaseSize = DL->getTypeAllocSize(ATy).getFixedValue();
       BaseAlign = AI->getAlign();
     } else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
       // If the global may be defined differently in another compilation unit
diff --git a/llvm/test/Analysis/Lint/scalable.ll b/llvm/test/Analysis/Lint/scalable.ll
new file mode 100644
index 0000000000000..73100eca8835c
--- /dev/null
+++ b/llvm/test/Analysis/Lint/scalable.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=lint < %s | FileCheck %s
+
+; Make sure we don't crash.
+
+define <vscale x 8 x i8> @test() {
+; CHECK-LABEL: define <vscale x 8 x i8> @test() {
+; CHECK-NEXT:    [[A:%.*]] = alloca <vscale x 8 x i8>, align 8
+; CHECK-NEXT:    [[V:%.*]] = load <vscale x 8 x i8>, ptr [[A]], align 8
+; CHECK-NEXT:    ret <vscale x 8 x i8> [[V]]
+;
+  %a = alloca <vscale x 8 x i8>
+  %v = load <vscale x 8 x i8>, ptr %a
+  ret <vscale x 8 x i8> %v
+}

From 4f3f09e787bab3caccd9496d93e6453c71d7869f Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer@amd.com>
Date: Wed, 4 Sep 2024 11:13:10 +0100
Subject: [PATCH 067/425] [flang][debug] Add stride information for assumed
 shape array. (#106703)

Without this information, debugger could present wrong values for arrays
in certain cases as shown in issue #105646.

Fixes #105646.
---
 .../Optimizer/Transforms/DebugTypeGenerator.cpp  | 16 ++++++++++++----
 .../Integration/debug-assumed-shape-array.f90    |  4 ++--
 .../Transforms/debug-assumed-shape-array.fir     |  4 ++--
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index 029d3776bcc0b..7c4382079fd6d 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -116,8 +116,8 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType(
   unsigned offset = dimsOffset;
   const unsigned indexSize = dimsSize / 3;
   for ([[maybe_unused]] auto _ : seqTy.getShape()) {
-    // For each dimension, find the offset of count and lower bound in the
-    // descriptor and generate the dwarf expression to extract it.
+    // For each dimension, find the offset of count, lower bound and stride in
+    // the descriptor and generate the dwarf expression to extract it.
     // FIXME: If `indexSize` happens to be bigger than address size on the
     // system then we may have to change 'DW_OP_deref' here.
     addOp(llvm::dwarf::DW_OP_push_object_address, {});
@@ -139,10 +139,18 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType(
         mlir::LLVM::DIExpressionAttr::get(context, ops);
     ops.clear();
 
+    addOp(llvm::dwarf::DW_OP_push_object_address, {});
+    addOp(llvm::dwarf::DW_OP_plus_uconst,
+          {offset + (indexSize * kDimStridePos)});
+    addOp(llvm::dwarf::DW_OP_deref, {});
+    // stride[i] = *(base_addr + offset + (indexSize * kDimStridePos))
+    mlir::LLVM::DIExpressionAttr strideAttr =
+        mlir::LLVM::DIExpressionAttr::get(context, ops);
+    ops.clear();
+
     offset += dimsSize;
     mlir::LLVM::DISubrangeAttr subrangeTy = mlir::LLVM::DISubrangeAttr::get(
-        context, countAttr, lowerAttr, /*upperBound=*/nullptr,
-        /*stride=*/nullptr);
+        context, countAttr, lowerAttr, /*upperBound=*/nullptr, strideAttr);
     elements.push_back(subrangeTy);
   }
   return mlir::LLVM::DICompositeTypeAttr::get(
diff --git a/flang/test/Integration/debug-assumed-shape-array.f90 b/flang/test/Integration/debug-assumed-shape-array.f90
index a23ffa13cf0ac..9a439e20d1981 100644
--- a/flang/test/Integration/debug-assumed-shape-array.f90
+++ b/flang/test/Integration/debug-assumed-shape-array.f90
@@ -8,6 +8,6 @@ end subroutine ff
 
 ! CHECK-DAG: !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS:[0-9]+]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref))
 ! CHECK-DAG: ![[ELEMS]] = !{![[ELEM1:[0-9]+]], ![[ELEM2:[0-9]+]]}
-! CHECK-DAG: ![[ELEM1]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 32, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 24, DW_OP_deref))
-! CHECK-DAG: ![[ELEM2]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 56, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 48, DW_OP_deref))
+! CHECK-DAG: ![[ELEM1]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 32, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 24, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 40, DW_OP_deref))
+! CHECK-DAG: ![[ELEM2]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 56, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 48, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 64, DW_OP_deref))
 
diff --git a/flang/test/Transforms/debug-assumed-shape-array.fir b/flang/test/Transforms/debug-assumed-shape-array.fir
index 0a9b84ad253aa..d1e64297acea7 100644
--- a/flang/test/Transforms/debug-assumed-shape-array.fir
+++ b/flang/test/Transforms/debug-assumed-shape-array.fir
@@ -11,6 +11,6 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i64, dense<64> :
 #loc2 = loc("test1.f90":3:16)
 
 // CHECK: #llvm.di_composite_type<tag = DW_TAG_array_type
-// CHECK-SAME: elements = #llvm.di_subrange<count = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(32), DW_OP_deref]>, lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(24), DW_OP_deref]>>,
-// CHECK-SAME: #llvm.di_subrange<count = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(56), DW_OP_deref]>, lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(48), DW_OP_deref]>>
+// CHECK-SAME: elements = #llvm.di_subrange<count = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(32), DW_OP_deref]>, lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(24), DW_OP_deref]>, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(40), DW_OP_deref]>>,
+// CHECK-SAME: #llvm.di_subrange<count = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(56), DW_OP_deref]>, lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(48), DW_OP_deref]>, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(64), DW_OP_deref]>>
 // CHECK-SAME: dataLocation = <[DW_OP_push_object_address, DW_OP_deref]>>

From d77ccae4a629ba11b5c28f97222a8834c5e5c183 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Wed, 4 Sep 2024 10:22:58 +0000
Subject: [PATCH 068/425] [lldb] Fix 32 bit compile error

https://lab.llvm.org/buildbot/#/builders/18/builds/3247/steps/4/logs/stdio

In code added by https://github.com/llvm/llvm-project/issues/87471.
---
 lldb/source/Target/Process.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 6c5c5162e2468..97ce2c14458e9 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -3377,7 +3377,8 @@ lldb::addr_t Process::FindInMemory(lldb::addr_t low, lldb::addr_t high,
       mem.resize_for_overwrite(max_read_size);
       Status error;
       mem.resize(ReadMemory(cur_addr, mem.data(),
-                            std::min(mem.size(), high - cur_addr), error));
+                            std::min<addr_t>(mem.size(), high - cur_addr),
+                            error));
       mem_pos = cur_addr;
       if (size > mem.size()) {
         // We didn't read enough data. Skip to the next memory region.

From 4f130fa943af8bf47f4401deff0d825a91dc7584 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Wed, 4 Sep 2024 11:28:47 +0100
Subject: [PATCH 069/425] [flang][Driver] support -fno-openmp (#107087)

Closes #83148
---
 clang/include/clang/Driver/Options.td     |  1 +
 flang/lib/Frontend/CompilerInvocation.cpp |  4 +++-
 flang/test/Driver/fno-openmp.f90          | 12 ++++++++++++
 flang/test/Driver/fopenmp.f90             |  9 +++++++++
 4 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Driver/fno-openmp.f90

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 83cf753e82484..8fe9f4f28f8fc 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3545,6 +3545,7 @@ def fopenmp : Flag<["-"], "fopenmp">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
   HelpText<"Parse OpenMP pragmas and generate parallel code.">;
 def fno_openmp : Flag<["-"], "fno-openmp">, Group<f_Group>,
+  Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
   Flags<[NoArgumentUnused]>;
 class OpenMPVersionHelp<string program, string default> {
   string str = !strconcat(
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 1d73397d33017..9e42fcc2e39d5 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -968,7 +968,9 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
 /// generated.
 static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
                             clang::DiagnosticsEngine &diags) {
-  if (!args.hasArg(clang::driver::options::OPT_fopenmp))
+  llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fopenmp,
+                                        clang::driver::options::OPT_fno_openmp);
+  if (!arg || arg->getOption().matches(clang::driver::options::OPT_fno_openmp))
     return true;
 
   unsigned numErrorsBefore = diags.getNumErrors();
diff --git a/flang/test/Driver/fno-openmp.f90 b/flang/test/Driver/fno-openmp.f90
new file mode 100644
index 0000000000000..98c8793c8c9bc
--- /dev/null
+++ b/flang/test/Driver/fno-openmp.f90
@@ -0,0 +1,12 @@
+! RUN: %flang_fc1 -fopenmp -fno-openmp %s -emit-hlfir -o - | FileCheck --check-prefix=CHECK-NO-OMP %s
+! RUN: %flang_fc1 -fno-openmp %s -emit-hlfir -o - | FileCheck --check-prefix=CHECK-NO-OMP %s
+! RUN: %flang_fc1 -fno-openmp -fopenmp %s -emit-hlfir -o - | FileCheck --check-prefix=CHECK-OMP %s
+! RUN: %flang_fc1 -fopenmp %s -emit-hlfir -o - | FileCheck --check-prefix=CHECK-OMP %s
+
+subroutine main
+  ! CHECK-NO-OMP-NOT: omp.parallel
+  ! CHECK-OMP: omp.parallel
+  !$omp parallel
+  print *,"test"
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Driver/fopenmp.f90 b/flang/test/Driver/fopenmp.f90
index d70fe100c3d21..9b4dc5ffb1f69 100644
--- a/flang/test/Driver/fopenmp.f90
+++ b/flang/test/Driver/fopenmp.f90
@@ -11,6 +11,9 @@
 ! RUN: %flang -target x86_64-windows-gnu -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-FC1-NO-OPENMP --check-prefix=CHECK-WARNING
 ! RUN: %flang -target x86_64-windows-gnu -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-FC1-OPENMP
 
+!RUN: %flang -fno-openmp -fopenmp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-FC1-OPENMP
+!RUN: %flang -fopenmp -fno-openmp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-FC1-NO-OPENMP
+
 ! CHECK-FC1-OPENMP: "-fc1"
 ! CHECK-FC1-OPENMP: "-fopenmp"
 !
@@ -59,8 +62,14 @@
 ! RUN: %flang -target x86_64-freebsd -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
 ! RUN: %flang -target x86_64-windows-gnu -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANYMD
 !
+! RUN: %flang -target x86_64-linux-gnu -fno-openmp -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+! RUN: %flang -target x86_64-linux-gnu -fopenmp -fno-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-LD-ANY
+!
 ! CHECK-LD-ANY: "{{.*}}ld{{(.exe)?}}"
 ! CHECK-LD-ANY: "-l{{(omp|gomp|iomp5)}}"
 !
+! CHECK-NO-LD-ANY:     "{{.*}}ld{{(.exe)?}}"
+! CHECK-NO-LD-ANY-NOT: "-l{{(omp|gomp|iomp5)}}"
+!
 ! CHECK-LD-ANYMD: "{{.*}}ld{{(.exe)?}}"
 ! CHECK-LD-ANYMD: "-l{{(omp|gomp|iomp5md)}}"

From 5818337765e4c74918a700a14df5f64a658c47ee Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Wed, 4 Sep 2024 11:47:37 +0100
Subject: [PATCH 070/425] LICM: hoist BO assoc when (C1 op LV) op C2 (#106999)

Extend hoistBOAssociation to handle the "(C1 op LV) op C2" case, when op
is a commutative operand.
---
 llvm/lib/Transforms/Scalar/LICM.cpp      | 17 ++++----
 llvm/test/Transforms/LICM/hoist-binop.ll | 49 ++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 1aad182bb7433..5cf7c252bb5f3 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2803,15 +2803,15 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
 
 /// Reassociate associative binary expressions of the form
 ///
-/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)" if op is an associative BinOp
+/// 2. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" if op is a commutative BinOp
 ///
-/// where op is an associative binary op, LV is a loop variant, and C1 and C2
-/// are loop invariants that we want to hoist.
+/// where LV is a loop variant, and C1 and C2 are loop invariants that we want
+/// to hoist.
 ///
 /// TODO: This can be extended to more cases such as
-/// 2. "C1 op (C2 op LV)" ==> "(C1 op C2) op LV"
-/// 3. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" if op is commutative
-/// 4. "C1 op (LV op C2)" ==> "(C1 op C2) op LV" if op is commutative
+/// 1. "C1 op (C2 op LV)" ==> "(C1 op C2) op LV" if op an associative BinOp
+/// 2. "C1 op (LV op C2)" ==> "(C1 op C2) op LV" if op is a commutative BinOp
 static bool hoistBOAssociation(Instruction &I, Loop &L,
                                ICFLoopSafetyInfo &SafetyInfo,
                                MemorySSAUpdater &MSSAU, AssumptionCache *AC,
@@ -2830,11 +2830,14 @@ static bool hoistBOAssociation(Instruction &I, Loop &L,
       BO0->hasNUsesOrMore(3))
     return false;
 
-  // Transform: "(LV op C1) op C2" ==> "LV op (C1 op C2)"
   Value *LV = BO0->getOperand(0);
   Value *C1 = BO0->getOperand(1);
   Value *C2 = BO->getOperand(1);
 
+  if (L.isLoopInvariant(LV) && !L.isLoopInvariant(C1)) {
+    assert(BO0->isCommutative() && "Associativity implies commutativity");
+    std::swap(LV, C1);
+  }
   if (L.isLoopInvariant(LV) || !L.isLoopInvariant(C1) || !L.isLoopInvariant(C2))
     return false;
 
diff --git a/llvm/test/Transforms/LICM/hoist-binop.ll b/llvm/test/Transforms/LICM/hoist-binop.ll
index 53c1df61931d7..b0ee45a5fb350 100644
--- a/llvm/test/Transforms/LICM/hoist-binop.ll
+++ b/llvm/test/Transforms/LICM/hoist-binop.ll
@@ -67,6 +67,31 @@ loop:
   br label %loop
 }
 
+
+; Hoist ADD and copy NUW if both ops have it. Commutative version.
+define void @add_nuw_comm(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_nuw_comm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add nuw i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nuw i64 [[C1]], [[INDEX]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add nuw i64 %c1, %index
+  call void @use(i64 %step.add)
+  %index.next = add nuw i64 %step.add, %c2
+  br label %loop
+}
+
 ; Hoist MUL and drop NUW even if both ops have it.
 define void @mul_nuw(i64 %c1, i64 %c2) {
 ; CHECK-LABEL: @mul_nuw(
@@ -91,6 +116,30 @@ loop:
   br label %loop
 }
 
+; Hoist MUL and drop NUW even if both ops have it. Commutative version.
+define void @mul_nuw_comm(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @mul_nuw_comm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nuw i64 [[C1]], [[INDEX]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = mul nuw i64 %c1, %index
+  call void @use(i64 %step.add)
+  %index.next = mul nuw i64 %step.add, %c2
+  br label %loop
+}
+
 ; Hoist ADD but don't copy NUW if only one op has it.
 define void @add_no_nuw(i64 %c1, i64 %c2) {
 ; CHECK-LABEL: @add_no_nuw(

From 5a6926ce49b6df807bff6083325ca291b0e731e5 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 4 Sep 2024 11:48:02 +0100
Subject: [PATCH 071/425] [AMDGPU] Fix test update after #107108

---
 .../CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 9397aee37524f..cc7050d08541a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1604,7 +1604,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1232_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
@@ -5351,7 +5351,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1232_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4

From 360f82f3703fa57de42a2f998b172551f294e11a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 4 Sep 2024 12:47:35 +0200
Subject: [PATCH 072/425] [Lint] Fix crash for insert/extract on scalable
 vector

Don't assume the vector is fixed size. For scalable vectors, do
not report an error, as indices outside the minimum range may be
valid.
---
 llvm/lib/Analysis/Lint.cpp          | 17 +++++++++--------
 llvm/test/Analysis/Lint/scalable.ll | 25 +++++++++++++++----------
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index 29b5d38fc93b7..415b16d25efd2 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -590,19 +590,20 @@ void Lint::visitIndirectBrInst(IndirectBrInst &I) {
 
 void Lint::visitExtractElementInst(ExtractElementInst &I) {
   if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue(I.getIndexOperand(),
-                                                        /*OffsetOk=*/false)))
-    Check(
-        CI->getValue().ult(
-            cast<FixedVectorType>(I.getVectorOperandType())->getNumElements()),
-        "Undefined result: extractelement index out of range", &I);
+                                                        /*OffsetOk=*/false))) {
+    ElementCount EC = I.getVectorOperandType()->getElementCount();
+    Check(EC.isScalable() || CI->getValue().ult(EC.getFixedValue()),
+          "Undefined result: extractelement index out of range", &I);
+  }
 }
 
 void Lint::visitInsertElementInst(InsertElementInst &I) {
   if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue(I.getOperand(2),
-                                                        /*OffsetOk=*/false)))
-    Check(CI->getValue().ult(
-              cast<FixedVectorType>(I.getType())->getNumElements()),
+                                                        /*OffsetOk=*/false))) {
+    ElementCount EC = I.getType()->getElementCount();
+    Check(EC.isScalable() || CI->getValue().ult(EC.getFixedValue()),
           "Undefined result: insertelement index out of range", &I);
+  }
 }
 
 void Lint::visitUnreachableInst(UnreachableInst &I) {
diff --git a/llvm/test/Analysis/Lint/scalable.ll b/llvm/test/Analysis/Lint/scalable.ll
index 73100eca8835c..4bcc4dae8d837 100644
--- a/llvm/test/Analysis/Lint/scalable.ll
+++ b/llvm/test/Analysis/Lint/scalable.ll
@@ -1,15 +1,20 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=lint < %s | FileCheck %s
+; RUN: opt -S -passes=lint -disable-output < %s 2>&1 | FileCheck %s --allow-empty
 
-; Make sure we don't crash.
-
-define <vscale x 8 x i8> @test() {
-; CHECK-LABEL: define <vscale x 8 x i8> @test() {
-; CHECK-NEXT:    [[A:%.*]] = alloca <vscale x 8 x i8>, align 8
-; CHECK-NEXT:    [[V:%.*]] = load <vscale x 8 x i8>, ptr [[A]], align 8
-; CHECK-NEXT:    ret <vscale x 8 x i8> [[V]]
-;
+; CHECK-NOT: Buffer overflow
+define <vscale x 8 x i8> @alloca_access() {
   %a = alloca <vscale x 8 x i8>
   %v = load <vscale x 8 x i8>, ptr %a
   ret <vscale x 8 x i8> %v
 }
+
+; CHECK-NOT: insertelement index out of range
+define <vscale x 8 x half> @insertelement() {
+  %insert = insertelement <vscale x 8 x half> poison, half 0xH0000, i64 100
+  ret <vscale x 8 x half> %insert
+}
+
+; CHECK-NOT: extract index out of range
+define half @extractelement(<vscale x 8 x half> %v) {
+  %insert = extractelement <vscale x 8 x half> %v, i64 100
+  ret half %insert
+}

From 6d3563422ce6f431b837221932d32db4c9681fc5 Mon Sep 17 00:00:00 2001
From: Lukacma <Marian.Lukac@arm.com>
Date: Wed, 4 Sep 2024 11:58:07 +0100
Subject: [PATCH 073/425] include REQUIRES guard to aarch64-neon-luti.c
 (#107217)

---
 clang/test/CodeGen/aarch64-neon-luti.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/test/CodeGen/aarch64-neon-luti.c b/clang/test/CodeGen/aarch64-neon-luti.c
index 72cb6bcdb40f0..40daf742eb966 100644
--- a/clang/test/CodeGen/aarch64-neon-luti.c
+++ b/clang/test/CodeGen/aarch64-neon-luti.c
@@ -1,4 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: aarch64-registered-target
 #include <arm_neon.h>
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -target-feature +bf16 -O3 -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s

From cd46829e547d2d0aa3cb0ef7c9de59c507eaaecc Mon Sep 17 00:00:00 2001
From: Madhur Amilkanthwar <madhura@nvidia.com>
Date: Wed, 4 Sep 2024 16:28:39 +0530
Subject: [PATCH 074/425] [LV] Fix emission of debug message in legality check
 (#101924)

Successful vectorization message is emitted even
after "Result" is false. "Result" = false indicates failure of one of
the legality check and thus
successful message should not be printed.
---
 .../Vectorize/LoopVectorizationLegality.cpp   | 20 ++++++++----
 .../LoopVectorize/check-no-vectorize.ll       | 32 +++++++++++++++++++
 2 files changed, 45 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/check-no-vectorize.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 66a779da8c25b..7042af6dd8eae 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1451,10 +1451,12 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
   // Check whether the loop-related control flow in the loop nest is expected by
   // vectorizer.
   if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) {
-    if (DoExtraAnalysis)
+    if (DoExtraAnalysis) {
+      LLVM_DEBUG(dbgs() << "LV: legality check failed: loop nest");
       Result = false;
-    else
+    } else {
       return false;
+    }
   }
 
   // We need to have a loop header.
@@ -1519,17 +1521,21 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
       return false;
   }
 
-  LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
-                    << (LAI->getRuntimePointerChecking()->Need
-                            ? " (with a runtime bound check)"
-                            : "")
-                    << "!\n");
+  if (Result) {
+    LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
+                      << (LAI->getRuntimePointerChecking()->Need
+                              ? " (with a runtime bound check)"
+                              : "")
+                      << "!\n");
+  }
 
   unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
 
   if (PSE.getPredicate().getComplexity() > SCEVThreshold) {
+    LLVM_DEBUG(dbgs() << "LV: Vectorization not profitable "
+                         "due to SCEVThreshold");
     reportVectorizationFailure("Too many SCEV checks needed",
         "Too many SCEV assumptions need to be made and checked at runtime",
         "TooManySCEVRunTimeChecks", ORE, TheLoop);
diff --git a/llvm/test/Transforms/LoopVectorize/check-no-vectorize.ll b/llvm/test/Transforms/LoopVectorize/check-no-vectorize.ll
new file mode 100644
index 0000000000000..e45a204931319
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/check-no-vectorize.ll
@@ -0,0 +1,32 @@
+; This test checks that we don't emit both
+; successful and unsuccessful message about vectorization.
+
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -debug -disable-output < %s 2>&1 | FileCheck %s
+; CHECK-NOT: LV: We can vectorize this loop
+; CHECK: LV: Not vectorizing: Cannot prove legality
+; CHECK-NOT: LV: We can vectorize this loop
+
+@a = global [32000 x i32] zeroinitializer, align 4
+
+define void @foo(i32 %val1, i32 %val2) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %0 = phi i32 [ %val1, %entry ], [ %add1, %for.body ]
+  %1 = phi i32 [ %val2, %entry ], [ %2, %for.body ]
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [32000 x i32], ptr @a, i64 0, i64 %iv
+  %iv.next = add nuw nsw i64 %iv, 1
+  %arrayidx2 = getelementptr inbounds [32000 x i32], ptr @a, i64 0, i64 %iv.next
+  %2 = load i32, ptr %arrayidx2, align 4
+  %add0 = add nsw i32 %2, %1
+  %add1 = add nsw i32 %add0, %0
+  store i32 %add1, ptr %arrayidx, align 4
+  %exitcond = icmp eq i64 %iv.next, 31999
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:                                 ; preds = %for.body
+  ret void
+}

From 7afdc6bd57d634354597df185fd7037bec9241ff Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 4 Sep 2024 11:09:36 +0100
Subject: [PATCH 075/425] [DAG] Fix typo in i64/i128 abdu/abds tests

I'd incorrectly swapped the operands in some of the "cmp" test patterns when I changed the condition code
---
 llvm/test/CodeGen/AArch64/abds-neg.ll |  21 +-
 llvm/test/CodeGen/AArch64/abds.ll     |  21 +-
 llvm/test/CodeGen/AArch64/abdu-neg.ll |  22 +-
 llvm/test/CodeGen/AArch64/abdu.ll     |  22 +-
 llvm/test/CodeGen/RISCV/abds-neg.ll   | 233 ++++++++--------
 llvm/test/CodeGen/RISCV/abds.ll       | 273 ++++++++++---------
 llvm/test/CodeGen/RISCV/abdu-neg.ll   | 366 ++++++++++++--------------
 llvm/test/CodeGen/RISCV/abdu.ll       | 359 +++++++++++++------------
 llvm/test/CodeGen/X86/abds-neg.ll     |  37 +--
 llvm/test/CodeGen/X86/abds.ll         |  37 ++-
 llvm/test/CodeGen/X86/abdu-neg.ll     |  22 +-
 llvm/test/CodeGen/X86/abdu.ll         |  89 +++----
 12 files changed, 738 insertions(+), 764 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/abds-neg.ll b/llvm/test/CodeGen/AArch64/abds-neg.ll
index d4c6a09405e0c..ac7cb1f619557 100644
--- a/llvm/test/CodeGen/AArch64/abds-neg.ll
+++ b/llvm/test/CodeGen/AArch64/abds-neg.ll
@@ -377,30 +377,31 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub x8, x1, x0
 ; CHECK-NEXT:    subs x9, x0, x1
-; CHECK-NEXT:    csel x0, x9, x8, gt
+; CHECK-NEXT:    csel x0, x9, x8, lt
 ; CHECK-NEXT:    ret
   %cmp = icmp slt i64 %a, %b
   %ab = sub i64 %a, %b
   %ba = sub i64 %b, %a
-  %sel = select i1 %cmp, i64 %ba, i64 %ab
+  %sel = select i1 %cmp, i64 %ab, i64 %ba
   ret i64 %sel
 }
 
 define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; CHECK-LABEL: abd_cmp_i128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    subs x8, x0, x2
-; CHECK-NEXT:    sbc x9, x1, x3
-; CHECK-NEXT:    subs x10, x2, x0
-; CHECK-NEXT:    sbc x11, x3, x1
-; CHECK-NEXT:    sbcs xzr, x3, x1
-; CHECK-NEXT:    csel x0, x8, x10, lt
-; CHECK-NEXT:    csel x1, x9, x11, lt
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    sbc x8, x1, x3
+; CHECK-NEXT:    subs x9, x2, x0
+; CHECK-NEXT:    sbc x10, x3, x1
+; CHECK-NEXT:    subs x11, x0, x2
+; CHECK-NEXT:    sbcs xzr, x1, x3
+; CHECK-NEXT:    csel x0, x11, x9, lt
+; CHECK-NEXT:    csel x1, x8, x10, lt
 ; CHECK-NEXT:    ret
   %cmp = icmp slt i128 %a, %b
   %ab = sub i128 %a, %b
   %ba = sub i128 %b, %a
-  %sel = select i1 %cmp, i128 %ba, i128 %ab
+  %sel = select i1 %cmp, i128 %ab, i128 %ba
   ret i128 %sel
 }
 
diff --git a/llvm/test/CodeGen/AArch64/abds.ll b/llvm/test/CodeGen/AArch64/abds.ll
index 45bb8749b25ed..0e35f8240848b 100644
--- a/llvm/test/CodeGen/AArch64/abds.ll
+++ b/llvm/test/CodeGen/AArch64/abds.ll
@@ -343,31 +343,30 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub x8, x1, x0
 ; CHECK-NEXT:    subs x9, x0, x1
-; CHECK-NEXT:    csel x0, x8, x9, ge
+; CHECK-NEXT:    csel x0, x9, x8, gt
 ; CHECK-NEXT:    ret
   %cmp = icmp sge i64 %a, %b
   %ab = sub i64 %a, %b
   %ba = sub i64 %b, %a
-  %sel = select i1 %cmp, i64 %ba, i64 %ab
+  %sel = select i1 %cmp, i64 %ab, i64 %ba
   ret i64 %sel
 }
 
 define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; CHECK-LABEL: abd_cmp_i128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp x0, x2
-; CHECK-NEXT:    sbc x8, x1, x3
-; CHECK-NEXT:    subs x9, x2, x0
-; CHECK-NEXT:    sbc x10, x3, x1
-; CHECK-NEXT:    subs x11, x0, x2
-; CHECK-NEXT:    sbcs xzr, x1, x3
-; CHECK-NEXT:    csel x0, x9, x11, ge
-; CHECK-NEXT:    csel x1, x10, x8, ge
+; CHECK-NEXT:    subs x8, x0, x2
+; CHECK-NEXT:    sbc x9, x1, x3
+; CHECK-NEXT:    subs x10, x2, x0
+; CHECK-NEXT:    sbc x11, x3, x1
+; CHECK-NEXT:    sbcs xzr, x3, x1
+; CHECK-NEXT:    csel x0, x8, x10, lt
+; CHECK-NEXT:    csel x1, x9, x11, lt
 ; CHECK-NEXT:    ret
   %cmp = icmp sge i128 %a, %b
   %ab = sub i128 %a, %b
   %ba = sub i128 %b, %a
-  %sel = select i1 %cmp, i128 %ba, i128 %ab
+  %sel = select i1 %cmp, i128 %ab, i128 %ba
   ret i128 %sel
 }
 
diff --git a/llvm/test/CodeGen/AArch64/abdu-neg.ll b/llvm/test/CodeGen/AArch64/abdu-neg.ll
index b148a29a72976..2118816ca7c58 100644
--- a/llvm/test/CodeGen/AArch64/abdu-neg.ll
+++ b/llvm/test/CodeGen/AArch64/abdu-neg.ll
@@ -379,31 +379,31 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub x8, x1, x0
 ; CHECK-NEXT:    subs x9, x0, x1
-; CHECK-NEXT:    csel x0, x9, x8, hi
+; CHECK-NEXT:    csel x0, x9, x8, lo
 ; CHECK-NEXT:    ret
   %cmp = icmp ult i64 %a, %b
   %ab = sub i64 %a, %b
   %ba = sub i64 %b, %a
-  %sel = select i1 %cmp, i64 %ba, i64 %ab
+  %sel = select i1 %cmp, i64 %ab, i64 %ba
   ret i64 %sel
 }
 
 define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; CHECK-LABEL: abd_cmp_i128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    subs x8, x0, x2
-; CHECK-NEXT:    sbcs x9, x1, x3
-; CHECK-NEXT:    cset w10, lo
-; CHECK-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NEXT:    eor x8, x8, x10
-; CHECK-NEXT:    eor x9, x9, x10
-; CHECK-NEXT:    subs x0, x8, x10
-; CHECK-NEXT:    sbc x1, x9, x10
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    sbc x8, x1, x3
+; CHECK-NEXT:    subs x9, x2, x0
+; CHECK-NEXT:    sbc x10, x3, x1
+; CHECK-NEXT:    subs x11, x0, x2
+; CHECK-NEXT:    sbcs xzr, x1, x3
+; CHECK-NEXT:    csel x0, x11, x9, lo
+; CHECK-NEXT:    csel x1, x8, x10, lo
 ; CHECK-NEXT:    ret
   %cmp = icmp ult i128 %a, %b
   %ab = sub i128 %a, %b
   %ba = sub i128 %b, %a
-  %sel = select i1 %cmp, i128 %ba, i128 %ab
+  %sel = select i1 %cmp, i128 %ab, i128 %ba
   ret i128 %sel
 }
 
diff --git a/llvm/test/CodeGen/AArch64/abdu.ll b/llvm/test/CodeGen/AArch64/abdu.ll
index 22d41dfb85a62..eb866e6a78a9b 100644
--- a/llvm/test/CodeGen/AArch64/abdu.ll
+++ b/llvm/test/CodeGen/AArch64/abdu.ll
@@ -346,31 +346,31 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub x8, x1, x0
 ; CHECK-NEXT:    subs x9, x0, x1
-; CHECK-NEXT:    csel x0, x8, x9, hs
+; CHECK-NEXT:    csel x0, x9, x8, hi
 ; CHECK-NEXT:    ret
   %cmp = icmp uge i64 %a, %b
   %ab = sub i64 %a, %b
   %ba = sub i64 %b, %a
-  %sel = select i1 %cmp, i64 %ba, i64 %ab
+  %sel = select i1 %cmp, i64 %ab, i64 %ba
   ret i64 %sel
 }
 
 define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; CHECK-LABEL: abd_cmp_i128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp x0, x2
-; CHECK-NEXT:    sbc x8, x1, x3
-; CHECK-NEXT:    subs x9, x2, x0
-; CHECK-NEXT:    sbc x10, x3, x1
-; CHECK-NEXT:    subs x11, x0, x2
-; CHECK-NEXT:    sbcs xzr, x1, x3
-; CHECK-NEXT:    csel x0, x9, x11, hs
-; CHECK-NEXT:    csel x1, x10, x8, hs
+; CHECK-NEXT:    subs x8, x0, x2
+; CHECK-NEXT:    sbcs x9, x1, x3
+; CHECK-NEXT:    cset w10, lo
+; CHECK-NEXT:    sbfx x10, x10, #0, #1
+; CHECK-NEXT:    eor x8, x8, x10
+; CHECK-NEXT:    eor x9, x9, x10
+; CHECK-NEXT:    subs x0, x8, x10
+; CHECK-NEXT:    sbc x1, x9, x10
 ; CHECK-NEXT:    ret
   %cmp = icmp uge i128 %a, %b
   %ab = sub i128 %a, %b
   %ba = sub i128 %b, %a
-  %sel = select i1 %cmp, i128 %ba, i128 %ab
+  %sel = select i1 %cmp, i128 %ab, i128 %ba
   ret i128 %sel
 }
 
diff --git a/llvm/test/CodeGen/RISCV/abds-neg.ll b/llvm/test/CodeGen/RISCV/abds-neg.ll
index 058f105e8f735..168615983d970 100644
--- a/llvm/test/CodeGen/RISCV/abds-neg.ll
+++ b/llvm/test/CodeGen/RISCV/abds-neg.ll
@@ -1791,20 +1791,20 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    sltu a4, a2, a0
+; RV32I-NEXT:    sltu a4, a0, a2
 ; RV32I-NEXT:    mv a5, a4
 ; RV32I-NEXT:    beq a1, a3, .LBB21_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt a5, a3, a1
+; RV32I-NEXT:    slt a5, a1, a3
 ; RV32I-NEXT:  .LBB21_2:
 ; RV32I-NEXT:    bnez a5, .LBB21_4
 ; RV32I-NEXT:  # %bb.3:
+; RV32I-NEXT:    sltu a4, a2, a0
 ; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    sub a0, a2, a0
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB21_4:
-; RV32I-NEXT:    sltu a4, a0, a2
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    sub a0, a0, a2
@@ -1812,7 +1812,7 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ;
 ; RV64I-LABEL: abd_cmp_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    blt a1, a0, .LBB21_2
+; RV64I-NEXT:    blt a0, a1, .LBB21_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    sub a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -1822,20 +1822,20 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ;
 ; RV32ZBB-LABEL: abd_cmp_i64:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    sltu a4, a2, a0
+; RV32ZBB-NEXT:    sltu a4, a0, a2
 ; RV32ZBB-NEXT:    mv a5, a4
 ; RV32ZBB-NEXT:    beq a1, a3, .LBB21_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt a5, a3, a1
+; RV32ZBB-NEXT:    slt a5, a1, a3
 ; RV32ZBB-NEXT:  .LBB21_2:
 ; RV32ZBB-NEXT:    bnez a5, .LBB21_4
 ; RV32ZBB-NEXT:  # %bb.3:
+; RV32ZBB-NEXT:    sltu a4, a2, a0
 ; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:    sub a1, a1, a4
 ; RV32ZBB-NEXT:    sub a0, a2, a0
 ; RV32ZBB-NEXT:    ret
 ; RV32ZBB-NEXT:  .LBB21_4:
-; RV32ZBB-NEXT:    sltu a4, a0, a2
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a1, a1, a4
 ; RV32ZBB-NEXT:    sub a0, a0, a2
@@ -1843,109 +1843,103 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ;
 ; RV64ZBB-LABEL: abd_cmp_i64:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    min a2, a0, a1
-; RV64ZBB-NEXT:    max a0, a0, a1
-; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    blt a0, a1, .LBB21_2
+; RV64ZBB-NEXT:  # %bb.1:
+; RV64ZBB-NEXT:    sub a0, a1, a0
+; RV64ZBB-NEXT:    ret
+; RV64ZBB-NEXT:  .LBB21_2:
+; RV64ZBB-NEXT:    sub a0, a0, a1
 ; RV64ZBB-NEXT:    ret
   %cmp = icmp slt i64 %a, %b
   %ab = sub i64 %a, %b
   %ba = sub i64 %b, %a
-  %sel = select i1 %cmp, i64 %ba, i64 %ab
+  %sel = select i1 %cmp, i64 %ab, i64 %ba
   ret i64 %sel
 }
 
 define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a5, 0(a2)
-; RV32I-NEXT:    lw a4, 4(a1)
-; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw a7, 8(a2)
+; RV32I-NEXT:    lw a3, 0(a2)
+; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a5, 4(a2)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw a7, 8(a1)
+; RV32I-NEXT:    lw a2, 12(a2)
 ; RV32I-NEXT:    lw t0, 12(a1)
-; RV32I-NEXT:    lw t1, 12(a2)
-; RV32I-NEXT:    lw a1, 4(a2)
-; RV32I-NEXT:    sltu a2, a7, a6
-; RV32I-NEXT:    mv t4, a2
-; RV32I-NEXT:    beq t0, t1, .LBB22_2
+; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    sltu t1, a7, a6
+; RV32I-NEXT:    mv t4, t1
+; RV32I-NEXT:    beq t0, a2, .LBB22_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt t4, t1, t0
+; RV32I-NEXT:    slt t4, t0, a2
 ; RV32I-NEXT:  .LBB22_2:
-; RV32I-NEXT:    sltu t2, a5, a3
-; RV32I-NEXT:    sltu t5, a1, a4
+; RV32I-NEXT:    sltu t2, a4, a3
 ; RV32I-NEXT:    mv t3, t2
-; RV32I-NEXT:    beq a4, a1, .LBB22_4
+; RV32I-NEXT:    beq a1, a5, .LBB22_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv t3, t5
+; RV32I-NEXT:    sltu t3, a1, a5
 ; RV32I-NEXT:  .LBB22_4:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    xor t6, t0, t1
-; RV32I-NEXT:    xor s0, a6, a7
-; RV32I-NEXT:    or t6, s0, t6
-; RV32I-NEXT:    beqz t6, .LBB22_6
+; RV32I-NEXT:    xor t5, t0, a2
+; RV32I-NEXT:    xor t6, a7, a6
+; RV32I-NEXT:    or t5, t6, t5
+; RV32I-NEXT:    mv t6, t3
+; RV32I-NEXT:    beqz t5, .LBB22_6
 ; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    mv t3, t4
+; RV32I-NEXT:    mv t6, t4
 ; RV32I-NEXT:  .LBB22_6:
-; RV32I-NEXT:    mv t4, t2
-; RV32I-NEXT:    beq a1, a4, .LBB22_8
+; RV32I-NEXT:    sltu t4, a3, a4
+; RV32I-NEXT:    mv t5, t4
+; RV32I-NEXT:    beq a1, a5, .LBB22_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv t4, t5
+; RV32I-NEXT:    sltu t5, a5, a1
 ; RV32I-NEXT:  .LBB22_8:
-; RV32I-NEXT:    sltu t5, a3, a5
-; RV32I-NEXT:    mv t6, t5
-; RV32I-NEXT:    beq a4, a1, .LBB22_10
+; RV32I-NEXT:    bnez t6, .LBB22_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    sltu t6, a4, a1
+; RV32I-NEXT:    sltu t1, a6, a7
+; RV32I-NEXT:    sub a2, a2, t0
+; RV32I-NEXT:    sub a2, a2, t1
+; RV32I-NEXT:    sub a6, a6, a7
+; RV32I-NEXT:    sltu a7, a6, t5
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a6, a6, t5
+; RV32I-NEXT:    sub a5, a5, a1
+; RV32I-NEXT:    sub a1, a5, t4
+; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    j .LBB22_11
 ; RV32I-NEXT:  .LBB22_10:
-; RV32I-NEXT:    bnez t3, .LBB22_12
-; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sub t0, t1, t0
-; RV32I-NEXT:    sub a6, a7, a6
 ; RV32I-NEXT:    sub a2, t0, a2
-; RV32I-NEXT:    sltu a7, a6, t4
+; RV32I-NEXT:    sub a6, a7, a6
+; RV32I-NEXT:    sub a2, a2, t1
+; RV32I-NEXT:    sltu a7, a6, t3
+; RV32I-NEXT:    sub a1, a1, a5
 ; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a3, a5, a3
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    sub a6, a6, t3
 ; RV32I-NEXT:    sub a1, a1, t2
-; RV32I-NEXT:    sub a4, a6, t4
-; RV32I-NEXT:    j .LBB22_13
-; RV32I-NEXT:  .LBB22_12:
-; RV32I-NEXT:    sltu a2, a6, a7
-; RV32I-NEXT:    sub t0, t0, t1
-; RV32I-NEXT:    sub a2, t0, a2
-; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t6
-; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a3, a3, a5
-; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a1, a4, t5
-; RV32I-NEXT:    sub a4, a6, t6
-; RV32I-NEXT:  .LBB22_13:
-; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:  .LBB22_11:
+; RV32I-NEXT:    sw a6, 8(a0)
 ; RV32I-NEXT:    sw a1, 4(a0)
 ; RV32I-NEXT:    sw a3, 0(a0)
 ; RV32I-NEXT:    sw a2, 12(a0)
-; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i128:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sltu a4, a2, a0
+; RV64I-NEXT:    sltu a4, a0, a2
 ; RV64I-NEXT:    mv a5, a4
 ; RV64I-NEXT:    beq a1, a3, .LBB22_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slt a5, a3, a1
+; RV64I-NEXT:    slt a5, a1, a3
 ; RV64I-NEXT:  .LBB22_2:
 ; RV64I-NEXT:    bnez a5, .LBB22_4
 ; RV64I-NEXT:  # %bb.3:
+; RV64I-NEXT:    sltu a4, a2, a0
 ; RV64I-NEXT:    sub a1, a3, a1
 ; RV64I-NEXT:    sub a1, a1, a4
 ; RV64I-NEXT:    sub a0, a2, a0
 ; RV64I-NEXT:    ret
 ; RV64I-NEXT:  .LBB22_4:
-; RV64I-NEXT:    sltu a4, a0, a2
 ; RV64I-NEXT:    sub a1, a1, a3
 ; RV64I-NEXT:    sub a1, a1, a4
 ; RV64I-NEXT:    sub a0, a0, a2
@@ -1953,95 +1947,86 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ;
 ; RV32ZBB-LABEL: abd_cmp_i128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw a5, 0(a2)
-; RV32ZBB-NEXT:    lw a4, 4(a1)
-; RV32ZBB-NEXT:    lw a6, 8(a1)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
+; RV32ZBB-NEXT:    lw a3, 0(a2)
+; RV32ZBB-NEXT:    lw a4, 0(a1)
+; RV32ZBB-NEXT:    lw a5, 4(a2)
+; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a7, 8(a1)
+; RV32ZBB-NEXT:    lw a2, 12(a2)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
-; RV32ZBB-NEXT:    lw t1, 12(a2)
-; RV32ZBB-NEXT:    lw a1, 4(a2)
-; RV32ZBB-NEXT:    sltu a2, a7, a6
-; RV32ZBB-NEXT:    mv t4, a2
-; RV32ZBB-NEXT:    beq t0, t1, .LBB22_2
+; RV32ZBB-NEXT:    lw a1, 4(a1)
+; RV32ZBB-NEXT:    sltu t1, a7, a6
+; RV32ZBB-NEXT:    mv t4, t1
+; RV32ZBB-NEXT:    beq t0, a2, .LBB22_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt t4, t1, t0
+; RV32ZBB-NEXT:    slt t4, t0, a2
 ; RV32ZBB-NEXT:  .LBB22_2:
-; RV32ZBB-NEXT:    sltu t2, a5, a3
-; RV32ZBB-NEXT:    sltu t5, a1, a4
+; RV32ZBB-NEXT:    sltu t2, a4, a3
 ; RV32ZBB-NEXT:    mv t3, t2
-; RV32ZBB-NEXT:    beq a4, a1, .LBB22_4
+; RV32ZBB-NEXT:    beq a1, a5, .LBB22_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    mv t3, t5
+; RV32ZBB-NEXT:    sltu t3, a1, a5
 ; RV32ZBB-NEXT:  .LBB22_4:
-; RV32ZBB-NEXT:    addi sp, sp, -16
-; RV32ZBB-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZBB-NEXT:    xor t6, t0, t1
-; RV32ZBB-NEXT:    xor s0, a6, a7
-; RV32ZBB-NEXT:    or t6, s0, t6
-; RV32ZBB-NEXT:    beqz t6, .LBB22_6
+; RV32ZBB-NEXT:    xor t5, t0, a2
+; RV32ZBB-NEXT:    xor t6, a7, a6
+; RV32ZBB-NEXT:    or t5, t6, t5
+; RV32ZBB-NEXT:    mv t6, t3
+; RV32ZBB-NEXT:    beqz t5, .LBB22_6
 ; RV32ZBB-NEXT:  # %bb.5:
-; RV32ZBB-NEXT:    mv t3, t4
+; RV32ZBB-NEXT:    mv t6, t4
 ; RV32ZBB-NEXT:  .LBB22_6:
-; RV32ZBB-NEXT:    mv t4, t2
-; RV32ZBB-NEXT:    beq a1, a4, .LBB22_8
+; RV32ZBB-NEXT:    sltu t4, a3, a4
+; RV32ZBB-NEXT:    mv t5, t4
+; RV32ZBB-NEXT:    beq a1, a5, .LBB22_8
 ; RV32ZBB-NEXT:  # %bb.7:
-; RV32ZBB-NEXT:    mv t4, t5
+; RV32ZBB-NEXT:    sltu t5, a5, a1
 ; RV32ZBB-NEXT:  .LBB22_8:
-; RV32ZBB-NEXT:    sltu t5, a3, a5
-; RV32ZBB-NEXT:    mv t6, t5
-; RV32ZBB-NEXT:    beq a4, a1, .LBB22_10
+; RV32ZBB-NEXT:    bnez t6, .LBB22_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    sltu t6, a4, a1
+; RV32ZBB-NEXT:    sltu t1, a6, a7
+; RV32ZBB-NEXT:    sub a2, a2, t0
+; RV32ZBB-NEXT:    sub a2, a2, t1
+; RV32ZBB-NEXT:    sub a6, a6, a7
+; RV32ZBB-NEXT:    sltu a7, a6, t5
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a6, a6, t5
+; RV32ZBB-NEXT:    sub a5, a5, a1
+; RV32ZBB-NEXT:    sub a1, a5, t4
+; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    j .LBB22_11
 ; RV32ZBB-NEXT:  .LBB22_10:
-; RV32ZBB-NEXT:    bnez t3, .LBB22_12
-; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sub t0, t1, t0
-; RV32ZBB-NEXT:    sub a6, a7, a6
 ; RV32ZBB-NEXT:    sub a2, t0, a2
-; RV32ZBB-NEXT:    sltu a7, a6, t4
+; RV32ZBB-NEXT:    sub a6, a7, a6
+; RV32ZBB-NEXT:    sub a2, a2, t1
+; RV32ZBB-NEXT:    sltu a7, a6, t3
+; RV32ZBB-NEXT:    sub a1, a1, a5
 ; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a3, a5, a3
-; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sub a6, a6, t3
 ; RV32ZBB-NEXT:    sub a1, a1, t2
-; RV32ZBB-NEXT:    sub a4, a6, t4
-; RV32ZBB-NEXT:    j .LBB22_13
-; RV32ZBB-NEXT:  .LBB22_12:
-; RV32ZBB-NEXT:    sltu a2, a6, a7
-; RV32ZBB-NEXT:    sub t0, t0, t1
-; RV32ZBB-NEXT:    sub a2, t0, a2
-; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t6
-; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a3, a3, a5
-; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a1, a4, t5
-; RV32ZBB-NEXT:    sub a4, a6, t6
-; RV32ZBB-NEXT:  .LBB22_13:
-; RV32ZBB-NEXT:    sw a4, 8(a0)
+; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:  .LBB22_11:
+; RV32ZBB-NEXT:    sw a6, 8(a0)
 ; RV32ZBB-NEXT:    sw a1, 4(a0)
 ; RV32ZBB-NEXT:    sw a3, 0(a0)
 ; RV32ZBB-NEXT:    sw a2, 12(a0)
-; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i128:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    sltu a4, a2, a0
+; RV64ZBB-NEXT:    sltu a4, a0, a2
 ; RV64ZBB-NEXT:    mv a5, a4
 ; RV64ZBB-NEXT:    beq a1, a3, .LBB22_2
 ; RV64ZBB-NEXT:  # %bb.1:
-; RV64ZBB-NEXT:    slt a5, a3, a1
+; RV64ZBB-NEXT:    slt a5, a1, a3
 ; RV64ZBB-NEXT:  .LBB22_2:
 ; RV64ZBB-NEXT:    bnez a5, .LBB22_4
 ; RV64ZBB-NEXT:  # %bb.3:
+; RV64ZBB-NEXT:    sltu a4, a2, a0
 ; RV64ZBB-NEXT:    sub a1, a3, a1
 ; RV64ZBB-NEXT:    sub a1, a1, a4
 ; RV64ZBB-NEXT:    sub a0, a2, a0
 ; RV64ZBB-NEXT:    ret
 ; RV64ZBB-NEXT:  .LBB22_4:
-; RV64ZBB-NEXT:    sltu a4, a0, a2
 ; RV64ZBB-NEXT:    sub a1, a1, a3
 ; RV64ZBB-NEXT:    sub a1, a1, a4
 ; RV64ZBB-NEXT:    sub a0, a0, a2
@@ -2049,7 +2034,7 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
   %cmp = icmp slt i128 %a, %b
   %ab = sub i128 %a, %b
   %ba = sub i128 %b, %a
-  %sel = select i1 %cmp, i128 %ba, i128 %ab
+  %sel = select i1 %cmp, i128 %ab, i128 %ba
   ret i128 %sel
 }
 
diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll
index b867a55445c95..86b36d8f69e95 100644
--- a/llvm/test/CodeGen/RISCV/abds.ll
+++ b/llvm/test/CodeGen/RISCV/abds.ll
@@ -1448,250 +1448,265 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    sltu a4, a0, a2
+; RV32I-NEXT:    sltu a4, a2, a0
 ; RV32I-NEXT:    mv a5, a4
 ; RV32I-NEXT:    beq a1, a3, .LBB21_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt a5, a1, a3
+; RV32I-NEXT:    slt a5, a3, a1
 ; RV32I-NEXT:  .LBB21_2:
-; RV32I-NEXT:    beqz a5, .LBB21_4
+; RV32I-NEXT:    bnez a5, .LBB21_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a0, a2, a0
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB21_4:
-; RV32I-NEXT:    sltu a4, a2, a0
-; RV32I-NEXT:    sub a1, a3, a1
+; RV32I-NEXT:    sltu a4, a0, a2
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a0, a2, a0
+; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    bge a0, a1, .LBB21_2
+; RV64I-NEXT:    blt a1, a0, .LBB21_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    sub a0, a1, a0
 ; RV64I-NEXT:    ret
 ; RV64I-NEXT:  .LBB21_2:
-; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: abd_cmp_i64:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    sltu a4, a0, a2
+; RV32ZBB-NEXT:    sltu a4, a2, a0
 ; RV32ZBB-NEXT:    mv a5, a4
 ; RV32ZBB-NEXT:    beq a1, a3, .LBB21_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt a5, a1, a3
+; RV32ZBB-NEXT:    slt a5, a3, a1
 ; RV32ZBB-NEXT:  .LBB21_2:
-; RV32ZBB-NEXT:    beqz a5, .LBB21_4
+; RV32ZBB-NEXT:    bnez a5, .LBB21_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a0, a0, a2
+; RV32ZBB-NEXT:    sub a0, a2, a0
 ; RV32ZBB-NEXT:    ret
 ; RV32ZBB-NEXT:  .LBB21_4:
-; RV32ZBB-NEXT:    sltu a4, a2, a0
-; RV32ZBB-NEXT:    sub a1, a3, a1
+; RV32ZBB-NEXT:    sltu a4, a0, a2
+; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a0, a2, a0
+; RV32ZBB-NEXT:    sub a0, a0, a2
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i64:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    bge a0, a1, .LBB21_2
-; RV64ZBB-NEXT:  # %bb.1:
-; RV64ZBB-NEXT:    sub a0, a0, a1
-; RV64ZBB-NEXT:    ret
-; RV64ZBB-NEXT:  .LBB21_2:
-; RV64ZBB-NEXT:    sub a0, a1, a0
+; RV64ZBB-NEXT:    min a2, a0, a1
+; RV64ZBB-NEXT:    max a0, a0, a1
+; RV64ZBB-NEXT:    sub a0, a0, a2
 ; RV64ZBB-NEXT:    ret
   %cmp = icmp sge i64 %a, %b
   %ab = sub i64 %a, %b
   %ba = sub i64 %b, %a
-  %sel = select i1 %cmp, i64 %ba, i64 %ab
+  %sel = select i1 %cmp, i64 %ab, i64 %ba
   ret i64 %sel
 }
 
 define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a3, 0(a2)
-; RV32I-NEXT:    lw a4, 0(a1)
-; RV32I-NEXT:    lw a5, 4(a2)
-; RV32I-NEXT:    lw a6, 8(a2)
-; RV32I-NEXT:    lw a7, 8(a1)
-; RV32I-NEXT:    lw a2, 12(a2)
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a5, 0(a2)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a6, 8(a1)
+; RV32I-NEXT:    lw a7, 8(a2)
 ; RV32I-NEXT:    lw t0, 12(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a7, a6
-; RV32I-NEXT:    mv t4, t1
-; RV32I-NEXT:    beq t0, a2, .LBB22_2
+; RV32I-NEXT:    lw t1, 12(a2)
+; RV32I-NEXT:    lw a1, 4(a2)
+; RV32I-NEXT:    sltu a2, a7, a6
+; RV32I-NEXT:    mv t4, a2
+; RV32I-NEXT:    beq t0, t1, .LBB22_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt t4, t0, a2
+; RV32I-NEXT:    slt t4, t1, t0
 ; RV32I-NEXT:  .LBB22_2:
-; RV32I-NEXT:    xor t3, t0, a2
-; RV32I-NEXT:    xor t5, a7, a6
-; RV32I-NEXT:    sltu t2, a4, a3
-; RV32I-NEXT:    or t5, t5, t3
+; RV32I-NEXT:    sltu t2, a5, a3
+; RV32I-NEXT:    sltu t5, a1, a4
 ; RV32I-NEXT:    mv t3, t2
-; RV32I-NEXT:    beq a1, a5, .LBB22_4
+; RV32I-NEXT:    beq a4, a1, .LBB22_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t3, a1, a5
+; RV32I-NEXT:    mv t3, t5
 ; RV32I-NEXT:  .LBB22_4:
-; RV32I-NEXT:    mv t6, t3
-; RV32I-NEXT:    beqz t5, .LBB22_6
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    xor t6, t0, t1
+; RV32I-NEXT:    xor s0, a6, a7
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    beqz t6, .LBB22_6
 ; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    mv t6, t4
+; RV32I-NEXT:    mv t3, t4
 ; RV32I-NEXT:  .LBB22_6:
-; RV32I-NEXT:    sltu t4, a3, a4
-; RV32I-NEXT:    mv t5, t4
-; RV32I-NEXT:    beq a1, a5, .LBB22_8
+; RV32I-NEXT:    mv t4, t2
+; RV32I-NEXT:    beq a1, a4, .LBB22_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    sltu t5, a5, a1
+; RV32I-NEXT:    mv t4, t5
 ; RV32I-NEXT:  .LBB22_8:
-; RV32I-NEXT:    beqz t6, .LBB22_10
+; RV32I-NEXT:    sltu t5, a3, a5
+; RV32I-NEXT:    mv t6, t5
+; RV32I-NEXT:    beq a4, a1, .LBB22_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    sub a2, t0, a2
+; RV32I-NEXT:    sltu t6, a4, a1
+; RV32I-NEXT:  .LBB22_10:
+; RV32I-NEXT:    bnez t3, .LBB22_12
+; RV32I-NEXT:  # %bb.11:
+; RV32I-NEXT:    sub t0, t1, t0
 ; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    sltu a7, a6, t3
-; RV32I-NEXT:    sub a1, a1, a5
+; RV32I-NEXT:    sub a2, t0, a2
+; RV32I-NEXT:    sltu a7, a6, t4
 ; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a6, a6, t3
+; RV32I-NEXT:    sub a3, a5, a3
+; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    sub a1, a1, t2
-; RV32I-NEXT:    sub a3, a4, a3
-; RV32I-NEXT:    j .LBB22_11
-; RV32I-NEXT:  .LBB22_10:
-; RV32I-NEXT:    sltu t1, a6, a7
-; RV32I-NEXT:    sub a2, a2, t0
-; RV32I-NEXT:    sub a2, a2, t1
+; RV32I-NEXT:    sub a4, a6, t4
+; RV32I-NEXT:    j .LBB22_13
+; RV32I-NEXT:  .LBB22_12:
+; RV32I-NEXT:    sltu a2, a6, a7
+; RV32I-NEXT:    sub t0, t0, t1
+; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t5
+; RV32I-NEXT:    sltu a7, a6, t6
 ; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a6, a6, t5
-; RV32I-NEXT:    sub a5, a5, a1
-; RV32I-NEXT:    sub a1, a5, t4
-; RV32I-NEXT:    sub a3, a3, a4
-; RV32I-NEXT:  .LBB22_11:
-; RV32I-NEXT:    sw a6, 8(a0)
+; RV32I-NEXT:    sub a3, a3, a5
+; RV32I-NEXT:    sub a4, a4, a1
+; RV32I-NEXT:    sub a1, a4, t5
+; RV32I-NEXT:    sub a4, a6, t6
+; RV32I-NEXT:  .LBB22_13:
+; RV32I-NEXT:    sw a4, 8(a0)
 ; RV32I-NEXT:    sw a1, 4(a0)
 ; RV32I-NEXT:    sw a3, 0(a0)
 ; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i128:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sltu a4, a0, a2
+; RV64I-NEXT:    sltu a4, a2, a0
 ; RV64I-NEXT:    mv a5, a4
 ; RV64I-NEXT:    beq a1, a3, .LBB22_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slt a5, a1, a3
+; RV64I-NEXT:    slt a5, a3, a1
 ; RV64I-NEXT:  .LBB22_2:
-; RV64I-NEXT:    beqz a5, .LBB22_4
+; RV64I-NEXT:    bnez a5, .LBB22_4
 ; RV64I-NEXT:  # %bb.3:
-; RV64I-NEXT:    sub a1, a1, a3
+; RV64I-NEXT:    sub a1, a3, a1
 ; RV64I-NEXT:    sub a1, a1, a4
-; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    sub a0, a2, a0
 ; RV64I-NEXT:    ret
 ; RV64I-NEXT:  .LBB22_4:
-; RV64I-NEXT:    sltu a4, a2, a0
-; RV64I-NEXT:    sub a1, a3, a1
+; RV64I-NEXT:    sltu a4, a0, a2
+; RV64I-NEXT:    sub a1, a1, a3
 ; RV64I-NEXT:    sub a1, a1, a4
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    sub a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: abd_cmp_i128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a3, 0(a2)
-; RV32ZBB-NEXT:    lw a4, 0(a1)
-; RV32ZBB-NEXT:    lw a5, 4(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a1)
-; RV32ZBB-NEXT:    lw a2, 12(a2)
+; RV32ZBB-NEXT:    lw a3, 0(a1)
+; RV32ZBB-NEXT:    lw a5, 0(a2)
+; RV32ZBB-NEXT:    lw a4, 4(a1)
+; RV32ZBB-NEXT:    lw a6, 8(a1)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
-; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a7, a6
-; RV32ZBB-NEXT:    mv t4, t1
-; RV32ZBB-NEXT:    beq t0, a2, .LBB22_2
+; RV32ZBB-NEXT:    lw t1, 12(a2)
+; RV32ZBB-NEXT:    lw a1, 4(a2)
+; RV32ZBB-NEXT:    sltu a2, a7, a6
+; RV32ZBB-NEXT:    mv t4, a2
+; RV32ZBB-NEXT:    beq t0, t1, .LBB22_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt t4, t0, a2
+; RV32ZBB-NEXT:    slt t4, t1, t0
 ; RV32ZBB-NEXT:  .LBB22_2:
-; RV32ZBB-NEXT:    xor t3, t0, a2
-; RV32ZBB-NEXT:    xor t5, a7, a6
-; RV32ZBB-NEXT:    sltu t2, a4, a3
-; RV32ZBB-NEXT:    or t5, t5, t3
+; RV32ZBB-NEXT:    sltu t2, a5, a3
+; RV32ZBB-NEXT:    sltu t5, a1, a4
 ; RV32ZBB-NEXT:    mv t3, t2
-; RV32ZBB-NEXT:    beq a1, a5, .LBB22_4
+; RV32ZBB-NEXT:    beq a4, a1, .LBB22_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t3, a1, a5
+; RV32ZBB-NEXT:    mv t3, t5
 ; RV32ZBB-NEXT:  .LBB22_4:
-; RV32ZBB-NEXT:    mv t6, t3
-; RV32ZBB-NEXT:    beqz t5, .LBB22_6
+; RV32ZBB-NEXT:    addi sp, sp, -16
+; RV32ZBB-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32ZBB-NEXT:    xor t6, t0, t1
+; RV32ZBB-NEXT:    xor s0, a6, a7
+; RV32ZBB-NEXT:    or t6, s0, t6
+; RV32ZBB-NEXT:    beqz t6, .LBB22_6
 ; RV32ZBB-NEXT:  # %bb.5:
-; RV32ZBB-NEXT:    mv t6, t4
+; RV32ZBB-NEXT:    mv t3, t4
 ; RV32ZBB-NEXT:  .LBB22_6:
-; RV32ZBB-NEXT:    sltu t4, a3, a4
-; RV32ZBB-NEXT:    mv t5, t4
-; RV32ZBB-NEXT:    beq a1, a5, .LBB22_8
+; RV32ZBB-NEXT:    mv t4, t2
+; RV32ZBB-NEXT:    beq a1, a4, .LBB22_8
 ; RV32ZBB-NEXT:  # %bb.7:
-; RV32ZBB-NEXT:    sltu t5, a5, a1
+; RV32ZBB-NEXT:    mv t4, t5
 ; RV32ZBB-NEXT:  .LBB22_8:
-; RV32ZBB-NEXT:    beqz t6, .LBB22_10
+; RV32ZBB-NEXT:    sltu t5, a3, a5
+; RV32ZBB-NEXT:    mv t6, t5
+; RV32ZBB-NEXT:    beq a4, a1, .LBB22_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    sub a2, t0, a2
+; RV32ZBB-NEXT:    sltu t6, a4, a1
+; RV32ZBB-NEXT:  .LBB22_10:
+; RV32ZBB-NEXT:    bnez t3, .LBB22_12
+; RV32ZBB-NEXT:  # %bb.11:
+; RV32ZBB-NEXT:    sub t0, t1, t0
 ; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    sltu a7, a6, t3
-; RV32ZBB-NEXT:    sub a1, a1, a5
+; RV32ZBB-NEXT:    sub a2, t0, a2
+; RV32ZBB-NEXT:    sltu a7, a6, t4
 ; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a6, a6, t3
+; RV32ZBB-NEXT:    sub a3, a5, a3
+; RV32ZBB-NEXT:    sub a1, a1, a4
 ; RV32ZBB-NEXT:    sub a1, a1, t2
-; RV32ZBB-NEXT:    sub a3, a4, a3
-; RV32ZBB-NEXT:    j .LBB22_11
-; RV32ZBB-NEXT:  .LBB22_10:
-; RV32ZBB-NEXT:    sltu t1, a6, a7
-; RV32ZBB-NEXT:    sub a2, a2, t0
-; RV32ZBB-NEXT:    sub a2, a2, t1
+; RV32ZBB-NEXT:    sub a4, a6, t4
+; RV32ZBB-NEXT:    j .LBB22_13
+; RV32ZBB-NEXT:  .LBB22_12:
+; RV32ZBB-NEXT:    sltu a2, a6, a7
+; RV32ZBB-NEXT:    sub t0, t0, t1
+; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t5
+; RV32ZBB-NEXT:    sltu a7, a6, t6
 ; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a6, a6, t5
-; RV32ZBB-NEXT:    sub a5, a5, a1
-; RV32ZBB-NEXT:    sub a1, a5, t4
-; RV32ZBB-NEXT:    sub a3, a3, a4
-; RV32ZBB-NEXT:  .LBB22_11:
-; RV32ZBB-NEXT:    sw a6, 8(a0)
+; RV32ZBB-NEXT:    sub a3, a3, a5
+; RV32ZBB-NEXT:    sub a4, a4, a1
+; RV32ZBB-NEXT:    sub a1, a4, t5
+; RV32ZBB-NEXT:    sub a4, a6, t6
+; RV32ZBB-NEXT:  .LBB22_13:
+; RV32ZBB-NEXT:    sw a4, 8(a0)
 ; RV32ZBB-NEXT:    sw a1, 4(a0)
 ; RV32ZBB-NEXT:    sw a3, 0(a0)
 ; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i128:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    sltu a4, a0, a2
+; RV64ZBB-NEXT:    sltu a4, a2, a0
 ; RV64ZBB-NEXT:    mv a5, a4
 ; RV64ZBB-NEXT:    beq a1, a3, .LBB22_2
 ; RV64ZBB-NEXT:  # %bb.1:
-; RV64ZBB-NEXT:    slt a5, a1, a3
+; RV64ZBB-NEXT:    slt a5, a3, a1
 ; RV64ZBB-NEXT:  .LBB22_2:
-; RV64ZBB-NEXT:    beqz a5, .LBB22_4
+; RV64ZBB-NEXT:    bnez a5, .LBB22_4
 ; RV64ZBB-NEXT:  # %bb.3:
-; RV64ZBB-NEXT:    sub a1, a1, a3
+; RV64ZBB-NEXT:    sub a1, a3, a1
 ; RV64ZBB-NEXT:    sub a1, a1, a4
-; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    sub a0, a2, a0
 ; RV64ZBB-NEXT:    ret
 ; RV64ZBB-NEXT:  .LBB22_4:
-; RV64ZBB-NEXT:    sltu a4, a2, a0
-; RV64ZBB-NEXT:    sub a1, a3, a1
+; RV64ZBB-NEXT:    sltu a4, a0, a2
+; RV64ZBB-NEXT:    sub a1, a1, a3
 ; RV64ZBB-NEXT:    sub a1, a1, a4
-; RV64ZBB-NEXT:    sub a0, a2, a0
+; RV64ZBB-NEXT:    sub a0, a0, a2
 ; RV64ZBB-NEXT:    ret
   %cmp = icmp sge i128 %a, %b
   %ab = sub i128 %a, %b
   %ba = sub i128 %b, %a
-  %sel = select i1 %cmp, i128 %ba, i128 %ab
+  %sel = select i1 %cmp, i128 %ab, i128 %ba
   ret i128 %sel
 }
 
diff --git a/llvm/test/CodeGen/RISCV/abdu-neg.ll b/llvm/test/CodeGen/RISCV/abdu-neg.ll
index bcacdf44ab103..87a06fc4403eb 100644
--- a/llvm/test/CodeGen/RISCV/abdu-neg.ll
+++ b/llvm/test/CodeGen/RISCV/abdu-neg.ll
@@ -1597,17 +1597,6 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ;
 
 define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
-; CHECK-LABEL: abd_cmp_i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andi a2, a0, 255
-; CHECK-NEXT:    andi a3, a1, 255
-; CHECK-NEXT:    bgeu a3, a2, .LBB18_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    sub a0, a1, a0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB18_2:
-; CHECK-NEXT:    sub a0, a0, a1
-; CHECK-NEXT:    ret
 ; NOZBB-LABEL: abd_cmp_i8:
 ; NOZBB:       # %bb.0:
 ; NOZBB-NEXT:    andi a2, a0, 255
@@ -1740,28 +1729,27 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sltu a4, a0, a2
-; RV32I-NEXT:    sub a3, a1, a3
-; RV32I-NEXT:    sub a3, a3, a4
-; RV32I-NEXT:    sub a2, a0, a2
-; RV32I-NEXT:    beq a3, a1, .LBB21_2
+; RV32I-NEXT:    mv a5, a4
+; RV32I-NEXT:    beq a1, a3, .LBB21_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a0, a1, a3
-; RV32I-NEXT:    j .LBB21_3
+; RV32I-NEXT:    sltu a5, a1, a3
 ; RV32I-NEXT:  .LBB21_2:
-; RV32I-NEXT:    sltu a0, a0, a2
-; RV32I-NEXT:  .LBB21_3:
-; RV32I-NEXT:    neg a1, a0
-; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    xor a1, a3, a1
-; RV32I-NEXT:    add a1, a1, a0
+; RV32I-NEXT:    bnez a5, .LBB21_4
+; RV32I-NEXT:  # %bb.3:
+; RV32I-NEXT:    sltu a4, a2, a0
+; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    sub a0, a2, a0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB21_4:
+; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    bltu a1, a0, .LBB21_2
+; RV64I-NEXT:    bltu a0, a1, .LBB21_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    sub a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -1772,234 +1760,218 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBB-LABEL: abd_cmp_i64:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    sltu a4, a0, a2
-; RV32ZBB-NEXT:    sub a3, a1, a3
-; RV32ZBB-NEXT:    sub a3, a3, a4
-; RV32ZBB-NEXT:    sub a2, a0, a2
-; RV32ZBB-NEXT:    beq a3, a1, .LBB21_2
+; RV32ZBB-NEXT:    mv a5, a4
+; RV32ZBB-NEXT:    beq a1, a3, .LBB21_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu a0, a1, a3
-; RV32ZBB-NEXT:    j .LBB21_3
+; RV32ZBB-NEXT:    sltu a5, a1, a3
 ; RV32ZBB-NEXT:  .LBB21_2:
-; RV32ZBB-NEXT:    sltu a0, a0, a2
-; RV32ZBB-NEXT:  .LBB21_3:
-; RV32ZBB-NEXT:    neg a1, a0
-; RV32ZBB-NEXT:    xor a2, a2, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a1
-; RV32ZBB-NEXT:    xor a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a1, a0
+; RV32ZBB-NEXT:    bnez a5, .LBB21_4
+; RV32ZBB-NEXT:  # %bb.3:
+; RV32ZBB-NEXT:    sltu a4, a2, a0
+; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    sub a0, a2, a0
+; RV32ZBB-NEXT:    ret
+; RV32ZBB-NEXT:  .LBB21_4:
+; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sub a0, a0, a2
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i64:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    minu a2, a0, a1
-; RV64ZBB-NEXT:    maxu a0, a0, a1
-; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    bltu a0, a1, .LBB21_2
+; RV64ZBB-NEXT:  # %bb.1:
+; RV64ZBB-NEXT:    sub a0, a1, a0
+; RV64ZBB-NEXT:    ret
+; RV64ZBB-NEXT:  .LBB21_2:
+; RV64ZBB-NEXT:    sub a0, a0, a1
 ; RV64ZBB-NEXT:    ret
   %cmp = icmp ult i64 %a, %b
   %ab = sub i64 %a, %b
   %ba = sub i64 %b, %a
-  %sel = select i1 %cmp, i64 %ba, i64 %ab
+  %sel = select i1 %cmp, i64 %ab, i64 %ba
   ret i64 %sel
 }
 
 define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a5, 0(a2)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw t1, 12(a2)
-; RV32I-NEXT:    lw a7, 8(a2)
-; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    lw a6, 12(a1)
-; RV32I-NEXT:    lw t0, 4(a2)
+; RV32I-NEXT:    lw a3, 0(a2)
+; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a5, 4(a2)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw a7, 8(a1)
+; RV32I-NEXT:    lw a2, 12(a2)
+; RV32I-NEXT:    lw t0, 12(a1)
 ; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu a2, a4, a7
-; RV32I-NEXT:    sub t1, a6, t1
-; RV32I-NEXT:    sltu t2, a3, a5
-; RV32I-NEXT:    sub a2, t1, a2
-; RV32I-NEXT:    mv t1, t2
-; RV32I-NEXT:    beq a1, t0, .LBB22_2
+; RV32I-NEXT:    sltu t1, a7, a6
+; RV32I-NEXT:    mv t4, t1
+; RV32I-NEXT:    beq t0, a2, .LBB22_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a1, t0
+; RV32I-NEXT:    sltu t4, t0, a2
 ; RV32I-NEXT:  .LBB22_2:
-; RV32I-NEXT:    sub a7, a4, a7
-; RV32I-NEXT:    sltu t3, a7, t1
-; RV32I-NEXT:    sub a2, a2, t3
-; RV32I-NEXT:    sub a7, a7, t1
-; RV32I-NEXT:    beq a2, a6, .LBB22_4
+; RV32I-NEXT:    sltu t2, a4, a3
+; RV32I-NEXT:    mv t3, t2
+; RV32I-NEXT:    beq a1, a5, .LBB22_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a6, a2
-; RV32I-NEXT:    j .LBB22_5
+; RV32I-NEXT:    sltu t3, a1, a5
 ; RV32I-NEXT:  .LBB22_4:
-; RV32I-NEXT:    sltu t1, a4, a7
-; RV32I-NEXT:  .LBB22_5:
-; RV32I-NEXT:    sub t0, a1, t0
-; RV32I-NEXT:    sub t0, t0, t2
-; RV32I-NEXT:    sub a5, a3, a5
-; RV32I-NEXT:    beq t0, a1, .LBB22_7
-; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a1, a1, t0
-; RV32I-NEXT:    j .LBB22_8
-; RV32I-NEXT:  .LBB22_7:
-; RV32I-NEXT:    sltu a1, a3, a5
+; RV32I-NEXT:    xor t5, t0, a2
+; RV32I-NEXT:    xor t6, a7, a6
+; RV32I-NEXT:    or t5, t6, t5
+; RV32I-NEXT:    mv t6, t3
+; RV32I-NEXT:    beqz t5, .LBB22_6
+; RV32I-NEXT:  # %bb.5:
+; RV32I-NEXT:    mv t6, t4
+; RV32I-NEXT:  .LBB22_6:
+; RV32I-NEXT:    sltu t4, a3, a4
+; RV32I-NEXT:    mv t5, t4
+; RV32I-NEXT:    beq a1, a5, .LBB22_8
+; RV32I-NEXT:  # %bb.7:
+; RV32I-NEXT:    sltu t5, a5, a1
 ; RV32I-NEXT:  .LBB22_8:
-; RV32I-NEXT:    xor a3, a2, a6
-; RV32I-NEXT:    xor a4, a7, a4
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    beqz a3, .LBB22_10
+; RV32I-NEXT:    bnez t6, .LBB22_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a1, t1
+; RV32I-NEXT:    sltu t1, a6, a7
+; RV32I-NEXT:    sub a2, a2, t0
+; RV32I-NEXT:    sub a2, a2, t1
+; RV32I-NEXT:    sub a6, a6, a7
+; RV32I-NEXT:    sltu a7, a6, t5
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a6, a6, t5
+; RV32I-NEXT:    sub a5, a5, a1
+; RV32I-NEXT:    sub a1, a5, t4
+; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    j .LBB22_11
 ; RV32I-NEXT:  .LBB22_10:
-; RV32I-NEXT:    neg a6, a1
-; RV32I-NEXT:    xor a3, a7, a6
-; RV32I-NEXT:    sltu a4, a3, a6
-; RV32I-NEXT:    xor a2, a2, a6
-; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    sub a4, a2, a4
-; RV32I-NEXT:    xor a2, a5, a6
-; RV32I-NEXT:    sltu a5, a2, a6
-; RV32I-NEXT:    xor a7, t0, a6
-; RV32I-NEXT:    mv t1, a5
-; RV32I-NEXT:    beqz t0, .LBB22_12
-; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t1, a7, a6
-; RV32I-NEXT:  .LBB22_12:
-; RV32I-NEXT:    add a3, a3, a1
-; RV32I-NEXT:    sltu a6, a3, t1
-; RV32I-NEXT:    sub a4, a4, a6
-; RV32I-NEXT:    sub a3, a3, t1
-; RV32I-NEXT:    add a7, a7, a1
-; RV32I-NEXT:    sub a5, a7, a5
-; RV32I-NEXT:    add a1, a2, a1
-; RV32I-NEXT:    sw a1, 0(a0)
-; RV32I-NEXT:    sw a5, 4(a0)
-; RV32I-NEXT:    sw a3, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sub a2, t0, a2
+; RV32I-NEXT:    sub a6, a7, a6
+; RV32I-NEXT:    sub a2, a2, t1
+; RV32I-NEXT:    sltu a7, a6, t3
+; RV32I-NEXT:    sub a1, a1, a5
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a6, a6, t3
+; RV32I-NEXT:    sub a1, a1, t2
+; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:  .LBB22_11:
+; RV32I-NEXT:    sw a6, 8(a0)
+; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a3, 0(a0)
+; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i128:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sltu a4, a0, a2
-; RV64I-NEXT:    sub a3, a1, a3
-; RV64I-NEXT:    sub a3, a3, a4
-; RV64I-NEXT:    sub a2, a0, a2
-; RV64I-NEXT:    beq a3, a1, .LBB22_2
+; RV64I-NEXT:    mv a5, a4
+; RV64I-NEXT:    beq a1, a3, .LBB22_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    sltu a0, a1, a3
-; RV64I-NEXT:    j .LBB22_3
+; RV64I-NEXT:    sltu a5, a1, a3
 ; RV64I-NEXT:  .LBB22_2:
-; RV64I-NEXT:    sltu a0, a0, a2
-; RV64I-NEXT:  .LBB22_3:
-; RV64I-NEXT:    neg a1, a0
-; RV64I-NEXT:    xor a2, a2, a1
-; RV64I-NEXT:    sltu a4, a2, a1
-; RV64I-NEXT:    xor a1, a3, a1
-; RV64I-NEXT:    add a1, a1, a0
+; RV64I-NEXT:    bnez a5, .LBB22_4
+; RV64I-NEXT:  # %bb.3:
+; RV64I-NEXT:    sltu a4, a2, a0
+; RV64I-NEXT:    sub a1, a3, a1
 ; RV64I-NEXT:    sub a1, a1, a4
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB22_4:
+; RV64I-NEXT:    sub a1, a1, a3
+; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    sub a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: abd_cmp_i128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a5, 0(a2)
-; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw t1, 12(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    lw a6, 12(a1)
-; RV32ZBB-NEXT:    lw t0, 4(a2)
+; RV32ZBB-NEXT:    lw a3, 0(a2)
+; RV32ZBB-NEXT:    lw a4, 0(a1)
+; RV32ZBB-NEXT:    lw a5, 4(a2)
+; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a7, 8(a1)
+; RV32ZBB-NEXT:    lw a2, 12(a2)
+; RV32ZBB-NEXT:    lw t0, 12(a1)
 ; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu a2, a4, a7
-; RV32ZBB-NEXT:    sub t1, a6, t1
-; RV32ZBB-NEXT:    sltu t2, a3, a5
-; RV32ZBB-NEXT:    sub a2, t1, a2
-; RV32ZBB-NEXT:    mv t1, t2
-; RV32ZBB-NEXT:    beq a1, t0, .LBB22_2
+; RV32ZBB-NEXT:    sltu t1, a7, a6
+; RV32ZBB-NEXT:    mv t4, t1
+; RV32ZBB-NEXT:    beq t0, a2, .LBB22_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a1, t0
+; RV32ZBB-NEXT:    sltu t4, t0, a2
 ; RV32ZBB-NEXT:  .LBB22_2:
-; RV32ZBB-NEXT:    sub a7, a4, a7
-; RV32ZBB-NEXT:    sltu t3, a7, t1
-; RV32ZBB-NEXT:    sub a2, a2, t3
-; RV32ZBB-NEXT:    sub a7, a7, t1
-; RV32ZBB-NEXT:    beq a2, a6, .LBB22_4
+; RV32ZBB-NEXT:    sltu t2, a4, a3
+; RV32ZBB-NEXT:    mv t3, t2
+; RV32ZBB-NEXT:    beq a1, a5, .LBB22_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a6, a2
-; RV32ZBB-NEXT:    j .LBB22_5
+; RV32ZBB-NEXT:    sltu t3, a1, a5
 ; RV32ZBB-NEXT:  .LBB22_4:
-; RV32ZBB-NEXT:    sltu t1, a4, a7
-; RV32ZBB-NEXT:  .LBB22_5:
-; RV32ZBB-NEXT:    sub t0, a1, t0
-; RV32ZBB-NEXT:    sub t0, t0, t2
-; RV32ZBB-NEXT:    sub a5, a3, a5
-; RV32ZBB-NEXT:    beq t0, a1, .LBB22_7
-; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a1, a1, t0
-; RV32ZBB-NEXT:    j .LBB22_8
-; RV32ZBB-NEXT:  .LBB22_7:
-; RV32ZBB-NEXT:    sltu a1, a3, a5
+; RV32ZBB-NEXT:    xor t5, t0, a2
+; RV32ZBB-NEXT:    xor t6, a7, a6
+; RV32ZBB-NEXT:    or t5, t6, t5
+; RV32ZBB-NEXT:    mv t6, t3
+; RV32ZBB-NEXT:    beqz t5, .LBB22_6
+; RV32ZBB-NEXT:  # %bb.5:
+; RV32ZBB-NEXT:    mv t6, t4
+; RV32ZBB-NEXT:  .LBB22_6:
+; RV32ZBB-NEXT:    sltu t4, a3, a4
+; RV32ZBB-NEXT:    mv t5, t4
+; RV32ZBB-NEXT:    beq a1, a5, .LBB22_8
+; RV32ZBB-NEXT:  # %bb.7:
+; RV32ZBB-NEXT:    sltu t5, a5, a1
 ; RV32ZBB-NEXT:  .LBB22_8:
-; RV32ZBB-NEXT:    xor a3, a2, a6
-; RV32ZBB-NEXT:    xor a4, a7, a4
-; RV32ZBB-NEXT:    or a3, a4, a3
-; RV32ZBB-NEXT:    beqz a3, .LBB22_10
+; RV32ZBB-NEXT:    bnez t6, .LBB22_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    mv a1, t1
+; RV32ZBB-NEXT:    sltu t1, a6, a7
+; RV32ZBB-NEXT:    sub a2, a2, t0
+; RV32ZBB-NEXT:    sub a2, a2, t1
+; RV32ZBB-NEXT:    sub a6, a6, a7
+; RV32ZBB-NEXT:    sltu a7, a6, t5
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a6, a6, t5
+; RV32ZBB-NEXT:    sub a5, a5, a1
+; RV32ZBB-NEXT:    sub a1, a5, t4
+; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    j .LBB22_11
 ; RV32ZBB-NEXT:  .LBB22_10:
-; RV32ZBB-NEXT:    neg a6, a1
-; RV32ZBB-NEXT:    xor a3, a7, a6
-; RV32ZBB-NEXT:    sltu a4, a3, a6
-; RV32ZBB-NEXT:    xor a2, a2, a6
-; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sub a4, a2, a4
-; RV32ZBB-NEXT:    xor a2, a5, a6
-; RV32ZBB-NEXT:    sltu a5, a2, a6
-; RV32ZBB-NEXT:    xor a7, t0, a6
-; RV32ZBB-NEXT:    mv t1, a5
-; RV32ZBB-NEXT:    beqz t0, .LBB22_12
-; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, a6
-; RV32ZBB-NEXT:  .LBB22_12:
-; RV32ZBB-NEXT:    add a3, a3, a1
-; RV32ZBB-NEXT:    sltu a6, a3, t1
-; RV32ZBB-NEXT:    sub a4, a4, a6
-; RV32ZBB-NEXT:    sub a3, a3, t1
-; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a5
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    sw a1, 0(a0)
-; RV32ZBB-NEXT:    sw a5, 4(a0)
-; RV32ZBB-NEXT:    sw a3, 8(a0)
-; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    sub a2, t0, a2
+; RV32ZBB-NEXT:    sub a6, a7, a6
+; RV32ZBB-NEXT:    sub a2, a2, t1
+; RV32ZBB-NEXT:    sltu a7, a6, t3
+; RV32ZBB-NEXT:    sub a1, a1, a5
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a6, a6, t3
+; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:  .LBB22_11:
+; RV32ZBB-NEXT:    sw a6, 8(a0)
+; RV32ZBB-NEXT:    sw a1, 4(a0)
+; RV32ZBB-NEXT:    sw a3, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i128:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    sltu a4, a0, a2
-; RV64ZBB-NEXT:    sub a3, a1, a3
-; RV64ZBB-NEXT:    sub a3, a3, a4
-; RV64ZBB-NEXT:    sub a2, a0, a2
-; RV64ZBB-NEXT:    beq a3, a1, .LBB22_2
+; RV64ZBB-NEXT:    mv a5, a4
+; RV64ZBB-NEXT:    beq a1, a3, .LBB22_2
 ; RV64ZBB-NEXT:  # %bb.1:
-; RV64ZBB-NEXT:    sltu a0, a1, a3
-; RV64ZBB-NEXT:    j .LBB22_3
+; RV64ZBB-NEXT:    sltu a5, a1, a3
 ; RV64ZBB-NEXT:  .LBB22_2:
-; RV64ZBB-NEXT:    sltu a0, a0, a2
-; RV64ZBB-NEXT:  .LBB22_3:
-; RV64ZBB-NEXT:    neg a1, a0
-; RV64ZBB-NEXT:    xor a2, a2, a1
-; RV64ZBB-NEXT:    sltu a4, a2, a1
-; RV64ZBB-NEXT:    xor a1, a3, a1
-; RV64ZBB-NEXT:    add a1, a1, a0
+; RV64ZBB-NEXT:    bnez a5, .LBB22_4
+; RV64ZBB-NEXT:  # %bb.3:
+; RV64ZBB-NEXT:    sltu a4, a2, a0
+; RV64ZBB-NEXT:    sub a1, a3, a1
 ; RV64ZBB-NEXT:    sub a1, a1, a4
-; RV64ZBB-NEXT:    add a0, a2, a0
+; RV64ZBB-NEXT:    sub a0, a2, a0
+; RV64ZBB-NEXT:    ret
+; RV64ZBB-NEXT:  .LBB22_4:
+; RV64ZBB-NEXT:    sub a1, a1, a3
+; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    sub a0, a0, a2
 ; RV64ZBB-NEXT:    ret
   %cmp = icmp ult i128 %a, %b
   %ab = sub i128 %a, %b
   %ba = sub i128 %b, %a
-  %sel = select i1 %cmp, i128 %ba, i128 %ab
+  %sel = select i1 %cmp, i128 %ab, i128 %ba
   ret i128 %sel
 }
 
diff --git a/llvm/test/CodeGen/RISCV/abdu.ll b/llvm/test/CodeGen/RISCV/abdu.ll
index 39aef369a2967..14f45895754df 100644
--- a/llvm/test/CodeGen/RISCV/abdu.ll
+++ b/llvm/test/CodeGen/RISCV/abdu.ll
@@ -1457,249 +1457,266 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sltu a4, a0, a2
-; RV32I-NEXT:    mv a5, a4
-; RV32I-NEXT:    beq a1, a3, .LBB21_2
+; RV32I-NEXT:    sub a3, a1, a3
+; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    sub a2, a0, a2
+; RV32I-NEXT:    beq a3, a1, .LBB21_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a5, a1, a3
+; RV32I-NEXT:    sltu a0, a1, a3
+; RV32I-NEXT:    j .LBB21_3
 ; RV32I-NEXT:  .LBB21_2:
-; RV32I-NEXT:    beqz a5, .LBB21_4
-; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a0, a0, a2
-; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB21_4:
-; RV32I-NEXT:    sltu a4, a2, a0
-; RV32I-NEXT:    sub a1, a3, a1
+; RV32I-NEXT:    sltu a0, a0, a2
+; RV32I-NEXT:  .LBB21_3:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    xor a2, a2, a1
+; RV32I-NEXT:    sltu a4, a2, a1
+; RV32I-NEXT:    xor a1, a3, a1
+; RV32I-NEXT:    add a1, a1, a0
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a0, a2, a0
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    bgeu a0, a1, .LBB21_2
+; RV64I-NEXT:    bltu a1, a0, .LBB21_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    sub a0, a1, a0
 ; RV64I-NEXT:    ret
 ; RV64I-NEXT:  .LBB21_2:
-; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: abd_cmp_i64:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    sltu a4, a0, a2
-; RV32ZBB-NEXT:    mv a5, a4
-; RV32ZBB-NEXT:    beq a1, a3, .LBB21_2
+; RV32ZBB-NEXT:    sub a3, a1, a3
+; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    sub a2, a0, a2
+; RV32ZBB-NEXT:    beq a3, a1, .LBB21_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu a5, a1, a3
+; RV32ZBB-NEXT:    sltu a0, a1, a3
+; RV32ZBB-NEXT:    j .LBB21_3
 ; RV32ZBB-NEXT:  .LBB21_2:
-; RV32ZBB-NEXT:    beqz a5, .LBB21_4
-; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sub a1, a1, a3
-; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a0, a0, a2
-; RV32ZBB-NEXT:    ret
-; RV32ZBB-NEXT:  .LBB21_4:
-; RV32ZBB-NEXT:    sltu a4, a2, a0
-; RV32ZBB-NEXT:    sub a1, a3, a1
+; RV32ZBB-NEXT:    sltu a0, a0, a2
+; RV32ZBB-NEXT:  .LBB21_3:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    xor a2, a2, a1
+; RV32ZBB-NEXT:    sltu a4, a2, a1
+; RV32ZBB-NEXT:    xor a1, a3, a1
+; RV32ZBB-NEXT:    add a1, a1, a0
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a0, a2, a0
+; RV32ZBB-NEXT:    add a0, a2, a0
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i64:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    bgeu a0, a1, .LBB21_2
-; RV64ZBB-NEXT:  # %bb.1:
-; RV64ZBB-NEXT:    sub a0, a0, a1
-; RV64ZBB-NEXT:    ret
-; RV64ZBB-NEXT:  .LBB21_2:
-; RV64ZBB-NEXT:    sub a0, a1, a0
+; RV64ZBB-NEXT:    minu a2, a0, a1
+; RV64ZBB-NEXT:    maxu a0, a0, a1
+; RV64ZBB-NEXT:    sub a0, a0, a2
 ; RV64ZBB-NEXT:    ret
   %cmp = icmp uge i64 %a, %b
   %ab = sub i64 %a, %b
   %ba = sub i64 %b, %a
-  %sel = select i1 %cmp, i64 %ba, i64 %ab
+  %sel = select i1 %cmp, i64 %ab, i64 %ba
   ret i64 %sel
 }
 
 define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a3, 0(a2)
-; RV32I-NEXT:    lw a4, 0(a1)
-; RV32I-NEXT:    lw a5, 4(a2)
-; RV32I-NEXT:    lw a6, 8(a2)
-; RV32I-NEXT:    lw a7, 8(a1)
-; RV32I-NEXT:    lw a2, 12(a2)
-; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a5, 0(a2)
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw t1, 12(a2)
+; RV32I-NEXT:    lw a7, 8(a2)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a6, 12(a1)
+; RV32I-NEXT:    lw t0, 4(a2)
 ; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a7, a6
-; RV32I-NEXT:    mv t4, t1
-; RV32I-NEXT:    beq t0, a2, .LBB22_2
+; RV32I-NEXT:    sltu a2, a4, a7
+; RV32I-NEXT:    sub t1, a6, t1
+; RV32I-NEXT:    sltu t2, a3, a5
+; RV32I-NEXT:    sub a2, t1, a2
+; RV32I-NEXT:    mv t1, t2
+; RV32I-NEXT:    beq a1, t0, .LBB22_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t4, t0, a2
+; RV32I-NEXT:    sltu t1, a1, t0
 ; RV32I-NEXT:  .LBB22_2:
-; RV32I-NEXT:    xor t3, t0, a2
-; RV32I-NEXT:    xor t5, a7, a6
-; RV32I-NEXT:    sltu t2, a4, a3
-; RV32I-NEXT:    or t5, t5, t3
-; RV32I-NEXT:    mv t3, t2
-; RV32I-NEXT:    beq a1, a5, .LBB22_4
+; RV32I-NEXT:    sub a7, a4, a7
+; RV32I-NEXT:    sltu t3, a7, t1
+; RV32I-NEXT:    sub a2, a2, t3
+; RV32I-NEXT:    sub a7, a7, t1
+; RV32I-NEXT:    beq a2, a6, .LBB22_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t3, a1, a5
+; RV32I-NEXT:    sltu t1, a6, a2
+; RV32I-NEXT:    j .LBB22_5
 ; RV32I-NEXT:  .LBB22_4:
-; RV32I-NEXT:    mv t6, t3
-; RV32I-NEXT:    beqz t5, .LBB22_6
-; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    mv t6, t4
-; RV32I-NEXT:  .LBB22_6:
-; RV32I-NEXT:    sltu t4, a3, a4
-; RV32I-NEXT:    mv t5, t4
-; RV32I-NEXT:    beq a1, a5, .LBB22_8
-; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    sltu t5, a5, a1
+; RV32I-NEXT:    sltu t1, a4, a7
+; RV32I-NEXT:  .LBB22_5:
+; RV32I-NEXT:    sub t0, a1, t0
+; RV32I-NEXT:    sub t0, t0, t2
+; RV32I-NEXT:    sub a5, a3, a5
+; RV32I-NEXT:    beq t0, a1, .LBB22_7
+; RV32I-NEXT:  # %bb.6:
+; RV32I-NEXT:    sltu a1, a1, t0
+; RV32I-NEXT:    j .LBB22_8
+; RV32I-NEXT:  .LBB22_7:
+; RV32I-NEXT:    sltu a1, a3, a5
 ; RV32I-NEXT:  .LBB22_8:
-; RV32I-NEXT:    beqz t6, .LBB22_10
+; RV32I-NEXT:    xor a3, a2, a6
+; RV32I-NEXT:    xor a4, a7, a4
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    beqz a3, .LBB22_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    sub a2, t0, a2
-; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    sltu a7, a6, t3
-; RV32I-NEXT:    sub a1, a1, a5
-; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a6, a6, t3
-; RV32I-NEXT:    sub a1, a1, t2
-; RV32I-NEXT:    sub a3, a4, a3
-; RV32I-NEXT:    j .LBB22_11
+; RV32I-NEXT:    mv a1, t1
 ; RV32I-NEXT:  .LBB22_10:
-; RV32I-NEXT:    sltu t1, a6, a7
-; RV32I-NEXT:    sub a2, a2, t0
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t5
-; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a6, a6, t5
-; RV32I-NEXT:    sub a5, a5, a1
-; RV32I-NEXT:    sub a1, a5, t4
-; RV32I-NEXT:    sub a3, a3, a4
-; RV32I-NEXT:  .LBB22_11:
-; RV32I-NEXT:    sw a6, 8(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
-; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    neg a6, a1
+; RV32I-NEXT:    xor a3, a7, a6
+; RV32I-NEXT:    sltu a4, a3, a6
+; RV32I-NEXT:    xor a2, a2, a6
+; RV32I-NEXT:    add a2, a2, a1
+; RV32I-NEXT:    sub a4, a2, a4
+; RV32I-NEXT:    xor a2, a5, a6
+; RV32I-NEXT:    sltu a5, a2, a6
+; RV32I-NEXT:    xor a7, t0, a6
+; RV32I-NEXT:    mv t1, a5
+; RV32I-NEXT:    beqz t0, .LBB22_12
+; RV32I-NEXT:  # %bb.11:
+; RV32I-NEXT:    sltu t1, a7, a6
+; RV32I-NEXT:  .LBB22_12:
+; RV32I-NEXT:    add a3, a3, a1
+; RV32I-NEXT:    sltu a6, a3, t1
+; RV32I-NEXT:    sub a4, a4, a6
+; RV32I-NEXT:    sub a3, a3, t1
+; RV32I-NEXT:    add a7, a7, a1
+; RV32I-NEXT:    sub a5, a7, a5
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a5, 4(a0)
+; RV32I-NEXT:    sw a3, 8(a0)
+; RV32I-NEXT:    sw a4, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i128:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sltu a4, a0, a2
-; RV64I-NEXT:    mv a5, a4
-; RV64I-NEXT:    beq a1, a3, .LBB22_2
+; RV64I-NEXT:    sub a3, a1, a3
+; RV64I-NEXT:    sub a3, a3, a4
+; RV64I-NEXT:    sub a2, a0, a2
+; RV64I-NEXT:    beq a3, a1, .LBB22_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    sltu a5, a1, a3
+; RV64I-NEXT:    sltu a0, a1, a3
+; RV64I-NEXT:    j .LBB22_3
 ; RV64I-NEXT:  .LBB22_2:
-; RV64I-NEXT:    beqz a5, .LBB22_4
-; RV64I-NEXT:  # %bb.3:
-; RV64I-NEXT:    sub a1, a1, a3
-; RV64I-NEXT:    sub a1, a1, a4
-; RV64I-NEXT:    sub a0, a0, a2
-; RV64I-NEXT:    ret
-; RV64I-NEXT:  .LBB22_4:
-; RV64I-NEXT:    sltu a4, a2, a0
-; RV64I-NEXT:    sub a1, a3, a1
+; RV64I-NEXT:    sltu a0, a0, a2
+; RV64I-NEXT:  .LBB22_3:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    sltu a4, a2, a1
+; RV64I-NEXT:    xor a1, a3, a1
+; RV64I-NEXT:    add a1, a1, a0
 ; RV64I-NEXT:    sub a1, a1, a4
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: abd_cmp_i128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a3, 0(a2)
-; RV32ZBB-NEXT:    lw a4, 0(a1)
-; RV32ZBB-NEXT:    lw a5, 4(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a1)
-; RV32ZBB-NEXT:    lw a2, 12(a2)
-; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a5, 0(a2)
+; RV32ZBB-NEXT:    lw a3, 0(a1)
+; RV32ZBB-NEXT:    lw t1, 12(a2)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
+; RV32ZBB-NEXT:    lw a4, 8(a1)
+; RV32ZBB-NEXT:    lw a6, 12(a1)
+; RV32ZBB-NEXT:    lw t0, 4(a2)
 ; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a7, a6
-; RV32ZBB-NEXT:    mv t4, t1
-; RV32ZBB-NEXT:    beq t0, a2, .LBB22_2
+; RV32ZBB-NEXT:    sltu a2, a4, a7
+; RV32ZBB-NEXT:    sub t1, a6, t1
+; RV32ZBB-NEXT:    sltu t2, a3, a5
+; RV32ZBB-NEXT:    sub a2, t1, a2
+; RV32ZBB-NEXT:    mv t1, t2
+; RV32ZBB-NEXT:    beq a1, t0, .LBB22_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t4, t0, a2
+; RV32ZBB-NEXT:    sltu t1, a1, t0
 ; RV32ZBB-NEXT:  .LBB22_2:
-; RV32ZBB-NEXT:    xor t3, t0, a2
-; RV32ZBB-NEXT:    xor t5, a7, a6
-; RV32ZBB-NEXT:    sltu t2, a4, a3
-; RV32ZBB-NEXT:    or t5, t5, t3
-; RV32ZBB-NEXT:    mv t3, t2
-; RV32ZBB-NEXT:    beq a1, a5, .LBB22_4
+; RV32ZBB-NEXT:    sub a7, a4, a7
+; RV32ZBB-NEXT:    sltu t3, a7, t1
+; RV32ZBB-NEXT:    sub a2, a2, t3
+; RV32ZBB-NEXT:    sub a7, a7, t1
+; RV32ZBB-NEXT:    beq a2, a6, .LBB22_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t3, a1, a5
+; RV32ZBB-NEXT:    sltu t1, a6, a2
+; RV32ZBB-NEXT:    j .LBB22_5
 ; RV32ZBB-NEXT:  .LBB22_4:
-; RV32ZBB-NEXT:    mv t6, t3
-; RV32ZBB-NEXT:    beqz t5, .LBB22_6
-; RV32ZBB-NEXT:  # %bb.5:
-; RV32ZBB-NEXT:    mv t6, t4
-; RV32ZBB-NEXT:  .LBB22_6:
-; RV32ZBB-NEXT:    sltu t4, a3, a4
-; RV32ZBB-NEXT:    mv t5, t4
-; RV32ZBB-NEXT:    beq a1, a5, .LBB22_8
-; RV32ZBB-NEXT:  # %bb.7:
-; RV32ZBB-NEXT:    sltu t5, a5, a1
+; RV32ZBB-NEXT:    sltu t1, a4, a7
+; RV32ZBB-NEXT:  .LBB22_5:
+; RV32ZBB-NEXT:    sub t0, a1, t0
+; RV32ZBB-NEXT:    sub t0, t0, t2
+; RV32ZBB-NEXT:    sub a5, a3, a5
+; RV32ZBB-NEXT:    beq t0, a1, .LBB22_7
+; RV32ZBB-NEXT:  # %bb.6:
+; RV32ZBB-NEXT:    sltu a1, a1, t0
+; RV32ZBB-NEXT:    j .LBB22_8
+; RV32ZBB-NEXT:  .LBB22_7:
+; RV32ZBB-NEXT:    sltu a1, a3, a5
 ; RV32ZBB-NEXT:  .LBB22_8:
-; RV32ZBB-NEXT:    beqz t6, .LBB22_10
+; RV32ZBB-NEXT:    xor a3, a2, a6
+; RV32ZBB-NEXT:    xor a4, a7, a4
+; RV32ZBB-NEXT:    or a3, a4, a3
+; RV32ZBB-NEXT:    beqz a3, .LBB22_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    sub a2, t0, a2
-; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    sltu a7, a6, t3
-; RV32ZBB-NEXT:    sub a1, a1, a5
-; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a6, a6, t3
-; RV32ZBB-NEXT:    sub a1, a1, t2
-; RV32ZBB-NEXT:    sub a3, a4, a3
-; RV32ZBB-NEXT:    j .LBB22_11
+; RV32ZBB-NEXT:    mv a1, t1
 ; RV32ZBB-NEXT:  .LBB22_10:
-; RV32ZBB-NEXT:    sltu t1, a6, a7
-; RV32ZBB-NEXT:    sub a2, a2, t0
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t5
-; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a6, a6, t5
-; RV32ZBB-NEXT:    sub a5, a5, a1
-; RV32ZBB-NEXT:    sub a1, a5, t4
-; RV32ZBB-NEXT:    sub a3, a3, a4
-; RV32ZBB-NEXT:  .LBB22_11:
-; RV32ZBB-NEXT:    sw a6, 8(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
-; RV32ZBB-NEXT:    sw a3, 0(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    neg a6, a1
+; RV32ZBB-NEXT:    xor a3, a7, a6
+; RV32ZBB-NEXT:    sltu a4, a3, a6
+; RV32ZBB-NEXT:    xor a2, a2, a6
+; RV32ZBB-NEXT:    add a2, a2, a1
+; RV32ZBB-NEXT:    sub a4, a2, a4
+; RV32ZBB-NEXT:    xor a2, a5, a6
+; RV32ZBB-NEXT:    sltu a5, a2, a6
+; RV32ZBB-NEXT:    xor a7, t0, a6
+; RV32ZBB-NEXT:    mv t1, a5
+; RV32ZBB-NEXT:    beqz t0, .LBB22_12
+; RV32ZBB-NEXT:  # %bb.11:
+; RV32ZBB-NEXT:    sltu t1, a7, a6
+; RV32ZBB-NEXT:  .LBB22_12:
+; RV32ZBB-NEXT:    add a3, a3, a1
+; RV32ZBB-NEXT:    sltu a6, a3, t1
+; RV32ZBB-NEXT:    sub a4, a4, a6
+; RV32ZBB-NEXT:    sub a3, a3, t1
+; RV32ZBB-NEXT:    add a7, a7, a1
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    sw a1, 0(a0)
+; RV32ZBB-NEXT:    sw a5, 4(a0)
+; RV32ZBB-NEXT:    sw a3, 8(a0)
+; RV32ZBB-NEXT:    sw a4, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i128:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    sltu a4, a0, a2
-; RV64ZBB-NEXT:    mv a5, a4
-; RV64ZBB-NEXT:    beq a1, a3, .LBB22_2
+; RV64ZBB-NEXT:    sub a3, a1, a3
+; RV64ZBB-NEXT:    sub a3, a3, a4
+; RV64ZBB-NEXT:    sub a2, a0, a2
+; RV64ZBB-NEXT:    beq a3, a1, .LBB22_2
 ; RV64ZBB-NEXT:  # %bb.1:
-; RV64ZBB-NEXT:    sltu a5, a1, a3
+; RV64ZBB-NEXT:    sltu a0, a1, a3
+; RV64ZBB-NEXT:    j .LBB22_3
 ; RV64ZBB-NEXT:  .LBB22_2:
-; RV64ZBB-NEXT:    beqz a5, .LBB22_4
-; RV64ZBB-NEXT:  # %bb.3:
-; RV64ZBB-NEXT:    sub a1, a1, a3
-; RV64ZBB-NEXT:    sub a1, a1, a4
-; RV64ZBB-NEXT:    sub a0, a0, a2
-; RV64ZBB-NEXT:    ret
-; RV64ZBB-NEXT:  .LBB22_4:
-; RV64ZBB-NEXT:    sltu a4, a2, a0
-; RV64ZBB-NEXT:    sub a1, a3, a1
+; RV64ZBB-NEXT:    sltu a0, a0, a2
+; RV64ZBB-NEXT:  .LBB22_3:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    xor a2, a2, a1
+; RV64ZBB-NEXT:    sltu a4, a2, a1
+; RV64ZBB-NEXT:    xor a1, a3, a1
+; RV64ZBB-NEXT:    add a1, a1, a0
 ; RV64ZBB-NEXT:    sub a1, a1, a4
-; RV64ZBB-NEXT:    sub a0, a2, a0
+; RV64ZBB-NEXT:    add a0, a2, a0
 ; RV64ZBB-NEXT:    ret
   %cmp = icmp uge i128 %a, %b
   %ab = sub i128 %a, %b
   %ba = sub i128 %b, %a
-  %sel = select i1 %cmp, i128 %ba, i128 %ab
+  %sel = select i1 %cmp, i128 %ab, i128 %ba
   ret i128 %sel
 }
 
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index 246cd8e0e852d..833273dc98243 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -842,8 +842,8 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    cmovll %edi, %eax
-; X86-NEXT:    cmovll %ebx, %edx
+; X86-NEXT:    cmovgel %edi, %eax
+; X86-NEXT:    cmovgel %ebx, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -853,13 +853,14 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
-; X64-NEXT:    subq %rdi, %rsi
-; X64-NEXT:    cmovgeq %rsi, %rax
+; X64-NEXT:    negq %rax
+; X64-NEXT:    subq %rsi, %rdi
+; X64-NEXT:    cmovlq %rdi, %rax
 ; X64-NEXT:    retq
   %cmp = icmp slt i64 %a, %b
   %ab = sub i64 %a, %b
   %ba = sub i64 %b, %a
-  %sel = select i1 %cmp, i64 %ba, i64 %ab
+  %sel = select i1 %cmp, i64 %ab, i64 %ba
   ret i64 %sel
 }
 
@@ -888,10 +889,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
+; X86-NEXT:    cmovgel (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovgel %ebx, %esi
+; X86-NEXT:    cmovgel %ebp, %ecx
+; X86-NEXT:    cmovgel %eax, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %edi, 12(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
@@ -906,20 +907,20 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X64-LABEL: abd_cmp_i128:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    subq %rdx, %rax
-; X64-NEXT:    movq %rsi, %r8
-; X64-NEXT:    sbbq %rcx, %r8
-; X64-NEXT:    subq %rdi, %rdx
-; X64-NEXT:    sbbq %rsi, %rcx
-; X64-NEXT:    cmovgeq %rdx, %rax
-; X64-NEXT:    cmovgeq %rcx, %r8
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    subq %rdi, %rax
+; X64-NEXT:    movq %rcx, %r8
+; X64-NEXT:    sbbq %rsi, %r8
+; X64-NEXT:    subq %rdx, %rdi
+; X64-NEXT:    sbbq %rcx, %rsi
+; X64-NEXT:    cmovlq %rdi, %rax
+; X64-NEXT:    cmovlq %rsi, %r8
 ; X64-NEXT:    movq %r8, %rdx
 ; X64-NEXT:    retq
   %cmp = icmp slt i128 %a, %b
   %ab = sub i128 %a, %b
   %ba = sub i128 %b, %a
-  %sel = select i1 %cmp, i128 %ba, i128 %ab
+  %sel = select i1 %cmp, i128 %ab, i128 %ba
   ret i128 %sel
 }
 
diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll
index 9f3b99b349aed..d9ba140032b31 100644
--- a/llvm/test/CodeGen/X86/abds.ll
+++ b/llvm/test/CodeGen/X86/abds.ll
@@ -744,8 +744,8 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    cmovgel %edi, %eax
-; X86-NEXT:    cmovgel %ebx, %edx
+; X86-NEXT:    cmovll %edi, %eax
+; X86-NEXT:    cmovll %ebx, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -755,14 +755,13 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
-; X64-NEXT:    negq %rax
-; X64-NEXT:    subq %rsi, %rdi
-; X64-NEXT:    cmovlq %rdi, %rax
+; X64-NEXT:    subq %rdi, %rsi
+; X64-NEXT:    cmovgeq %rsi, %rax
 ; X64-NEXT:    retq
   %cmp = icmp sge i64 %a, %b
   %ab = sub i64 %a, %b
   %ba = sub i64 %b, %a
-  %sel = select i1 %cmp, i64 %ba, i64 %ab
+  %sel = select i1 %cmp, i64 %ab, i64 %ba
   ret i64 %sel
 }
 
@@ -791,10 +790,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovgel (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovgel %ebx, %esi
-; X86-NEXT:    cmovgel %ebp, %ecx
-; X86-NEXT:    cmovgel %eax, %edi
+; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %ebx, %esi
+; X86-NEXT:    cmovll %ebp, %ecx
+; X86-NEXT:    cmovll %eax, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %edi, 12(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
@@ -809,20 +808,20 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X64-LABEL: abd_cmp_i128:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    subq %rdi, %rax
-; X64-NEXT:    movq %rcx, %r8
-; X64-NEXT:    sbbq %rsi, %r8
-; X64-NEXT:    subq %rdx, %rdi
-; X64-NEXT:    sbbq %rcx, %rsi
-; X64-NEXT:    cmovlq %rdi, %rax
-; X64-NEXT:    cmovlq %rsi, %r8
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    subq %rdx, %rax
+; X64-NEXT:    movq %rsi, %r8
+; X64-NEXT:    sbbq %rcx, %r8
+; X64-NEXT:    subq %rdi, %rdx
+; X64-NEXT:    sbbq %rsi, %rcx
+; X64-NEXT:    cmovgeq %rdx, %rax
+; X64-NEXT:    cmovgeq %rcx, %r8
 ; X64-NEXT:    movq %r8, %rdx
 ; X64-NEXT:    retq
   %cmp = icmp sge i128 %a, %b
   %ab = sub i128 %a, %b
   %ba = sub i128 %b, %a
-  %sel = select i1 %cmp, i128 %ba, i128 %ab
+  %sel = select i1 %cmp, i128 %ab, i128 %ba
   ret i128 %sel
 }
 
diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll
index 9cb3e5e8bf0c2..507f7681400ef 100644
--- a/llvm/test/CodeGen/X86/abdu-neg.ll
+++ b/llvm/test/CodeGen/X86/abdu-neg.ll
@@ -824,8 +824,8 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    cmovbl %edi, %eax
-; X86-NEXT:    cmovbl %ebx, %edx
+; X86-NEXT:    cmovael %edi, %eax
+; X86-NEXT:    cmovael %ebx, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -837,12 +837,12 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; X64-NEXT:    subq %rsi, %rax
 ; X64-NEXT:    negq %rax
 ; X64-NEXT:    subq %rsi, %rdi
-; X64-NEXT:    cmovaeq %rdi, %rax
+; X64-NEXT:    cmovbq %rdi, %rax
 ; X64-NEXT:    retq
   %cmp = icmp ult i64 %a, %b
   %ab = sub i64 %a, %b
   %ba = sub i64 %b, %a
-  %sel = select i1 %cmp, i64 %ba, i64 %ab
+  %sel = select i1 %cmp, i64 %ab, i64 %ba
   ret i64 %sel
 }
 
@@ -871,10 +871,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovbl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovbl %ebx, %esi
-; X86-NEXT:    cmovbl %ebp, %ecx
-; X86-NEXT:    cmovbl %eax, %edi
+; X86-NEXT:    cmovael (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovael %ebx, %esi
+; X86-NEXT:    cmovael %ebp, %ecx
+; X86-NEXT:    cmovael %eax, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %edi, 12(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
@@ -895,14 +895,14 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X64-NEXT:    sbbq %rsi, %r8
 ; X64-NEXT:    subq %rdx, %rdi
 ; X64-NEXT:    sbbq %rcx, %rsi
-; X64-NEXT:    cmovaeq %rdi, %rax
-; X64-NEXT:    cmovaeq %rsi, %r8
+; X64-NEXT:    cmovbq %rdi, %rax
+; X64-NEXT:    cmovbq %rsi, %r8
 ; X64-NEXT:    movq %r8, %rdx
 ; X64-NEXT:    retq
   %cmp = icmp ult i128 %a, %b
   %ab = sub i128 %a, %b
   %ba = sub i128 %b, %a
-  %sel = select i1 %cmp, i128 %ba, i128 %ab
+  %sel = select i1 %cmp, i128 %ab, i128 %ba
   ret i128 %sel
 }
 
diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll
index c8fa19cb661b6..290894d2712e8 100644
--- a/llvm/test/CodeGen/X86/abdu.ll
+++ b/llvm/test/CodeGen/X86/abdu.ll
@@ -695,98 +695,83 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; X86-LABEL: abd_cmp_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    cmovael %edi, %eax
-; X86-NEXT:    cmovael %ebx, %edx
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    sbbl %ecx, %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_cmp_i64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
-; X64-NEXT:    negq %rax
-; X64-NEXT:    subq %rsi, %rdi
-; X64-NEXT:    cmovbq %rdi, %rax
+; X64-NEXT:    subq %rdi, %rsi
+; X64-NEXT:    cmovaeq %rsi, %rax
 ; X64-NEXT:    retq
   %cmp = icmp uge i64 %a, %b
   %ab = sub i64 %a, %b
   %ba = sub i64 %b, %a
-  %sel = select i1 %cmp, i64 %ba, i64 %ab
+  %sel = select i1 %cmp, i64 %ab, i64 %ba
   ret i64 %sel
 }
 
 define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovael (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovael %ebx, %esi
-; X86-NEXT:    cmovael %ebp, %ecx
-; X86-NEXT:    cmovael %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    xorl %ebx, %edx
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    subl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_cmp_i128:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    subq %rdi, %rax
-; X64-NEXT:    movq %rcx, %r8
-; X64-NEXT:    sbbq %rsi, %r8
-; X64-NEXT:    subq %rdx, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    subq %rdx, %rax
 ; X64-NEXT:    sbbq %rcx, %rsi
-; X64-NEXT:    cmovbq %rdi, %rax
-; X64-NEXT:    cmovbq %rsi, %r8
-; X64-NEXT:    movq %r8, %rdx
+; X64-NEXT:    sbbq %rdi, %rdi
+; X64-NEXT:    xorq %rdi, %rsi
+; X64-NEXT:    xorq %rdi, %rax
+; X64-NEXT:    subq %rdi, %rax
+; X64-NEXT:    sbbq %rdi, %rsi
+; X64-NEXT:    movq %rsi, %rdx
 ; X64-NEXT:    retq
   %cmp = icmp uge i128 %a, %b
   %ab = sub i128 %a, %b
   %ba = sub i128 %b, %a
-  %sel = select i1 %cmp, i128 %ba, i128 %ab
+  %sel = select i1 %cmp, i128 %ab, i128 %ba
   ret i128 %sel
 }
 

From b25b9a7d6c872e42121aa024f362fae0b15dd72c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 4 Sep 2024 11:58:15 +0100
Subject: [PATCH 076/425] [DAG] visitSELECT - add "select usubo(x, y).overflow,
 (sub y, x), (usubo x, y) -> abdu(x, y)" fold (and neg equivalent)

Handle cases where CGP has merged the CMP+SUB into a USUBO node - improves a few outstanding niggles from #100810
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  21 ++++
 llvm/test/CodeGen/X86/abdu-neg.ll             | 111 +++++++++---------
 llvm/test/CodeGen/X86/abdu.ll                 |  32 ++---
 3 files changed, 89 insertions(+), 75 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b0a906743f29f..6390231341f96 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11719,6 +11719,24 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
                              N2_2, Flags);
       }
     }
+
+    // select usubo(x, y).overflow, (sub y, x), (usubo x, y) -> abdu(x, y)
+    if (N0.getOpcode() == ISD::USUBO && N0.getResNo() == 1 &&
+        N2.getNode() == N0.getNode() && N2.getResNo() == 0 &&
+        N1.getOpcode() == ISD::SUB && N2.getOperand(0) == N1.getOperand(1) &&
+        N2.getOperand(1) == N1.getOperand(0) &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::ABDU, VT)))
+      return DAG.getNode(ISD::ABDU, DL, VT, N0.getOperand(0), N0.getOperand(1));
+
+    // select usubo(x, y).overflow, (usubo x, y), (sub y, x) -> neg (abdu x, y)
+    if (N0.getOpcode() == ISD::USUBO && N0.getResNo() == 1 &&
+        N1.getNode() == N0.getNode() && N1.getResNo() == 0 &&
+        N2.getOpcode() == ISD::SUB && N2.getOperand(0) == N1.getOperand(1) &&
+        N2.getOperand(1) == N1.getOperand(0) &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::ABDU, VT)))
+      return DAG.getNegative(
+          DAG.getNode(ISD::ABDU, DL, VT, N0.getOperand(0), N0.getOperand(1)),
+          DL, VT);
   }
 
   // Fold selects based on a setcc into other things, such as min/max/abs.
@@ -11776,6 +11794,9 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
       return SelectNode;
     }
 
+    if (SDValue ABD = foldSelectToABD(Cond0, Cond1, N1, N2, CC, DL))
+      return ABD;
+
     if (SDValue NewSel = SimplifySelect(DL, N0, N1, N2))
       return NewSel;
   }
diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll
index 507f7681400ef..24962be43b5cf 100644
--- a/llvm/test/CodeGen/X86/abdu-neg.ll
+++ b/llvm/test/CodeGen/X86/abdu-neg.ll
@@ -751,27 +751,23 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; X86-LABEL: abd_cmp_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    subw %dx, %si
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
-; X86-NEXT:    cmpw %dx, %cx
-; X86-NEXT:    cmovbl %esi, %eax
+; X86-NEXT:    cmovnsl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_cmp_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    subw %si, %cx
+; X64-NEXT:    movzwl %si, %eax
+; X64-NEXT:    movzwl %di, %ecx
+; X64-NEXT:    subl %eax, %ecx
 ; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    negl %eax
-; X64-NEXT:    cmpw %si, %di
-; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    cmovnsl %ecx, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %cmp = icmp ult i16 %a, %b
@@ -811,33 +807,30 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; X86-LABEL: abd_cmp_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    cmovael %edi, %eax
-; X86-NEXT:    cmovael %ebx, %edx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    xorl %esi, %eax
+; X86-NEXT:    subl %esi, %eax
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    negl %eax
+; X86-NEXT:    sbbl %ecx, %edx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_cmp_i64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
+; X64-NEXT:    subq %rdi, %rsi
+; X64-NEXT:    cmovaeq %rsi, %rax
 ; X64-NEXT:    negq %rax
-; X64-NEXT:    subq %rsi, %rdi
-; X64-NEXT:    cmovbq %rdi, %rax
 ; X64-NEXT:    retq
   %cmp = icmp ult i64 %a, %b
   %ab = sub i64 %a, %b
@@ -853,34 +846,36 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovael (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovael %ebx, %esi
-; X86-NEXT:    cmovael %ebp, %ecx
-; X86-NEXT:    cmovael %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
+; X86-NEXT:    xorl %ebp, %ecx
+; X86-NEXT:    xorl %ebp, %esi
+; X86-NEXT:    xorl %ebp, %ebx
+; X86-NEXT:    xorl %ebp, %edx
+; X86-NEXT:    subl %ebp, %edx
+; X86-NEXT:    sbbl %ebp, %ebx
+; X86-NEXT:    sbbl %ebp, %esi
+; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    sbbl %ecx, %edi
 ; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl %ebp, 4(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl %edi, 12(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -889,15 +884,19 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X64-LABEL: abd_cmp_i128:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    subq %rdi, %rax
-; X64-NEXT:    movq %rcx, %r8
-; X64-NEXT:    sbbq %rsi, %r8
-; X64-NEXT:    subq %rdx, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    subq %rdx, %rax
 ; X64-NEXT:    sbbq %rcx, %rsi
-; X64-NEXT:    cmovbq %rdi, %rax
-; X64-NEXT:    cmovbq %rsi, %r8
-; X64-NEXT:    movq %r8, %rdx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    sbbq %rcx, %rcx
+; X64-NEXT:    xorq %rcx, %rsi
+; X64-NEXT:    xorq %rcx, %rax
+; X64-NEXT:    subq %rcx, %rax
+; X64-NEXT:    sbbq %rcx, %rsi
+; X64-NEXT:    negq %rax
+; X64-NEXT:    sbbq %rsi, %rdi
+; X64-NEXT:    movq %rdi, %rdx
 ; X64-NEXT:    retq
   %cmp = icmp ult i128 %a, %b
   %ab = sub i128 %a, %b
diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll
index 290894d2712e8..3bee81b61b98a 100644
--- a/llvm/test/CodeGen/X86/abdu.ll
+++ b/llvm/test/CodeGen/X86/abdu.ll
@@ -608,25 +608,21 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    subb %cl, %dl
-; X86-NEXT:    negb %dl
-; X86-NEXT:    subb %cl, %al
-; X86-NEXT:    movzbl %al, %ecx
-; X86-NEXT:    movzbl %dl, %eax
-; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovsl %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_cmp_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    subb %dil, %al
-; X64-NEXT:    negb %al
-; X64-NEXT:    subb %dil, %sil
+; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    movzbl %sil, %ecx
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    cmovael %ecx, %eax
+; X64-NEXT:    subl %eax, %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovsl %ecx, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %cmp = icmp ugt i8 %a, %b
@@ -670,9 +666,8 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    subl %ecx, %edx
-; X86-NEXT:    negl %edx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    cmovbl %edx, %eax
 ; X86-NEXT:    retl
@@ -681,9 +676,8 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
-; X64-NEXT:    negl %eax
-; X64-NEXT:    subl %esi, %edi
-; X64-NEXT:    cmovael %edi, %eax
+; X64-NEXT:    subl %edi, %esi
+; X64-NEXT:    cmovael %esi, %eax
 ; X64-NEXT:    retq
   %cmp = icmp ult i32 %a, %b
   %ab = sub i32 %a, %b

From 8d4235d97e5bd12b8244f9ffc157651a9a288b36 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 4 Sep 2024 13:02:02 +0200
Subject: [PATCH 077/425] [Lint] Fix another scalable vector crash

We also need to check that the memory access LocationSize is not
scalable.
---
 llvm/lib/Analysis/Lint.cpp          | 3 ++-
 llvm/test/Analysis/Lint/scalable.ll | 7 +++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index 415b16d25efd2..00e430ce8e0ab 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -454,7 +454,8 @@ void Lint::visitMemoryReference(Instruction &I, const MemoryLocation &Loc,
 
     // Accesses from before the start or after the end of the object are not
     // defined.
-    Check(!Loc.Size.hasValue() || BaseSize == MemoryLocation::UnknownSize ||
+    Check(!Loc.Size.hasValue() || Loc.Size.isScalable() ||
+              BaseSize == MemoryLocation::UnknownSize ||
               (Offset >= 0 && Offset + Loc.Size.getValue() <= BaseSize),
           "Undefined behavior: Buffer overflow", &I);
 
diff --git a/llvm/test/Analysis/Lint/scalable.ll b/llvm/test/Analysis/Lint/scalable.ll
index 4bcc4dae8d837..bc12d6738d2aa 100644
--- a/llvm/test/Analysis/Lint/scalable.ll
+++ b/llvm/test/Analysis/Lint/scalable.ll
@@ -7,6 +7,13 @@ define <vscale x 8 x i8> @alloca_access() {
   ret <vscale x 8 x i8> %v
 }
 
+; CHECK-NOT: Buffer overflow
+define <vscale x 8 x i8> @alloca_access2() {
+  %a = alloca <256 x i8>
+  %v = load <vscale x 8 x i8>, ptr %a
+  ret <vscale x 8 x i8> %v
+}
+
 ; CHECK-NOT: insertelement index out of range
 define <vscale x 8 x half> @insertelement() {
   %insert = insertelement <vscale x 8 x half> poison, half 0xH0000, i64 100

From 55a24738302eb9bb5bf458220deb20ddef60ce51 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 4 Sep 2024 13:04:36 +0200
Subject: [PATCH 078/425] [CtxProf] Replace include with forward declaration
 (NFC)

This header is fairly expensive. Forward declare
PGOContextualProfile instead.
---
 llvm/include/llvm/Transforms/Utils/Cloning.h | 4 ++--
 llvm/lib/Transforms/Utils/InlineFunction.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h
index 2ddcfeb1501e2..a4be24e32c527 100644
--- a/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -20,7 +20,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CtxProfAnalysis.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/ValueHandle.h"
@@ -42,6 +41,7 @@ class Instruction;
 class Loop;
 class LoopInfo;
 class Module;
+class PGOContextualProfile;
 class ProfileSummaryInfo;
 class ReturnInst;
 class DomTreeUpdater;
@@ -276,7 +276,7 @@ InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
 /// to the behavior of the non-contextual profile updating variant above. This
 /// makes it easy to drop-in replace uses of the non-contextual overload.
 InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
-                            CtxProfAnalysis::Result &CtxProf,
+                            PGOContextualProfile &CtxProf,
                             bool MergeAttributes = false,
                             AAResults *CalleeAAR = nullptr,
                             bool InsertLifetime = true,
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 799ef3ab021d3..2e05fa80464b8 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -2142,7 +2142,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind,
 // it's not worth updating those.
 static const std::pair<std::vector<int64_t>, std::vector<int64_t>>
 remapIndices(Function &Caller, BasicBlock *StartBB,
-             CtxProfAnalysis::Result &CtxProf, uint32_t CalleeCounters,
+             PGOContextualProfile &CtxProf, uint32_t CalleeCounters,
              uint32_t CalleeCallsites) {
   // We'll allocate a new ID to imported callsite counters and callsites. We're
   // using -1 to indicate a counter we delete. Most likely the entry ID, for
@@ -2258,7 +2258,7 @@ remapIndices(Function &Caller, BasicBlock *StartBB,
 // copying over the data of the callee, **intentionally without any value
 // scaling**, and copying over the callees of the inlined callee.
 llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
-                                        CtxProfAnalysis::Result &CtxProf,
+                                        PGOContextualProfile &CtxProf,
                                         bool MergeAttributes,
                                         AAResults *CalleeAAR,
                                         bool InsertLifetime,

From 43b8ae3cea7c0f45dc29479ba8024e0adae9d145 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?=
 <jmartinezcaamao@gmail.com>
Date: Wed, 4 Sep 2024 13:11:45 +0200
Subject: [PATCH 079/425] [AMDGPU][LDS] Pre-Commit tests for 'Fix dynamic LDS
 interaction with "amdgpu-no-lds-kernel-id" (#107091)

---
 .../AMDGPU/lower-module-lds-zero-size-arr.ll  | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll
index da1d23f1496cf..c7829be565373 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
 
 ; This is an extension and should be rejected by the front-end in most cases.
@@ -6,9 +6,13 @@
 
 @Var0 = linkonce_odr hidden local_unnamed_addr addrspace(3) global [0 x float] poison
 
-define void @fn(float %val, i32 %idx) {
+;.
+; CHECK: @llvm.amdgcn.kernelA.dynlds = external addrspace(3) global [0 x i8], align 4, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.dynlds.offset.table = internal addrspace(4) constant [1 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernelA.dynlds to i32)]
+;.
+define void @fn(float %val, i32 %idx) #0 {
 ; CHECK-LABEL: define void @fn(
-; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; CHECK-NEXT:    [[VAR0:%.*]] = getelementptr inbounds [1 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[VAR0]], align 4
@@ -22,9 +26,9 @@ define void @fn(float %val, i32 %idx) {
   ret void
 }
 
-define amdgpu_kernel void @kernelA(float %val, i32 %idx) {
+define amdgpu_kernel void @kernelA(float %val, i32 %idx) #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @kernelA(
-; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] {
+; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernelA.dynlds) ]
 ; CHECK-NEXT:    tail call void @fn(float [[VAL]], i32 [[IDX]])
 ; CHECK-NEXT:    ret void
@@ -32,6 +36,14 @@ define amdgpu_kernel void @kernelA(float %val, i32 %idx) {
   tail call void @fn(float %val, i32 %idx)
   ret void
 }
+
+attributes #0 = { "amdgpu-no-lds-kernel-id" }
+
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-lds-kernel-id" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
 ; CHECK: [[META1]] = !{i32 0}
 ;.

From 9ba41031de105d7babf3ae53facd368f2b4e409f Mon Sep 17 00:00:00 2001
From: Akash Banerjee <akash.banerjee@amd.com>
Date: Wed, 4 Sep 2024 12:35:44 +0100
Subject: [PATCH 080/425] [OpenMP]Update use_device_clause lowering (#101703)

This patch updates the use_device_ptr and use_device_addr clauses to use
the mapInfoOps for lowering. This allows all the types that are handle
by the map clauses such as derived types to also be supported by the
use_device_clauses.

This is patch 1/2 in a series of patches.

Co-authored-by: Raghu Maddhipatla raghu.maddhipatla@amd.com
---
 flang/lib/Lower/OpenMP/ClauseProcessor.cpp    | 145 ++++++++-----
 flang/lib/Lower/OpenMP/ClauseProcessor.h      |  50 ++---
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 195 +++++++++---------
 .../Optimizer/OpenMP/MapInfoFinalization.cpp  |  37 +++-
 flang/test/Lower/OpenMP/target.f90            |  12 +-
 .../use-device-ptr-to-use-device-addr.f90     |  96 +++++----
 6 files changed, 283 insertions(+), 252 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index dd6068ba048cc..6dee31ddb6963 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -887,13 +887,64 @@ bool ClauseProcessor::processLink(
       });
 }
 
+void ClauseProcessor::processMapObjects(
+    lower::StatementContext &stmtCtx, mlir::Location clauseLocation,
+    const omp::ObjectList &objects,
+    llvm::omp::OpenMPOffloadMappingFlags mapTypeBits,
+    std::map<const semantics::Symbol *,
+             llvm::SmallVector<OmpMapMemberIndicesData>> &parentMemberIndices,
+    llvm::SmallVectorImpl<mlir::Value> &mapVars,
+    llvm::SmallVectorImpl<const semantics::Symbol *> *mapSyms,
+    llvm::SmallVectorImpl<mlir::Location> *mapSymLocs,
+    llvm::SmallVectorImpl<mlir::Type> *mapSymTypes) const {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  for (const omp::Object &object : objects) {
+    llvm::SmallVector<mlir::Value> bounds;
+    std::stringstream asFortran;
+
+    lower::AddrAndBoundsInfo info =
+        lower::gatherDataOperandAddrAndBounds<mlir::omp::MapBoundsOp,
+                                              mlir::omp::MapBoundsType>(
+            converter, firOpBuilder, semaCtx, stmtCtx, *object.sym(),
+            object.ref(), clauseLocation, asFortran, bounds,
+            treatIndexAsSection);
+
+    // Explicit map captures are captured ByRef by default,
+    // optimisation passes may alter this to ByCopy or other capture
+    // types to optimise
+    mlir::Value baseOp = info.rawInput;
+    auto location = mlir::NameLoc::get(
+        mlir::StringAttr::get(firOpBuilder.getContext(), asFortran.str()),
+        baseOp.getLoc());
+    mlir::omp::MapInfoOp mapOp = createMapInfoOp(
+        firOpBuilder, location, baseOp,
+        /*varPtrPtr=*/mlir::Value{}, asFortran.str(), bounds,
+        /*members=*/{}, /*membersIndex=*/mlir::DenseIntElementsAttr{},
+        static_cast<
+            std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
+            mapTypeBits),
+        mlir::omp::VariableCaptureKind::ByRef, baseOp.getType());
+
+    if (object.sym()->owner().IsDerivedType()) {
+      addChildIndexAndMapToParent(object, parentMemberIndices, mapOp, semaCtx);
+    } else {
+      mapVars.push_back(mapOp);
+      if (mapSyms)
+        mapSyms->push_back(object.sym());
+      if (mapSymTypes)
+        mapSymTypes->push_back(baseOp.getType());
+      if (mapSymLocs)
+        mapSymLocs->push_back(baseOp.getLoc());
+    }
+  }
+}
+
 bool ClauseProcessor::processMap(
     mlir::Location currentLocation, lower::StatementContext &stmtCtx,
     mlir::omp::MapClauseOps &result,
     llvm::SmallVectorImpl<const semantics::Symbol *> *mapSyms,
     llvm::SmallVectorImpl<mlir::Location> *mapSymLocs,
     llvm::SmallVectorImpl<mlir::Type> *mapSymTypes) const {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   // We always require tracking of symbols, even if the caller does not,
   // so we create an optionally used local set of symbols when the mapSyms
   // argument is not present.
@@ -948,46 +999,10 @@ bool ClauseProcessor::processMap(
           mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO |
                          llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
         }
-
-        for (const omp::Object &object : std::get<omp::ObjectList>(clause.t)) {
-          llvm::SmallVector<mlir::Value> bounds;
-          std::stringstream asFortran;
-
-          lower::AddrAndBoundsInfo info =
-              lower::gatherDataOperandAddrAndBounds<mlir::omp::MapBoundsOp,
-                                                    mlir::omp::MapBoundsType>(
-                  converter, firOpBuilder, semaCtx, stmtCtx, *object.sym(),
-                  object.ref(), clauseLocation, asFortran, bounds,
-                  treatIndexAsSection);
-
-          // Explicit map captures are captured ByRef by default,
-          // optimisation passes may alter this to ByCopy or other capture
-          // types to optimise
-          mlir::Value baseOp = info.rawInput;
-          auto location = mlir::NameLoc::get(
-              mlir::StringAttr::get(firOpBuilder.getContext(), asFortran.str()),
-              baseOp.getLoc());
-          mlir::omp::MapInfoOp mapOp = createMapInfoOp(
-              firOpBuilder, location, baseOp,
-              /*varPtrPtr=*/mlir::Value{}, asFortran.str(), bounds,
-              /*members=*/{}, /*membersIndex=*/mlir::DenseIntElementsAttr{},
-              static_cast<
-                  std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
-                  mapTypeBits),
-              mlir::omp::VariableCaptureKind::ByRef, baseOp.getType());
-
-          if (object.sym()->owner().IsDerivedType()) {
-            addChildIndexAndMapToParent(object, parentMemberIndices, mapOp,
-                                        semaCtx);
-          } else {
-            result.mapVars.push_back(mapOp);
-            ptrMapSyms->push_back(object.sym());
-            if (mapSymTypes)
-              mapSymTypes->push_back(baseOp.getType());
-            if (mapSymLocs)
-              mapSymLocs->push_back(baseOp.getLoc());
-          }
-        }
+        processMapObjects(stmtCtx, clauseLocation,
+                          std::get<omp::ObjectList>(clause.t), mapTypeBits,
+                          parentMemberIndices, result.mapVars, ptrMapSyms,
+                          mapSymLocs, mapSymTypes);
       });
 
   insertChildMapInfoIntoParent(converter, parentMemberIndices, result.mapVars,
@@ -1050,27 +1065,55 @@ bool ClauseProcessor::processEnter(
 }
 
 bool ClauseProcessor::processUseDeviceAddr(
-    mlir::omp::UseDeviceAddrClauseOps &result,
+    lower::StatementContext &stmtCtx, mlir::omp::UseDeviceAddrClauseOps &result,
     llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
     llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
     llvm::SmallVectorImpl<const semantics::Symbol *> &useDeviceSyms) const {
-  return findRepeatableClause<omp::clause::UseDeviceAddr>(
-      [&](const omp::clause::UseDeviceAddr &clause, const parser::CharBlock &) {
-        addUseDeviceClause(converter, clause.v, result.useDeviceAddrVars,
-                           useDeviceTypes, useDeviceLocs, useDeviceSyms);
+  std::map<const semantics::Symbol *,
+           llvm::SmallVector<OmpMapMemberIndicesData>>
+      parentMemberIndices;
+  bool clauseFound = findRepeatableClause<omp::clause::UseDeviceAddr>(
+      [&](const omp::clause::UseDeviceAddr &clause,
+          const parser::CharBlock &source) {
+        mlir::Location location = converter.genLocation(source);
+        llvm::omp::OpenMPOffloadMappingFlags mapTypeBits =
+            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO |
+            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+        processMapObjects(stmtCtx, location, clause.v, mapTypeBits,
+                          parentMemberIndices, result.useDeviceAddrVars,
+                          &useDeviceSyms, &useDeviceLocs, &useDeviceTypes);
       });
+
+  insertChildMapInfoIntoParent(converter, parentMemberIndices,
+                               result.useDeviceAddrVars, useDeviceSyms,
+                               &useDeviceTypes, &useDeviceLocs);
+  return clauseFound;
 }
 
 bool ClauseProcessor::processUseDevicePtr(
-    mlir::omp::UseDevicePtrClauseOps &result,
+    lower::StatementContext &stmtCtx, mlir::omp::UseDevicePtrClauseOps &result,
     llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
     llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
     llvm::SmallVectorImpl<const semantics::Symbol *> &useDeviceSyms) const {
-  return findRepeatableClause<omp::clause::UseDevicePtr>(
-      [&](const omp::clause::UseDevicePtr &clause, const parser::CharBlock &) {
-        addUseDeviceClause(converter, clause.v, result.useDevicePtrVars,
-                           useDeviceTypes, useDeviceLocs, useDeviceSyms);
+  std::map<const semantics::Symbol *,
+           llvm::SmallVector<OmpMapMemberIndicesData>>
+      parentMemberIndices;
+  bool clauseFound = findRepeatableClause<omp::clause::UseDevicePtr>(
+      [&](const omp::clause::UseDevicePtr &clause,
+          const parser::CharBlock &source) {
+        mlir::Location location = converter.genLocation(source);
+        llvm::omp::OpenMPOffloadMappingFlags mapTypeBits =
+            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO |
+            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+        processMapObjects(stmtCtx, location, clause.v, mapTypeBits,
+                          parentMemberIndices, result.useDevicePtrVars,
+                          &useDeviceSyms, &useDeviceLocs, &useDeviceTypes);
       });
+
+  insertChildMapInfoIntoParent(converter, parentMemberIndices,
+                               result.useDevicePtrVars, useDeviceSyms,
+                               &useDeviceTypes, &useDeviceLocs);
+  return clauseFound;
 }
 
 } // namespace omp
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 4a90f667c7248..f6b319c726a2d 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -128,11 +128,13 @@ class ClauseProcessor {
           nullptr) const;
   bool processTo(llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const;
   bool processUseDeviceAddr(
+      lower::StatementContext &stmtCtx,
       mlir::omp::UseDeviceAddrClauseOps &result,
       llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
       llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
       llvm::SmallVectorImpl<const semantics::Symbol *> &useDeviceSyms) const;
   bool processUseDevicePtr(
+      lower::StatementContext &stmtCtx,
       mlir::omp::UseDevicePtrClauseOps &result,
       llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
       llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
@@ -172,6 +174,17 @@ class ClauseProcessor {
   template <typename T>
   bool markClauseOccurrence(mlir::UnitAttr &result) const;
 
+  void processMapObjects(
+      lower::StatementContext &stmtCtx, mlir::Location clauseLocation,
+      const omp::ObjectList &objects,
+      llvm::omp::OpenMPOffloadMappingFlags mapTypeBits,
+      std::map<const semantics::Symbol *,
+               llvm::SmallVector<OmpMapMemberIndicesData>> &parentMemberIndices,
+      llvm::SmallVectorImpl<mlir::Value> &mapVars,
+      llvm::SmallVectorImpl<const semantics::Symbol *> *mapSyms,
+      llvm::SmallVectorImpl<mlir::Location> *mapSymLocs = nullptr,
+      llvm::SmallVectorImpl<mlir::Type> *mapSymTypes = nullptr) const;
+
   lower::AbstractConverter &converter;
   semantics::SemanticsContext &semaCtx;
   List<Clause> clauses;
@@ -188,7 +201,6 @@ bool ClauseProcessor::processMotionClauses(lower::StatementContext &stmtCtx,
   bool clauseFound = findRepeatableClause<T>(
       [&](const T &clause, const parser::CharBlock &source) {
         mlir::Location clauseLocation = converter.genLocation(source);
-        fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
         static_assert(std::is_same_v<T, omp::clause::To> ||
                       std::is_same_v<T, omp::clause::From>);
@@ -199,39 +211,9 @@ bool ClauseProcessor::processMotionClauses(lower::StatementContext &stmtCtx,
                 ? llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO
                 : llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
 
-        auto &objects = std::get<ObjectList>(clause.t);
-        for (const omp::Object &object : objects) {
-          llvm::SmallVector<mlir::Value> bounds;
-          std::stringstream asFortran;
-
-          lower::AddrAndBoundsInfo info =
-              lower::gatherDataOperandAddrAndBounds<mlir::omp::MapBoundsOp,
-                                                    mlir::omp::MapBoundsType>(
-                  converter, firOpBuilder, semaCtx, stmtCtx, *object.sym(),
-                  object.ref(), clauseLocation, asFortran, bounds,
-                  treatIndexAsSection);
-
-          // Explicit map captures are captured ByRef by default,
-          // optimisation passes may alter this to ByCopy or other capture
-          // types to optimise
-          mlir::Value baseOp = info.rawInput;
-          mlir::omp::MapInfoOp mapOp = createMapInfoOp(
-              firOpBuilder, clauseLocation, baseOp,
-              /*varPtrPtr=*/mlir::Value{}, asFortran.str(), bounds,
-              /*members=*/{}, /*membersIndex=*/mlir::DenseIntElementsAttr{},
-              static_cast<
-                  std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
-                  mapTypeBits),
-              mlir::omp::VariableCaptureKind::ByRef, baseOp.getType());
-
-          if (object.sym()->owner().IsDerivedType()) {
-            addChildIndexAndMapToParent(object, parentMemberIndices, mapOp,
-                                        semaCtx);
-          } else {
-            result.mapVars.push_back(mapOp);
-            mapSymbols.push_back(object.sym());
-          }
-        }
+        processMapObjects(stmtCtx, clauseLocation,
+                          std::get<ObjectList>(clause.t), mapTypeBits,
+                          parentMemberIndices, result.mapVars, &mapSymbols);
       });
 
   insertChildMapInfoIntoParent(converter, parentMemberIndices, result.mapVars,
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 2fd5d4b33074e..8b77f1ac6b4ff 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -702,45 +702,94 @@ static void createBodyOfOp(mlir::Operation &op, const OpWithBodyGenInfo &info,
   marker->erase();
 }
 
+void mapBodySymbols(lower::AbstractConverter &converter, mlir::Region &region,
+                    llvm::ArrayRef<const semantics::Symbol *> mapSyms) {
+  assert(region.hasOneBlock() && "target must have single region");
+  mlir::Block &regionBlock = region.front();
+  // Clones the `bounds` placing them inside the target region and returns them.
+  auto cloneBound = [&](mlir::Value bound) {
+    if (mlir::isMemoryEffectFree(bound.getDefiningOp())) {
+      mlir::Operation *clonedOp = bound.getDefiningOp()->clone();
+      regionBlock.push_back(clonedOp);
+      return clonedOp->getResult(0);
+    }
+    TODO(converter.getCurrentLocation(),
+         "target map clause operand unsupported bound type");
+  };
+
+  auto cloneBounds = [cloneBound](llvm::ArrayRef<mlir::Value> bounds) {
+    llvm::SmallVector<mlir::Value> clonedBounds;
+    for (mlir::Value bound : bounds)
+      clonedBounds.emplace_back(cloneBound(bound));
+    return clonedBounds;
+  };
+
+  // Bind the symbols to their corresponding block arguments.
+  for (auto [argIndex, argSymbol] : llvm::enumerate(mapSyms)) {
+    const mlir::BlockArgument &arg = region.getArgument(argIndex);
+    // Avoid capture of a reference to a structured binding.
+    const semantics::Symbol *sym = argSymbol;
+    // Structure component symbols don't have bindings.
+    if (sym->owner().IsDerivedType())
+      continue;
+    fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*sym);
+    auto refType = mlir::dyn_cast<fir::ReferenceType>(arg.getType());
+    if (refType && fir::isa_builtin_cptr_type(refType.getElementType())) {
+      converter.bindSymbol(*argSymbol, arg);
+    } else {
+      extVal.match(
+          [&](const fir::BoxValue &v) {
+            converter.bindSymbol(*sym,
+                                 fir::BoxValue(arg, cloneBounds(v.getLBounds()),
+                                               v.getExplicitParameters(),
+                                               v.getExplicitExtents()));
+          },
+          [&](const fir::MutableBoxValue &v) {
+            converter.bindSymbol(
+                *sym, fir::MutableBoxValue(arg, cloneBounds(v.getLBounds()),
+                                           v.getMutableProperties()));
+          },
+          [&](const fir::ArrayBoxValue &v) {
+            converter.bindSymbol(
+                *sym, fir::ArrayBoxValue(arg, cloneBounds(v.getExtents()),
+                                         cloneBounds(v.getLBounds()),
+                                         v.getSourceBox()));
+          },
+          [&](const fir::CharArrayBoxValue &v) {
+            converter.bindSymbol(
+                *sym, fir::CharArrayBoxValue(arg, cloneBound(v.getLen()),
+                                             cloneBounds(v.getExtents()),
+                                             cloneBounds(v.getLBounds())));
+          },
+          [&](const fir::CharBoxValue &v) {
+            converter.bindSymbol(
+                *sym, fir::CharBoxValue(arg, cloneBound(v.getLen())));
+          },
+          [&](const fir::UnboxedValue &v) { converter.bindSymbol(*sym, arg); },
+          [&](const auto &) {
+            TODO(converter.getCurrentLocation(),
+                 "target map clause operand unsupported type");
+          });
+    }
+  }
+}
+
 static void genBodyOfTargetDataOp(
     lower::AbstractConverter &converter, lower::SymMap &symTable,
     semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
-    mlir::omp::TargetDataOp &dataOp, llvm::ArrayRef<mlir::Type> useDeviceTypes,
-    llvm::ArrayRef<mlir::Location> useDeviceLocs,
+    mlir::omp::TargetDataOp &dataOp,
     llvm::ArrayRef<const semantics::Symbol *> useDeviceSymbols,
+    llvm::ArrayRef<mlir::Location> useDeviceLocs,
+    llvm::ArrayRef<mlir::Type> useDeviceTypes,
     const mlir::Location &currentLocation, const ConstructQueue &queue,
     ConstructQueue::const_iterator item) {
+  assert(useDeviceTypes.size() == useDeviceLocs.size());
+
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   mlir::Region &region = dataOp.getRegion();
-
   firOpBuilder.createBlock(&region, {}, useDeviceTypes, useDeviceLocs);
 
-  for (auto [argIndex, argSymbol] : llvm::enumerate(useDeviceSymbols)) {
-    const mlir::BlockArgument &arg = region.front().getArgument(argIndex);
-    fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*argSymbol);
-    if (auto refType = mlir::dyn_cast<fir::ReferenceType>(arg.getType())) {
-      if (fir::isa_builtin_cptr_type(refType.getElementType())) {
-        converter.bindSymbol(*argSymbol, arg);
-      } else {
-        // Avoid capture of a reference to a structured binding.
-        const semantics::Symbol *sym = argSymbol;
-        extVal.match(
-            [&](const fir::MutableBoxValue &mbv) {
-              converter.bindSymbol(
-                  *sym,
-                  fir::MutableBoxValue(
-                      arg, fir::factory::getNonDeferredLenParams(extVal), {}));
-            },
-            [&](const auto &) {
-              TODO(converter.getCurrentLocation(),
-                   "use_device clause operand unsupported type");
-            });
-      }
-    } else {
-      TODO(converter.getCurrentLocation(),
-           "use_device clause operand unsupported type");
-    }
-  }
+  mapBodySymbols(converter, region, useDeviceSymbols);
 
   // Insert dummy instruction to remember the insertion position. The
   // marker will be deleted by clean up passes since there are no uses.
@@ -748,7 +797,7 @@ static void genBodyOfTargetDataOp(
   // there are hlfir.declares inserted above while setting block arguments
   // and new code from the body should be inserted after that.
   mlir::Value undefMarker = firOpBuilder.create<fir::UndefOp>(
-      dataOp.getOperation()->getLoc(), firOpBuilder.getIndexType());
+      dataOp.getLoc(), firOpBuilder.getIndexType());
 
   // Create blocks for unstructured regions. This has to be done since
   // blocks are initially allocated with the function as the parent region.
@@ -806,93 +855,33 @@ static void genBodyOfTargetOp(
     mlir::omp::TargetOp &targetOp,
     llvm::ArrayRef<const semantics::Symbol *> mapSyms,
     llvm::ArrayRef<mlir::Location> mapSymLocs,
-    llvm::ArrayRef<mlir::Type> mapSymTypes, DataSharingProcessor &dsp,
+    llvm::ArrayRef<mlir::Type> mapSymTypes,
     const mlir::Location &currentLocation, const ConstructQueue &queue,
-    ConstructQueue::const_iterator item) {
+    ConstructQueue::const_iterator item, DataSharingProcessor &dsp) {
   assert(mapSymTypes.size() == mapSymLocs.size());
 
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   mlir::Region &region = targetOp.getRegion();
 
   llvm::SmallVector<mlir::Type> allRegionArgTypes;
+  llvm::SmallVector<mlir::Location> allRegionArgLocs;
   mergePrivateVarsInfo(targetOp, mapSymTypes,
                        llvm::function_ref<mlir::Type(mlir::Value)>{
                            [](mlir::Value v) { return v.getType(); }},
                        allRegionArgTypes);
 
-  llvm::SmallVector<mlir::Location> allRegionArgLocs;
   mergePrivateVarsInfo(targetOp, mapSymLocs,
                        llvm::function_ref<mlir::Location(mlir::Value)>{
                            [](mlir::Value v) { return v.getLoc(); }},
                        allRegionArgLocs);
 
-  auto *regionBlock = firOpBuilder.createBlock(&region, {}, allRegionArgTypes,
-                                               allRegionArgLocs);
+  mlir::Block *regionBlock = firOpBuilder.createBlock(
+      &region, {}, allRegionArgTypes, allRegionArgLocs);
 
-  // Clones the `bounds` placing them inside the target region and returns them.
-  auto cloneBound = [&](mlir::Value bound) {
-    if (mlir::isMemoryEffectFree(bound.getDefiningOp())) {
-      mlir::Operation *clonedOp = bound.getDefiningOp()->clone();
-      regionBlock->push_back(clonedOp);
-      return clonedOp->getResult(0);
-    }
-    TODO(converter.getCurrentLocation(),
-         "target map clause operand unsupported bound type");
-  };
-
-  auto cloneBounds = [cloneBound](llvm::ArrayRef<mlir::Value> bounds) {
-    llvm::SmallVector<mlir::Value> clonedBounds;
-    for (mlir::Value bound : bounds)
-      clonedBounds.emplace_back(cloneBound(bound));
-    return clonedBounds;
-  };
-
-  // Bind the symbols to their corresponding block arguments.
-  for (auto [argIndex, argSymbol] : llvm::enumerate(mapSyms)) {
-    const mlir::BlockArgument &arg = region.getArgument(argIndex);
-    // Avoid capture of a reference to a structured binding.
-    const semantics::Symbol *sym = argSymbol;
-    // Structure component symbols don't have bindings.
-    if (sym->owner().IsDerivedType())
-      continue;
-    fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*sym);
-    extVal.match(
-        [&](const fir::BoxValue &v) {
-          converter.bindSymbol(*sym,
-                               fir::BoxValue(arg, cloneBounds(v.getLBounds()),
-                                             v.getExplicitParameters(),
-                                             v.getExplicitExtents()));
-        },
-        [&](const fir::MutableBoxValue &v) {
-          converter.bindSymbol(
-              *sym, fir::MutableBoxValue(arg, cloneBounds(v.getLBounds()),
-                                         v.getMutableProperties()));
-        },
-        [&](const fir::ArrayBoxValue &v) {
-          converter.bindSymbol(
-              *sym, fir::ArrayBoxValue(arg, cloneBounds(v.getExtents()),
-                                       cloneBounds(v.getLBounds()),
-                                       v.getSourceBox()));
-        },
-        [&](const fir::CharArrayBoxValue &v) {
-          converter.bindSymbol(
-              *sym, fir::CharArrayBoxValue(arg, cloneBound(v.getLen()),
-                                           cloneBounds(v.getExtents()),
-                                           cloneBounds(v.getLBounds())));
-        },
-        [&](const fir::CharBoxValue &v) {
-          converter.bindSymbol(*sym,
-                               fir::CharBoxValue(arg, cloneBound(v.getLen())));
-        },
-        [&](const fir::UnboxedValue &v) { converter.bindSymbol(*sym, arg); },
-        [&](const auto &) {
-          TODO(converter.getCurrentLocation(),
-               "target map clause operand unsupported type");
-        });
-  }
+  mapBodySymbols(converter, region, mapSyms);
 
   for (auto [argIndex, argSymbol] :
-       llvm::enumerate(dsp.getDelayedPrivSymbols())) {
+       llvm::enumerate(dsp.getAllSymbolsToPrivatize())) {
     argIndex = mapSyms.size() + argIndex;
 
     const mlir::BlockArgument &arg = region.getArgument(argIndex);
@@ -940,7 +929,9 @@ static void genBodyOfTargetOp(
                 std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
                 llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT),
             mlir::omp::VariableCaptureKind::ByCopy, copyVal.getType());
+
         targetOp.getMapVarsMutable().append(mapOp);
+
         mlir::Value clonedValArg =
             region.addArgument(copyVal.getType(), copyVal.getLoc());
         firOpBuilder.setInsertionPointToStart(regionBlock);
@@ -962,7 +953,7 @@ static void genBodyOfTargetOp(
   // In the HLFIR flow there are hlfir.declares inserted above while
   // setting block arguments.
   mlir::Value undefMarker = firOpBuilder.create<fir::UndefOp>(
-      targetOp.getOperation()->getLoc(), firOpBuilder.getIndexType());
+      targetOp.getLoc(), firOpBuilder.getIndexType());
 
   // Create blocks for unstructured regions. This has to be done since
   // blocks are initially allocated with the function as the parent region.
@@ -1201,9 +1192,9 @@ static void genTargetDataClauses(
   cp.processDevice(stmtCtx, clauseOps);
   cp.processIf(llvm::omp::Directive::OMPD_target_data, clauseOps);
   cp.processMap(loc, stmtCtx, clauseOps);
-  cp.processUseDeviceAddr(clauseOps, useDeviceTypes, useDeviceLocs,
+  cp.processUseDeviceAddr(stmtCtx, clauseOps, useDeviceTypes, useDeviceLocs,
                           useDeviceSyms);
-  cp.processUseDevicePtr(clauseOps, useDeviceTypes, useDeviceLocs,
+  cp.processUseDevicePtr(stmtCtx, clauseOps, useDeviceTypes, useDeviceLocs,
                          useDeviceSyms);
 
   // This function implements the deprecated functionality of use_device_ptr
@@ -1807,7 +1798,7 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
 
   auto targetOp = firOpBuilder.create<mlir::omp::TargetOp>(loc, clauseOps);
   genBodyOfTargetOp(converter, symTable, semaCtx, eval, targetOp, mapSyms,
-                    mapLocs, mapTypes, dsp, loc, queue, item);
+                    mapLocs, mapTypes, loc, queue, item, dsp);
   return targetOp;
 }
 
@@ -1829,7 +1820,7 @@ genTargetDataOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
       converter.getFirOpBuilder().create<mlir::omp::TargetDataOp>(loc,
                                                                   clauseOps);
   genBodyOfTargetDataOp(converter, symTable, semaCtx, eval, targetDataOp,
-                        useDeviceTypes, useDeviceLocs, useDeviceSyms, loc,
+                        useDeviceSyms, useDeviceLocs, useDeviceTypes, loc,
                         queue, item);
   return targetDataOp;
 }
diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
index 04a11a52dbd04..7ebeb51cf3dec 100644
--- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
@@ -125,13 +125,12 @@ class MapInfoFinalizationPass
     // TODO: map the addendum segment of the descriptor, similarly to the
     // above base address/data pointer member.
 
-    if (auto mapClauseOwner =
-            llvm::dyn_cast<mlir::omp::MapClauseOwningOpInterface>(target)) {
+    auto addOperands = [&](mlir::OperandRange &operandsArr,
+                           mlir::MutableOperandRange &mutableOpRange,
+                           auto directiveOp) {
       llvm::SmallVector<mlir::Value> newMapOps;
-      mlir::OperandRange mapVarsArr = mapClauseOwner.getMapVars();
-
-      for (size_t i = 0; i < mapVarsArr.size(); ++i) {
-        if (mapVarsArr[i] == op) {
+      for (size_t i = 0; i < operandsArr.size(); ++i) {
+        if (operandsArr[i] == op) {
           // Push new implicit maps generated for the descriptor.
           newMapOps.push_back(baseAddr);
 
@@ -139,13 +138,29 @@ class MapInfoFinalizationPass
           // new additional map operand with an appropriate BlockArgument,
           // as the printing and later processing currently requires a 1:1
           // mapping of BlockArgs to MapInfoOp's at the same placement in
-          // each array (BlockArgs and MapVars).
-          if (auto targetOp = llvm::dyn_cast<mlir::omp::TargetOp>(target))
-            targetOp.getRegion().insertArgument(i, baseAddr.getType(), loc);
+          // each array (BlockArgs and MapOperands).
+          if (directiveOp) {
+            directiveOp.getRegion().insertArgument(i, baseAddr.getType(), loc);
+          }
         }
-        newMapOps.push_back(mapVarsArr[i]);
+        newMapOps.push_back(operandsArr[i]);
       }
-      mapClauseOwner.getMapVarsMutable().assign(newMapOps);
+      mutableOpRange.assign(newMapOps);
+    };
+    if (auto mapClauseOwner =
+            llvm::dyn_cast<mlir::omp::MapClauseOwningOpInterface>(target)) {
+      mlir::OperandRange mapOperandsArr = mapClauseOwner.getMapVars();
+      mlir::MutableOperandRange mapMutableOpRange =
+          mapClauseOwner.getMapVarsMutable();
+      mlir::omp::TargetOp targetOp =
+          llvm::dyn_cast<mlir::omp::TargetOp>(target);
+      addOperands(mapOperandsArr, mapMutableOpRange, targetOp);
+    }
+    if (auto targetDataOp = llvm::dyn_cast<mlir::omp::TargetDataOp>(target)) {
+      mlir::OperandRange useDevAddrArr = targetDataOp.getUseDeviceAddrVars();
+      mlir::MutableOperandRange useDevAddrMutableOpRange =
+          targetDataOp.getUseDeviceAddrVarsMutable();
+      addOperands(useDevAddrArr, useDevAddrMutableOpRange, targetDataOp);
     }
 
     mlir::Value newDescParentMapOp = builder.create<mlir::omp::MapInfoOp>(
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 1d5ab6942dfa3..e86a2f9b6098d 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -526,21 +526,23 @@ end subroutine omp_target_device_ptr
  !===============================================================================
 
  !CHECK-LABEL: func.func @_QPomp_target_device_addr() {
- subroutine omp_target_device_addr
+subroutine omp_target_device_addr
    integer, pointer :: a
    !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "a", uniq_name = "_QFomp_target_device_addrEa"}
    !CHECK: %[[VAL_0_DECL:.*]]:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
    !CHECK: %[[MAP_MEMBERS:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, i32) var_ptr_ptr({{.*}} : !fir.llvm_ptr<!fir.ref<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr<!fir.ref<i32>> {name = ""}
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.box<!fir.ptr<i32>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBERS]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.ptr<i32>>> {name = "a"}
-   !CHECK: omp.target_data map_entries(%[[MAP_MEMBERS]], %[[MAP]] : {{.*}}) use_device_addr(%[[VAL_0_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>) {
+   !CHECK: %[[DEV_ADDR_MEMBERS:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, i32) var_ptr_ptr({{.*}} : !fir.llvm_ptr<!fir.ref<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr<!fir.ref<i32>> {name = ""}
+   !CHECK: %[[DEV_ADDR:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.box<!fir.ptr<i32>>) map_clauses(tofrom) capture(ByRef) members(%[[DEV_ADDR_MEMBERS]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.ptr<i32>>> {name = "a"}
+   !CHECK: omp.target_data map_entries(%[[MAP_MEMBERS]], %[[MAP]] : {{.*}}) use_device_addr(%[[DEV_ADDR_MEMBERS]], %[[DEV_ADDR]] : {{.*}}) {
    !$omp target data map(tofrom: a) use_device_addr(a)
-   !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>):
-   !CHECK: %[[VAL_1_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+   !CHECK: ^bb0(%[[ARG_0:.*]]: !fir.llvm_ptr<!fir.ref<i32>>, %[[ARG_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>):
+   !CHECK: %[[VAL_1_DECL:.*]]:2 = hlfir.declare %[[ARG_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
    !CHECK: %[[C10:.*]] = arith.constant 10 : i32
    !CHECK: %[[A_BOX:.*]] = fir.load %[[VAL_1_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
    !CHECK: %[[A_ADDR:.*]] = fir.box_addr %[[A_BOX]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
    !CHECK: hlfir.assign %[[C10]] to %[[A_ADDR]] : i32, !fir.ptr<i32>
-      a = 10
+   a = 10
    !CHECK: omp.terminator
    !$omp end target data
    !CHECK: }
diff --git a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
index acb5f533b619e..085f5419fa7f8 100644
--- a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
+++ b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
@@ -2,72 +2,70 @@
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
 ! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
 
-! This tests primary goal is to check the promotion of 
-! non-CPTR arguments from use_device_ptr to 
-! use_device_addr works, without breaking any 
-! functionality 
+! This tests primary goal is to check the promotion of non-CPTR arguments from
+! use_device_ptr to use_device_addr works, without breaking any functionality.
 
 !CHECK: func.func @{{.*}}only_use_device_ptr()
-!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
-!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
-subroutine only_use_device_ptr 
+!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
+!CHECK: ^bb0(%{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+subroutine only_use_device_ptr
     use iso_c_binding
     integer, pointer, dimension(:) :: array
     real, pointer :: pa(:)
     type(c_ptr) :: cptr
 
-   !$omp target data use_device_ptr(pa, cptr, array)
-   !$omp end target data 
-end subroutine
+       !$omp target data use_device_ptr(pa, cptr, array)
+       !$omp end target data
+     end subroutine
 
 !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr()
-!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) use_device_ptr({{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
-!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
-subroutine mix_use_device_ptr_and_addr 
+!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) use_device_ptr({{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
+!CHECK: ^bb0(%{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+subroutine mix_use_device_ptr_and_addr
     use iso_c_binding
     integer, pointer, dimension(:) :: array
     real, pointer :: pa(:)
     type(c_ptr) :: cptr
 
-   !$omp target data use_device_ptr(pa, cptr) use_device_addr(array)
-   !$omp end target data 
-end subroutine
+       !$omp target data use_device_ptr(pa, cptr) use_device_addr(array)
+       !$omp end target data
+     end subroutine
 
-!CHECK: func.func @{{.*}}only_use_device_addr()
-!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) {
-!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
-subroutine only_use_device_addr 
-    use iso_c_binding
-    integer, pointer, dimension(:) :: array
-    real, pointer :: pa(:)
-    type(c_ptr) :: cptr
+     !CHECK: func.func @{{.*}}only_use_device_addr()
+     !CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) {
+     !CHECK: ^bb0(%{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+     subroutine only_use_device_addr
+        use iso_c_binding
+        integer, pointer, dimension(:) :: array
+        real, pointer :: pa(:)
+        type(c_ptr) :: cptr
 
-   !$omp target data use_device_addr(pa, cptr, array)
-   !$omp end target data 
-end subroutine
+       !$omp target data use_device_addr(pa, cptr, array)
+       !$omp end target data
+     end subroutine
 
-!CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr_and_map()
-!CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}} : !fir.ref<i32>, !fir.ref<i32>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
-!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
-subroutine mix_use_device_ptr_and_addr_and_map
-    use iso_c_binding
-    integer :: i, j
-    integer, pointer, dimension(:) :: array
-    real, pointer :: pa(:)
-    type(c_ptr) :: cptr
+     !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr_and_map()
+     !CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}} : !fir.ref<i32>, !fir.ref<i32>) use_device_addr(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
+     !CHECK: ^bb0(%{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+     subroutine mix_use_device_ptr_and_addr_and_map
+        use iso_c_binding
+        integer :: i, j
+        integer, pointer, dimension(:) :: array
+        real, pointer :: pa(:)
+        type(c_ptr) :: cptr
 
-   !$omp target data use_device_ptr(pa, cptr) use_device_addr(array) map(tofrom: i, j)
-   !$omp end target data 
-end subroutine
+       !$omp target data use_device_ptr(pa, cptr) use_device_addr(array) map(tofrom: i, j)
+       !$omp end target data
+     end subroutine
 
-!CHECK: func.func @{{.*}}only_use_map()
-!CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) {
-subroutine only_use_map
-    use iso_c_binding
-    integer, pointer, dimension(:) :: array
-    real, pointer :: pa(:)
-    type(c_ptr) :: cptr
+     !CHECK: func.func @{{.*}}only_use_map()
+     !CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) {
+     subroutine only_use_map
+        use iso_c_binding
+        integer, pointer, dimension(:) :: array
+        real, pointer :: pa(:)
+        type(c_ptr) :: cptr
 
-   !$omp target data map(pa, cptr, array)
-   !$omp end target data 
-end subroutine
+       !$omp target data map(pa, cptr, array)
+       !$omp end target data
+     end subroutine

From 2cf36f0293daf0fa28d5c7d5d3617660edf237e7 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <akash.banerjee@amd.com>
Date: Wed, 4 Sep 2024 12:36:03 +0100
Subject: [PATCH 081/425] [OpenMP]Update use_device_clause lowering (#101707)

This patch updates the use_device_ptr and use_device_addr clauses to use
the mapInfoOps for lowering. This allows all the types that are handle
by the map clauses such as derived types to also be supported by the
use_device_clauses.

This is patch 2/2 in a series of patches.
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |   2 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 354 +++++++++---------
 mlir/test/Target/LLVMIR/omptarget-llvm.mlir   |  16 +-
 .../openmp-target-use-device-nested.mlir      |  39 ++
 4 files changed, 228 insertions(+), 183 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 027b927fa6424..71d51affba642 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6371,6 +6371,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetData(
   if (!updateToLocation(Loc))
     return InsertPointTy();
 
+  Builder.restoreIP(CodeGenIP);
   // Disable TargetData CodeGen on Device pass.
   if (Config.IsTargetDevice.value_or(false)) {
     if (BodyGenCB)
@@ -6378,7 +6379,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetData(
     return Builder.saveIP();
   }
 
-  Builder.restoreIP(CodeGenIP);
   bool IsStandAlone = !BodyGenCB;
   MapInfosTy *MapInfo;
   // Generate the code for the opening of the data environment. Capture all the
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6a32aeb444140..d597bbfee2fe1 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2260,6 +2260,7 @@ getRefPtrIfDeclareTarget(mlir::Value value,
   return nullptr;
 }
 
+namespace {
 // A small helper structure to contain data gathered
 // for map lowering and coalese it into one area and
 // avoiding extra computations such as searches in the
@@ -2269,6 +2270,8 @@ getRefPtrIfDeclareTarget(mlir::Value value,
 struct MapInfoData : llvm::OpenMPIRBuilder::MapInfosTy {
   llvm::SmallVector<bool, 4> IsDeclareTarget;
   llvm::SmallVector<bool, 4> IsAMember;
+  // Identify if mapping was added by mapClause or use_device clauses.
+  llvm::SmallVector<bool, 4> IsAMapping;
   llvm::SmallVector<mlir::Operation *, 4> MapClause;
   llvm::SmallVector<llvm::Value *, 4> OriginalValue;
   // Stripped off array/pointer to get the underlying
@@ -2286,6 +2289,7 @@ struct MapInfoData : llvm::OpenMPIRBuilder::MapInfosTy {
     llvm::OpenMPIRBuilder::MapInfosTy::append(CurInfo);
   }
 };
+} // namespace
 
 uint64_t getArrayElementSizeInBits(LLVM::LLVMArrayType arrTy, DataLayout &dl) {
   if (auto nestedArrTy = llvm::dyn_cast_if_present<LLVM::LLVMArrayType>(
@@ -2352,80 +2356,126 @@ llvm::Value *getSizeInBytes(DataLayout &dl, const mlir::Type &type,
   return builder.getInt64(dl.getTypeSizeInBits(type) / 8);
 }
 
-void collectMapDataFromMapVars(MapInfoData &mapData,
-                               llvm::SmallVectorImpl<Value> &mapVars,
-                               LLVM::ModuleTranslation &moduleTranslation,
-                               DataLayout &dl, llvm::IRBuilderBase &builder) {
-  for (mlir::Value mapValue : mapVars) {
-    if (auto mapOp = mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(
-            mapValue.getDefiningOp())) {
-      mlir::Value offloadPtr =
+static void collectMapDataFromMapOperands(
+    MapInfoData &mapData, SmallVectorImpl<Value> &mapVars,
+    LLVM::ModuleTranslation &moduleTranslation, DataLayout &dl,
+    llvm::IRBuilderBase &builder, const ArrayRef<Value> &useDevPtrOperands = {},
+    const ArrayRef<Value> &useDevAddrOperands = {}) {
+  auto checkIsAMember = [](const auto &mapVars, auto mapOp) {
+    // Check if this is a member mapping and correctly assign that it is, if
+    // it is a member of a larger object.
+    // TODO: Need better handling of members, and distinguishing of members
+    // that are implicitly allocated on device vs explicitly passed in as
+    // arguments.
+    // TODO: May require some further additions to support nested record
+    // types, i.e. member maps that can have member maps.
+    for (Value mapValue : mapVars) {
+      auto map = cast<omp::MapInfoOp>(mapValue.getDefiningOp());
+      for (auto member : map.getMembers())
+        if (member == mapOp)
+          return true;
+    }
+    return false;
+  };
+
+  // Process MapOperands
+  for (Value mapValue : mapVars) {
+    auto mapOp = cast<omp::MapInfoOp>(mapValue.getDefiningOp());
+    Value offloadPtr =
+        mapOp.getVarPtrPtr() ? mapOp.getVarPtrPtr() : mapOp.getVarPtr();
+    mapData.OriginalValue.push_back(moduleTranslation.lookupValue(offloadPtr));
+    mapData.Pointers.push_back(mapData.OriginalValue.back());
+
+    if (llvm::Value *refPtr =
+            getRefPtrIfDeclareTarget(offloadPtr,
+                                     moduleTranslation)) { // declare target
+      mapData.IsDeclareTarget.push_back(true);
+      mapData.BasePointers.push_back(refPtr);
+    } else { // regular mapped variable
+      mapData.IsDeclareTarget.push_back(false);
+      mapData.BasePointers.push_back(mapData.OriginalValue.back());
+    }
+
+    mapData.BaseType.push_back(
+        moduleTranslation.convertType(mapOp.getVarType()));
+    mapData.Sizes.push_back(
+        getSizeInBytes(dl, mapOp.getVarType(), mapOp, mapData.Pointers.back(),
+                       mapData.BaseType.back(), builder, moduleTranslation));
+    mapData.MapClause.push_back(mapOp.getOperation());
+    mapData.Types.push_back(
+        llvm::omp::OpenMPOffloadMappingFlags(mapOp.getMapType().value()));
+    mapData.Names.push_back(LLVM::createMappingInformation(
+        mapOp.getLoc(), *moduleTranslation.getOpenMPBuilder()));
+    mapData.DevicePointers.push_back(llvm::OpenMPIRBuilder::DeviceInfoTy::None);
+    mapData.IsAMapping.push_back(true);
+    mapData.IsAMember.push_back(checkIsAMember(mapVars, mapOp));
+  }
+
+  auto findMapInfo = [&mapData](llvm::Value *val,
+                                llvm::OpenMPIRBuilder::DeviceInfoTy devInfoTy) {
+    unsigned index = 0;
+    bool found = false;
+    for (llvm::Value *basePtr : mapData.OriginalValue) {
+      if (basePtr == val && mapData.IsAMapping[index]) {
+        found = true;
+        mapData.Types[index] |=
+            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM;
+        mapData.DevicePointers[index] = devInfoTy;
+      }
+      index++;
+    }
+    return found;
+  };
+
+  // Process useDevPtr(Addr)Operands
+  auto addDevInfos = [&](const llvm::ArrayRef<Value> &useDevOperands,
+                         llvm::OpenMPIRBuilder::DeviceInfoTy devInfoTy) {
+    for (Value mapValue : useDevOperands) {
+      auto mapOp = cast<omp::MapInfoOp>(mapValue.getDefiningOp());
+      Value offloadPtr =
           mapOp.getVarPtrPtr() ? mapOp.getVarPtrPtr() : mapOp.getVarPtr();
-      mapData.OriginalValue.push_back(
-          moduleTranslation.lookupValue(offloadPtr));
-      mapData.Pointers.push_back(mapData.OriginalValue.back());
-
-      if (llvm::Value *refPtr =
-              getRefPtrIfDeclareTarget(offloadPtr,
-                                       moduleTranslation)) { // declare target
-        mapData.IsDeclareTarget.push_back(true);
-        mapData.BasePointers.push_back(refPtr);
-      } else { // regular mapped variable
+      llvm::Value *origValue = moduleTranslation.lookupValue(offloadPtr);
+
+      // Check if map info is already present for this entry.
+      if (!findMapInfo(origValue, devInfoTy)) {
+        mapData.OriginalValue.push_back(origValue);
+        mapData.Pointers.push_back(mapData.OriginalValue.back());
         mapData.IsDeclareTarget.push_back(false);
         mapData.BasePointers.push_back(mapData.OriginalValue.back());
-      }
-
-      mapData.BaseType.push_back(
-          moduleTranslation.convertType(mapOp.getVarType()));
-      mapData.Sizes.push_back(
-          getSizeInBytes(dl, mapOp.getVarType(), mapOp, mapData.Pointers.back(),
-                         mapData.BaseType.back(), builder, moduleTranslation));
-      mapData.MapClause.push_back(mapOp.getOperation());
-      mapData.Types.push_back(
-          llvm::omp::OpenMPOffloadMappingFlags(mapOp.getMapType().value()));
-      mapData.Names.push_back(LLVM::createMappingInformation(
-          mapOp.getLoc(), *moduleTranslation.getOpenMPBuilder()));
-      mapData.DevicePointers.push_back(
-          llvm::OpenMPIRBuilder::DeviceInfoTy::None);
-
-      // Check if this is a member mapping and correctly assign that it is, if
-      // it is a member of a larger object.
-      // TODO: Need better handling of members, and distinguishing of members
-      // that are implicitly allocated on device vs explicitly passed in as
-      // arguments.
-      // TODO: May require some further additions to support nested record
-      // types, i.e. member maps that can have member maps.
-      mapData.IsAMember.push_back(false);
-      for (mlir::Value mapValue : mapVars) {
-        if (auto map = mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(
-                mapValue.getDefiningOp())) {
-          for (auto member : map.getMembers()) {
-            if (member == mapOp) {
-              mapData.IsAMember.back() = true;
-            }
-          }
-        }
+        mapData.BaseType.push_back(
+            moduleTranslation.convertType(mapOp.getVarType()));
+        mapData.Sizes.push_back(builder.getInt64(0));
+        mapData.MapClause.push_back(mapOp.getOperation());
+        mapData.Types.push_back(
+            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM);
+        mapData.Names.push_back(LLVM::createMappingInformation(
+            mapOp.getLoc(), *moduleTranslation.getOpenMPBuilder()));
+        mapData.DevicePointers.push_back(devInfoTy);
+        mapData.IsAMapping.push_back(true);
+        mapData.IsAMember.push_back(checkIsAMember(useDevOperands, mapOp));
       }
     }
-  }
+  };
+
+  addDevInfos(useDevPtrOperands, llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer);
+  addDevInfos(useDevAddrOperands, llvm::OpenMPIRBuilder::DeviceInfoTy::Address);
 }
 
-static int getMapDataMemberIdx(MapInfoData &mapData,
-                               mlir::omp::MapInfoOp memberOp) {
+static int getMapDataMemberIdx(MapInfoData &mapData, omp::MapInfoOp memberOp) {
   auto *res = llvm::find(mapData.MapClause, memberOp);
   assert(res != mapData.MapClause.end() &&
          "MapInfoOp for member not found in MapData, cannot return index");
   return std::distance(mapData.MapClause.begin(), res);
 }
 
-static mlir::omp::MapInfoOp
-getFirstOrLastMappedMemberPtr(mlir::omp::MapInfoOp mapInfo, bool first) {
-  mlir::DenseIntElementsAttr indexAttr = mapInfo.getMembersIndexAttr();
+static omp::MapInfoOp getFirstOrLastMappedMemberPtr(omp::MapInfoOp mapInfo,
+                                                    bool first) {
+  DenseIntElementsAttr indexAttr = mapInfo.getMembersIndexAttr();
 
   // Only 1 member has been mapped, we can return it.
   if (indexAttr.size() == 1)
-    if (auto mapOp = mlir::dyn_cast<mlir::omp::MapInfoOp>(
-            mapInfo.getMembers()[0].getDefiningOp()))
+    if (auto mapOp =
+            dyn_cast<omp::MapInfoOp>(mapInfo.getMembers()[0].getDefiningOp()))
       return mapOp;
 
   llvm::ArrayRef<int64_t> shape = indexAttr.getShapedType().getShape();
@@ -2462,7 +2512,7 @@ getFirstOrLastMappedMemberPtr(mlir::omp::MapInfoOp mapInfo, bool first) {
                return false;
              });
 
-  return llvm::cast<mlir::omp::MapInfoOp>(
+  return llvm::cast<omp::MapInfoOp>(
       mapInfo.getMembers()[indices.front()].getDefiningOp());
 }
 
@@ -2488,7 +2538,7 @@ getFirstOrLastMappedMemberPtr(mlir::omp::MapInfoOp mapInfo, bool first) {
 std::vector<llvm::Value *>
 calculateBoundsOffset(LLVM::ModuleTranslation &moduleTranslation,
                       llvm::IRBuilderBase &builder, bool isArrayTy,
-                      mlir::OperandRange bounds) {
+                      OperandRange bounds) {
   std::vector<llvm::Value *> idx;
   // There's no bounds to calculate an offset from, we can safely
   // ignore and return no indices.
@@ -2502,7 +2552,7 @@ calculateBoundsOffset(LLVM::ModuleTranslation &moduleTranslation,
   if (isArrayTy) {
     idx.push_back(builder.getInt64(0));
     for (int i = bounds.size() - 1; i >= 0; --i) {
-      if (auto boundOp = mlir::dyn_cast_if_present<mlir::omp::MapBoundsOp>(
+      if (auto boundOp = dyn_cast_if_present<omp::MapBoundsOp>(
               bounds[i].getDefiningOp())) {
         idx.push_back(moduleTranslation.lookupValue(boundOp.getLowerBound()));
       }
@@ -2528,7 +2578,7 @@ calculateBoundsOffset(LLVM::ModuleTranslation &moduleTranslation,
     //  (extent/size of current) 100 for 1000 for each index increment
     std::vector<llvm::Value *> dimensionIndexSizeOffset{builder.getInt64(1)};
     for (size_t i = 1; i < bounds.size(); ++i) {
-      if (auto boundOp = mlir::dyn_cast_if_present<mlir::omp::MapBoundsOp>(
+      if (auto boundOp = dyn_cast_if_present<omp::MapBoundsOp>(
               bounds[i].getDefiningOp())) {
         dimensionIndexSizeOffset.push_back(builder.CreateMul(
             moduleTranslation.lookupValue(boundOp.getExtent()),
@@ -2541,7 +2591,7 @@ calculateBoundsOffset(LLVM::ModuleTranslation &moduleTranslation,
     // have calculated in the previous and accumulate the results to get
     // our final resulting offset.
     for (int i = bounds.size() - 1; i >= 0; --i) {
-      if (auto boundOp = mlir::dyn_cast_if_present<mlir::omp::MapBoundsOp>(
+      if (auto boundOp = dyn_cast_if_present<omp::MapBoundsOp>(
               bounds[i].getDefiningOp())) {
         if (idx.empty())
           idx.emplace_back(builder.CreateMul(
@@ -2585,7 +2635,7 @@ static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers(
           ? llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM
           : llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE);
   combinedInfo.DevicePointers.emplace_back(
-      llvm::OpenMPIRBuilder::DeviceInfoTy::None);
+      mapData.DevicePointers[mapDataIndex]);
   combinedInfo.Names.emplace_back(LLVM::createMappingInformation(
       mapData.MapClause[mapDataIndex]->getLoc(), ompBuilder));
   combinedInfo.BasePointers.emplace_back(mapData.BasePointers[mapDataIndex]);
@@ -2598,7 +2648,7 @@ static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers(
   // data by the descriptor (which itself, is a structure containing
   // runtime information on the dynamically allocated data).
   auto parentClause =
-      llvm::cast<mlir::omp::MapInfoOp>(mapData.MapClause[mapDataIndex]);
+      llvm::cast<omp::MapInfoOp>(mapData.MapClause[mapDataIndex]);
 
   llvm::Value *lowAddr, *highAddr;
   if (!parentClause.getPartialMap()) {
@@ -2610,8 +2660,7 @@ static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers(
         builder.getPtrTy());
     combinedInfo.Pointers.emplace_back(mapData.Pointers[mapDataIndex]);
   } else {
-    auto mapOp =
-        mlir::dyn_cast<mlir::omp::MapInfoOp>(mapData.MapClause[mapDataIndex]);
+    auto mapOp = dyn_cast<omp::MapInfoOp>(mapData.MapClause[mapDataIndex]);
     int firstMemberIdx = getMapDataMemberIdx(
         mapData, getFirstOrLastMappedMemberPtr(mapOp, true));
     lowAddr = builder.CreatePointerCast(mapData.Pointers[firstMemberIdx],
@@ -2669,7 +2718,7 @@ static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers(
 // There may be a better way to verify this, but unfortunately with
 // opaque pointers we lose the ability to easily check if something is
 // a pointer whilst maintaining access to the underlying type.
-static bool checkIfPointerMap(mlir::omp::MapInfoOp mapOp) {
+static bool checkIfPointerMap(omp::MapInfoOp mapOp) {
   // If we have a varPtrPtr field assigned then the underlying type is a pointer
   if (mapOp.getVarPtrPtr())
     return true;
@@ -2691,11 +2740,11 @@ static void processMapMembersWithParent(
     uint64_t mapDataIndex, llvm::omp::OpenMPOffloadMappingFlags memberOfFlag) {
 
   auto parentClause =
-      llvm::cast<mlir::omp::MapInfoOp>(mapData.MapClause[mapDataIndex]);
+      llvm::cast<omp::MapInfoOp>(mapData.MapClause[mapDataIndex]);
 
   for (auto mappedMembers : parentClause.getMembers()) {
     auto memberClause =
-        llvm::cast<mlir::omp::MapInfoOp>(mappedMembers.getDefiningOp());
+        llvm::cast<omp::MapInfoOp>(mappedMembers.getDefiningOp());
     int memberDataIdx = getMapDataMemberIdx(mapData, memberClause);
 
     assert(memberDataIdx >= 0 && "could not find mapped member of structure");
@@ -2712,7 +2761,7 @@ static void processMapMembersWithParent(
 
     combinedInfo.Types.emplace_back(mapFlag);
     combinedInfo.DevicePointers.emplace_back(
-        llvm::OpenMPIRBuilder::DeviceInfoTy::None);
+        mapData.DevicePointers[memberDataIdx]);
     combinedInfo.Names.emplace_back(
         LLVM::createMappingInformation(memberClause.getLoc(), ompBuilder));
     combinedInfo.BasePointers.emplace_back(mapData.BasePointers[mapDataIndex]);
@@ -2729,8 +2778,7 @@ processIndividualMap(MapInfoData &mapData, size_t mapDataIdx,
   // OMP_MAP_TARGET_PARAM as they are not passed as parameters, they're
   // marked with OMP_MAP_PTR_AND_OBJ instead.
   auto mapFlag = mapData.Types[mapDataIdx];
-  auto mapInfoOp =
-      llvm::cast<mlir::omp::MapInfoOp>(mapData.MapClause[mapDataIdx]);
+  auto mapInfoOp = llvm::cast<omp::MapInfoOp>(mapData.MapClause[mapDataIdx]);
 
   bool isPtrTy = checkIfPointerMap(mapInfoOp);
   if (isPtrTy)
@@ -2740,7 +2788,7 @@ processIndividualMap(MapInfoData &mapData, size_t mapDataIdx,
     mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM;
 
   if (mapInfoOp.getMapCaptureType().value() ==
-          mlir::omp::VariableCaptureKind::ByCopy &&
+          omp::VariableCaptureKind::ByCopy &&
       !isPtrTy)
     mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_LITERAL;
 
@@ -2766,13 +2814,13 @@ static void processMapWithMembersOf(
     llvm::OpenMPIRBuilder::MapInfosTy &combinedInfo, MapInfoData &mapData,
     uint64_t mapDataIndex, bool isTargetParams) {
   auto parentClause =
-      llvm::cast<mlir::omp::MapInfoOp>(mapData.MapClause[mapDataIndex]);
+      llvm::cast<omp::MapInfoOp>(mapData.MapClause[mapDataIndex]);
 
   // If we have a partial map (no parent referenced in the map clauses of the
   // directive, only members) and only a single member, we do not need to bind
   // the map of the member to the parent, we can pass the member separately.
   if (parentClause.getMembers().size() == 1 && parentClause.getPartialMap()) {
-    auto memberClause = llvm::cast<mlir::omp::MapInfoOp>(
+    auto memberClause = llvm::cast<omp::MapInfoOp>(
         parentClause.getMembers()[0].getDefiningOp());
     int memberDataIdx = getMapDataMemberIdx(mapData, memberClause);
     // Note: Clang treats arrays with explicit bounds that fall into this
@@ -2809,11 +2857,9 @@ createAlteredByCaptureMap(MapInfoData &mapData,
   for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
     // if it's declare target, skip it, it's handled separately.
     if (!mapData.IsDeclareTarget[i]) {
-      auto mapOp =
-          mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(mapData.MapClause[i]);
-      mlir::omp::VariableCaptureKind captureKind =
-          mapOp.getMapCaptureType().value_or(
-              mlir::omp::VariableCaptureKind::ByRef);
+      auto mapOp = cast<omp::MapInfoOp>(mapData.MapClause[i]);
+      omp::VariableCaptureKind captureKind =
+          mapOp.getMapCaptureType().value_or(omp::VariableCaptureKind::ByRef);
       bool isPtrTy = checkIfPointerMap(mapOp);
 
       // Currently handles array sectioning lowerbound case, but more
@@ -2824,7 +2870,7 @@ createAlteredByCaptureMap(MapInfoData &mapData,
       // function mimics some of the logic from Clang that we require for
       // kernel argument passing from host -> device.
       switch (captureKind) {
-      case mlir::omp::VariableCaptureKind::ByRef: {
+      case omp::VariableCaptureKind::ByRef: {
         llvm::Value *newV = mapData.Pointers[i];
         std::vector<llvm::Value *> offsetIdx = calculateBoundsOffset(
             moduleTranslation, builder, mapData.BaseType[i]->isArrayTy(),
@@ -2837,7 +2883,7 @@ createAlteredByCaptureMap(MapInfoData &mapData,
                                            "array_offset");
         mapData.Pointers[i] = newV;
       } break;
-      case mlir::omp::VariableCaptureKind::ByCopy: {
+      case omp::VariableCaptureKind::ByCopy: {
         llvm::Type *type = mapData.BaseType[i];
         llvm::Value *newV;
         if (mapData.Pointers[i]->getType()->isPointerTy())
@@ -2859,8 +2905,8 @@ createAlteredByCaptureMap(MapInfoData &mapData,
         mapData.Pointers[i] = newV;
         mapData.BasePointers[i] = newV;
       } break;
-      case mlir::omp::VariableCaptureKind::This:
-      case mlir::omp::VariableCaptureKind::VLAType:
+      case omp::VariableCaptureKind::This:
+      case omp::VariableCaptureKind::VLAType:
         mapData.MapClause[i]->emitOpError("Unhandled capture kind");
         break;
       }
@@ -2873,10 +2919,7 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
                         LLVM::ModuleTranslation &moduleTranslation,
                         DataLayout &dl,
                         llvm::OpenMPIRBuilder::MapInfosTy &combinedInfo,
-                        MapInfoData &mapData,
-                        const SmallVector<Value> &useDevicePtrVars = {},
-                        const SmallVector<Value> &useDeviceAddrVars = {},
-                        bool isTargetParams = false) {
+                        MapInfoData &mapData, bool isTargetParams = false) {
   // We wish to modify some of the methods in which arguments are
   // passed based on their capture type by the target region, this can
   // involve generating new loads and stores, which changes the
@@ -2893,15 +2936,6 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
 
-  auto fail = [&combinedInfo]() -> void {
-    combinedInfo.BasePointers.clear();
-    combinedInfo.Pointers.clear();
-    combinedInfo.DevicePointers.clear();
-    combinedInfo.Sizes.clear();
-    combinedInfo.Types.clear();
-    combinedInfo.Names.clear();
-  };
-
   // We operate under the assumption that all vectors that are
   // required in MapInfoData are of equal lengths (either filled with
   // default constructed data or appropiate information) so we can
@@ -2913,7 +2947,7 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
     if (mapData.IsAMember[i])
       continue;
 
-    auto mapInfoOp = mlir::dyn_cast<mlir::omp::MapInfoOp>(mapData.MapClause[i]);
+    auto mapInfoOp = dyn_cast<omp::MapInfoOp>(mapData.MapClause[i]);
     if (!mapInfoOp.getMembers().empty()) {
       processMapWithMembersOf(moduleTranslation, builder, *ompBuilder, dl,
                               combinedInfo, mapData, i, isTargetParams);
@@ -2922,46 +2956,6 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
 
     processIndividualMap(mapData, i, combinedInfo, isTargetParams);
   }
-
-  auto findMapInfo = [&combinedInfo](llvm::Value *val, unsigned &index) {
-    index = 0;
-    for (llvm::Value *basePtr : combinedInfo.BasePointers) {
-      if (basePtr == val)
-        return true;
-      index++;
-    }
-    return false;
-  };
-
-  auto addDevInfos = [&, fail](auto useDeviceVars, auto devOpType) -> void {
-    for (const auto &useDeviceVar : useDeviceVars) {
-      // TODO: Only LLVMPointerTypes are handled.
-      if (!isa<LLVM::LLVMPointerType>(useDeviceVar.getType()))
-        return fail();
-
-      llvm::Value *mapOpValue = moduleTranslation.lookupValue(useDeviceVar);
-
-      // Check if map info is already present for this entry.
-      unsigned infoIndex;
-      if (findMapInfo(mapOpValue, infoIndex)) {
-        combinedInfo.Types[infoIndex] |=
-            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM;
-        combinedInfo.DevicePointers[infoIndex] = devOpType;
-      } else {
-        combinedInfo.BasePointers.emplace_back(mapOpValue);
-        combinedInfo.Pointers.emplace_back(mapOpValue);
-        combinedInfo.DevicePointers.emplace_back(devOpType);
-        combinedInfo.Names.emplace_back(
-            LLVM::createMappingInformation(useDeviceVar.getLoc(), *ompBuilder));
-        combinedInfo.Types.emplace_back(
-            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM);
-        combinedInfo.Sizes.emplace_back(builder.getInt64(0));
-      }
-    }
-  };
-
-  addDevInfos(useDevicePtrVars, llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer);
-  addDevInfos(useDeviceAddrVars, llvm::OpenMPIRBuilder::DeviceInfoTy::Address);
 }
 
 static LogicalResult
@@ -3058,19 +3052,15 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
 
   MapInfoData mapData;
-  collectMapDataFromMapVars(mapData, mapVars, moduleTranslation, DL, builder);
+  collectMapDataFromMapOperands(mapData, mapVars, moduleTranslation, DL,
+                                builder, useDevicePtrVars, useDeviceAddrVars);
 
   // Fill up the arrays with all the mapped variables.
   llvm::OpenMPIRBuilder::MapInfosTy combinedInfo;
   auto genMapInfoCB =
       [&](InsertPointTy codeGenIP) -> llvm::OpenMPIRBuilder::MapInfosTy & {
     builder.restoreIP(codeGenIP);
-    if (auto dataOp = dyn_cast<omp::TargetDataOp>(op)) {
-      genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapData,
-                  useDevicePtrVars, useDeviceAddrVars);
-    } else {
-      genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapData);
-    }
+    genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapData);
     return combinedInfo;
   };
 
@@ -3089,21 +3079,21 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
       if (!info.DevicePtrInfoMap.empty()) {
         builder.restoreIP(codeGenIP);
         unsigned argIndex = 0;
-        for (auto &devPtrOp : useDevicePtrVars) {
-          llvm::Value *mapOpValue = moduleTranslation.lookupValue(devPtrOp);
-          const auto &arg = region.front().getArgument(argIndex);
-          moduleTranslation.mapValue(arg,
-                                     info.DevicePtrInfoMap[mapOpValue].second);
-          argIndex++;
-        }
-
-        for (auto &devAddrOp : useDeviceAddrVars) {
-          llvm::Value *mapOpValue = moduleTranslation.lookupValue(devAddrOp);
-          const auto &arg = region.front().getArgument(argIndex);
-          auto *LI = builder.CreateLoad(
-              builder.getPtrTy(), info.DevicePtrInfoMap[mapOpValue].second);
-          moduleTranslation.mapValue(arg, LI);
-          argIndex++;
+        for (auto [basePointer, devicePointer] : llvm::zip_equal(
+                 combinedInfo.BasePointers, combinedInfo.DevicePointers)) {
+          if (devicePointer == llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer) {
+            const auto &arg = region.front().getArgument(argIndex);
+            moduleTranslation.mapValue(
+                arg, info.DevicePtrInfoMap[basePointer].second);
+            argIndex++;
+          } else if (devicePointer ==
+                     llvm::OpenMPIRBuilder::DeviceInfoTy::Address) {
+            const auto &arg = region.front().getArgument(argIndex);
+            auto *loadInst = builder.CreateLoad(
+                builder.getPtrTy(), info.DevicePtrInfoMap[basePointer].second);
+            moduleTranslation.mapValue(arg, loadInst);
+            argIndex++;
+          }
         }
 
         bodyGenStatus = inlineConvertOmpRegions(region, "omp.data.region",
@@ -3116,6 +3106,20 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
       // If device info is available then region has already been generated
       if (info.DevicePtrInfoMap.empty()) {
         builder.restoreIP(codeGenIP);
+        // For device pass, if use_device_ptr(addr) mappings were present,
+        // we need to link them here before codegen.
+        if (ompBuilder->Config.IsTargetDevice.value_or(false)) {
+          unsigned argIndex = 0;
+          for (auto [basePointer, devicePointer] :
+               llvm::zip_equal(mapData.BasePointers, mapData.DevicePointers)) {
+            if (devicePointer == llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer ||
+                devicePointer == llvm::OpenMPIRBuilder::DeviceInfoTy::Address) {
+              const auto &arg = region.front().getArgument(argIndex);
+              moduleTranslation.mapValue(arg, basePointer);
+              argIndex++;
+            }
+          }
+        }
         bodyGenStatus = inlineConvertOmpRegions(region, "omp.data.region",
                                                 builder, moduleTranslation);
       }
@@ -3335,17 +3339,14 @@ createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg,
                              llvm::IRBuilderBase::InsertPoint codeGenIP) {
   builder.restoreIP(allocaIP);
 
-  mlir::omp::VariableCaptureKind capture =
-      mlir::omp::VariableCaptureKind::ByRef;
+  omp::VariableCaptureKind capture = omp::VariableCaptureKind::ByRef;
 
   // Find the associated MapInfoData entry for the current input
   for (size_t i = 0; i < mapData.MapClause.size(); ++i)
     if (mapData.OriginalValue[i] == input) {
-      if (auto mapOp = mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(
-              mapData.MapClause[i])) {
-        capture = mapOp.getMapCaptureType().value_or(
-            mlir::omp::VariableCaptureKind::ByRef);
-      }
+      auto mapOp = cast<omp::MapInfoOp>(mapData.MapClause[i]);
+      capture =
+          mapOp.getMapCaptureType().value_or(omp::VariableCaptureKind::ByRef);
 
       break;
     }
@@ -3366,18 +3367,18 @@ createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg,
   builder.restoreIP(codeGenIP);
 
   switch (capture) {
-  case mlir::omp::VariableCaptureKind::ByCopy: {
+  case omp::VariableCaptureKind::ByCopy: {
     retVal = v;
     break;
   }
-  case mlir::omp::VariableCaptureKind::ByRef: {
+  case omp::VariableCaptureKind::ByRef: {
     retVal = builder.CreateAlignedLoad(
         v->getType(), v,
         ompBuilder.M.getDataLayout().getPrefTypeAlign(v->getType()));
     break;
   }
-  case mlir::omp::VariableCaptureKind::This:
-  case mlir::omp::VariableCaptureKind::VLAType:
+  case omp::VariableCaptureKind::This:
+  case omp::VariableCaptureKind::VLAType:
     assert(false && "Currently unsupported capture kind");
     break;
   }
@@ -3429,8 +3430,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
     builder.restoreIP(codeGenIP);
     unsigned argIndex = 0;
     for (auto &mapOp : mapVars) {
-      auto mapInfoOp =
-          mlir::dyn_cast<mlir::omp::MapInfoOp>(mapOp.getDefiningOp());
+      auto mapInfoOp = cast<omp::MapInfoOp>(mapOp.getDefiningOp());
       llvm::Value *mapOpValue =
           moduleTranslation.lookupValue(mapInfoOp.getVarPtr());
       const auto &arg = targetRegion.front().getArgument(argIndex);
@@ -3458,14 +3458,14 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
       findAllocaInsertPoint(builder, moduleTranslation);
 
   MapInfoData mapData;
-  collectMapDataFromMapVars(mapData, mapVars, moduleTranslation, dl, builder);
+  collectMapDataFromMapOperands(mapData, mapVars, moduleTranslation, dl,
+                                builder);
 
   llvm::OpenMPIRBuilder::MapInfosTy combinedInfos;
   auto genMapInfoCB = [&](llvm::OpenMPIRBuilder::InsertPointTy codeGenIP)
       -> llvm::OpenMPIRBuilder::MapInfosTy & {
     builder.restoreIP(codeGenIP);
-    genMapInfos(builder, moduleTranslation, dl, combinedInfos, mapData, {}, {},
-                true);
+    genMapInfos(builder, moduleTranslation, dl, combinedInfos, mapData, true);
     return combinedInfos;
   };
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
index bf9fa183bfb80..458d2f28a78f8 100644
--- a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
@@ -209,7 +209,8 @@ llvm.func @_QPopenmp_target_use_dev_ptr() {
   %0 = llvm.mlir.constant(1 : i64) : i64
   %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
   %map1 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map1 : !llvm.ptr) use_device_ptr(%a : !llvm.ptr)  {
+  %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+  omp.target_data  map_entries(%map1 : !llvm.ptr) use_device_ptr(%map2 : !llvm.ptr)  {
   ^bb0(%arg0: !llvm.ptr):
     %1 = llvm.mlir.constant(10 : i32) : i32
     %2 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
@@ -253,7 +254,8 @@ llvm.func @_QPopenmp_target_use_dev_addr() {
   %0 = llvm.mlir.constant(1 : i64) : i64
   %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
   %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%a : !llvm.ptr)  {
+  %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 : !llvm.ptr)  {
   ^bb0(%arg0: !llvm.ptr):
     %1 = llvm.mlir.constant(10 : i32) : i32
     %2 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
@@ -295,7 +297,8 @@ llvm.func @_QPopenmp_target_use_dev_addr_no_ptr() {
   %0 = llvm.mlir.constant(1 : i64) : i64
   %a = llvm.alloca %0 x i32 : (i64) -> !llvm.ptr
   %map = omp.map.info var_ptr(%a : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%a : !llvm.ptr)  {
+  %map2 = omp.map.info var_ptr(%a : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 : !llvm.ptr)  {
   ^bb0(%arg0: !llvm.ptr):
     %1 = llvm.mlir.constant(10 : i32) : i32
     llvm.store %1, %arg0 : i32, !llvm.ptr
@@ -337,7 +340,8 @@ llvm.func @_QPopenmp_target_use_dev_addr_nomap() {
   %1 = llvm.mlir.constant(1 : i64) : i64
   %b = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
   %map = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%a : !llvm.ptr)  {
+  %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 : !llvm.ptr)  {
   ^bb0(%arg0: !llvm.ptr):
     %2 = llvm.mlir.constant(10 : i32) : i32
     %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
@@ -394,7 +398,9 @@ llvm.func @_QPopenmp_target_use_dev_both() {
   %b = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
   %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
   %map1 = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map, %map1 : !llvm.ptr, !llvm.ptr) use_device_ptr(%a : !llvm.ptr) use_device_addr(%b : !llvm.ptr)  {
+  %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+  %map3 = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+  omp.target_data  map_entries(%map, %map1 : !llvm.ptr, !llvm.ptr) use_device_ptr(%map2 : !llvm.ptr) use_device_addr(%map3 : !llvm.ptr)  {
   ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
     %2 = llvm.mlir.constant(10 : i32) : i32
     %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
diff --git a/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir b/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir
new file mode 100644
index 0000000000000..f094a46581dee
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir
@@ -0,0 +1,39 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// This tests check that target code nested inside a target data region which
+// has only use_device_ptr mapping corectly generates code on the device pass.
+
+// CHECK:         define weak_odr protected void @__omp_offloading{{.*}}main_
+// CHECK-NEXT:       entry:
+// CHECK-NEXT:         %[[VAL_3:.*]] = alloca ptr, align 8
+// CHECK-NEXT:         store ptr %[[VAL_4:.*]], ptr %[[VAL_3]], align 8
+// CHECK-NEXT:         %[[VAL_5:.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_{{.*}}_kernel_environment, ptr %[[VAL_6:.*]])
+// CHECK-NEXT:         %[[VAL_7:.*]] = icmp eq i32 %[[VAL_5]], -1
+// CHECK-NEXT:         br i1 %[[VAL_7]], label %[[VAL_8:.*]], label %[[VAL_9:.*]]
+// CHECK:            user_code.entry:                                  ; preds = %[[VAL_10:.*]]
+// CHECK-NEXT:         %[[VAL_11:.*]] = load ptr, ptr %[[VAL_3]], align 8
+// CHECK-NEXT:         br label %[[VAL_12:.*]]
+// CHECK:            omp.target:                                       ; preds = %[[VAL_8]]
+// CHECK-NEXT:         %[[VAL_13:.*]] = load ptr, ptr %[[VAL_11]], align 8
+// CHECK-NEXT:         store i32 999, ptr %[[VAL_13]], align 4
+// CHECK-NEXT:         br label %[[VAL_14:.*]]
+module attributes {omp.is_target_device = true } {
+  llvm.func @_QQmain() attributes {fir.bindc_name = "main"} {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
+    %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target_data use_device_ptr(%map : !llvm.ptr)  {
+    ^bb0(%arg0: !llvm.ptr):
+      %map1 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+      omp.target map_entries(%map1 : !llvm.ptr){
+      ^bb0(%arg1: !llvm.ptr):
+        %1 = llvm.mlir.constant(999 : i32) : i32
+        %2 = llvm.load %arg1 : !llvm.ptr -> !llvm.ptr
+        llvm.store %1, %2 : i32, !llvm.ptr
+        omp.terminator
+      }
+      omp.terminator
+    }
+    llvm.return
+  }
+}

From 7330c9b033f29bd92b99db6282c0f71de64122ab Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Wed, 4 Sep 2024 19:46:27 +0800
Subject: [PATCH 082/425] [flang] Fix -Wunneeded-internal-declaration in
 DirectivesCommon.h (NFC)

/llvm-project/flang/lib/Lower/DirectivesCommon.h:910:1:
error: 'static' function 'peelOuterConvert' declared in header file should be declared 'static inline' [-Werror,-Wunneeded-internal-declaration]
peelOuterConvert(Fortran::semantics::SomeExpr &expr) {
^
1 error generated.
---
 flang/lib/Lower/DirectivesCommon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h
index d8b1f1f3e4362..d2060e77ce530 100644
--- a/flang/lib/Lower/DirectivesCommon.h
+++ b/flang/lib/Lower/DirectivesCommon.h
@@ -906,7 +906,7 @@ struct PeelConvert {
   }
 };
 
-static Fortran::semantics::SomeExpr
+static inline Fortran::semantics::SomeExpr
 peelOuterConvert(Fortran::semantics::SomeExpr &expr) {
   if (auto peeled = PeelConvert::visit(expr))
     return *peeled;

From d94199c8ffba3f3895da7627d3dbbca62937310c Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Wed, 4 Sep 2024 13:49:48 +0200
Subject: [PATCH 083/425] [clang] Make lifetimebound and GSL analysis more
 coherent (#105884)

This allows clang to detect more use-after-free bugs (shown in the
#100549).

This relands the remaining change (removing the EnableLifetimeWarnings
flag) in https://github.com/llvm/llvm-project/pull/104906, with a proper
fix for the regression.

Fixes #100549
---
 clang/docs/ReleaseNotes.rst                   |   3 +
 clang/lib/Sema/CheckExprLifetime.cpp          | 154 +++++++-----------
 .../Sema/warn-lifetime-analysis-nocfg.cpp     |  26 +++
 3 files changed, 87 insertions(+), 96 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 251eb4c1c4559..bcdbc5b702765 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -270,6 +270,9 @@ Improvements to Clang's diagnostics
 
 - Clang now respects lifetimebound attribute for the assignment operator parameter. (#GH106372).
 
+- The lifetimebound and GSL analysis in clang are coherent, allowing clang to
+  detect more use-after-free bugs. (#GH100549).
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index f7540a6e3a897..1482711cc2839 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -237,13 +237,11 @@ static bool pathContainsInit(IndirectLocalPath &Path) {
 
 static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
                                              Expr *Init, LocalVisitor Visit,
-                                             bool RevisitSubinits,
-                                             bool EnableLifetimeWarnings);
+                                             bool RevisitSubinits);
 
 static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path,
                                                   Expr *Init, ReferenceKind RK,
-                                                  LocalVisitor Visit,
-                                                  bool EnableLifetimeWarnings);
+                                                  LocalVisitor Visit);
 
 template <typename T> static bool isRecordWithAttr(QualType Type) {
   if (auto *RD = Type->getAsCXXRecordDecl())
@@ -369,8 +367,7 @@ static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) {
 
 // Visit lifetimebound or gsl-pointer arguments.
 static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
-                                       LocalVisitor Visit,
-                                       bool EnableLifetimeWarnings) {
+                                       LocalVisitor Visit) {
   const FunctionDecl *Callee;
   ArrayRef<Expr *> Args;
 
@@ -385,6 +382,8 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
   if (!Callee)
     return;
 
+  bool EnableGSLAnalysis = !Callee->getASTContext().getDiagnostics().isIgnored(
+      diag::warn_dangling_lifetime_pointer, SourceLocation());
   Expr *ObjectArg = nullptr;
   if (isa<CXXOperatorCallExpr>(Call) && Callee->isCXXInstanceMember()) {
     ObjectArg = Args[0];
@@ -397,11 +396,9 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
     Path.push_back({IndirectLocalPathEntry::LifetimeBoundCall, Arg, D});
     if (Arg->isGLValue())
       visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding,
-                                            Visit,
-                                            /*EnableLifetimeWarnings=*/false);
+                                            Visit);
     else
-      visitLocalsRetainedByInitializer(Path, Arg, Visit, true,
-                                       /*EnableLifetimeWarnings=*/false);
+      visitLocalsRetainedByInitializer(Path, Arg, Visit, true);
     Path.pop_back();
   };
   auto VisitGSLPointerArg = [&](const Decl *D, Expr *Arg, bool Value) {
@@ -412,7 +409,8 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
     // Once we initialized a value with a reference, it can no longer dangle.
     if (!Value) {
       for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) {
-        if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit)
+        if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit ||
+            PE.Kind == IndirectLocalPathEntry::LifetimeBoundCall)
           continue;
         if (PE.Kind == IndirectLocalPathEntry::GslPointerInit ||
             PE.Kind == IndirectLocalPathEntry::GslPointerAssignment)
@@ -425,11 +423,9 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
                     Arg, D});
     if (Arg->isGLValue())
       visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding,
-                                            Visit,
-                                            /*EnableLifetimeWarnings=*/true);
+                                            Visit);
     else
-      visitLocalsRetainedByInitializer(Path, Arg, Visit, true,
-                                       /*EnableLifetimeWarnings=*/true);
+      visitLocalsRetainedByInitializer(Path, Arg, Visit, true);
     Path.pop_back();
   };
 
@@ -452,7 +448,7 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
       CheckCoroObjArg = false;
     if (implicitObjectParamIsLifetimeBound(Callee) || CheckCoroObjArg)
       VisitLifetimeBoundArg(Callee, ObjectArg);
-    else if (EnableLifetimeWarnings) {
+    else if (EnableGSLAnalysis) {
       if (auto *CME = dyn_cast<CXXMethodDecl>(Callee);
           CME && shouldTrackImplicitObjectArg(CME))
         VisitGSLPointerArg(Callee, ObjectArg,
@@ -465,15 +461,15 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
        I != N; ++I) {
     if (CheckCoroCall || Callee->getParamDecl(I)->hasAttr<LifetimeBoundAttr>())
       VisitLifetimeBoundArg(Callee->getParamDecl(I), Args[I]);
-    else if (EnableLifetimeWarnings && I == 0) {
+    else if (EnableGSLAnalysis && I == 0) {
       if (shouldTrackFirstArgument(Callee)) {
         VisitGSLPointerArg(Callee, Args[0],
                            !Callee->getReturnType()->isReferenceType());
-      } else {
-        if (auto *CCE = dyn_cast<CXXConstructExpr>(Call);
-            CCE && CCE->getConstructor()->getParent()->hasAttr<PointerAttr>())
-          VisitGSLPointerArg(CCE->getConstructor()->getParamDecl(0), Args[0],
-                             true);
+      } else if (auto *CCE = dyn_cast<CXXConstructExpr>(Call);
+                 CCE &&
+                 CCE->getConstructor()->getParent()->hasAttr<PointerAttr>()) {
+        VisitGSLPointerArg(CCE->getConstructor()->getParamDecl(0), Args[0],
+                           true);
       }
     }
   }
@@ -483,8 +479,7 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
 /// glvalue expression \c Init.
 static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path,
                                                   Expr *Init, ReferenceKind RK,
-                                                  LocalVisitor Visit,
-                                                  bool EnableLifetimeWarnings) {
+                                                  LocalVisitor Visit) {
   RevertToOldSizeRAII RAII(Path);
 
   // Walk past any constructs which we can lifetime-extend across.
@@ -521,8 +516,7 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path,
       else
         // We can't lifetime extend through this but we might still find some
         // retained temporaries.
-        return visitLocalsRetainedByInitializer(Path, Init, Visit, true,
-                                                EnableLifetimeWarnings);
+        return visitLocalsRetainedByInitializer(Path, Init, Visit, true);
     }
 
     // Step into CXXDefaultInitExprs so we can diagnose cases where a
@@ -536,21 +530,18 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path,
 
   if (auto *MTE = dyn_cast<MaterializeTemporaryExpr>(Init)) {
     if (Visit(Path, Local(MTE), RK))
-      visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(), Visit, true,
-                                       EnableLifetimeWarnings);
+      visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(), Visit, true);
   }
 
   if (auto *M = dyn_cast<MemberExpr>(Init)) {
     // Lifetime of a non-reference type field is same as base object.
     if (auto *F = dyn_cast<FieldDecl>(M->getMemberDecl());
         F && !F->getType()->isReferenceType())
-      visitLocalsRetainedByInitializer(Path, M->getBase(), Visit, true,
-                                       EnableLifetimeWarnings);
+      visitLocalsRetainedByInitializer(Path, M->getBase(), Visit, true);
   }
 
   if (isa<CallExpr>(Init))
-    return visitFunctionCallArguments(Path, Init, Visit,
-                                      EnableLifetimeWarnings);
+    return visitFunctionCallArguments(Path, Init, Visit);
 
   switch (Init->getStmtClass()) {
   case Stmt::DeclRefExprClass: {
@@ -569,8 +560,7 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path,
       } else if (VD->getInit() && !isVarOnPath(Path, VD)) {
         Path.push_back({IndirectLocalPathEntry::VarInit, DRE, VD});
         visitLocalsRetainedByReferenceBinding(Path, VD->getInit(),
-                                              RK_ReferenceBinding, Visit,
-                                              EnableLifetimeWarnings);
+                                              RK_ReferenceBinding, Visit);
       }
     }
     break;
@@ -582,15 +572,13 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path,
     // handling all sorts of rvalues passed to a unary operator.
     const UnaryOperator *U = cast<UnaryOperator>(Init);
     if (U->getOpcode() == UO_Deref)
-      visitLocalsRetainedByInitializer(Path, U->getSubExpr(), Visit, true,
-                                       EnableLifetimeWarnings);
+      visitLocalsRetainedByInitializer(Path, U->getSubExpr(), Visit, true);
     break;
   }
 
   case Stmt::ArraySectionExprClass: {
-    visitLocalsRetainedByInitializer(Path,
-                                     cast<ArraySectionExpr>(Init)->getBase(),
-                                     Visit, true, EnableLifetimeWarnings);
+    visitLocalsRetainedByInitializer(
+        Path, cast<ArraySectionExpr>(Init)->getBase(), Visit, true);
     break;
   }
 
@@ -598,11 +586,9 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path,
   case Stmt::BinaryConditionalOperatorClass: {
     auto *C = cast<AbstractConditionalOperator>(Init);
     if (!C->getTrueExpr()->getType()->isVoidType())
-      visitLocalsRetainedByReferenceBinding(Path, C->getTrueExpr(), RK, Visit,
-                                            EnableLifetimeWarnings);
+      visitLocalsRetainedByReferenceBinding(Path, C->getTrueExpr(), RK, Visit);
     if (!C->getFalseExpr()->getType()->isVoidType())
-      visitLocalsRetainedByReferenceBinding(Path, C->getFalseExpr(), RK, Visit,
-                                            EnableLifetimeWarnings);
+      visitLocalsRetainedByReferenceBinding(Path, C->getFalseExpr(), RK, Visit);
     break;
   }
 
@@ -625,8 +611,7 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path,
 /// the prvalue expression \c Init.
 static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
                                              Expr *Init, LocalVisitor Visit,
-                                             bool RevisitSubinits,
-                                             bool EnableLifetimeWarnings) {
+                                             bool RevisitSubinits) {
   RevertToOldSizeRAII RAII(Path);
 
   Expr *Old;
@@ -667,18 +652,16 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
                 if (VD && VD->getType().isConstQualified() && VD->getInit() &&
                     !isVarOnPath(Path, VD)) {
                   Path.push_back({IndirectLocalPathEntry::VarInit, DRE, VD});
-                  visitLocalsRetainedByInitializer(
-                      Path, VD->getInit(), Visit, true, EnableLifetimeWarnings);
+                  visitLocalsRetainedByInitializer(Path, VD->getInit(), Visit,
+                                                   true);
                 }
               } else if (auto *MTE = dyn_cast<MaterializeTemporaryExpr>(L)) {
                 if (MTE->getType().isConstQualified())
                   visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(),
-                                                   Visit, true,
-                                                   EnableLifetimeWarnings);
+                                                   Visit, true);
               }
               return false;
-            },
-            EnableLifetimeWarnings);
+            });
 
         // We assume that objects can be retained by pointers cast to integers,
         // but not if the integer is cast to floating-point type or to _Complex.
@@ -707,9 +690,8 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
         // Model array-to-pointer decay as taking the address of the array
         // lvalue.
         Path.push_back({IndirectLocalPathEntry::AddressOf, CE});
-        return visitLocalsRetainedByReferenceBinding(Path, CE->getSubExpr(),
-                                                     RK_ReferenceBinding, Visit,
-                                                     EnableLifetimeWarnings);
+        return visitLocalsRetainedByReferenceBinding(
+            Path, CE->getSubExpr(), RK_ReferenceBinding, Visit);
 
       default:
         return;
@@ -724,8 +706,7 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
   //   lifetime of the array exactly like binding a reference to a temporary.
   if (auto *ILE = dyn_cast<CXXStdInitializerListExpr>(Init))
     return visitLocalsRetainedByReferenceBinding(Path, ILE->getSubExpr(),
-                                                 RK_StdInitializerList, Visit,
-                                                 EnableLifetimeWarnings);
+                                                 RK_StdInitializerList, Visit);
 
   if (InitListExpr *ILE = dyn_cast<InitListExpr>(Init)) {
     // We already visited the elements of this initializer list while
@@ -736,14 +717,12 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
 
     if (ILE->isTransparent())
       return visitLocalsRetainedByInitializer(Path, ILE->getInit(0), Visit,
-                                              RevisitSubinits,
-                                              EnableLifetimeWarnings);
+                                              RevisitSubinits);
 
     if (ILE->getType()->isArrayType()) {
       for (unsigned I = 0, N = ILE->getNumInits(); I != N; ++I)
         visitLocalsRetainedByInitializer(Path, ILE->getInit(I), Visit,
-                                         RevisitSubinits,
-                                         EnableLifetimeWarnings);
+                                         RevisitSubinits);
       return;
     }
 
@@ -756,14 +735,12 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
       if (RD->isUnion() && ILE->getInitializedFieldInUnion() &&
           ILE->getInitializedFieldInUnion()->getType()->isReferenceType())
         visitLocalsRetainedByReferenceBinding(Path, ILE->getInit(0),
-                                              RK_ReferenceBinding, Visit,
-                                              EnableLifetimeWarnings);
+                                              RK_ReferenceBinding, Visit);
       else {
         unsigned Index = 0;
         for (; Index < RD->getNumBases() && Index < ILE->getNumInits(); ++Index)
           visitLocalsRetainedByInitializer(Path, ILE->getInit(Index), Visit,
-                                           RevisitSubinits,
-                                           EnableLifetimeWarnings);
+                                           RevisitSubinits);
         for (const auto *I : RD->fields()) {
           if (Index >= ILE->getNumInits())
             break;
@@ -772,14 +749,13 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
           Expr *SubInit = ILE->getInit(Index);
           if (I->getType()->isReferenceType())
             visitLocalsRetainedByReferenceBinding(Path, SubInit,
-                                                  RK_ReferenceBinding, Visit,
-                                                  EnableLifetimeWarnings);
+                                                  RK_ReferenceBinding, Visit);
           else
             // This might be either aggregate-initialization of a member or
             // initialization of a std::initializer_list object. Regardless,
             // we should recursively lifetime-extend that initializer.
-            visitLocalsRetainedByInitializer(
-                Path, SubInit, Visit, RevisitSubinits, EnableLifetimeWarnings);
+            visitLocalsRetainedByInitializer(Path, SubInit, Visit,
+                                             RevisitSubinits);
           ++Index;
         }
       }
@@ -800,10 +776,9 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
         Path.push_back({IndirectLocalPathEntry::LambdaCaptureInit, E, &Cap});
       if (E->isGLValue())
         visitLocalsRetainedByReferenceBinding(Path, E, RK_ReferenceBinding,
-                                              Visit, EnableLifetimeWarnings);
+                                              Visit);
       else
-        visitLocalsRetainedByInitializer(Path, E, Visit, true,
-                                         EnableLifetimeWarnings);
+        visitLocalsRetainedByInitializer(Path, E, Visit, true);
       if (Cap.capturesVariable())
         Path.pop_back();
     }
@@ -817,16 +792,14 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
         Expr *Arg = MTE->getSubExpr();
         Path.push_back({IndirectLocalPathEntry::TemporaryCopy, Arg,
                         CCE->getConstructor()});
-        visitLocalsRetainedByInitializer(Path, Arg, Visit, true,
-                                         /*EnableLifetimeWarnings*/ false);
+        visitLocalsRetainedByInitializer(Path, Arg, Visit, true);
         Path.pop_back();
       }
     }
   }
 
   if (isa<CallExpr>(Init) || isa<CXXConstructExpr>(Init))
-    return visitFunctionCallArguments(Path, Init, Visit,
-                                      EnableLifetimeWarnings);
+    return visitFunctionCallArguments(Path, Init, Visit);
 
   switch (Init->getStmtClass()) {
   case Stmt::UnaryOperatorClass: {
@@ -842,8 +815,7 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
 
       Path.push_back({IndirectLocalPathEntry::AddressOf, UO});
       visitLocalsRetainedByReferenceBinding(Path, UO->getSubExpr(),
-                                            RK_ReferenceBinding, Visit,
-                                            EnableLifetimeWarnings);
+                                            RK_ReferenceBinding, Visit);
     }
     break;
   }
@@ -856,11 +828,9 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
       break;
 
     if (BO->getLHS()->getType()->isPointerType())
-      visitLocalsRetainedByInitializer(Path, BO->getLHS(), Visit, true,
-                                       EnableLifetimeWarnings);
+      visitLocalsRetainedByInitializer(Path, BO->getLHS(), Visit, true);
     else if (BO->getRHS()->getType()->isPointerType())
-      visitLocalsRetainedByInitializer(Path, BO->getRHS(), Visit, true,
-                                       EnableLifetimeWarnings);
+      visitLocalsRetainedByInitializer(Path, BO->getRHS(), Visit, true);
     break;
   }
 
@@ -870,11 +840,9 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
     // In C++, we can have a throw-expression operand, which has 'void' type
     // and isn't interesting from a lifetime perspective.
     if (!C->getTrueExpr()->getType()->isVoidType())
-      visitLocalsRetainedByInitializer(Path, C->getTrueExpr(), Visit, true,
-                                       EnableLifetimeWarnings);
+      visitLocalsRetainedByInitializer(Path, C->getTrueExpr(), Visit, true);
     if (!C->getFalseExpr()->getType()->isVoidType())
-      visitLocalsRetainedByInitializer(Path, C->getFalseExpr(), Visit, true,
-                                       EnableLifetimeWarnings);
+      visitLocalsRetainedByInitializer(Path, C->getFalseExpr(), Visit, true);
     break;
   }
 
@@ -992,8 +960,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
                                   const InitializedEntity *InitEntity,
                                   const InitializedEntity *ExtendingEntity,
                                   LifetimeKind LK,
-                                  const AssignedEntity *AEntity, Expr *Init,
-                                  bool EnableLifetimeWarnings) {
+                                  const AssignedEntity *AEntity, Expr *Init) {
   assert((AEntity && LK == LK_Assignment) ||
          (InitEntity && LK != LK_Assignment));
   // If this entity doesn't have an interesting lifetime, don't bother looking
@@ -1292,13 +1259,12 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
 
   if (Init->isGLValue())
     visitLocalsRetainedByReferenceBinding(Path, Init, RK_ReferenceBinding,
-                                          TemporaryVisitor,
-                                          EnableLifetimeWarnings);
+                                          TemporaryVisitor);
   else
     visitLocalsRetainedByInitializer(
         Path, Init, TemporaryVisitor,
         // Don't revisit the sub inits for the intialization case.
-        /*RevisitSubinits=*/!InitEntity, EnableLifetimeWarnings);
+        /*RevisitSubinits=*/!InitEntity);
 }
 
 void checkExprLifetime(Sema &SemaRef, const InitializedEntity &Entity,
@@ -1306,16 +1272,12 @@ void checkExprLifetime(Sema &SemaRef, const InitializedEntity &Entity,
   auto LTResult = getEntityLifetime(&Entity);
   LifetimeKind LK = LTResult.getInt();
   const InitializedEntity *ExtendingEntity = LTResult.getPointer();
-  bool EnableLifetimeWarnings = !SemaRef.getDiagnostics().isIgnored(
-      diag::warn_dangling_lifetime_pointer, SourceLocation());
   checkExprLifetimeImpl(SemaRef, &Entity, ExtendingEntity, LK,
-                        /*AEntity*/ nullptr, Init, EnableLifetimeWarnings);
+                        /*AEntity*/ nullptr, Init);
 }
 
 void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity,
                        Expr *Init) {
-  bool EnableLifetimeWarnings = !SemaRef.getDiagnostics().isIgnored(
-      diag::warn_dangling_lifetime_pointer, SourceLocation());
   bool EnableDanglingPointerAssignment = !SemaRef.getDiagnostics().isIgnored(
       diag::warn_dangling_pointer_assignment, SourceLocation());
   bool RunAnalysis = (EnableDanglingPointerAssignment &&
@@ -1327,7 +1289,7 @@ void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity,
 
   checkExprLifetimeImpl(SemaRef, /*InitEntity=*/nullptr,
                         /*ExtendingEntity=*/nullptr, LK_Assignment, &Entity,
-                        Init, EnableLifetimeWarnings);
+                        Init);
 }
 
 } // namespace clang::sema
diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
index cd1904db32710..67d1ceaa02d03 100644
--- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
+++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
@@ -498,4 +498,30 @@ std::string_view test2(int i, std::optional<std::string_view> a) {
     return std::move(*a);
   return std::move(a.value());
 }
+
+struct Foo;
+struct FooView {
+  FooView(const Foo& foo [[clang::lifetimebound]]);
+};
+FooView test3(int i, std::optional<Foo> a) {
+  if (i)
+    return *a; // expected-warning {{address of stack memory}}
+  return a.value(); // expected-warning {{address of stack memory}}
+}
+} // namespace GH93386
+
+namespace GH100549 {
+struct UrlAnalyzed {
+  UrlAnalyzed(std::string_view url [[clang::lifetimebound]]);
+};
+std::string StrCat(std::string_view, std::string_view);
+void test1() {
+  UrlAnalyzed url(StrCat("abc", "bcd")); // expected-warning {{object backing the pointer will be destroyed}}
+}
+
+std::string_view ReturnStringView(std::string_view abc [[clang::lifetimebound]]);
+
+void test() {
+  std::string_view svjkk1 = ReturnStringView(StrCat("bar", "x")); // expected-warning {{object backing the pointer will be destroyed at the end of the full-expression}}
 }
+} // namespace GH100549

From af1e59aea2ea7d07ece1f34621dda38c995826a3 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 3 Sep 2024 15:35:40 -0700
Subject: [PATCH 084/425] [SLP]Fix PR107037: correctly track origonal/modified
 after vectorizations reduced values

Need to correctly track reduced values with multiple uses in the same
reduction emission attempt. Otherwise, the number of the reuses might be
calculated incorrectly, and may cause compiler crash.

Fixes https://github.com/llvm/llvm-project/issues/107037
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 69 ++++++++++---------
 .../X86/multi-tracked-reduced-value.ll        | 47 +++++++++++++
 2 files changed, 85 insertions(+), 31 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a7272deb3c34f..19b95cf473e91 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -17721,6 +17721,12 @@ class HorizontalReduction {
       for (Value *V : Candidates)
         TrackedVals.try_emplace(V, V);
 
+    auto At = [](MapVector<Value *, unsigned> &MV, Value *V) -> unsigned & {
+      auto *It = MV.find(V);
+      assert(It != MV.end() && "Unable to find given key.");
+      return It->second;
+    };
+
     DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
     // List of the values that were reduced in other trees as part of gather
     // nodes and thus requiring extract if fully vectorized in other trees.
@@ -17738,7 +17744,7 @@ class HorizontalReduction {
       Candidates.reserve(2 * OrigReducedVals.size());
       DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
       for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
-        Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
+        Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
         // Check if the reduction value was not overriden by the extractelement
         // instruction because of the vectorization and exclude it, if it is not
         // compatible with other values.
@@ -17757,7 +17763,7 @@ class HorizontalReduction {
           I + 1 < E) {
         SmallVector<Value *> CommonCandidates(Candidates);
         for (Value *RV : ReducedVals[I + 1]) {
-          Value *RdxVal = TrackedVals.find(RV)->second;
+          Value *RdxVal = TrackedVals.at(RV);
           // Check if the reduction value was not overriden by the
           // extractelement instruction because of the vectorization and
           // exclude it, if it is not compatible with other values.
@@ -17778,10 +17784,12 @@ class HorizontalReduction {
       // Emit code for constant values.
       if (Candidates.size() > 1 && allConstant(Candidates)) {
         Value *Res = Candidates.front();
-        ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
+        Value *OrigV = TrackedToOrig.at(Candidates.front());
+        ++VectorizedVals.try_emplace(OrigV).first->getSecond();
         for (Value *VC : ArrayRef(Candidates).drop_front()) {
           Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
-          ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
+          Value *OrigV = TrackedToOrig.at(VC);
+          ++VectorizedVals.try_emplace(OrigV).first->getSecond();
           if (auto *ResI = dyn_cast<Instruction>(Res))
             V.analyzedReductionRoot(ResI);
         }
@@ -17802,8 +17810,10 @@ class HorizontalReduction {
       // Gather same values.
       MapVector<Value *, unsigned> SameValuesCounter;
       if (IsSupportedHorRdxIdentityOp)
-        for (Value *V : Candidates)
-          ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
+        for (Value *V : Candidates) {
+          Value *OrigV = TrackedToOrig.at(V);
+          ++SameValuesCounter.try_emplace(OrigV).first->second;
+        }
       // Used to check if the reduced values used same number of times. In this
       // case the compiler may produce better code. E.g. if reduced values are
       // aabbccdd (8 x values), then the first node of the tree will have a node
@@ -17827,12 +17837,12 @@ class HorizontalReduction {
                    });
         Candidates.resize(SameValuesCounter.size());
         transform(SameValuesCounter, Candidates.begin(),
-                  [](const auto &P) { return P.first; });
+                  [&](const auto &P) { return TrackedVals.at(P.first); });
         NumReducedVals = Candidates.size();
         // Have a reduction of the same element.
         if (NumReducedVals == 1) {
-          Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
-          unsigned Cnt = SameValuesCounter.lookup(OrigV);
+          Value *OrigV = TrackedToOrig.at(Candidates.front());
+          unsigned Cnt = At(SameValuesCounter, OrigV);
           Value *RedVal =
               emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
           VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
@@ -17936,8 +17946,8 @@ class HorizontalReduction {
             if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
               continue;
             Value *V = Candidates[Cnt];
-            Value *OrigV = TrackedToOrig.find(V)->second;
-            ++SameValuesCounter[OrigV];
+            Value *OrigV = TrackedToOrig.at(V);
+            ++SameValuesCounter.try_emplace(OrigV).first->second;
           }
         }
         SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
@@ -17955,10 +17965,10 @@ class HorizontalReduction {
             LocalExternallyUsedValues[RdxVal];
             continue;
           }
-          Value *OrigV = TrackedToOrig.find(RdxVal)->second;
+          Value *OrigV = TrackedToOrig.at(RdxVal);
           unsigned NumOps =
-              VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
-          if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
+              VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
+          if (NumOps != ReducedValsToOps.at(OrigV).size())
             LocalExternallyUsedValues[RdxVal];
         }
         // Do not need the list of reused scalars in regular mode anymore.
@@ -17983,9 +17993,8 @@ class HorizontalReduction {
           break;
         if (Cost >= -SLPCostThreshold) {
           V.getORE()->emit([&]() {
-            return OptimizationRemarkMissed(
-                       SV_NAME, "HorSLPNotBeneficial",
-                       ReducedValsToOps.find(VL[0])->second.front())
+            return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
+                                            ReducedValsToOps.at(VL[0]).front())
                    << "Vectorizing horizontal reduction is possible "
                    << "but not beneficial with cost " << ore::NV("Cost", Cost)
                    << " and threshold "
@@ -17999,9 +18008,8 @@ class HorizontalReduction {
         LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
                           << Cost << ". (HorRdx)\n");
         V.getORE()->emit([&]() {
-          return OptimizationRemark(
-                     SV_NAME, "VectorizedHorizontalReduction",
-                     ReducedValsToOps.find(VL[0])->second.front())
+          return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
+                                    ReducedValsToOps.at(VL[0]).front())
                  << "Vectorized horizontal reduction with cost "
                  << ore::NV("Cost", Cost) << " and with tree size "
                  << ore::NV("TreeSize", V.getTreeSize());
@@ -18083,12 +18091,12 @@ class HorizontalReduction {
         VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
         // Count vectorized reduced values to exclude them from final reduction.
         for (Value *RdxVal : VL) {
-          Value *OrigV = TrackedToOrig.find(RdxVal)->second;
+          Value *OrigV = TrackedToOrig.at(RdxVal);
           if (IsSupportedHorRdxIdentityOp) {
-            VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
+            VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
             continue;
           }
-          ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
+          ++VectorizedVals.try_emplace(OrigV).first->getSecond();
           if (!V.isVectorized(RdxVal))
             RequiredExtract.insert(RdxVal);
         }
@@ -18099,10 +18107,10 @@ class HorizontalReduction {
       }
       if (OptReusedScalars && !AnyVectorized) {
         for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
-          Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
+          Value *RdxVal = TrackedVals.at(P.first);
+          Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
           VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
-          Value *OrigV = TrackedToOrig.find(P.first)->second;
-          VectorizedVals.try_emplace(OrigV, P.second);
+          VectorizedVals.try_emplace(P.first, P.second);
         }
         continue;
       }
@@ -18190,8 +18198,7 @@ class HorizontalReduction {
             continue;
           unsigned NumOps = VectorizedVals.lookup(RdxVal);
           for (Instruction *RedOp :
-               ArrayRef(ReducedValsToOps.find(RdxVal)->second)
-                   .drop_back(NumOps))
+               ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
             ExtraReductions.emplace_back(RedOp, RdxVal);
         }
       }
@@ -18430,7 +18437,7 @@ class HorizontalReduction {
       // root = mul prev_root, <1, 1, n, 1>
       SmallVector<Constant *> Vals;
       for (Value *V : VL) {
-        unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
+        unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
         Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
       }
       auto *Scale = ConstantVector::get(Vals);
@@ -18468,7 +18475,7 @@ class HorizontalReduction {
       bool NeedShuffle = false;
       for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
         Value *V = VL[I];
-        unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
+        unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
         if (Cnt % 2 == 0) {
           Mask[I] = VF;
           NeedShuffle = true;
@@ -18488,7 +18495,7 @@ class HorizontalReduction {
       // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
       SmallVector<Constant *> Vals;
       for (Value *V : VL) {
-        unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
+        unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
         Vals.push_back(ConstantFP::get(V->getType(), Cnt));
       }
       auto *Scale = ConstantVector::get(Vals);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll
new file mode 100644
index 0000000000000..e012cc60960b3
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i8 @test() {
+; CHECK-LABEL: define i8 @test() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 0 to i8
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 0 to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 0 to i8
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 0 to i8
+; CHECK-NEXT:    [[TMP4:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> zeroinitializer)
+; CHECK-NEXT:    [[OP_RDX:%.*]] = or i8 [[TMP4]], [[TMP0]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = or i8 [[OP_RDX]], [[TMP2]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = or i8 [[OP_RDX1]], [[TMP0]]
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = or i8 [[OP_RDX2]], [[TMP1]]
+; CHECK-NEXT:    [[OP_RDX4:%.*]] = or i8 [[OP_RDX3]], [[TMP3]]
+; CHECK-NEXT:    ret i8 [[OP_RDX4]]
+;
+entry:
+  %0 = trunc i32 0 to i8
+  %1 = add i8 %0, 0
+  %2 = add i8 %0, 0
+  %3 = add i8 %0, 0
+  %4 = add i8 %0, 0
+  %5 = trunc i32 0 to i8
+  %6 = or i8 %5, %0
+  %7 = or i8 %6, %2
+  %8 = or i8 %7, %3
+  %9 = or i8 %8, %0
+  %10 = or i8 %9, %4
+  %conv4 = or i8 %10, %1
+  %11 = trunc i32 0 to i8
+  %12 = add i8 %11, 0
+  %conv7 = or i8 %conv4, %12
+  %13 = add i8 %11, 0
+  %14 = add i8 %11, 0
+  %15 = add i8 %11, 0
+  %16 = trunc i32 0 to i8
+  %17 = or i8 %13, %16
+  %18 = or i8 %17, %14
+  %19 = or i8 %18, %11
+  %20 = or i8 %19, %15
+  %conv5 = or i8 %20, %conv7
+  %21 = trunc i32 0 to i8
+  %conv6 = or i8 %21, %conv5
+  ret i8 %conv6
+}

From bb1b368e0ad3da98b4c51018bdbcd6a83d8e646d Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Wed, 4 Sep 2024 13:07:57 +0100
Subject: [PATCH 085/425] [AArch64] Implement intrinsics for SVE FAMIN/FAMAX
 (#99042)

This patch implements the following intrinsics:

* Floating-point absolute maximum (predicated)

svfloat16_t svamax[_f16]_m(svbool_t, svfloat16_t, svfloat16_t);
svfloat16_t svamax[_f16]_x(svbool_t, svfloat16_t, svfloat16_t);
svfloat16_t svamax[_f16]_z(svbool_t, svfloat16_t, svfloat16_t);

svfloat16_t svamax[_n_f16]_m(svbool_t, svfloat16_t, float16_t);
svfloat16_t svamax[_n_f16]_x(svbool_t, svfloat16_t, float16_t);
svfloat16_t svamax[_n_f16]_z(svbool_t, svfloat16_t, float16_t);

* Floating-point absolute minimum (predicated)

svfloat16_t svmin[_f16]_m(svbool_t, svfloat16_t, svfloat16_t);
svfloat16_t svmin[_f16]_x(svbool_t, svfloat16_t, svfloat16_t);
svfloat16_t svmin[_f16]_z(svbool_t, svfloat16_t, svfloat16_t);

svfloat16_t svmin[_n_f16]_m(svbool_t, svfloat16_t, float16_t);
svfloat16_t svmin[_n_f16]_x(svbool_t, svfloat16_t, float16_t);
svfloat16_t svmin[_n_f16]_z(svbool_t, svfloat16_t, float16_t);

All the intrinsics have also variants for `f32` and `f64`, and have the
`__arm_streaming` attribute.

(cf. https://github.com/ARM-software/acle/pull/324)
---
 clang/include/clang/Basic/arm_sve.td          |   7 +-
 .../acle_sve2_faminmax.c                      | 775 ++++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |   7 +
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   8 +-
 .../AArch64/sve2-intrinsics-faminmax.ll       | 115 +++
 5 files changed, 908 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_faminmax.c
 create mode 100644 llvm/test/CodeGen/AArch64/sve2-intrinsics-faminmax.ll

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 81527d8b98760..a0f12e1bbd2d4 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2418,4 +2418,9 @@ let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
 
   def SVBFMLSLB_LANE : SInst<"svbfmlslb_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslb_lane", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
   def SVBFMLSLT_LANE : SInst<"svbfmlslt_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslt_lane", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
-}
\ No newline at end of file
+}
+
+let SVETargetGuard = "sve2,faminmax", SMETargetGuard = "sme2,faminmax" in {
+  defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", "aarch64_sve_famin_u">;
+  defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", "aarch64_sve_famax_u">;
+}
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_faminmax.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_faminmax.c
new file mode 100644
index 0000000000000..3cf7d99d606f3
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_faminmax.c
@@ -0,0 +1,775 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +faminmax -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +faminmax -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +faminmax -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CPP
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CPP
+// RUN: %clang_cc1 -x c++ -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +faminmax -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CPP
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +faminmax -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#ifdef __ARM_FEATURE_SME
+#include "arm_sme.h"
+#else
+#include "arm_sve.h"
+#endif
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+#ifdef __ARM_FEATURE_SME
+#define STREAMING __arm_streaming
+#else
+#define STREAMING
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_famin_f16_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famin.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 8 x half> @_Z16test_famin_f16_mu10__SVBool_tu13__SVFloat16_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famin.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+svfloat16_t test_famin_f16_m(svbool_t pg, svfloat16_t a, svfloat16_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _f16, _m)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_famin_f16_x(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famin.u.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 8 x half> @_Z16test_famin_f16_xu10__SVBool_tu13__SVFloat16_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famin.u.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+svfloat16_t test_famin_f16_x(svbool_t pg, svfloat16_t a, svfloat16_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _f16, _x)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_famin_f16_z(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famin.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[B]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 8 x half> @_Z16test_famin_f16_zu10__SVBool_tu13__SVFloat16_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famin.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+//
+svfloat16_t test_famin_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _f16, _z)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_famin_n_f16_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x half> poison, half [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x half> [[DOTSPLATINSERT]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famin.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 8 x half> @_Z18test_famin_n_f16_mu10__SVBool_tu13__SVFloat16_tDh(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x half> poison, half [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x half> [[DOTSPLATINSERT]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famin.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+svfloat16_t test_famin_n_f16_m(svbool_t pg, svfloat16_t a, float16_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _n_f16, _m)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_famin_n_f16_x(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x half> poison, half [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x half> [[DOTSPLATINSERT]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famin.u.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 8 x half> @_Z18test_famin_n_f16_xu10__SVBool_tu13__SVFloat16_tDh(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x half> poison, half [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x half> [[DOTSPLATINSERT]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famin.u.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+svfloat16_t test_famin_n_f16_x(svbool_t pg, svfloat16_t a, float16_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _n_f16, _x)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_famin_n_f16_z(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x half> poison, half [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x half> [[DOTSPLATINSERT]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famin.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 8 x half> @_Z18test_famin_n_f16_zu10__SVBool_tu13__SVFloat16_tDh(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x half> poison, half [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x half> [[DOTSPLATINSERT]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famin.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+//
+svfloat16_t test_famin_n_f16_z(svbool_t pg, svfloat16_t a, float16_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _n_f16, _z)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_famin_f32_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famin.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 4 x float> @_Z16test_famin_f32_mu10__SVBool_tu13__SVFloat32_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famin.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_famin_f32_m(svbool_t pg, svfloat32_t a, svfloat32_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _f32, _m)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_famin_f32_x(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famin.u.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 4 x float> @_Z16test_famin_f32_xu10__SVBool_tu13__SVFloat32_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famin.u.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_famin_f32_x(svbool_t pg, svfloat32_t a, svfloat32_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _f32, _x)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_famin_f32_z(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famin.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[B]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 4 x float> @_Z16test_famin_f32_zu10__SVBool_tu13__SVFloat32_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = select <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famin.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+//
+svfloat32_t test_famin_f32_z(svbool_t pg, svfloat32_t a, svfloat32_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _f32, _z)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_famin_n_f32_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x float> [[DOTSPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famin.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 4 x float> @_Z18test_famin_n_f32_mu10__SVBool_tu13__SVFloat32_tf(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x float> [[DOTSPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famin.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_famin_n_f32_m(svbool_t pg, svfloat32_t a, float32_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _n_f32, _m)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_famin_n_f32_x(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x float> [[DOTSPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famin.u.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 4 x float> @_Z18test_famin_n_f32_xu10__SVBool_tu13__SVFloat32_tf(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x float> [[DOTSPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famin.u.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_famin_n_f32_x(svbool_t pg, svfloat32_t a, float32_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _n_f32, _x)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_famin_n_f32_z(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x float> [[DOTSPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famin.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 4 x float> @_Z18test_famin_n_f32_zu10__SVBool_tu13__SVFloat32_tf(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x float> [[DOTSPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = select <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famin.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+//
+svfloat32_t test_famin_n_f32_z(svbool_t pg, svfloat32_t a, float32_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _n_f32, _z)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_famin_f64_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famin.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 2 x double> @_Z16test_famin_f64_mu10__SVBool_tu13__SVFloat64_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famin.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+svfloat64_t test_famin_f64_m(svbool_t pg, svfloat64_t a, svfloat64_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _f64, _m)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_famin_f64_x(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famin.u.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 2 x double> @_Z16test_famin_f64_xu10__SVBool_tu13__SVFloat64_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famin.u.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+svfloat64_t test_famin_f64_x(svbool_t pg, svfloat64_t a, svfloat64_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _f64, _x)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_famin_f64_z(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famin.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[B]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 2 x double> @_Z16test_famin_f64_zu10__SVBool_tu13__SVFloat64_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = select <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famin.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+//
+svfloat64_t test_famin_f64_z(svbool_t pg, svfloat64_t a, svfloat64_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _f64, _z)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_famin_n_f64_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x double> [[DOTSPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famin.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 2 x double> @_Z18test_famin_n_f64_mu10__SVBool_tu13__SVFloat64_td(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x double> [[DOTSPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famin.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+svfloat64_t test_famin_n_f64_m(svbool_t pg, svfloat64_t a, float64_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _n_f64, _m)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_famin_n_f64_x(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x double> [[DOTSPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famin.u.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 2 x double> @_Z18test_famin_n_f64_xu10__SVBool_tu13__SVFloat64_td(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x double> [[DOTSPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famin.u.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+svfloat64_t test_famin_n_f64_x(svbool_t pg, svfloat64_t a, float64_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _n_f64, _x)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_famin_n_f64_z(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x double> [[DOTSPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famin.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 2 x double> @_Z18test_famin_n_f64_zu10__SVBool_tu13__SVFloat64_td(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x double> [[DOTSPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = select <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famin.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+//
+svfloat64_t test_famin_n_f64_z(svbool_t pg, svfloat64_t a, float64_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamin, _n_f64, _z)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_famax_f16_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famax.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 8 x half> @_Z16test_famax_f16_mu10__SVBool_tu13__SVFloat16_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famax.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+svfloat16_t test_famax_f16_m(svbool_t pg, svfloat16_t a, svfloat16_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _f16, _m)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_famax_f16_x(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famax.u.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 8 x half> @_Z16test_famax_f16_xu10__SVBool_tu13__SVFloat16_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famax.u.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+svfloat16_t test_famax_f16_x(svbool_t pg, svfloat16_t a, svfloat16_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _f16, _x)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_famax_f16_z(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famax.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[B]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 8 x half> @_Z16test_famax_f16_zu10__SVBool_tu13__SVFloat16_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famax.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+//
+svfloat16_t test_famax_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _f16, _z)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_famax_n_f16_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x half> poison, half [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x half> [[DOTSPLATINSERT]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famax.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 8 x half> @_Z18test_famax_n_f16_mu10__SVBool_tu13__SVFloat16_tDh(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x half> poison, half [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x half> [[DOTSPLATINSERT]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famax.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+svfloat16_t test_famax_n_f16_m(svbool_t pg, svfloat16_t a, float16_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _n_f16, _m)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_famax_n_f16_x(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x half> poison, half [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x half> [[DOTSPLATINSERT]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famax.u.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 8 x half> @_Z18test_famax_n_f16_xu10__SVBool_tu13__SVFloat16_tDh(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x half> poison, half [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x half> [[DOTSPLATINSERT]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famax.u.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+svfloat16_t test_famax_n_f16_x(svbool_t pg, svfloat16_t a, float16_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _n_f16, _x)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_famax_n_f16_z(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x half> poison, half [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x half> [[DOTSPLATINSERT]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famax.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 8 x half> @_Z18test_famax_n_f16_zu10__SVBool_tu13__SVFloat16_tDh(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], half noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x half> poison, half [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x half> [[DOTSPLATINSERT]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[A]], <vscale x 8 x half> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.famax.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+//
+svfloat16_t test_famax_n_f16_z(svbool_t pg, svfloat16_t a, float16_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _n_f16, _z)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_famax_f32_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famax.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 4 x float> @_Z16test_famax_f32_mu10__SVBool_tu13__SVFloat32_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famax.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_famax_f32_m(svbool_t pg, svfloat32_t a, svfloat32_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _f32, _m)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_famax_f32_x(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famax.u.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 4 x float> @_Z16test_famax_f32_xu10__SVBool_tu13__SVFloat32_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famax.u.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_famax_f32_x(svbool_t pg, svfloat32_t a, svfloat32_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _f32, _x)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_famax_f32_z(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famax.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[B]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 4 x float> @_Z16test_famax_f32_zu10__SVBool_tu13__SVFloat32_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = select <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famax.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+//
+svfloat32_t test_famax_f32_z(svbool_t pg, svfloat32_t a, svfloat32_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _f32, _z)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_famax_n_f32_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x float> [[DOTSPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famax.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 4 x float> @_Z18test_famax_n_f32_mu10__SVBool_tu13__SVFloat32_tf(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x float> [[DOTSPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famax.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_famax_n_f32_m(svbool_t pg, svfloat32_t a, float32_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _n_f32, _m)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_famax_n_f32_x(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x float> [[DOTSPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famax.u.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 4 x float> @_Z18test_famax_n_f32_xu10__SVBool_tu13__SVFloat32_tf(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x float> [[DOTSPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famax.u.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_famax_n_f32_x(svbool_t pg, svfloat32_t a, float32_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _n_f32, _x)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_famax_n_f32_z(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x float> [[DOTSPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famax.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 4 x float> @_Z18test_famax_n_f32_zu10__SVBool_tu13__SVFloat32_tf(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], float noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x float> [[DOTSPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = select <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[A]], <vscale x 4 x float> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.famax.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+//
+svfloat32_t test_famax_n_f32_z(svbool_t pg, svfloat32_t a, float32_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _n_f32, _z)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_famax_f64_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famax.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 2 x double> @_Z16test_famax_f64_mu10__SVBool_tu13__SVFloat64_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famax.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+svfloat64_t test_famax_f64_m(svbool_t pg, svfloat64_t a, svfloat64_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _f64, _m)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_famax_f64_x(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famax.u.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 2 x double> @_Z16test_famax_f64_xu10__SVBool_tu13__SVFloat64_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famax.u.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+svfloat64_t test_famax_f64_x(svbool_t pg, svfloat64_t a, svfloat64_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _f64, _x)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_famax_f64_z(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famax.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[B]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 2 x double> @_Z16test_famax_f64_zu10__SVBool_tu13__SVFloat64_tS0_(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = select <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famax.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[B]])
+// CHECK-CPP-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+//
+svfloat64_t test_famax_f64_z(svbool_t pg, svfloat64_t a, svfloat64_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _f64, _z)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_famax_n_f64_m(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x double> [[DOTSPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famax.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 2 x double> @_Z18test_famax_n_f64_mu10__SVBool_tu13__SVFloat64_td(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x double> [[DOTSPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famax.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+svfloat64_t test_famax_n_f64_m(svbool_t pg, svfloat64_t a, float64_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _n_f64, _m)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_famax_n_f64_x(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x double> [[DOTSPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famax.u.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 2 x double> @_Z18test_famax_n_f64_xu10__SVBool_tu13__SVFloat64_td(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x double> [[DOTSPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famax.u.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+svfloat64_t test_famax_n_f64_x(svbool_t pg, svfloat64_t a, float64_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _n_f64, _x)(pg, a, b);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_famax_n_f64_z(
+// CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[B]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x double> [[DOTSPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famax.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+//
+// CHECK-CPP-LABEL: define dso_local <vscale x 2 x double> @_Z18test_famax_n_f64_zu10__SVBool_tu13__SVFloat64_td(
+// CHECK-CPP-SAME: <vscale x 16 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
+// CHECK-CPP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[B]], i64 0
+// CHECK-CPP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x double> [[DOTSPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = select <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[A]], <vscale x 2 x double> zeroinitializer
+// CHECK-CPP-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.famax.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[DOTSPLAT]])
+// CHECK-CPP-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+//
+svfloat64_t test_famax_n_f64_z(svbool_t pg, svfloat64_t a, float64_t b) STREAMING {
+  return SVE_ACLE_FUNC(svamax, _n_f64, _z)(pg, a, b);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 6727ee69d7b3e..9bce850750f79 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3785,3 +3785,10 @@ def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic
 def int_aarch64_sme_mopa_nonwide : SME_OuterProduct_Intrinsic;
 def int_aarch64_sme_mops_nonwide : SME_OuterProduct_Intrinsic;
 
+// SVE2/SME2 - Floating point absolute maximum and minimum
+
+def int_aarch64_sve_famax   : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_famax_u : AdvSIMD_Pred2VectorArg_Intrinsic;
+
+def int_aarch64_sve_famin   : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_famin_u : AdvSIMD_Pred2VectorArg_Intrinsic;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index af8ddb49b0ac6..4922fb280333b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4183,9 +4183,11 @@ defm FCVTNT_Z2Z_StoB : sve2_fp8_down_cvt_single<0b11, "fcvtnt", ZZ_s_mul_r>;
 } // End HasSVE2orSME2, HasFP8
 
 let Predicates = [HasSVE2orSME2, HasFAMINMAX] in {
-// FP8 Arithmetic - Predicated Group
-defm FAMIN_ZPmZ : sve_fp_2op_p_zds<0b1111, "famin", "", null_frag, DestructiveOther>;
-defm FAMAX_ZPmZ : sve_fp_2op_p_zds<0b1110, "famax", "", null_frag, DestructiveOther>;
+defm FAMIN_ZPmZ : sve_fp_2op_p_zds<0b1111, "famin", "FAMIN_ZPZZ", int_aarch64_sve_famin, DestructiveBinaryComm>;
+defm FAMAX_ZPmZ : sve_fp_2op_p_zds<0b1110, "famax", "FAMAX_ZPZZ", int_aarch64_sve_famax, DestructiveBinaryComm>;
+
+defm FAMAX_ZPZZ : sve_fp_bin_pred_hfd<int_aarch64_sve_famax_u>;
+defm FAMIN_ZPZZ : sve_fp_bin_pred_hfd<int_aarch64_sve_famin_u>;
 } // End HasSVE2orSME2, HasFAMINMAX
 
 let Predicates = [HasSSVE_FP8FMA] in {
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-faminmax.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-faminmax.ll
new file mode 100644
index 0000000000000..7d16f8383d968
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-faminmax.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mattr=+sve2 < %s | FileCheck %s
+; RUN: llc -mattr=+sme2 -force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define <vscale x 8 x half> @famin_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: famin_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+    %r = call <vscale x 8 x half> @llvm.aarch64.sve.famin.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+    ret <vscale x 8 x half> %r
+}
+
+define <vscale x 4 x float> @famin_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: famin_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+    %r = call <vscale x 4 x float> @llvm.aarch64.sve.famin.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+    ret <vscale x 4 x float> %r
+}
+
+define <vscale x 2 x double> @famin_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: famin_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+    %r = call <vscale x 2 x double> @llvm.aarch64.sve.famin.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+    ret <vscale x 2 x double> %r
+}
+
+define <vscale x 8 x half> @famin_u_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: famin_u_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+    %r = call <vscale x 8 x half> @llvm.aarch64.sve.famin.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %b, <vscale x 8 x half> %a)
+    ret <vscale x 8 x half> %r
+}
+
+define <vscale x 4 x float> @famin_u_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: famin_u_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+    %r = call <vscale x 4 x float> @llvm.aarch64.sve.famin.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %b, <vscale x 4 x float> %a)
+    ret <vscale x 4 x float> %r
+}
+
+define <vscale x 2 x double> @famin_u_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: famin_u_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+    %r = call <vscale x 2 x double> @llvm.aarch64.sve.famin.u.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %b, <vscale x 2 x double> %a)
+    ret <vscale x 2 x double> %r
+}
+
+define <vscale x 8 x half> @famax_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: famax_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+    %r = call <vscale x 8 x half> @llvm.aarch64.sve.famax.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+    ret <vscale x 8 x half> %r
+}
+
+define <vscale x 4 x float> @famax_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: famax_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+    %r = call <vscale x 4 x float> @llvm.aarch64.sve.famax.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+    ret <vscale x 4 x float> %r
+}
+
+define <vscale x 2 x double> @famax_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: famax_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+    %r = call <vscale x 2 x double> @llvm.aarch64.sve.famax.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+    ret <vscale x 2 x double> %r
+}
+
+define <vscale x 8 x half> @famax_u_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: famax_u_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+    %r = call <vscale x 8 x half> @llvm.aarch64.sve.famax.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %b, <vscale x 8 x half> %a)
+    ret <vscale x 8 x half> %r
+}
+
+define <vscale x 4 x float> @famax_u_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: famax_u_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+    %r = call <vscale x 4 x float> @llvm.aarch64.sve.famax.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %b, <vscale x 4 x float> %a)
+    ret <vscale x 4 x float> %r
+}
+
+define <vscale x 2 x double> @famax_u_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: famax_u_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+    %r = call <vscale x 2 x double> @llvm.aarch64.sve.famax.u.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %b, <vscale x 2 x double> %a)
+    ret <vscale x 2 x double> %r
+}
+
+attributes #0 = { nounwind "target-features" = "+faminmax" }

From c2b92a4250b3f514685676ba8985ea73450f14d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Chmiel?= <bachm44@gmail.com>
Date: Wed, 4 Sep 2024 14:10:51 +0200
Subject: [PATCH 086/425] [SROA] Use SetVector for PromotableAllocas (#105809)

Use SetVector to make operations more efficient
if there is a very large number of allocas.
---
 llvm/lib/Transforms/Scalar/SROA.cpp | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 2310cb3a7decb..d0186da1bc5e2 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -198,7 +198,9 @@ class SROA {
   SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
 
   /// A collection of alloca instructions we can directly promote.
-  std::vector<AllocaInst *> PromotableAllocas;
+  SetVector<AllocaInst *, SmallVector<AllocaInst *>,
+            SmallPtrSet<AllocaInst *, 16>, 16>
+      PromotableAllocas;
 
   /// A worklist of PHIs to speculate prior to promoting allocas.
   ///
@@ -4799,9 +4801,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
 
   // Finally, don't try to promote any allocas that new require re-splitting.
   // They have already been added to the worklist above.
-  llvm::erase_if(PromotableAllocas, [&](AllocaInst *AI) {
-    return ResplitPromotableAllocas.count(AI);
-  });
+  PromotableAllocas.set_subtract(ResplitPromotableAllocas);
 
   return true;
 }
@@ -4963,7 +4963,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     }
     if (PHIUsers.empty() && SelectUsers.empty()) {
       // Promote the alloca.
-      PromotableAllocas.push_back(NewAI);
+      PromotableAllocas.insert(NewAI);
     } else {
       // If we have either PHIs or Selects to speculate, add them to those
       // worklists and re-queue the new alloca so that we promote in on the
@@ -5598,7 +5598,7 @@ bool SROA::promoteAllocas(Function &F) {
     LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
   } else {
     LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
-    PromoteMemToReg(PromotableAllocas, DTU->getDomTree(), AC);
+    PromoteMemToReg(PromotableAllocas.getArrayRef(), DTU->getDomTree(), AC);
   }
 
   PromotableAllocas.clear();
@@ -5615,7 +5615,7 @@ std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
       if (DL.getTypeAllocSize(AI->getAllocatedType()).isScalable() &&
           isAllocaPromotable(AI))
-        PromotableAllocas.push_back(AI);
+        PromotableAllocas.insert(AI);
       else
         Worklist.insert(AI);
     }
@@ -5639,10 +5639,9 @@ std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
       // Remove the deleted allocas from various lists so that we don't try to
       // continue processing them.
       if (!DeletedAllocas.empty()) {
-        auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
-        Worklist.remove_if(IsInSet);
-        PostPromotionWorklist.remove_if(IsInSet);
-        llvm::erase_if(PromotableAllocas, IsInSet);
+        Worklist.set_subtract(DeletedAllocas);
+        PostPromotionWorklist.set_subtract(DeletedAllocas);
+        PromotableAllocas.set_subtract(DeletedAllocas);
         DeletedAllocas.clear();
       }
     }

From d65ff3e9364536f9e0bd5f1c1bace626c256a2ad Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 4 Sep 2024 05:12:17 -0700
Subject: [PATCH 087/425] [SLP]Fix PR107198: add a check for empty complex type

Need to check if the complex type is empty before trying to dig in,
trying to find vectorizable type

Fixes https://github.com/llvm/llvm-project/issues/107198
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp    |  2 ++
 llvm/test/Transforms/SLPVectorizer/empty-struct.ll | 14 ++++++++++++++
 2 files changed, 16 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/empty-struct.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 19b95cf473e91..a2af7f4e1b01c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7780,6 +7780,8 @@ unsigned BoUpSLP::canMapToVector(Type *T) const {
   Type *EltTy = T;
 
   while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
+    if (EltTy->isEmptyTy())
+      return 0;
     if (auto *ST = dyn_cast<StructType>(EltTy)) {
       // Check that struct is homogeneous.
       for (const auto *Ty : ST->elements())
diff --git a/llvm/test/Transforms/SLPVectorizer/empty-struct.ll b/llvm/test/Transforms/SLPVectorizer/empty-struct.ll
new file mode 100644
index 0000000000000..422d6b2eb3276
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/empty-struct.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s
+
+define { { {}, {}, {}, {}, {}, {}, {} } } @test({ {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, { {}, {}, {}, {}, {}, {}, {} }, { {} } } %0) {
+; CHECK-LABEL: define { { {}, {}, {}, {}, {}, {}, {} } } @test(
+; CHECK-SAME: { {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, { {}, {}, {}, {}, {}, {}, {} }, { {} } } [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, { {}, {}, {}, {}, {}, {}, {} }, { {} } } [[TMP0]], 18
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { { {}, {}, {}, {}, {}, {}, {} } } undef, { {}, {}, {}, {}, {}, {}, {} } [[TMP2]], 0
+; CHECK-NEXT:    ret { { {}, {}, {}, {}, {}, {}, {} } } [[TMP3]]
+;
+  %2 = extractvalue { {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, { {}, {}, {}, {}, {}, {}, {} }, { {} } } %0, 18
+  %3 = insertvalue { { {}, {}, {}, {}, {}, {}, {} } } undef, { {}, {}, {}, {}, {}, {}, {} } %2, 0
+  ret { { {}, {}, {}, {}, {}, {}, {} } } %3
+}

From 26645ae2eea00456d98b497f348426c375409ce4 Mon Sep 17 00:00:00 2001
From: Menooker <yijie.mei@intel.com>
Date: Wed, 4 Sep 2024 20:36:19 +0800
Subject: [PATCH 088/425] [mlir][memref] Fix hoist-static-allocs option of
 buffer-results-to-out-params when function parameters are returned (#102093)

buffer-results-to-out-params pass will have a nullptr-referencing error
when hoist-static-allocs option is on, when the return value of a
function is a parameter of the function. This PR fixes this issue.
---
 .../Transforms/BufferResultsToOutParams.cpp      |  3 ++-
 .../buffer-results-to-out-params-elim.mlir       | 16 +++++++++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
index b19636adaa69e..b7755b2be8483 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
@@ -120,7 +120,8 @@ static LogicalResult updateReturnOps(func::FuncOp func,
     }
     OpBuilder builder(op);
     for (auto [orig, arg] : llvm::zip(copyIntoOutParams, appendedEntryArgs)) {
-      if (hoistStaticAllocs && isa<memref::AllocOp>(orig.getDefiningOp()) &&
+      if (hoistStaticAllocs &&
+          isa_and_nonnull<memref::AllocOp>(orig.getDefiningOp()) &&
           mlir::cast<MemRefType>(orig.getType()).hasStaticShape()) {
         orig.replaceAllUsesWith(arg);
         orig.getDefiningOp()->erase();
diff --git a/mlir/test/Transforms/buffer-results-to-out-params-elim.mlir b/mlir/test/Transforms/buffer-results-to-out-params-elim.mlir
index f77dbfaa6cb11..2783836a09e16 100644
--- a/mlir/test/Transforms/buffer-results-to-out-params-elim.mlir
+++ b/mlir/test/Transforms/buffer-results-to-out-params-elim.mlir
@@ -34,4 +34,18 @@ func.func @basic_dynamic(%d: index) -> (memref<?xf32>) {
   %b = memref.alloc(%d) : memref<?xf32>
   "test.source"(%b)  : (memref<?xf32>) -> ()
   return %b : memref<?xf32>
-}
\ No newline at end of file
+}
+
+// -----
+
+// no change due to writing to func args
+// CHECK-LABEL:   func @return_arg(
+// CHECK-SAME:        %[[ARG0:.*]]: memref<128x256xf32>, %[[ARG1:.*]]: memref<128x256xf32>, %[[ARG2:.*]]: memref<128x256xf32>) {
+// CHECK:           "test.source"(%[[ARG0]], %[[ARG1]])
+// CHECK:           memref.copy
+// CHECK:           return
+// CHECK:         }
+func.func @return_arg(%arg0: memref<128x256xf32>, %arg1: memref<128x256xf32>) -> memref<128x256xf32> {
+  "test.source"(%arg0, %arg1)  : (memref<128x256xf32>, memref<128x256xf32>) -> ()
+  return %arg0 : memref<128x256xf32>
+}

From 6238159e886baca5ebf31fd6b15d79db30ced889 Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97@outlook.com>
Date: Wed, 4 Sep 2024 20:46:21 +0800
Subject: [PATCH 089/425] [CodeGen][NewPM] Fix linker error due to dummy
 `MachineVerifierPass` (#107237)

Forgot to remove the dummy registry in `MachinePassRegistry.def`.
---
 llvm/include/llvm/Passes/CodeGenPassBuilder.h    | 1 +
 llvm/include/llvm/Passes/MachinePassRegistry.def | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 64d20ae5b20ef..602e64f23ed51 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -45,6 +45,7 @@
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachinePassManager.h"
+#include "llvm/CodeGen/MachineVerifier.h"
 #include "llvm/CodeGen/PHIElimination.h"
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/CodeGen/RegAllocFast.h"
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index b710b1c46f643..0423e20f58d30 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -228,7 +228,6 @@ DUMMY_MACHINE_FUNCTION_PASS("machine-sink", MachineSinkingPass)
 DUMMY_MACHINE_FUNCTION_PASS("machine-uniformity", MachineUniformityInfoWrapperPass)
 DUMMY_MACHINE_FUNCTION_PASS("machineinstr-printer", MachineFunctionPrinterPass)
 DUMMY_MACHINE_FUNCTION_PASS("machinelicm", MachineLICMPass)
-DUMMY_MACHINE_FUNCTION_PASS("machineverifier", MachineVerifierPass)
 DUMMY_MACHINE_FUNCTION_PASS("mirfs-discriminators", MIRAddFSDiscriminatorsPass)
 DUMMY_MACHINE_FUNCTION_PASS("opt-phis", OptimizePHIsPass)
 DUMMY_MACHINE_FUNCTION_PASS("patchable-function", PatchableFunctionPass)

From df5840f9f09280a33923f119db5a82e0cda3622d Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Wed, 4 Sep 2024 15:58:17 +0300
Subject: [PATCH 090/425] [AMDGPU][Docs] Update product names for some targets
 (#106973)

Based on
https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html#supported-gpus.
---
 llvm/docs/AMDGPUUsage.rst | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 80140734cbefd..a5ad3f6bbf7b2 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -392,12 +392,12 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
                                                                         work-item                       Add product
                                                                         IDs                             names.
 
-     ``gfx942``                  ``amdgcn``   dGPU  - sramecc         - Architected                   *TBA*
-                                                    - tgsplit           flat
-                                                    - xnack             scratch                       .. TODO::
+     ``gfx942``                  ``amdgcn``   dGPU  - sramecc         - Architected                   - AMD Instinct MI300X
+                                                    - tgsplit           flat                          - AMD Instinct MI300A
+                                                    - xnack             scratch
                                                     - kernarg preload - Packed
-                                                                        work-item                       Add product
-                                                                        IDs                             names.
+                                                                        work-item
+                                                                        IDs
 
      **GCN GFX10.1 (RDNA 1)** [AMD-GCN-GFX10-RDNA1]_
      -----------------------------------------------------------------------------------------------------------------------
@@ -424,6 +424,8 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
      ``gfx1030``                 ``amdgcn``   dGPU  - cumode          - Absolute      - *rocm-amdhsa* - Radeon RX 6800
                                                     - wavefrontsize64   flat          - *pal-amdhsa*  - Radeon RX 6800 XT
                                                                         scratch       - *pal-amdpal*  - Radeon RX 6900 XT
+                                                                                                      - Radeon PRO W6800
+                                                                                                      - Radeon PRO V620
      ``gfx1031``                 ``amdgcn``   dGPU  - cumode          - Absolute      - *rocm-amdhsa* - Radeon RX 6700 XT
                                                     - wavefrontsize64   flat          - *pal-amdhsa*
                                                                         scratch       - *pal-amdpal*
@@ -462,12 +464,12 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
 
      **GCN GFX11 (RDNA 3)** [AMD-GCN-GFX11-RDNA3]_
      -----------------------------------------------------------------------------------------------------------------------
-     ``gfx1100``                 ``amdgcn``   dGPU  - cumode          - Architected   - *pal-amdpal*  *TBA*
-                                                    - wavefrontsize64   flat
-                                                                        scratch                       .. TODO::
-                                                                      - Packed
-                                                                        work-item                       Add product
-                                                                        IDs                             names.
+     ``gfx1100``                 ``amdgcn``   dGPU  - cumode          - Architected   - *pal-amdpal*  - Radeon PRO W7900 Dual Slot
+                                                    - wavefrontsize64   flat                          - Radeon PRO W7900
+                                                                        scratch                       - Radeon PRO W7800
+                                                                      - Packed                        - Radeon RX 7900 XTX
+                                                                        work-item                     - Radeon RX 7900 XT
+                                                                        IDs                           - Radeon RX 7900 GRE
 
      ``gfx1101``                 ``amdgcn``   dGPU  - cumode          - Architected                   *TBA*
                                                     - wavefrontsize64   flat

From 205f7ee737f75e666f70ad51bda5f778c02ab124 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 4 Sep 2024 15:01:16 +0200
Subject: [PATCH 091/425] [Lint] Skip null args when checking noalias

Do not emit a warning if there are two null noalias arguments,
as they cannot be dereferenced anyway.

This is a common pattern for `@.omp_outlined`, which has some
optional noalias arguments.
---
 llvm/lib/Analysis/Lint.cpp              |  3 ++-
 llvm/test/Analysis/Lint/noalias-null.ll | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Analysis/Lint/noalias-null.ll

diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index 00e430ce8e0ab..e0a029802bbd9 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -244,7 +244,8 @@ void Lint::visitCallBase(CallBase &I) {
             // dereferenced anyway.
             if (I.doesNotAccessMemory(ArgNo))
               continue;
-            if (AI != BI && (*BI)->getType()->isPointerTy()) {
+            if (AI != BI && (*BI)->getType()->isPointerTy() &&
+                !isa<ConstantPointerNull>(*BI)) {
               AliasResult Result = AA->alias(*AI, *BI);
               Check(Result != AliasResult::MustAlias &&
                         Result != AliasResult::PartialAlias,
diff --git a/llvm/test/Analysis/Lint/noalias-null.ll b/llvm/test/Analysis/Lint/noalias-null.ll
new file mode 100644
index 0000000000000..5d2ecb5da5898
--- /dev/null
+++ b/llvm/test/Analysis/Lint/noalias-null.ll
@@ -0,0 +1,14 @@
+; RUN: opt < %s -passes=lint -disable-output 2>&1 | FileCheck --allow-empty %s
+
+declare void @foo(ptr noalias, ptr noalias)
+
+define void @test() {
+entry:
+  call void @foo(ptr null, ptr null)
+  ret void
+}
+
+; Lint should not complain about passing null to both arguments if they are
+; null, since noalias only applies if the argument is written to, which is not
+; possible for a null pointer.
+; CHECK-NOT: Unusual: noalias argument aliases another argument

From 3d53212f6104c27df5097301587ece69db9c007e Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Wed, 4 Sep 2024 15:03:36 +0200
Subject: [PATCH 092/425] [LLD][COFF] Initial support for ARM64EC importlibs.
 (#107164)

Use demangled symbol name for __imp_ symbols and define demangled thunk
symbol as AMD64 thunk.
---
 lld/COFF/InputFiles.cpp           | 23 +++++++++--
 lld/test/COFF/arm64ec-import.test | 68 +++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 4 deletions(-)
 create mode 100644 lld/test/COFF/arm64ec-import.test

diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index 50bc62312a6f8..fa2d230075d9d 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -25,6 +25,7 @@
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/LTO/LTO.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/COFF.h"
@@ -1019,9 +1020,17 @@ void ImportFile::parse() {
 
   // Read names and create an __imp_ symbol.
   StringRef buf = mb.getBuffer().substr(sizeof(*hdr));
-  StringRef name = saver().save(buf.split('\0').first);
+  auto split = buf.split('\0');
+  buf = split.second;
+  StringRef name;
+  if (isArm64EC(hdr->Machine)) {
+    if (std::optional<std::string> demangledName =
+            getArm64ECDemangledFunctionName(split.first))
+      name = saver().save(*demangledName);
+  }
+  if (name.empty())
+    name = saver().save(split.first);
   StringRef impName = saver().save("__imp_" + name);
-  buf = buf.substr(name.size() + 1);
   dllName = buf.split('\0').first;
   StringRef extName;
   switch (hdr->getNameType()) {
@@ -1058,8 +1067,14 @@ void ImportFile::parse() {
   // If type is function, we need to create a thunk which jump to an
   // address pointed by the __imp_ symbol. (This allows you to call
   // DLL functions just like regular non-DLL functions.)
-  if (hdr->getType() == llvm::COFF::IMPORT_CODE)
-    thunkSym = ctx.symtab.addImportThunk(name, impSym, hdr->Machine);
+  if (hdr->getType() == llvm::COFF::IMPORT_CODE) {
+    if (ctx.config.machine != ARM64EC) {
+      thunkSym = ctx.symtab.addImportThunk(name, impSym, hdr->Machine);
+    } else {
+      thunkSym = ctx.symtab.addImportThunk(name, impSym, AMD64);
+      // FIXME: Add aux IAT symbols.
+    }
+  }
 }
 
 BitcodeFile::BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb,
diff --git a/lld/test/COFF/arm64ec-import.test b/lld/test/COFF/arm64ec-import.test
new file mode 100644
index 0000000000000..2a80a30910b5c
--- /dev/null
+++ b/lld/test/COFF/arm64ec-import.test
@@ -0,0 +1,68 @@
+REQUIRES: aarch64, x86
+RUN: split-file %s %t.dir && cd %t.dir
+
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test.s -o test.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
+RUN: llvm-lib -machine:arm64ec -def:test.def -out:test-arm64ec.lib
+RUN: llvm-lib -machine:arm64ec -def:test2.def -out:test2-arm64ec.lib
+RUN: llvm-lib -machine:x64 -def:test.def -out:test-x86_64.lib
+
+Link using ARM64EC import library:
+RUN: lld-link -machine:arm64ec -dll -noentry -out:out.dll loadconfig-arm64ec.obj \
+RUN:          test.obj test-arm64ec.lib test2-arm64ec.lib
+
+Link using x86_64 import library:
+RUN: lld-link -machine:arm64ec -dll -noentry -out:out2.dll loadconfig-arm64ec.obj \
+RUN:          test.obj test-x86_64.lib test2-arm64ec.lib
+
+RUN: llvm-readobj --coff-imports out.dll | FileCheck --check-prefix=IMPORTS %s
+RUN: llvm-readobj --coff-imports out2.dll | FileCheck --check-prefix=IMPORTS %s
+IMPORTS:      Import {
+IMPORTS-NEXT:   Name: test.dll
+IMPORTS-NEXT:   ImportLookupTableRVA:
+IMPORTS-NEXT:   ImportAddressTableRVA: 0x2258
+IMPORTS-NEXT:   Symbol: data (0)
+IMPORTS-NEXT:   Symbol: func (0)
+IMPORTS-NEXT:   Symbol: func2 (0)
+IMPORTS-NEXT: }
+IMPORTS-NEXT: Import {
+IMPORTS-NEXT:   Name: test2.dll
+IMPORTS-NEXT:   ImportLookupTableRVA:
+IMPORTS-NEXT:   ImportAddressTableRVA: 0x2278
+IMPORTS-NEXT:   Symbol: t2func (0)
+IMPORTS-NEXT: }
+
+RUN: llvm-objdump -d out.dll | FileCheck --check-prefix=DISASM %s
+RUN: llvm-objdump -d out2.dll | FileCheck --check-prefix=DISASM %s
+
+DISASM:      0000000180001000 <.text>:
+DISASM-NEXT: 180001000: ff 25 5a 12 00 00            jmpq    *0x125a(%rip)           # 0x180002260
+
+RUN: llvm-readobj --hex-dump=.test out.dll | FileCheck --check-prefix=TESTSEC %s
+RUN: llvm-readobj --hex-dump=.test out2.dll | FileCheck --check-prefix=TESTSEC %s
+TESTSEC:      0x180004000 60220000 58220000 68220000 78220000
+TESTSEC-NEXT: 0x180004010 00100000
+
+#--- test.s
+    .section .test, "r"
+    .globl arm64ec_data_sym
+    .p2align 2, 0x0
+arm64ec_data_sym:
+    .rva __imp_func
+    .rva __imp_data
+    .rva __imp_func2
+    .rva __imp_t2func
+    .rva func
+
+#--- test.def
+NAME test.dll
+EXPORTS
+    data DATA
+    func
+    func2
+    unused_func
+
+#--- test2.def
+NAME test2.dll
+EXPORTS
+    t2func

From f11915153761e0c2691945add795c891e63c0c4a Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Wed, 4 Sep 2024 14:09:04 +0100
Subject: [PATCH 093/425] IVDescriptors: improve readability of a function
 (NFC) (#106219)

Avoid dereferencing operand to llvm::isa.
---
 llvm/lib/Analysis/IVDescriptors.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 560955dede84f..fdf78b9f8a44e 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -734,13 +734,12 @@ RecurrenceDescriptor::isConditionalRdxPattern(RecurKind Kind, Instruction *I) {
   Value *FalseVal = SI->getFalseValue();
   // Handle only when either of operands of select instruction is a PHI
   // node for now.
-  if ((isa<PHINode>(*TrueVal) && isa<PHINode>(*FalseVal)) ||
-      (!isa<PHINode>(*TrueVal) && !isa<PHINode>(*FalseVal)))
+  if ((isa<PHINode>(TrueVal) && isa<PHINode>(FalseVal)) ||
+      (!isa<PHINode>(TrueVal) && !isa<PHINode>(FalseVal)))
     return InstDesc(false, I);
 
-  Instruction *I1 =
-      isa<PHINode>(*TrueVal) ? dyn_cast<Instruction>(FalseVal)
-                             : dyn_cast<Instruction>(TrueVal);
+  Instruction *I1 = isa<PHINode>(TrueVal) ? dyn_cast<Instruction>(FalseVal)
+                                          : dyn_cast<Instruction>(TrueVal);
   if (!I1 || !I1->isBinaryOp())
     return InstDesc(false, I);
 
@@ -754,8 +753,8 @@ RecurrenceDescriptor::isConditionalRdxPattern(RecurKind Kind, Instruction *I) {
         (m_Mul(m_Value(Op1), m_Value(Op2)).match(I1))))
     return InstDesc(false, I);
 
-  Instruction *IPhi = isa<PHINode>(*Op1) ? dyn_cast<Instruction>(Op1)
-                                         : dyn_cast<Instruction>(Op2);
+  Instruction *IPhi = isa<PHINode>(Op1) ? dyn_cast<Instruction>(Op1)
+                                        : dyn_cast<Instruction>(Op2);
   if (!IPhi || IPhi != FalseVal)
     return InstDesc(false, I);
 

From 4552153c37e04def01e99e32c02eab245d92b753 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <christudasan.devadasan@amd.com>
Date: Wed, 4 Sep 2024 18:43:09 +0530
Subject: [PATCH 094/425] [CodeGen][MachineCSE] Remove unused AA results(NFC)
 (#106604)

Alias Analysis result is never used in this pass
and hence removing it.
---
 llvm/lib/CodeGen/MachineCSE.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp
index aadc54b495fe2..2ac1fae9ea48c 100644
--- a/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/llvm/lib/CodeGen/MachineCSE.cpp
@@ -18,7 +18,6 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -73,7 +72,6 @@ namespace {
   class MachineCSE : public MachineFunctionPass {
     const TargetInstrInfo *TII = nullptr;
     const TargetRegisterInfo *TRI = nullptr;
-    AliasAnalysis *AA = nullptr;
     MachineDominatorTree *DT = nullptr;
     MachineRegisterInfo *MRI = nullptr;
     MachineBlockFrequencyInfo *MBFI = nullptr;
@@ -90,7 +88,6 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
-      AU.addRequired<AAResultsWrapperPass>();
       AU.addPreservedID(MachineLoopInfoID);
       AU.addRequired<MachineDominatorTreeWrapperPass>();
       AU.addPreserved<MachineDominatorTreeWrapperPass>();
@@ -167,7 +164,6 @@ char &llvm::MachineCSEID = MachineCSE::ID;
 INITIALIZE_PASS_BEGIN(MachineCSE, DEBUG_TYPE,
                       "Machine Common Subexpression Elimination", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE,
                     "Machine Common Subexpression Elimination", false, false)
 
@@ -943,7 +939,6 @@ bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
   LookAheadLimit = TII->getMachineCSELookAheadLimit();

From 028174aa2c3a9447aca3333e45b5f89e652b74d1 Mon Sep 17 00:00:00 2001
From: Paul T Robinson <paul.robinson@sony.com>
Date: Wed, 4 Sep 2024 06:19:41 -0700
Subject: [PATCH 095/425] [DebugInfo] Make a test more robust (#106463)

This was accidentally matching a metadata record that happend to have
three elements, but wasn't the record of interest. Add CHECKs to make
sure we've found the correct record.
---
 clang/test/CodeGenCXX/debug-info-lambda-this.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/test/CodeGenCXX/debug-info-lambda-this.cpp b/clang/test/CodeGenCXX/debug-info-lambda-this.cpp
index 0a2f08ea4aa6d..e5acab126d72c 100644
--- a/clang/test/CodeGenCXX/debug-info-lambda-this.cpp
+++ b/clang/test/CodeGenCXX/debug-info-lambda-this.cpp
@@ -21,7 +21,8 @@ int main() {
   return 0;
 }
 
-// CHECK: !{![[FOO_THIS:[0-9]+]], ![[FOO_AA:[0-9]+]], ![[FOO_OPERATOR:[0-9]+]]}
+// CHECK: distinct !DICompositeType(tag: DW_TAG_class_type, name: "<lambda_1>", {{.*}}, elements: ![[ELEMENT_TAG:[0-9]+]]
+// CHECK: ![[ELEMENT_TAG]] = !{![[FOO_THIS:[0-9]+]], ![[FOO_AA:[0-9]+]], ![[FOO_OPERATOR:[0-9]+]]}
 // CHECK-NEXT: ![[FOO_THIS]] = !DIDerivedType(tag: DW_TAG_member, name: "__this", scope: ![[#]], file: ![[#]], line: [[#]], baseType: ![[#]], size: [[#]])
 // CHECK-NEXT: ![[FOO_AA]] = !DIDerivedType(tag: DW_TAG_member, name: "aa", scope: ![[#]], file: ![[#]], line: [[#]], baseType: ![[#]], size: [[#]], offset: [[#]])
 // CHECK-NEXT: ![[FOO_OPERATOR]] = !DISubprogram(name: "operator()", scope: ![[#]], file: ![[#]], line: [[#]], type: ![[#]], scopeLine: [[#]], flags: DIFlagPublic | DIFlagPrototyped, spFlags: 0)

From 6c143a86cddbc6d0431dd643bfc7d4f017042512 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <christudasan.devadasan@amd.com>
Date: Wed, 4 Sep 2024 18:54:07 +0530
Subject: [PATCH 096/425] [CodeGen][NewPM] Port MachineCSE pass to new pass
 manager. (#106605)

---
 llvm/include/llvm/CodeGen/MachineCSE.h        |  29 ++
 llvm/include/llvm/CodeGen/Passes.h            |   2 +-
 llvm/include/llvm/InitializePasses.h          |   2 +-
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |   1 +
 .../llvm/Passes/MachinePassRegistry.def       |   2 +-
 llvm/lib/CodeGen/CodeGen.cpp                  |   2 +-
 llvm/lib/CodeGen/MachineCSE.cpp               | 284 ++++++++++--------
 llvm/lib/CodeGen/TargetPassConfig.cpp         |   6 +-
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   2 +-
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp  |   2 +-
 .../GlobalISel/machine-cse-mid-pipeline.mir   |   1 +
 .../AArch64/sve-pfalse-machine-cse.mir        |   1 +
 .../no-cse-nonlocal-convergent-instrs.mir     |   1 +
 .../copyprop_regsequence_with_undef.mir       |   1 +
 llvm/test/CodeGen/AMDGPU/machine-cse-ssa.mir  |   8 +-
 .../CodeGen/PowerPC/machine-cse-rm-pre.mir    |   1 +
 .../CodeGen/Thumb/machine-cse-deadreg.mir     |   1 +
 .../CodeGen/Thumb/machine-cse-physreg.mir     |   1 +
 llvm/test/CodeGen/X86/cse-two-preds.mir       |   1 +
 llvm/test/DebugInfo/MIR/X86/machine-cse.mir   |   1 +
 21 files changed, 216 insertions(+), 134 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/MachineCSE.h

diff --git a/llvm/include/llvm/CodeGen/MachineCSE.h b/llvm/include/llvm/CodeGen/MachineCSE.h
new file mode 100644
index 0000000000000..f83c25bf39120
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/MachineCSE.h
@@ -0,0 +1,29 @@
+//===- llvm/CodeGen/MachineCSE.h --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINECSE_H
+#define LLVM_CODEGEN_MACHINECSE_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class MachineCSEPass : public PassInfoMixin<MachineCSEPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+
+  MachineFunctionProperties getRequiredProperties() {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_MACHINECSE_H
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index dbdd110b0600e..ddb2012cd2bff 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -330,7 +330,7 @@ namespace llvm {
   extern char &GCMachineCodeAnalysisID;
 
   /// MachineCSE - This pass performs global CSE on machine instructions.
-  extern char &MachineCSEID;
+  extern char &MachineCSELegacyID;
 
   /// MIRCanonicalizer - This pass canonicalizes MIR by renaming vregs
   /// according to the semantics of the instruction as well as hoists
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 47a1ca15fc0d1..6605c6fde9251 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -188,7 +188,7 @@ void initializeMachineBlockPlacementPass(PassRegistry &);
 void initializeMachineBlockPlacementStatsPass(PassRegistry &);
 void initializeMachineBranchProbabilityInfoWrapperPassPass(PassRegistry &);
 void initializeMachineCFGPrinterPass(PassRegistry &);
-void initializeMachineCSEPass(PassRegistry &);
+void initializeMachineCSELegacyPass(PassRegistry &);
 void initializeMachineCombinerPass(PassRegistry &);
 void initializeMachineCopyPropagationPass(PassRegistry &);
 void initializeMachineCycleInfoPrinterPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 602e64f23ed51..a99fed86d168d 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -42,6 +42,7 @@
 #include "llvm/CodeGen/LocalStackSlotAllocation.h"
 #include "llvm/CodeGen/LowerEmuTLS.h"
 #include "llvm/CodeGen/MIRPrinter.h"
+#include "llvm/CodeGen/MachineCSE.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachinePassManager.h"
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 0423e20f58d30..4047fd0478579 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -131,6 +131,7 @@ MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis())
 #endif
 MACHINE_FUNCTION_PASS("dead-mi-elimination", DeadMachineInstructionElimPass())
 MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass())
+MACHINE_FUNCTION_PASS("machine-cse", MachineCSEPass())
 MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotAllocationPass())
 MACHINE_FUNCTION_PASS("no-op-machine-function", NoOpMachineFunctionPass())
 MACHINE_FUNCTION_PASS("phi-node-elimination", PHIEliminationPass())
@@ -219,7 +220,6 @@ DUMMY_MACHINE_FUNCTION_PASS("livedebugvalues", LiveDebugValuesPass)
 DUMMY_MACHINE_FUNCTION_PASS("lrshrink", LiveRangeShrinkPass)
 DUMMY_MACHINE_FUNCTION_PASS("machine-combiner", MachineCombinerPass)
 DUMMY_MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass)
-DUMMY_MACHINE_FUNCTION_PASS("machine-cse", MachineCSEPass)
 DUMMY_MACHINE_FUNCTION_PASS("machine-function-splitter", MachineFunctionSplitterPass)
 DUMMY_MACHINE_FUNCTION_PASS("machine-latecleanup", MachineLateInstrsCleanupPass)
 DUMMY_MACHINE_FUNCTION_PASS("machine-sanmd", MachineSanitizerBinaryMetadata)
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 177702054a0e3..16b8d456748fa 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -75,7 +75,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineBlockPlacementPass(Registry);
   initializeMachineBlockPlacementStatsPass(Registry);
   initializeMachineCFGPrinterPass(Registry);
-  initializeMachineCSEPass(Registry);
+  initializeMachineCSELegacyPass(Registry);
   initializeMachineCombinerPass(Registry);
   initializeMachineCopyPropagationPass(Registry);
   initializeMachineCycleInfoPrinterPassPass(Registry);
diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp
index 2ac1fae9ea48c..8e9fcccff7764 100644
--- a/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/llvm/lib/CodeGen/MachineCSE.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/MachineCSE.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/ScopedHashTable.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -25,6 +26,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -69,110 +71,110 @@ static cl::opt<bool> AggressiveMachineCSE(
 
 namespace {
 
-  class MachineCSE : public MachineFunctionPass {
-    const TargetInstrInfo *TII = nullptr;
-    const TargetRegisterInfo *TRI = nullptr;
-    MachineDominatorTree *DT = nullptr;
-    MachineRegisterInfo *MRI = nullptr;
-    MachineBlockFrequencyInfo *MBFI = nullptr;
-
-  public:
-    static char ID; // Pass identification
-
-    MachineCSE() : MachineFunctionPass(ID) {
-      initializeMachineCSEPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnMachineFunction(MachineFunction &MF) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      MachineFunctionPass::getAnalysisUsage(AU);
-      AU.addPreservedID(MachineLoopInfoID);
-      AU.addRequired<MachineDominatorTreeWrapperPass>();
-      AU.addPreserved<MachineDominatorTreeWrapperPass>();
-      AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
-      AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
-    }
-
-    MachineFunctionProperties getRequiredProperties() const override {
-      return MachineFunctionProperties()
-        .set(MachineFunctionProperties::Property::IsSSA);
-    }
+class MachineCSEImpl {
+  const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  MachineDominatorTree *DT = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  MachineBlockFrequencyInfo *MBFI = nullptr;
+
+public:
+  MachineCSEImpl(MachineDominatorTree *DT, MachineBlockFrequencyInfo *MBFI)
+      : DT(DT), MBFI(MBFI) {}
+  bool run(MachineFunction &MF);
+
+private:
+  using AllocatorTy =
+      RecyclingAllocator<BumpPtrAllocator,
+                         ScopedHashTableVal<MachineInstr *, unsigned>>;
+  using ScopedHTType =
+      ScopedHashTable<MachineInstr *, unsigned, MachineInstrExpressionTrait,
+                      AllocatorTy>;
+  using ScopeType = ScopedHTType::ScopeTy;
+  using PhysDefVector = SmallVector<std::pair<unsigned, unsigned>, 2>;
+
+  unsigned LookAheadLimit = 0;
+  DenseMap<MachineBasicBlock *, ScopeType *> ScopeMap;
+  DenseMap<MachineInstr *, MachineBasicBlock *, MachineInstrExpressionTrait>
+      PREMap;
+  ScopedHTType VNT;
+  SmallVector<MachineInstr *, 64> Exps;
+  unsigned CurrVN = 0;
+
+  bool PerformTrivialCopyPropagation(MachineInstr *MI, MachineBasicBlock *MBB);
+  bool isPhysDefTriviallyDead(MCRegister Reg,
+                              MachineBasicBlock::const_iterator I,
+                              MachineBasicBlock::const_iterator E) const;
+  bool hasLivePhysRegDefUses(const MachineInstr *MI,
+                             const MachineBasicBlock *MBB,
+                             SmallSet<MCRegister, 8> &PhysRefs,
+                             PhysDefVector &PhysDefs, bool &PhysUseDef) const;
+  bool PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
+                        SmallSet<MCRegister, 8> &PhysRefs,
+                        PhysDefVector &PhysDefs, bool &NonLocal) const;
+  bool isCSECandidate(MachineInstr *MI);
+  bool isProfitableToCSE(Register CSReg, Register Reg, MachineBasicBlock *CSBB,
+                         MachineInstr *MI);
+  void EnterScope(MachineBasicBlock *MBB);
+  void ExitScope(MachineBasicBlock *MBB);
+  bool ProcessBlockCSE(MachineBasicBlock *MBB);
+  void ExitScopeIfDone(MachineDomTreeNode *Node,
+                       DenseMap<MachineDomTreeNode *, unsigned> &OpenChildren);
+  bool PerformCSE(MachineDomTreeNode *Node);
+
+  bool isPRECandidate(MachineInstr *MI, SmallSet<MCRegister, 8> &PhysRefs);
+  bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB);
+  bool PerformSimplePRE(MachineDominatorTree *DT);
+  /// Heuristics to see if it's profitable to move common computations of MBB
+  /// and MBB1 to CandidateBB.
+  bool isProfitableToHoistInto(MachineBasicBlock *CandidateBB,
+                               MachineBasicBlock *MBB, MachineBasicBlock *MBB1);
+  void releaseMemory();
+};
+
+class MachineCSELegacy : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification
+
+  MachineCSELegacy() : MachineFunctionPass(ID) {
+    initializeMachineCSELegacyPass(*PassRegistry::getPassRegistry());
+  }
 
-    void releaseMemory() override {
-      ScopeMap.clear();
-      PREMap.clear();
-      Exps.clear();
-    }
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  private:
-    using AllocatorTy = RecyclingAllocator<BumpPtrAllocator,
-                            ScopedHashTableVal<MachineInstr *, unsigned>>;
-    using ScopedHTType =
-        ScopedHashTable<MachineInstr *, unsigned, MachineInstrExpressionTrait,
-                        AllocatorTy>;
-    using ScopeType = ScopedHTType::ScopeTy;
-    using PhysDefVector = SmallVector<std::pair<unsigned, unsigned>, 2>;
-
-    unsigned LookAheadLimit = 0;
-    DenseMap<MachineBasicBlock *, ScopeType *> ScopeMap;
-    DenseMap<MachineInstr *, MachineBasicBlock *, MachineInstrExpressionTrait>
-        PREMap;
-    ScopedHTType VNT;
-    SmallVector<MachineInstr *, 64> Exps;
-    unsigned CurrVN = 0;
-
-    bool PerformTrivialCopyPropagation(MachineInstr *MI,
-                                       MachineBasicBlock *MBB);
-    bool isPhysDefTriviallyDead(MCRegister Reg,
-                                MachineBasicBlock::const_iterator I,
-                                MachineBasicBlock::const_iterator E) const;
-    bool hasLivePhysRegDefUses(const MachineInstr *MI,
-                               const MachineBasicBlock *MBB,
-                               SmallSet<MCRegister, 8> &PhysRefs,
-                               PhysDefVector &PhysDefs, bool &PhysUseDef) const;
-    bool PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
-                          SmallSet<MCRegister, 8> &PhysRefs,
-                          PhysDefVector &PhysDefs, bool &NonLocal) const;
-    bool isCSECandidate(MachineInstr *MI);
-    bool isProfitableToCSE(Register CSReg, Register Reg,
-                           MachineBasicBlock *CSBB, MachineInstr *MI);
-    void EnterScope(MachineBasicBlock *MBB);
-    void ExitScope(MachineBasicBlock *MBB);
-    bool ProcessBlockCSE(MachineBasicBlock *MBB);
-    void ExitScopeIfDone(MachineDomTreeNode *Node,
-                         DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren);
-    bool PerformCSE(MachineDomTreeNode *Node);
-
-    bool isPRECandidate(MachineInstr *MI, SmallSet<MCRegister, 8> &PhysRefs);
-    bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB);
-    bool PerformSimplePRE(MachineDominatorTree *DT);
-    /// Heuristics to see if it's profitable to move common computations of MBB
-    /// and MBB1 to CandidateBB.
-    bool isProfitableToHoistInto(MachineBasicBlock *CandidateBB,
-                                 MachineBasicBlock *MBB,
-                                 MachineBasicBlock *MBB1);
-  };
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
+    AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
+    AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
+  }
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+};
 } // end anonymous namespace
 
-char MachineCSE::ID = 0;
+char MachineCSELegacy::ID = 0;
 
-char &llvm::MachineCSEID = MachineCSE::ID;
+char &llvm::MachineCSELegacyID = MachineCSELegacy::ID;
 
-INITIALIZE_PASS_BEGIN(MachineCSE, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(MachineCSELegacy, DEBUG_TYPE,
                       "Machine Common Subexpression Elimination", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE,
+INITIALIZE_PASS_END(MachineCSELegacy, DEBUG_TYPE,
                     "Machine Common Subexpression Elimination", false, false)
 
 /// The source register of a COPY machine instruction can be propagated to all
 /// its users, and this propagation could increase the probability of finding
 /// common subexpressions. If the COPY has only one user, the COPY itself can
 /// be removed.
-bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
-                                               MachineBasicBlock *MBB) {
+bool MachineCSEImpl::PerformTrivialCopyPropagation(MachineInstr *MI,
+                                                   MachineBasicBlock *MBB) {
   bool Changed = false;
   for (MachineOperand &MO : MI->all_uses()) {
     Register Reg = MO.getReg();
@@ -225,7 +227,7 @@ bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
   return Changed;
 }
 
-bool MachineCSE::isPhysDefTriviallyDead(
+bool MachineCSEImpl::isPhysDefTriviallyDead(
     MCRegister Reg, MachineBasicBlock::const_iterator I,
     MachineBasicBlock::const_iterator E) const {
   unsigned LookAheadLeft = LookAheadLimit;
@@ -282,11 +284,11 @@ static bool isCallerPreservedOrConstPhysReg(MCRegister Reg,
 /// physical registers (except for dead defs of physical registers). It also
 /// returns the physical register def by reference if it's the only one and the
 /// instruction does not uses a physical register.
-bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
-                                       const MachineBasicBlock *MBB,
-                                       SmallSet<MCRegister, 8> &PhysRefs,
-                                       PhysDefVector &PhysDefs,
-                                       bool &PhysUseDef) const {
+bool MachineCSEImpl::hasLivePhysRegDefUses(const MachineInstr *MI,
+                                           const MachineBasicBlock *MBB,
+                                           SmallSet<MCRegister, 8> &PhysRefs,
+                                           PhysDefVector &PhysDefs,
+                                           bool &PhysUseDef) const {
   // First, add all uses to PhysRefs.
   for (const MachineOperand &MO : MI->all_uses()) {
     Register Reg = MO.getReg();
@@ -333,10 +335,10 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
   return !PhysRefs.empty();
 }
 
-bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
-                                  SmallSet<MCRegister, 8> &PhysRefs,
-                                  PhysDefVector &PhysDefs,
-                                  bool &NonLocal) const {
+bool MachineCSEImpl::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
+                                      SmallSet<MCRegister, 8> &PhysRefs,
+                                      PhysDefVector &PhysDefs,
+                                      bool &NonLocal) const {
   // For now conservatively returns false if the common subexpression is
   // not in the same basic block as the given instruction. The only exception
   // is if the common subexpression is in the sole predecessor block.
@@ -400,7 +402,7 @@ bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
   return false;
 }
 
-bool MachineCSE::isCSECandidate(MachineInstr *MI) {
+bool MachineCSEImpl::isCSECandidate(MachineInstr *MI) {
   if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() || MI->isKill() ||
       MI->isInlineAsm() || MI->isDebugInstr() || MI->isJumpTableDebugInfo() ||
       MI->isFakeUse())
@@ -437,8 +439,9 @@ bool MachineCSE::isCSECandidate(MachineInstr *MI) {
 /// isProfitableToCSE - Return true if it's profitable to eliminate MI with a
 /// common expression that defines Reg. CSBB is basic block where CSReg is
 /// defined.
-bool MachineCSE::isProfitableToCSE(Register CSReg, Register Reg,
-                                   MachineBasicBlock *CSBB, MachineInstr *MI) {
+bool MachineCSEImpl::isProfitableToCSE(Register CSReg, Register Reg,
+                                       MachineBasicBlock *CSBB,
+                                       MachineInstr *MI) {
   if (AggressiveMachineCSE)
     return true;
 
@@ -513,13 +516,13 @@ bool MachineCSE::isProfitableToCSE(Register CSReg, Register Reg,
   return !HasPHI;
 }
 
-void MachineCSE::EnterScope(MachineBasicBlock *MBB) {
+void MachineCSEImpl::EnterScope(MachineBasicBlock *MBB) {
   LLVM_DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n');
   ScopeType *Scope = new ScopeType(VNT);
   ScopeMap[MBB] = Scope;
 }
 
-void MachineCSE::ExitScope(MachineBasicBlock *MBB) {
+void MachineCSEImpl::ExitScope(MachineBasicBlock *MBB) {
   LLVM_DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n');
   DenseMap<MachineBasicBlock*, ScopeType*>::iterator SI = ScopeMap.find(MBB);
   assert(SI != ScopeMap.end());
@@ -527,7 +530,7 @@ void MachineCSE::ExitScope(MachineBasicBlock *MBB) {
   ScopeMap.erase(SI);
 }
 
-bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
+bool MachineCSEImpl::ProcessBlockCSE(MachineBasicBlock *MBB) {
   bool Changed = false;
 
   SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
@@ -748,9 +751,9 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
 /// ExitScopeIfDone - Destroy scope for the MBB that corresponds to the given
 /// dominator tree node if its a leaf or all of its children are done. Walk
 /// up the dominator tree to destroy ancestors which are now done.
-void
-MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
-                        DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren) {
+void MachineCSEImpl::ExitScopeIfDone(
+    MachineDomTreeNode *Node,
+    DenseMap<MachineDomTreeNode *, unsigned> &OpenChildren) {
   if (OpenChildren[Node])
     return;
 
@@ -767,7 +770,7 @@ MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
   }
 }
 
-bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
+bool MachineCSEImpl::PerformCSE(MachineDomTreeNode *Node) {
   SmallVector<MachineDomTreeNode*, 32> Scopes;
   SmallVector<MachineDomTreeNode*, 8> WorkList;
   DenseMap<MachineDomTreeNode*, unsigned> OpenChildren;
@@ -799,8 +802,8 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
 // We use stronger checks for PRE candidate rather than for CSE ones to embrace
 // checks inside ProcessBlockCSE(), not only inside isCSECandidate(). This helps
 // to exclude instrs created by PRE that won't be CSEed later.
-bool MachineCSE::isPRECandidate(MachineInstr *MI,
-                                SmallSet<MCRegister, 8> &PhysRefs) {
+bool MachineCSEImpl::isPRECandidate(MachineInstr *MI,
+                                    SmallSet<MCRegister, 8> &PhysRefs) {
   if (!isCSECandidate(MI) ||
       MI->isNotDuplicable() ||
       MI->mayLoad() ||
@@ -821,8 +824,8 @@ bool MachineCSE::isPRECandidate(MachineInstr *MI,
   return true;
 }
 
-bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
-                                 MachineBasicBlock *MBB) {
+bool MachineCSEImpl::ProcessBlockPRE(MachineDominatorTree *DT,
+                                     MachineBasicBlock *MBB) {
   bool Changed = false;
   for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
     SmallSet<MCRegister, 8> PhysRefs;
@@ -902,7 +905,7 @@ bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
 // anticipating that the next CSE step will eliminate this created redundancy.
 // If CSE doesn't eliminate this, than created instruction will remain dead
 // and eliminated later by Remove Dead Machine Instructions pass.
-bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) {
+bool MachineCSEImpl::PerformSimplePRE(MachineDominatorTree *DT) {
   SmallVector<MachineDomTreeNode *, 32> BBs;
 
   PREMap.clear();
@@ -920,9 +923,9 @@ bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) {
   return Changed;
 }
 
-bool MachineCSE::isProfitableToHoistInto(MachineBasicBlock *CandidateBB,
-                                         MachineBasicBlock *MBB,
-                                         MachineBasicBlock *MBB1) {
+bool MachineCSEImpl::isProfitableToHoistInto(MachineBasicBlock *CandidateBB,
+                                             MachineBasicBlock *MBB,
+                                             MachineBasicBlock *MBB1) {
   if (CandidateBB->getParent()->getFunction().hasMinSize())
     return true;
   assert(DT->dominates(CandidateBB, MBB) && "CandidateBB should dominate MBB");
@@ -932,18 +935,55 @@ bool MachineCSE::isProfitableToHoistInto(MachineBasicBlock *CandidateBB,
          MBFI->getBlockFreq(MBB) + MBFI->getBlockFreq(MBB1);
 }
 
-bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
-  if (skipFunction(MF.getFunction()))
-    return false;
+void MachineCSEImpl::releaseMemory() {
+  ScopeMap.clear();
+  PREMap.clear();
+  Exps.clear();
+}
 
+bool MachineCSEImpl::run(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
-  DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
-  MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
   LookAheadLimit = TII->getMachineCSELookAheadLimit();
   bool ChangedPRE, ChangedCSE;
   ChangedPRE = PerformSimplePRE(DT);
   ChangedCSE = PerformCSE(DT->getRootNode());
+  releaseMemory();
   return ChangedPRE || ChangedCSE;
 }
+
+PreservedAnalyses MachineCSEPass::run(MachineFunction &MF,
+                                      MachineFunctionAnalysisManager &MFAM) {
+  MFPropsModifier _(*this, MF);
+
+  if (MF.getFunction().hasOptNone())
+    return PreservedAnalyses::all();
+
+  MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
+  MachineBlockFrequencyInfo &MBFI =
+      MFAM.getResult<MachineBlockFrequencyAnalysis>(MF);
+  MachineCSEImpl Impl(&MDT, &MBFI);
+  bool Changed = Impl.run(MF);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  auto PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserve<MachineLoopAnalysis>();
+  PA.preserve<MachineDominatorTreeAnalysis>();
+  PA.preserve<MachineBlockFrequencyAnalysis>();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+bool MachineCSELegacy::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  MachineDominatorTree &MDT =
+      getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  MachineBlockFrequencyInfo &MBFI =
+      getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
+  MachineCSEImpl Impl(&MDT, &MBFI);
+  return Impl.run(MF);
+}
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index c0b834650d73b..11a7752ef7a38 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -311,7 +311,7 @@ static IdentifyingPassPtr overridePass(AnalysisID StandardID,
   if (StandardID == &EarlyMachineLICMID)
     return applyDisable(TargetID, DisableMachineLICM);
 
-  if (StandardID == &MachineCSEID)
+  if (StandardID == &MachineCSELegacyID)
     return applyDisable(TargetID, DisableMachineCSE);
 
   if (StandardID == &MachineLICMID)
@@ -523,7 +523,7 @@ void llvm::registerCodeGenCallback(PassInstrumentationCallbacks &PIC,
     DISABLE_PASS(DisableCopyProp, MachineCopyPropagationPass)
     DISABLE_PASS(DisableEarlyIfConversion, EarlyIfConverterPass)
     DISABLE_PASS(DisableEarlyTailDup, EarlyTailDuplicatePass)
-    DISABLE_PASS(DisableMachineCSE, MachineCSEPass)
+    DISABLE_PASS(DisableMachineCSE, MachineCSELegacyPass)
     DISABLE_PASS(DisableMachineDCE, DeadMachineInstructionElimPass)
     DISABLE_PASS(DisableMachineLICM, EarlyMachineLICMPass)
     DISABLE_PASS(DisableMachineSink, MachineSinkingPass)
@@ -1305,7 +1305,7 @@ void TargetPassConfig::addMachineSSAOptimization() {
   addILPOpts();
 
   addPass(&EarlyMachineLICMID);
-  addPass(&MachineCSEID);
+  addPass(&MachineCSELegacyID);
 
   addPass(&MachineSinkingID);
 
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 1df1449fce597..a22abed8051a1 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -100,6 +100,7 @@
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineCSE.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 18e466d2bd5b5..9c9c505139373 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1277,7 +1277,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
   if (isPassEnabled(EnableSDWAPeephole)) {
     addPass(&SIPeepholeSDWAID);
     addPass(&EarlyMachineLICMID);
-    addPass(&MachineCSEID);
+    addPass(&MachineCSELegacyID);
     addPass(&SIFoldOperandsLegacyID);
   }
   addPass(&DeadMachineInstructionElimID);
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index e86d3771bd2f2..57b7fa783c14a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -462,7 +462,7 @@ void NVPTXPassConfig::addMachineSSAOptimization() {
     printAndVerify("After ILP optimizations");
 
   addPass(&EarlyMachineLICMID);
-  addPass(&MachineCSEID);
+  addPass(&MachineCSELegacyID);
 
   addPass(&MachineSinkingID);
   printAndVerify("After Machine LICM, CSE and Sinking passes");
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir b/llvm/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir
index 2b69c13174f6c..015ce5ec2dca6 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir
@@ -1,4 +1,5 @@
 # RUN: llc -run-pass machine-cse -verify-machineinstrs -mtriple aarch64-apple-ios %s -o - | FileCheck %s
+# RUN: llc -passes machine-cse -mtriple aarch64-apple-ios %s -o - | FileCheck %s
 ---
 name:            irtranslated
 legalized:       false
diff --git a/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir b/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir
index 8395a7619fbb4..5ebc2f61eaafb 100644
--- a/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir
+++ b/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -run-pass=machine-cse -mtriple=aarch64 -mattr=+sve -o - %s | FileCheck %s
+# RUN: llc -passes=machine-cse -mtriple=aarch64 -mattr=+sve -o - %s | FileCheck %s
 ---
 name:            pfalse
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/no-cse-nonlocal-convergent-instrs.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/no-cse-nonlocal-convergent-instrs.mir
index 684b5ec3883b2..6eb4df2b48700 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/no-cse-nonlocal-convergent-instrs.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/no-cse-nonlocal-convergent-instrs.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -o - -run-pass=machine-cse %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -o - -passes=machine-cse %s | FileCheck %s
 
 # LLVM's current definition of `isConvergent` does not necessarily prove that
 # non-local CSE is illegal. The following test extends the definition of
diff --git a/llvm/test/CodeGen/AMDGPU/copyprop_regsequence_with_undef.mir b/llvm/test/CodeGen/AMDGPU/copyprop_regsequence_with_undef.mir
index 1e12a3b22e9a4..fee1391d150f9 100644
--- a/llvm/test/CodeGen/AMDGPU/copyprop_regsequence_with_undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/copyprop_regsequence_with_undef.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=amdgcn -run-pass=machine-cse -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -passes=machine-cse -o - %s | FileCheck %s
 
 # Test to ensure that this does not crash on undefs
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/machine-cse-ssa.mir b/llvm/test/CodeGen/AMDGPU/machine-cse-ssa.mir
index 89b204d715ded..d32737f05a9b0 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-cse-ssa.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-cse-ssa.mir
@@ -1,8 +1,10 @@
 # REQUIRES: asserts
-# RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=machine-cse -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s
+# RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=machine-cse -o /dev/null %s 2>&1 | FileCheck -check-prefixes=ERR,ERR-LEGACY %s
+# RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=machine-cse -o /dev/null %s 2>&1 | FileCheck -check-prefixes=ERR,ERR-NPM %s
 
-# ERR: MachineFunctionProperties required by Machine Common Subexpression Elimination pass are not met by function not_ssa.
-# ERR-NEXT: Required properties: IsSSA
+# ERR-LEGACY: MachineFunctionProperties required by Machine Common Subexpression Elimination pass are not met by function not_ssa.
+# ERR-NPM: MachineFunctionProperties required by MachineCSEPass pass are not met by function not_ssa.
+# ERR: Required properties: IsSSA
 # ERR-NEXT: Current properties: NoPHIs
 
 ---
diff --git a/llvm/test/CodeGen/PowerPC/machine-cse-rm-pre.mir b/llvm/test/CodeGen/PowerPC/machine-cse-rm-pre.mir
index 32f5e0172047e..0e9459238ce9d 100644
--- a/llvm/test/CodeGen/PowerPC/machine-cse-rm-pre.mir
+++ b/llvm/test/CodeGen/PowerPC/machine-cse-rm-pre.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc %s -o - -mtriple=powerpc-unknown-unknown -run-pass=machine-cse -verify-machineinstrs | FileCheck %s
+# RUN: llc %s -o - -mtriple=powerpc-unknown-unknown -passes=machine-cse | FileCheck %s
 --- |
   define void @can_pre() {
   entry:
diff --git a/llvm/test/CodeGen/Thumb/machine-cse-deadreg.mir b/llvm/test/CodeGen/Thumb/machine-cse-deadreg.mir
index e4db7abeea354..cee5c24847f34 100644
--- a/llvm/test/CodeGen/Thumb/machine-cse-deadreg.mir
+++ b/llvm/test/CodeGen/Thumb/machine-cse-deadreg.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple thumbv6m-arm-none-eabi -run-pass=machine-cse -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple thumbv6m-arm-none-eabi -passes=machine-cse -o - %s | FileCheck %s
 
 --- |
   target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
diff --git a/llvm/test/CodeGen/Thumb/machine-cse-physreg.mir b/llvm/test/CodeGen/Thumb/machine-cse-physreg.mir
index 2fa22feb4e1b3..58e1eca22711a 100644
--- a/llvm/test/CodeGen/Thumb/machine-cse-physreg.mir
+++ b/llvm/test/CodeGen/Thumb/machine-cse-physreg.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple thumbv5e -run-pass=machine-cse -o - %s | FileCheck %s
+# RUN: llc -mtriple thumbv5e -passes=machine-cse -o - %s | FileCheck %s
 
 # This is a contrived example made to expose a bug in
 # MachineCSE, see PR32538.
diff --git a/llvm/test/CodeGen/X86/cse-two-preds.mir b/llvm/test/CodeGen/X86/cse-two-preds.mir
index 6479747daf426..e6f04a6ce66d4 100644
--- a/llvm/test/CodeGen/X86/cse-two-preds.mir
+++ b/llvm/test/CodeGen/X86/cse-two-preds.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
 # RUN: llc -mtriple=x86_64 -verify-machineinstrs --run-pass=machine-cse -o - %s | FileCheck %s
+# RUN: llc -mtriple=x86_64 -passes=machine-cse -o - %s | FileCheck %s
 --- |
   target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/DebugInfo/MIR/X86/machine-cse.mir b/llvm/test/DebugInfo/MIR/X86/machine-cse.mir
index 120dbdf850cc4..9bcb4408312a6 100644
--- a/llvm/test/DebugInfo/MIR/X86/machine-cse.mir
+++ b/llvm/test/DebugInfo/MIR/X86/machine-cse.mir
@@ -1,4 +1,5 @@
 # RUN: llc %s -o - -run-pass=machine-cse -mtriple=x86_64-- | FileCheck %s
+# RUN: llc %s -o - -passes=machine-cse -mtriple=x86_64-- | FileCheck %s
 #
 # This test examines machine-cse's behaviour when dealing with copy propagation,
 # the code for which is lifted from test/CodeGen/X86/machine-cse.ll. There are

From 7732d8e51819416b9d28b1815bdf81d0e0642b04 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 4 Sep 2024 06:51:30 -0700
Subject: [PATCH 097/425] [ADT] Deprecate DenseMap::FindAndConstruct (#107224)

I've migrated all uses of FindAndConstruct to operator[] and
try_emplace.  This patch inlines FindAndConstruct into operator[] and
deprecates FindAndConstruct.
---
 llvm/include/llvm/ADT/DenseMap.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index e78700f9a9f3a..745288ff047f4 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -354,7 +354,8 @@ class DenseMapBase : public DebugEpochBase {
     incrementNumTombstones();
   }
 
-  value_type& FindAndConstruct(const KeyT &Key) {
+  LLVM_DEPRECATED("Use [Key] instead", "[Key]")
+  value_type &FindAndConstruct(const KeyT &Key) {
     BucketT *TheBucket;
     if (LookupBucketFor(Key, TheBucket))
       return *TheBucket;
@@ -363,10 +364,15 @@ class DenseMapBase : public DebugEpochBase {
   }
 
   ValueT &operator[](const KeyT &Key) {
-    return FindAndConstruct(Key).second;
+    BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return TheBucket->second;
+
+    return InsertIntoBucket(TheBucket, Key)->second;
   }
 
-  value_type& FindAndConstruct(KeyT &&Key) {
+  LLVM_DEPRECATED("Use [Key] instead", "[Key]")
+  value_type &FindAndConstruct(KeyT &&Key) {
     BucketT *TheBucket;
     if (LookupBucketFor(Key, TheBucket))
       return *TheBucket;
@@ -375,7 +381,11 @@ class DenseMapBase : public DebugEpochBase {
   }
 
   ValueT &operator[](KeyT &&Key) {
-    return FindAndConstruct(std::move(Key)).second;
+    BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return TheBucket->second;
+
+    return InsertIntoBucket(TheBucket, std::move(Key))->second;
   }
 
   /// isPointerIntoBucketsArray - Return true if the specified pointer points

From 1b0a80249399dadfe0c3f682fff77bf9eb666535 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Wed, 4 Sep 2024 07:02:55 -0700
Subject: [PATCH 098/425] [GDBRemote] Handle 'heap' memory region info type
 (#105883)

This should cause the memory region info "is stack" field to be set to
"no".
---
 .../Process/gdb-remote/GDBRemoteCommunicationClient.cpp  | 2 ++
 .../gdb-remote/GDBRemoteCommunicationClientTest.cpp      | 9 +++++++++
 2 files changed, 11 insertions(+)

diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index 0297fe363f69e..55d76ca8532d3 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -1638,6 +1638,8 @@ Status GDBRemoteCommunicationClient::GetMemoryRegionInfo(
           for (llvm::StringRef entry : llvm::split(value, ',')) {
             if (entry == "stack")
               region_info.SetIsStackMemory(MemoryRegionInfo::eYes);
+            else if (entry == "heap")
+              region_info.SetIsStackMemory(MemoryRegionInfo::eNo);
           }
         } else if (name == "error") {
           StringExtractorGDBRemote error_extractor(value);
diff --git a/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp b/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp
index 18020c8e43fe0..ce5ab2cf50829 100644
--- a/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp
+++ b/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp
@@ -364,6 +364,15 @@ TEST_F(GDBRemoteCommunicationClientTest, GetMemoryRegionInfo) {
   EXPECT_TRUE(result.get().Success());
   EXPECT_EQ(MemoryRegionInfo::eYes, region_info.GetMemoryTagged());
   EXPECT_EQ(MemoryRegionInfo::eYes, region_info.IsStackMemory());
+
+  result = std::async(std::launch::async, [&] {
+    return client.GetMemoryRegionInfo(addr, region_info);
+  });
+
+  HandlePacket(server, "qMemoryRegionInfo:a000",
+               "start:a000;size:2000;type:heap;");
+  EXPECT_TRUE(result.get().Success());
+  EXPECT_EQ(MemoryRegionInfo::eNo, region_info.IsStackMemory());
 }
 
 TEST_F(GDBRemoteCommunicationClientTest, GetMemoryRegionInfoInvalidResponse) {

From 30f1cfb4d0784de869ab3a4a9774b696b9769093 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 4 Sep 2024 07:04:32 -0700
Subject: [PATCH 099/425] [TableGen] Print memory stats in detailed record
 emitter (#106990)

Print memory allocation and related statistics when dumping detailed
record information.
---
 llvm/include/llvm/TableGen/Record.h          |  4 ++-
 llvm/lib/TableGen/DetailedRecordsBackend.cpp | 17 ++++++----
 llvm/lib/TableGen/Record.cpp                 | 33 ++++++++++++++++++++
 3 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index a339946e67cf2..ff596df94e4f5 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -1757,7 +1757,7 @@ class Record {
   ArrayRef<AssertionInfo> getAssertions() const { return Assertions; }
   ArrayRef<DumpInfo> getDumps() const { return Dumps; }
 
-  ArrayRef<std::pair<Record *, SMRange>>  getSuperClasses() const {
+  ArrayRef<std::pair<Record *, SMRange>> getSuperClasses() const {
     return SuperClasses;
   }
 
@@ -2073,6 +2073,8 @@ class RecordKeeper {
 
   void dump() const;
 
+  void dumpAllocationStats(raw_ostream &OS) const;
+
 private:
   RecordKeeper(RecordKeeper &&) = delete;
   RecordKeeper(const RecordKeeper &) = delete;
diff --git a/llvm/lib/TableGen/DetailedRecordsBackend.cpp b/llvm/lib/TableGen/DetailedRecordsBackend.cpp
index 45e621483c817..7d17c4d68c3f1 100644
--- a/llvm/lib/TableGen/DetailedRecordsBackend.cpp
+++ b/llvm/lib/TableGen/DetailedRecordsBackend.cpp
@@ -41,6 +41,7 @@ class DetailedRecordsEmitter {
   void printVariables(raw_ostream &OS);
   void printClasses(raw_ostream &OS);
   void printRecords(raw_ostream &OS);
+  void printAllocationStats(raw_ostream &OS);
   void printDefms(const Record &Rec, raw_ostream &OS);
   void printTemplateArgs(const Record &Rec, raw_ostream &OS);
   void printSuperclasses(const Record &Rec, raw_ostream &OS);
@@ -55,6 +56,7 @@ void DetailedRecordsEmitter::run(raw_ostream &OS) {
   printVariables(OS);
   printClasses(OS);
   printRecords(OS);
+  printAllocationStats(OS);
 }
 
 // Print the report heading, including the source file name.
@@ -62,8 +64,7 @@ void DetailedRecordsEmitter::printReportHeading(raw_ostream &OS) {
   OS << formatv("DETAILED RECORDS for file {0}\n", Records.getInputFilename());
 }
 
-// Print a section heading with the name of the section and
-// the item count.
+// Print a section heading with the name of the section and the item count.
 void DetailedRecordsEmitter::printSectionHeading(StringRef Title, int Count,
                                                  raw_ostream &OS) {
   OS << formatv("\n{0} {1} ({2}) {0}\n", "--------------------", Title, Count);
@@ -79,8 +80,7 @@ void DetailedRecordsEmitter::printVariables(raw_ostream &OS) {
     OS << Var.first << " = " << Var.second->getAsString() << '\n';
 }
 
-// Print the classes, including the template arguments, superclasses,
-// and fields.
+// Print classes, including the template arguments, superclasses, and fields.
 void DetailedRecordsEmitter::printClasses(raw_ostream &OS) {
   const auto &ClassList = Records.getClasses();
   printSectionHeading("Classes", ClassList.size(), OS);
@@ -94,8 +94,7 @@ void DetailedRecordsEmitter::printClasses(raw_ostream &OS) {
   }
 }
 
-// Print the records, including the defm sequences, supercasses,
-// and fields.
+// Print the records, including the defm sequences, supercasses, and fields.
 void DetailedRecordsEmitter::printRecords(raw_ostream &OS) {
   const auto &RecordList = Records.getDefs();
   printSectionHeading("Records", RecordList.size(), OS);
@@ -110,6 +109,12 @@ void DetailedRecordsEmitter::printRecords(raw_ostream &OS) {
   }
 }
 
+// Print memory allocation related stats.
+void DetailedRecordsEmitter::printAllocationStats(raw_ostream &OS) {
+  OS << formatv("\n{0} Memory Allocation Stats {0}\n", "--------------------");
+  Records.dumpAllocationStats(OS);
+}
+
 // Print the record's defm source locations, if any. Note that they
 // are stored in the reverse order of their invocation.
 void DetailedRecordsEmitter::printDefms(const Record &Rec, raw_ostream &OS) {
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index bcecee8e550c8..cead8f865a607 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -92,10 +92,39 @@ struct RecordKeeperImpl {
 
   unsigned AnonCounter;
   unsigned LastRecordID;
+
+  void dumpAllocationStats(raw_ostream &OS) const;
 };
 } // namespace detail
 } // namespace llvm
 
+void detail::RecordKeeperImpl::dumpAllocationStats(raw_ostream &OS) const {
+  // Dump memory allocation related stats.
+  OS << "TheArgumentInitPool size = " << TheArgumentInitPool.size() << '\n';
+  OS << "TheBitsInitPool size = " << TheBitsInitPool.size() << '\n';
+  OS << "TheIntInitPool size = " << TheIntInitPool.size() << '\n';
+  OS << "TheBitsInitPool size = " << TheBitsInitPool.size() << '\n';
+  OS << "TheListInitPool size = " << TheListInitPool.size() << '\n';
+  OS << "TheUnOpInitPool size = " << TheUnOpInitPool.size() << '\n';
+  OS << "TheBinOpInitPool size = " << TheBinOpInitPool.size() << '\n';
+  OS << "TheTernOpInitPool size = " << TheTernOpInitPool.size() << '\n';
+  OS << "TheFoldOpInitPool size = " << TheFoldOpInitPool.size() << '\n';
+  OS << "TheIsAOpInitPool size = " << TheIsAOpInitPool.size() << '\n';
+  OS << "TheExistsOpInitPool size = " << TheExistsOpInitPool.size() << '\n';
+  OS << "TheCondOpInitPool size = " << TheCondOpInitPool.size() << '\n';
+  OS << "TheDagInitPool size = " << TheDagInitPool.size() << '\n';
+  OS << "RecordTypePool size = " << RecordTypePool.size() << '\n';
+  OS << "TheVarInitPool size = " << TheVarInitPool.size() << '\n';
+  OS << "TheVarBitInitPool size = " << TheVarBitInitPool.size() << '\n';
+  OS << "TheVarDefInitPool size = " << TheVarDefInitPool.size() << '\n';
+  OS << "TheFieldInitPool size = " << TheFieldInitPool.size() << '\n';
+  OS << "Bytes allocated = " << Allocator.getBytesAllocated() << '\n';
+  OS << "Total allocator memory = " << Allocator.getTotalMemory() << "\n\n";
+
+  OS << "Number of records instantiated = " << LastRecordID << '\n';
+  OS << "Number of anonymous records = " << AnonCounter << '\n';
+}
+
 //===----------------------------------------------------------------------===//
 //    Type implementations
 //===----------------------------------------------------------------------===//
@@ -3261,6 +3290,10 @@ RecordKeeper::getAllDerivedDefinitionsIfDefined(StringRef ClassName) const {
                              : std::vector<Record *>();
 }
 
+void RecordKeeper::dumpAllocationStats(raw_ostream &OS) const {
+  Impl->dumpAllocationStats(OS);
+}
+
 Init *MapResolver::resolve(Init *VarName) {
   auto It = Map.find(VarName);
   if (It == Map.end())

From e64ef634bbd940dfaae23d9fb43e6385014ffd10 Mon Sep 17 00:00:00 2001
From: Malek Ben Slimane <85631834+malek203@users.noreply.github.com>
Date: Wed, 4 Sep 2024 16:18:11 +0200
Subject: [PATCH 100/425] Thread Safety Analysis: Differentiate between lock
 sets at real join points and expected/actual sets at function end (#105526)

This fixes false positives related to returning a scoped lockable
object. At the end of a function, we check managed locks instead of
scoped locks.

At real join points, we skip checking managed locks because we assume
that the scope keeps track of its underlying mutexes and will release
them at its destruction. So, checking for the scopes is sufficient.
However, at the end of a function, we aim at comparing the expected and
the actual lock sets. There, we skip checking scoped locks to prevent to
get duplicate warnings for the same lock.
---
 clang/lib/Analysis/ThreadSafety.cpp           |  8 ++++++--
 .../SemaCXX/warn-thread-safety-analysis.cpp   | 20 ++++++++-----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/clang/lib/Analysis/ThreadSafety.cpp b/clang/lib/Analysis/ThreadSafety.cpp
index e25b843c9bf83..c4a83b069e079 100644
--- a/clang/lib/Analysis/ThreadSafety.cpp
+++ b/clang/lib/Analysis/ThreadSafety.cpp
@@ -922,6 +922,9 @@ class ScopedLockableFactEntry : public FactEntry {
   handleRemovalFromIntersection(const FactSet &FSet, FactManager &FactMan,
                                 SourceLocation JoinLoc, LockErrorKind LEK,
                                 ThreadSafetyHandler &Handler) const override {
+    if (LEK == LEK_LockedAtEndOfFunction || LEK == LEK_NotLockedAtEndOfFunction)
+      return;
+
     for (const auto &UnderlyingMutex : UnderlyingMutexes) {
       const auto *Entry = FSet.findLock(FactMan, UnderlyingMutex.Cap);
       if ((UnderlyingMutex.Kind == UCK_Acquired && Entry) ||
@@ -2224,7 +2227,7 @@ void ThreadSafetyAnalyzer::intersectAndWarn(FactSet &EntrySet,
       if (join(FactMan[*EntryIt], ExitFact,
                EntryLEK != LEK_LockedSomeLoopIterations))
         *EntryIt = Fact;
-    } else if (!ExitFact.managed()) {
+    } else if (!ExitFact.managed() || EntryLEK == LEK_LockedAtEndOfFunction) {
       ExitFact.handleRemovalFromIntersection(ExitSet, FactMan, JoinLoc,
                                              EntryLEK, Handler);
     }
@@ -2236,7 +2239,8 @@ void ThreadSafetyAnalyzer::intersectAndWarn(FactSet &EntrySet,
     const FactEntry *ExitFact = ExitSet.findLock(FactMan, *EntryFact);
 
     if (!ExitFact) {
-      if (!EntryFact->managed() || ExitLEK == LEK_LockedSomeLoopIterations)
+      if (!EntryFact->managed() || ExitLEK == LEK_LockedSomeLoopIterations ||
+          ExitLEK == LEK_NotLockedAtEndOfFunction)
         EntryFact->handleRemovalFromIntersection(EntrySetOrig, FactMan, JoinLoc,
                                                  ExitLEK, Handler);
       if (ExitLEK == LEK_LockedSomePredecessors)
diff --git a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
index af9254508d805..8477200456d98 100644
--- a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
+++ b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
@@ -6077,24 +6077,20 @@ namespace ReturnScopedLockable {
 class Object {
 public:
   MutexLock lock() EXCLUSIVE_LOCK_FUNCTION(mutex) {
-    // TODO: False positive because scoped lock isn't destructed.
-    return MutexLock(&mutex); // expected-note {{mutex acquired here}}
-  }                           // expected-warning {{mutex 'mutex' is still held at the end of function}}
+    return MutexLock(&mutex);
+  }
 
   ReaderMutexLock lockShared() SHARED_LOCK_FUNCTION(mutex) {
-    // TODO: False positive because scoped lock isn't destructed.
-    return ReaderMutexLock(&mutex); // expected-note {{mutex acquired here}}
-  }                                 // expected-warning {{mutex 'mutex' is still held at the end of function}}
+    return ReaderMutexLock(&mutex);
+  }
 
   MutexLock adopt() EXCLUSIVE_LOCKS_REQUIRED(mutex) {
-    // TODO: False positive because scoped lock isn't destructed.
-    return MutexLock(&mutex, true); // expected-note {{mutex acquired here}}
-  }                                 // expected-warning {{mutex 'mutex' is still held at the end of function}}
+    return MutexLock(&mutex, true);
+  }
 
   ReaderMutexLock adoptShared() SHARED_LOCKS_REQUIRED(mutex) {
-    // TODO: False positive because scoped lock isn't destructed.
-    return ReaderMutexLock(&mutex, true); // expected-note {{mutex acquired here}}
-  }                                       // expected-warning {{mutex 'mutex' is still held at the end of function}}
+    return ReaderMutexLock(&mutex, true);
+  }
 
   int x GUARDED_BY(mutex);
   void needsLock() EXCLUSIVE_LOCKS_REQUIRED(mutex);

From 8f77d37f256809766fd83a09c6d144b785e9165a Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro@nvidia.com>
Date: Wed, 4 Sep 2024 07:18:53 -0700
Subject: [PATCH 101/425] [DAGCombiner] cache negative result from
 getMergeStoreCandidates() (#106949)

Cache negative search result from getStoreMergeCandidates() so that
mergeConsecutiveStores() does not iterate quadratically over a
potentially long sequence of unmergeable stores.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 83 ++++++++++++-------
 1 file changed, 51 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6390231341f96..37272a09b336a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -191,6 +191,11 @@ namespace {
     // AA - Used for DAG load/store alias analysis.
     AliasAnalysis *AA;
 
+    /// This caches all chains that have already been processed in
+    /// DAGCombiner::getStoreMergeCandidates() and found to have no mergeable
+    /// stores candidates.
+    SmallPtrSet<SDNode *, 4> ChainsWithoutMergeableStores;
+
     /// When an instruction is simplified, add all users of the instruction to
     /// the work lists because they might get more simplified now.
     void AddUsersToWorklist(SDNode *N) {
@@ -779,11 +784,10 @@ namespace {
                                          bool UseTrunc);
 
     /// This is a helper function for mergeConsecutiveStores. Stores that
-    /// potentially may be merged with St are placed in StoreNodes. RootNode is
-    /// a chain predecessor to all store candidates.
-    void getStoreMergeCandidates(StoreSDNode *St,
-                                 SmallVectorImpl<MemOpLink> &StoreNodes,
-                                 SDNode *&Root);
+    /// potentially may be merged with St are placed in StoreNodes. On success,
+    /// returns a chain predecessor to all store candidates.
+    SDNode *getStoreMergeCandidates(StoreSDNode *St,
+                                    SmallVectorImpl<MemOpLink> &StoreNodes);
 
     /// Helper function for mergeConsecutiveStores. Checks if candidate stores
     /// have indirect dependency through their operands. RootNode is the
@@ -1785,6 +1789,9 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
 
     ++NodesCombined;
 
+    // Invalidate cached info.
+    ChainsWithoutMergeableStores.clear();
+
     // If we get back the same node we passed in, rather than a new node or
     // zero, we know that the node must have defined multiple values and
     // CombineTo was used.  Since CombineTo takes care of the worklist
@@ -20514,15 +20521,15 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
   return true;
 }
 
-void DAGCombiner::getStoreMergeCandidates(
-    StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes,
-    SDNode *&RootNode) {
+SDNode *
+DAGCombiner::getStoreMergeCandidates(StoreSDNode *St,
+                                     SmallVectorImpl<MemOpLink> &StoreNodes) {
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer. We must have a base and an offset. Do not handle stores to undef
   // base pointers.
   BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
   if (!BasePtr.getBase().getNode() || BasePtr.getBase().isUndef())
-    return;
+    return nullptr;
 
   SDValue Val = peekThroughBitcasts(St->getValue());
   StoreSource StoreSrc = getStoreSource(Val);
@@ -20538,14 +20545,14 @@ void DAGCombiner::getStoreMergeCandidates(
     LoadVT = Ld->getMemoryVT();
     // Load and store should be the same type.
     if (MemVT != LoadVT)
-      return;
+      return nullptr;
     // Loads must only have one use.
     if (!Ld->hasNUsesOfValue(1, 0))
-      return;
+      return nullptr;
     // The memory operands must not be volatile/indexed/atomic.
     // TODO: May be able to relax for unordered atomics (see D66309)
     if (!Ld->isSimple() || Ld->isIndexed())
-      return;
+      return nullptr;
   }
   auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
                             int64_t &Offset) -> bool {
@@ -20613,6 +20620,27 @@ void DAGCombiner::getStoreMergeCandidates(
     return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
   };
 
+  // We are looking for a root node which is an ancestor to all mergable
+  // stores. We search up through a load, to our root and then down
+  // through all children. For instance we will find Store{1,2,3} if
+  // St is Store1, Store2. or Store3 where the root is not a load
+  // which always true for nonvolatile ops. TODO: Expand
+  // the search to find all valid candidates through multiple layers of loads.
+  //
+  // Root
+  // |-------|-------|
+  // Load    Load    Store3
+  // |       |
+  // Store1   Store2
+  //
+  // FIXME: We should be able to climb and
+  // descend TokenFactors to find candidates as well.
+
+  SDNode *RootNode = St->getChain().getNode();
+  // Bail out if we already analyzed this root node and found nothing.
+  if (ChainsWithoutMergeableStores.contains(RootNode))
+    return nullptr;
+
   // Check if the pair of StoreNode and the RootNode already bail out many
   // times which is over the limit in dependence check.
   auto OverLimitInDependenceCheck = [&](SDNode *StoreNode,
@@ -20636,28 +20664,13 @@ void DAGCombiner::getStoreMergeCandidates(
     }
   };
 
-  // We looking for a root node which is an ancestor to all mergable
-  // stores. We search up through a load, to our root and then down
-  // through all children. For instance we will find Store{1,2,3} if
-  // St is Store1, Store2. or Store3 where the root is not a load
-  // which always true for nonvolatile ops. TODO: Expand
-  // the search to find all valid candidates through multiple layers of loads.
-  //
-  // Root
-  // |-------|-------|
-  // Load    Load    Store3
-  // |       |
-  // Store1   Store2
-  //
-  // FIXME: We should be able to climb and
-  // descend TokenFactors to find candidates as well.
-
-  RootNode = St->getChain().getNode();
-
   unsigned NumNodesExplored = 0;
   const unsigned MaxSearchNodes = 1024;
   if (auto *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
     RootNode = Ldn->getChain().getNode();
+    // Bail out if we already analyzed this root node and found nothing.
+    if (ChainsWithoutMergeableStores.contains(RootNode))
+      return nullptr;
     for (auto I = RootNode->use_begin(), E = RootNode->use_end();
          I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored) {
       if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) { // walk down chain
@@ -20674,6 +20687,8 @@ void DAGCombiner::getStoreMergeCandidates(
          I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored)
       TryToAddCandidate(I);
   }
+
+  return RootNode;
 }
 
 // We need to check that merging these stores does not cause a loop in the
@@ -21304,9 +21319,8 @@ bool DAGCombiner::mergeConsecutiveStores(StoreSDNode *St) {
     return false;
 
   SmallVector<MemOpLink, 8> StoreNodes;
-  SDNode *RootNode;
   // Find potential store merge candidates by searching through chain sub-DAG
-  getStoreMergeCandidates(St, StoreNodes, RootNode);
+  SDNode *RootNode = getStoreMergeCandidates(St, StoreNodes);
 
   // Check if there is anything to merge.
   if (StoreNodes.size() < 2)
@@ -21362,6 +21376,11 @@ bool DAGCombiner::mergeConsecutiveStores(StoreSDNode *St) {
       llvm_unreachable("Unhandled store source type");
     }
   }
+
+  // Remember if we failed to optimize, to save compile time.
+  if (!MadeChange)
+    ChainsWithoutMergeableStores.insert(RootNode);
+
   return MadeChange;
 }
 

From 3bc38fb27a12f785d8e78b8d00cbd277464ace92 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 4 Sep 2024 16:22:43 +0200
Subject: [PATCH 102/425] [InstCombine] Generalize and consolidate phi
 translation check (#106051)

The foldOpIntoPhi() transforms requires all operands to be
phi-translatable. This can be the case either because they are phi nodes
in the same block, or because the operand dominates the block.

Currently, most callers of foldOpIntoPhi() satisfy this pre-condition by
requiring a constant operand, which trivially dominates everything. Only
selects had handling for variable operands.

Move this logic into foldOpIntoPhi(), so things are handled correctly if
other callers are generalized. Also make the implementation a bit more
general by querying the dominator tree.
---
 .../InstCombine/InstCombineSelect.cpp         | 42 +------------------
 .../InstCombine/InstructionCombining.cpp      | 27 ++++++++++--
 .../InstCombine/phi-select-constant.ll        |  8 ++--
 3 files changed, 30 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index fcd11126073bf..66f7c4592457c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1992,41 +1992,6 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
   return Changed ? &SI : nullptr;
 }
 
-/// SI is a select whose condition is a PHI node (but the two may be in
-/// different blocks). See if the true/false values (V) are live in all of the
-/// predecessor blocks of the PHI. For example, cases like this can't be mapped:
-///
-///   X = phi [ C1, BB1], [C2, BB2]
-///   Y = add
-///   Z = select X, Y, 0
-///
-/// because Y is not live in BB1/BB2.
-static bool canSelectOperandBeMappingIntoPredBlock(const Value *V,
-                                                   const SelectInst &SI) {
-  // If the value is a non-instruction value like a constant or argument, it
-  // can always be mapped.
-  const Instruction *I = dyn_cast<Instruction>(V);
-  if (!I) return true;
-
-  // If V is a PHI node defined in the same block as the condition PHI, we can
-  // map the arguments.
-  const PHINode *CondPHI = cast<PHINode>(SI.getCondition());
-
-  if (const PHINode *VP = dyn_cast<PHINode>(I))
-    if (VP->getParent() == CondPHI->getParent())
-      return true;
-
-  // Otherwise, if the PHI and select are defined in the same block and if V is
-  // defined in a different block, then we can transform it.
-  if (SI.getParent() == CondPHI->getParent() &&
-      I->getParent() != CondPHI->getParent())
-    return true;
-
-  // Otherwise we have a 'hard' case and we can't tell without doing more
-  // detailed dominator based analysis, punt.
-  return false;
-}
-
 /// We have an SPF (e.g. a min or max) of an SPF of the form:
 ///   SPF2(SPF1(A, B), C)
 Instruction *InstCombinerImpl::foldSPFofSPF(Instruction *Inner,
@@ -3929,11 +3894,8 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
 
   // See if we can fold the select into a phi node if the condition is a select.
   if (auto *PN = dyn_cast<PHINode>(SI.getCondition()))
-    // The true/false values have to be live in the PHI predecessor's blocks.
-    if (canSelectOperandBeMappingIntoPredBlock(TrueVal, SI) &&
-        canSelectOperandBeMappingIntoPredBlock(FalseVal, SI))
-      if (Instruction *NV = foldOpIntoPhi(SI, PN))
-        return NV;
+    if (Instruction *NV = foldOpIntoPhi(SI, PN))
+      return NV;
 
   if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) {
     if (TrueSI->getCondition()->getType() == CondVal->getType()) {
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index ad2a620081bcd..8195e0539305c 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1730,8 +1730,7 @@ static Value *simplifyInstructionWithPHI(Instruction &I, PHINode *PN,
                                          const DataLayout &DL,
                                          const SimplifyQuery SQ) {
   // NB: It is a precondition of this transform that the operands be
-  // phi translatable! This is usually trivially satisfied by limiting it
-  // to constant ops, and for selects we do a more sophisticated check.
+  // phi translatable!
   SmallVector<Value *> Ops;
   for (Value *Op : I.operands()) {
     if (Op == PN)
@@ -1784,9 +1783,31 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
     // Otherwise, we can replace *all* users with the new PHI we form.
   }
 
+  // Check that all operands are phi-translatable.
+  for (Value *Op : I.operands()) {
+    if (Op == PN)
+      continue;
+
+    // Non-instructions never require phi-translation.
+    auto *I = dyn_cast<Instruction>(Op);
+    if (!I)
+      continue;
+
+    // Phi-translate can handle phi nodes in the same block.
+    if (isa<PHINode>(I))
+      if (I->getParent() == PN->getParent())
+        continue;
+
+    // Operand dominates the block, no phi-translation necessary.
+    if (DT.dominates(I, PN->getParent()))
+      continue;
+
+    // Not phi-translatable, bail out.
+    return nullptr;
+  }
+
   // Check to see whether the instruction can be folded into each phi operand.
   // If there is one operand that does not fold, remember the BB it is in.
-  // If there is more than one or if *it* is a PHI, bail out.
   SmallVector<Value *> NewPhiValues;
   BasicBlock *NonSimplifiedBB = nullptr;
   Value *NonSimplifiedInVal = nullptr;
diff --git a/llvm/test/Transforms/InstCombine/phi-select-constant.ll b/llvm/test/Transforms/InstCombine/phi-select-constant.ll
index 9d61891127690..81a27811ec63f 100644
--- a/llvm/test/Transforms/InstCombine/phi-select-constant.ll
+++ b/llvm/test/Transforms/InstCombine/phi-select-constant.ll
@@ -226,17 +226,17 @@ final:
 define i32 @dominating_values_select_not_same_block(i1 %c1, i1 %c2, ptr %p, ptr %p2) {
 ; CHECK-LABEL: @dominating_values_select_not_same_block(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    [[B:%.*]] = load i32, ptr [[P2:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C1:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
 ; CHECK:       delay:
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = select i1 [[C2:%.*]], i32 [[A]], i32 [[B]]
 ; CHECK-NEXT:    br label [[FINAL]]
 ; CHECK:       final:
-; CHECK-NEXT:    [[USE2:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[C2:%.*]], [[DELAY]] ]
+; CHECK-NEXT:    [[USE2:%.*]] = phi i32 [ [[B]], [[ENTRY:%.*]] ], [ [[TMP0]], [[DELAY]] ]
 ; CHECK-NEXT:    br label [[SPLIT:%.*]]
 ; CHECK:       split:
-; CHECK-NEXT:    [[VALUE:%.*]] = select i1 [[USE2]], i32 [[A]], i32 [[B]]
-; CHECK-NEXT:    ret i32 [[VALUE]]
+; CHECK-NEXT:    ret i32 [[USE2]]
 ;
 entry:
   %a = load i32, ptr %p

From 75dc9af1a227e5bfd34eeaf822d2db4520545f14 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 4 Sep 2024 05:57:13 -0700
Subject: [PATCH 103/425] [SLP][NFC]Remove some dead code + reorder calls to
 avoid extra checks

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a2af7f4e1b01c..60476398e5ca7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7126,8 +7126,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   }
 
   // Don't handle vectors.
-  if (!SLPReVec && getValueType(S.OpValue)->isVectorTy() &&
-      !isa<InsertElementInst>(S.OpValue)) {
+  if (!SLPReVec && getValueType(S.OpValue)->isVectorTy()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
     return;
@@ -13221,9 +13220,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   }
 
   Value *V = E->Scalars.front();
-  Type *ScalarTy = getValueType(V);
-  if (isa<CmpInst>(V))
-    ScalarTy = V->getType();
+  Type *ScalarTy = V->getType();
+  if (!isa<CmpInst>(V))
+    ScalarTy = getValueType(V);
   auto It = MinBWs.find(E);
   if (It != MinBWs.end()) {
     auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);

From f7fa75b20835254c35baeff908b8c3827c13db41 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Wed, 4 Sep 2024 15:29:32 +0100
Subject: [PATCH 104/425] [AArch64] Implement intrinsics for SME2 FAMIN/FAMAX
 (#99063)

This patch implements these intrinsics:

``` c
  // Variants are also available for:
  //  [_f32_x2], [_f64_x2],
  //  [_f16_x4], [_f32_x4], [_f64_x4]
  svfloat16x2_t svamax[_f16_x2](svfloat16x2 zd, svfloat16x2_t zm) __arm_streaming;
  svfloat16x2_t svamin[_f16_x2](svfloat16x2 zd, svfloat16x2_t zm) __arm_streaming;
```

(cf. https://github.com/ARM-software/acle/pull/324)

Co-authored-by: Caroline Concatto <caroline.concatto@arm.com>
---
 clang/include/clang/Basic/arm_sve.td          |   7 +
 .../acle_sme2_faminmax.c                      | 476 ++++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |   9 +
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  28 ++
 .../AArch64/sme2-intrinsics-faminmax.ll       | 241 +++++++++
 5 files changed, 761 insertions(+)
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_faminmax.c
 create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index a0f12e1bbd2d4..edf73d9022b06 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2253,6 +2253,13 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
   def SVSQDMULH_X4        : SInst<"svqdmulh[_{d}_x4]",        "444", "csil", MergeNone, "aarch64_sve_sqdmulh_vgx4",        [IsStreaming], []>;
 }
 
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,faminmax" in {
+  def FAMIN_X2 : Inst<"svamin[_{d}_x2]", "222", "hfd", MergeNone, "aarch64_sme_famin_x2",  [IsStreaming], []>;
+  def FAMAX_X2 : Inst<"svamax[_{d}_x2]", "222", "hfd", MergeNone, "aarch64_sme_famax_x2",  [IsStreaming], []>;
+  def FAMIN_X4 : Inst<"svamin[_{d}_x4]", "444", "hfd", MergeNone, "aarch64_sme_famin_x4",  [IsStreaming], []>;
+  def FAMAX_X4 : Inst<"svamax[_{d}_x4]", "444", "hfd", MergeNone, "aarch64_sme_famax_x4",  [IsStreaming], []>;
+}
+
 let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
   def REINTERPRET_SVBOOL_TO_SVCOUNT : Inst<"svreinterpret[_c]", "}P", "Pc", MergeNone, "", [VerifyRuntimeMode], []>;
   def REINTERPRET_SVCOUNT_TO_SVBOOL : Inst<"svreinterpret[_b]", "P}", "Pc", MergeNone, "", [VerifyRuntimeMode], []>;
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_faminmax.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_faminmax.c
new file mode 100644
index 0000000000000..5d026f8cde5e0
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_faminmax.c
@@ -0,0 +1,476 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +faminmax -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
+#endif
+
+
+// Multi, x2
+
+// CHECK-LABEL: @test_svamax_f16_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZDN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZDN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famax.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x half> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svamax_f16_x213svfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZDN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZDN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famax.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP8]]
+//
+svfloat16x2_t test_svamax_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __arm_streaming {
+  return SVE_ACLE_FUNC(svamax,_f16_x2)(zdn, zm);
+}
+
+// CHECK-LABEL: @test_svamax_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZDN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZDN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famax.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svamax_f32_x213svfloat32x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZDN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZDN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famax.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP8]]
+//
+svfloat32x2_t test_svamax_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __arm_streaming {
+  return SVE_ACLE_FUNC(svamax,_f32_x2)(zdn, zm);
+}
+
+// CHECK-LABEL: @test_svamax_f64_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZDN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZDN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famax.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x double> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svamax_f64_x213svfloat64x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZDN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZDN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famax.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP8]]
+//
+svfloat64x2_t test_svamax_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __arm_streaming {
+  return SVE_ACLE_FUNC(svamax,_f64_x2)(zdn, zm);
+}
+
+// CHECK-LABEL: @test_svamin_f16_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZDN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZDN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famin.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x half> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svamin_f16_x213svfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZDN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZDN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famin.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP8]]
+//
+svfloat16x2_t test_svamin_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) __arm_streaming {
+  return SVE_ACLE_FUNC(svamin,_f16_x2)(zdn, zm);
+}
+
+// CHECK-LABEL: @test_svamin_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZDN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZDN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famin.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svamin_f32_x213svfloat32x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZDN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZDN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famin.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP8]]
+//
+svfloat32x2_t test_svamin_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) __arm_streaming {
+  return SVE_ACLE_FUNC(svamin,_f32_x2)(zdn, zm);
+}
+
+// CHECK-LABEL: @test_svamin_f64_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZDN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZDN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famin.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x double> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svamin_f64_x213svfloat64x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZDN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZDN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famin.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP8]]
+//
+svfloat64x2_t test_svamin_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) __arm_streaming {
+  return SVE_ACLE_FUNC(svamin,_f64_x2)(zdn, zm);
+}
+
+// Multi, x4
+
+// CHECK-LABEL: @test_svamax_f16_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famax.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP9]], i64 0)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 1
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP10]], <vscale x 8 x half> [[TMP11]], i64 8)
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 2
+// CHECK-NEXT:    [[TMP14:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP12]], <vscale x 8 x half> [[TMP13]], i64 16)
+// CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 3
+// CHECK-NEXT:    [[TMP16:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP14]], <vscale x 8 x half> [[TMP15]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x half> [[TMP16]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svamax_f16_x413svfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famax.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 0
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP9]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 1
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP10]], <vscale x 8 x half> [[TMP11]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 2
+// CPP-CHECK-NEXT:    [[TMP14:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP12]], <vscale x 8 x half> [[TMP13]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 3
+// CPP-CHECK-NEXT:    [[TMP16:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP14]], <vscale x 8 x half> [[TMP15]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP16]]
+//
+svfloat16x4_t test_svamax_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __arm_streaming {
+  return SVE_ACLE_FUNC(svamax,_f16_x4)(zdn, zm);
+}
+
+// CHECK-LABEL: @test_svamax_f32_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famax.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP9]], i64 0)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 1
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 4)
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 2
+// CHECK-NEXT:    [[TMP14:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP12]], <vscale x 4 x float> [[TMP13]], i64 8)
+// CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 3
+// CHECK-NEXT:    [[TMP16:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP14]], <vscale x 4 x float> [[TMP15]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP16]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svamax_f32_x413svfloat32x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famax.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 0
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP9]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 1
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 2
+// CPP-CHECK-NEXT:    [[TMP14:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP12]], <vscale x 4 x float> [[TMP13]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 3
+// CPP-CHECK-NEXT:    [[TMP16:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP14]], <vscale x 4 x float> [[TMP15]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP16]]
+//
+svfloat32x4_t test_svamax_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __arm_streaming {
+  return SVE_ACLE_FUNC(svamax,_f32_x4)(zdn, zm);
+}
+
+// CHECK-LABEL: @test_svamax_f64_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famax.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP9]], i64 0)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 1
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP10]], <vscale x 2 x double> [[TMP11]], i64 2)
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 2
+// CHECK-NEXT:    [[TMP14:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP12]], <vscale x 2 x double> [[TMP13]], i64 4)
+// CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 3
+// CHECK-NEXT:    [[TMP16:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP14]], <vscale x 2 x double> [[TMP15]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x double> [[TMP16]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svamax_f64_x413svfloat64x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famax.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 0
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP9]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 1
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP10]], <vscale x 2 x double> [[TMP11]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 2
+// CPP-CHECK-NEXT:    [[TMP14:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP12]], <vscale x 2 x double> [[TMP13]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 3
+// CPP-CHECK-NEXT:    [[TMP16:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP14]], <vscale x 2 x double> [[TMP15]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP16]]
+//
+svfloat64x4_t test_svamax_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) __arm_streaming {
+  return SVE_ACLE_FUNC(svamax,_f64_x4)(zdn, zm);
+}
+
+// CHECK-LABEL: @test_svamin_f16_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famin.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP9]], i64 0)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 1
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP10]], <vscale x 8 x half> [[TMP11]], i64 8)
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 2
+// CHECK-NEXT:    [[TMP14:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP12]], <vscale x 8 x half> [[TMP13]], i64 16)
+// CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 3
+// CHECK-NEXT:    [[TMP16:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP14]], <vscale x 8 x half> [[TMP15]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x half> [[TMP16]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svamin_f16_x413svfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZDN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famin.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 0
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP9]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 1
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP10]], <vscale x 8 x half> [[TMP11]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 2
+// CPP-CHECK-NEXT:    [[TMP14:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP12]], <vscale x 8 x half> [[TMP13]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], 3
+// CPP-CHECK-NEXT:    [[TMP16:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP14]], <vscale x 8 x half> [[TMP15]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP16]]
+//
+svfloat16x4_t test_svamin_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) __arm_streaming {
+  return SVE_ACLE_FUNC(svamin,_f16_x4)(zdn, zm);
+}
+
+// CHECK-LABEL: @test_svamin_f32_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famin.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP9]], i64 0)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 1
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 4)
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 2
+// CHECK-NEXT:    [[TMP14:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP12]], <vscale x 4 x float> [[TMP13]], i64 8)
+// CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 3
+// CHECK-NEXT:    [[TMP16:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP14]], <vscale x 4 x float> [[TMP15]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP16]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svamin_f32_x413svfloat32x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZDN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famin.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 0
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP9]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 1
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 2
+// CPP-CHECK-NEXT:    [[TMP14:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP12]], <vscale x 4 x float> [[TMP13]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 3
+// CPP-CHECK-NEXT:    [[TMP16:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP14]], <vscale x 4 x float> [[TMP15]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP16]]
+//
+svfloat32x4_t test_svamin_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) __arm_streaming {
+  return SVE_ACLE_FUNC(svamin,_f32_x4)(zdn, zm);
+}
+
+// CHECK-LABEL: @test_svamin_f64_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famin.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 0
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP9]], i64 0)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 1
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP10]], <vscale x 2 x double> [[TMP11]], i64 2)
+// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 2
+// CHECK-NEXT:    [[TMP14:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP12]], <vscale x 2 x double> [[TMP13]], i64 4)
+// CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 3
+// CHECK-NEXT:    [[TMP16:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP14]], <vscale x 2 x double> [[TMP15]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x double> [[TMP16]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svamin_f64_x413svfloat64x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZDN]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famin.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 0
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP9]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 1
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP10]], <vscale x 2 x double> [[TMP11]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 2
+// CPP-CHECK-NEXT:    [[TMP14:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP12]], <vscale x 2 x double> [[TMP13]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 3
+// CPP-CHECK-NEXT:    [[TMP16:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP14]], <vscale x 2 x double> [[TMP15]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP16]]
+//
+svfloat64x4_t test_svamin_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) __arm_streaming {
+  return SVE_ACLE_FUNC(svamin,_f64_x4)(zdn, zm);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 9bce850750f79..8ac1d67e162f7 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3521,6 +3521,15 @@ let TargetPrefix = "aarch64" in {
     def int_aarch64_sve_ # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic;
   }
 
+  //
+  // Multi-vector floating point absolute min/max number
+  //
+
+  foreach instr = ["famax", "famin"] in {
+    def int_aarch64_sme_ # instr # _x2 : SME2_VG2_Multi_Multi_Intrinsic;
+    def int_aarch64_sme_ # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic;
+  }
+
   //
   // Multi-vector vertical dot-products
   //
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index f5f9a62faa0f5..69806c9c3fdbf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5822,6 +5822,34 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                AArch64::FMAX_VG4_4Z4Z_S, AArch64::FMAX_VG4_4Z4Z_D}))
         SelectDestructiveMultiIntrinsic(Node, 4, true, Op);
       return;
+    case Intrinsic::aarch64_sme_famax_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::FP>(
+              Node->getValueType(0),
+              {0, AArch64::FAMAX_2Z2Z_H, AArch64::FAMAX_2Z2Z_S,
+               AArch64::FAMAX_2Z2Z_D}))
+        SelectDestructiveMultiIntrinsic(Node, 2, true, Op);
+      return;
+    case Intrinsic::aarch64_sme_famax_x4:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::FP>(
+              Node->getValueType(0),
+              {0, AArch64::FAMAX_4Z4Z_H, AArch64::FAMAX_4Z4Z_S,
+               AArch64::FAMAX_4Z4Z_D}))
+        SelectDestructiveMultiIntrinsic(Node, 4, true, Op);
+      return;
+    case Intrinsic::aarch64_sme_famin_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::FP>(
+              Node->getValueType(0),
+              {0, AArch64::FAMIN_2Z2Z_H, AArch64::FAMIN_2Z2Z_S,
+               AArch64::FAMIN_2Z2Z_D}))
+        SelectDestructiveMultiIntrinsic(Node, 2, true, Op);
+      return;
+    case Intrinsic::aarch64_sme_famin_x4:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::FP>(
+              Node->getValueType(0),
+              {0, AArch64::FAMIN_4Z4Z_H, AArch64::FAMIN_4Z4Z_S,
+               AArch64::FAMIN_4Z4Z_D}))
+        SelectDestructiveMultiIntrinsic(Node, 4, true, Op);
+      return;
     case Intrinsic::aarch64_sve_smin_x2:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
               Node->getValueType(0),
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll
new file mode 100644
index 0000000000000..ecfbb47ba5571
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll
@@ -0,0 +1,241 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -mattr=+sme2 -mattr=+faminmax -force-streaming -verify-machineinstrs < %s | FileCheck %s
+
+; FAMAX (Multi, x2)
+
+define { <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_max_multi_x2_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2) {
+; CHECK-LABEL: multi_vec_max_multi_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    famax { z4.h, z5.h }, { z4.h, z5.h }, { z6.h, z7.h }
+; CHECK-NEXT:    mov z0.d, z4.d
+; CHECK-NEXT:    mov z1.d, z5.d
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famax.x2.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
+  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_max_multi_x2_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) {
+; CHECK-LABEL: multi_vec_max_multi_x2_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    famax { z4.s, z5.s }, { z4.s, z5.s }, { z6.s, z7.s }
+; CHECK-NEXT:    mov z0.d, z4.d
+; CHECK-NEXT:    mov z1.d, z5.d
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famax.x2.nxv4f32(<vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
+  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_max_multi_x2_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) {
+; CHECK-LABEL: multi_vec_max_multi_x2_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    famax { z4.d, z5.d }, { z4.d, z5.d }, { z6.d, z7.d }
+; CHECK-NEXT:    mov z0.d, z4.d
+; CHECK-NEXT:    mov z1.d, z5.d
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famax.x2.nxv2f64(<vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
+  ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+; FAMAX (Multi, x4)
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }@multi_vec_max_multi_x4_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
+; CHECK-LABEL: multi_vec_max_multi_x4_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
+; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    famax { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
+; CHECK-NEXT:    mov z0.d, z24.d
+; CHECK-NEXT:    mov z1.d, z25.d
+; CHECK-NEXT:    mov z2.d, z26.d
+; CHECK-NEXT:    mov z3.d, z27.d
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famax.x4.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm1,  <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4)
+  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_max_multi_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
+; CHECK-LABEL: multi_vec_max_multi_x4_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
+; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    famax { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
+; CHECK-NEXT:    mov z0.d, z24.d
+; CHECK-NEXT:    mov z1.d, z25.d
+; CHECK-NEXT:    mov z2.d, z26.d
+; CHECK-NEXT:    mov z3.d, z27.d
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famax.x4.nxv4f32(<vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm1,  <vscale x 4 x float> %zm2,  <vscale x 4 x float> %zm3,  <vscale x 4 x float> %zm4)
+  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_max_multi_x4_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
+; CHECK-LABEL: multi_vec_max_multi_x4_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
+; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    famax { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
+; CHECK-NEXT:    mov z0.d, z24.d
+; CHECK-NEXT:    mov z1.d, z25.d
+; CHECK-NEXT:    mov z2.d, z26.d
+; CHECK-NEXT:    mov z3.d, z27.d
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }
+              @llvm.aarch64.sme.famax.x4.nxv2f64(<vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4,
+                                                <vscale x 2 x double> %zm1,  <vscale x 2 x double> %zm2,  <vscale x 2 x double> %zm3,  <vscale x 2 x double> %zm4)
+  ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+
+; FAMIN (Multi, x2)
+
+define { <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_min_multi_x2_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2) {
+; CHECK-LABEL: multi_vec_min_multi_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    famin { z4.h, z5.h }, { z4.h, z5.h }, { z6.h, z7.h }
+; CHECK-NEXT:    mov z0.d, z4.d
+; CHECK-NEXT:    mov z1.d, z5.d
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famin.x2.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
+  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_min_multi_x2_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) {
+; CHECK-LABEL: multi_vec_min_multi_x2_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    famin { z4.s, z5.s }, { z4.s, z5.s }, { z6.s, z7.s }
+; CHECK-NEXT:    mov z0.d, z4.d
+; CHECK-NEXT:    mov z1.d, z5.d
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famin.x2.nxv4f32(<vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
+  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_main_multi_x2_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) {
+; CHECK-LABEL: multi_vec_main_multi_x2_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    famin { z4.d, z5.d }, { z4.d, z5.d }, { z6.d, z7.d }
+; CHECK-NEXT:    mov z0.d, z4.d
+; CHECK-NEXT:    mov z1.d, z5.d
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famin.x2.nxv2f64(<vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
+  ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+; FAMIN (Multi, x4)
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_min_multi_x4_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
+; CHECK-LABEL: multi_vec_min_multi_x4_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
+; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    famin { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
+; CHECK-NEXT:    mov z0.d, z24.d
+; CHECK-NEXT:    mov z1.d, z25.d
+; CHECK-NEXT:    mov z2.d, z26.d
+; CHECK-NEXT:    mov z3.d, z27.d
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }
+              @llvm.aarch64.sme.famin.x4.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4,
+                                                <vscale x 8 x half> %zm1,  <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4)
+  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_min_multi_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
+; CHECK-LABEL: multi_vec_min_multi_x4_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
+; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    famin { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
+; CHECK-NEXT:    mov z0.d, z24.d
+; CHECK-NEXT:    mov z1.d, z25.d
+; CHECK-NEXT:    mov z2.d, z26.d
+; CHECK-NEXT:    mov z3.d, z27.d
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }
+              @llvm.aarch64.sme.famin.x4.nxv4f32(<vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4,
+                                                <vscale x 4 x float> %zm1,  <vscale x 4 x float> %zm2,  <vscale x 4 x float> %zm3,  <vscale x 4 x float> %zm4)
+  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_min_multi_x4_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
+; CHECK-LABEL: multi_vec_min_multi_x4_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
+; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    famin { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
+; CHECK-NEXT:    mov z0.d, z24.d
+; CHECK-NEXT:    mov z1.d, z25.d
+; CHECK-NEXT:    mov z2.d, z26.d
+; CHECK-NEXT:    mov z3.d, z27.d
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }
+              @llvm.aarch64.sme.famin.x4.nxv2f64(<vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4,
+                                                <vscale x 2 x double> %zm1,  <vscale x 2 x double> %zm2,  <vscale x 2 x double> %zm3,  <vscale x 2 x double> %zm4)
+  ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
+}

From 660e34fd38c3fb39fba1871bbf5b2eb3a48bf277 Mon Sep 17 00:00:00 2001
From: Alexey Merzlyakov <60094858+AlexeyMerzlyakov@users.noreply.github.com>
Date: Wed, 4 Sep 2024 17:31:59 +0300
Subject: [PATCH 105/425] [lldb][RISCV] Support optionally disabled FPR for
 riscv64 (#104547)

The PR adds the support optionally enabled/disabled FP-registers to LLDB
`RegisterInfoPOSIX_riscv64`. This situation might take place for RISC-V
builds having no FP-registers, like RV64IMAC or RV64IMACV.

To aim this, patch adds `opt_regsets` flags mechanism. It re-works
RegisterInfo class to work with flexibly allocated (depending on
`opt_regsets` flag) `m_register_sets` and `m_register_infos` vectors
instead of statically defined structures. The registration of regsets is
being arranged by `m_per_regset_regnum_range` map.

The patch flows are spread to `NativeRegisterContextLinux_riscv64` and
`RegisterContextCorePOSIX_riscv64` classes, that were tested on:
 - x86_64 host working with coredumps
- RV64GC and RV64IMAC targets working with coredumps and natively in
run-time with binaries

`EmulateInstructionRISCV` is out of scope of this patch, and its
behavior did not change, using maximum set of registers.

According testcase built for RV64IMAC (no-FPR) was added to
`TestLinuxCore.py`.
---
 .../RISCV/EmulateInstructionRISCV.cpp         |   8 +-
 .../NativeRegisterContextLinux_riscv64.cpp    |  57 +++++---
 .../NativeRegisterContextLinux_riscv64.h      |   2 +
 .../Utility/RegisterContextPOSIX_riscv64.cpp  |   3 +-
 .../Utility/RegisterInfoPOSIX_riscv64.cpp     | 132 +++++++++--------
 .../Utility/RegisterInfoPOSIX_riscv64.h       |  38 +++--
 .../Process/Utility/RegisterInfos_riscv64.h   |  54 +++----
 .../RegisterContextPOSIXCore_riscv64.cpp      |  25 ++--
 .../RegisterContextPOSIXCore_riscv64.h        |   3 -
 .../postmortem/elf-core/TestLinuxCore.py      | 133 ++++++++++++++----
 ...iscv64.core => linux-riscv64.gpr_fpr.core} | Bin 20480 -> 20480 bytes
 .../elf-core/linux-riscv64.gpr_fpr.out        | Bin 0 -> 3480 bytes
 .../elf-core/linux-riscv64.gpr_only.core      | Bin 0 -> 28672 bytes
 .../elf-core/linux-riscv64.gpr_only.out       | Bin 0 -> 3520 bytes
 .../postmortem/elf-core/linux-riscv64.out     | Bin 3328 -> 0 bytes
 .../postmortem/elf-core/main_fpr.c            |  14 ++
 16 files changed, 300 insertions(+), 169 deletions(-)
 rename lldb/test/API/functionalities/postmortem/elf-core/{linux-riscv64.core => linux-riscv64.gpr_fpr.core} (76%)
 create mode 100755 lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_fpr.out
 create mode 100644 lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_only.core
 create mode 100755 lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_only.out
 delete mode 100755 lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.out
 create mode 100644 lldb/test/API/functionalities/postmortem/elf-core/main_fpr.c

diff --git a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp
index e8014b1eeb378..badc7ba36f011 100644
--- a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp
+++ b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp
@@ -1748,10 +1748,10 @@ EmulateInstructionRISCV::GetRegisterInfo(RegisterKind reg_kind,
     }
   }
 
-  const RegisterInfo *array =
-      RegisterInfoPOSIX_riscv64::GetRegisterInfoPtr(m_arch);
-  const uint32_t length =
-      RegisterInfoPOSIX_riscv64::GetRegisterInfoCount(m_arch);
+  RegisterInfoPOSIX_riscv64 reg_info(m_arch,
+                                     RegisterInfoPOSIX_riscv64::eRegsetMaskAll);
+  const RegisterInfo *array = reg_info.GetRegisterInfo();
+  const uint32_t length = reg_info.GetRegisterCount();
 
   if (reg_index >= length || reg_kind != eRegisterKindLLDB)
     return {};
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.cpp
index bfa1a154b0f28..45b6c8ff9905b 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.cpp
@@ -23,12 +23,11 @@
 
 // System includes - They have to be included after framework includes because
 // they define some macros which collide with variable names in other modules
+#include <sys/ptrace.h>
 #include <sys/uio.h>
 // NT_PRSTATUS and NT_FPREGSET definition
 #include <elf.h>
 
-#define REG_CONTEXT_SIZE (GetGPRSize() + GetFPRSize())
-
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::process_linux;
@@ -38,7 +37,21 @@ NativeRegisterContextLinux::CreateHostNativeRegisterContextLinux(
     const ArchSpec &target_arch, NativeThreadLinux &native_thread) {
   switch (target_arch.GetMachine()) {
   case llvm::Triple::riscv64: {
-    Flags opt_regsets;
+    Flags opt_regsets(RegisterInfoPOSIX_riscv64::eRegsetMaskDefault);
+
+    RegisterInfoPOSIX_riscv64::FPR fpr;
+    struct iovec ioVec;
+    ioVec.iov_base = &fpr;
+    ioVec.iov_len = sizeof(fpr);
+    unsigned int regset = NT_FPREGSET;
+
+    if (NativeProcessLinux::PtraceWrapper(PTRACE_GETREGSET,
+                                          native_thread.GetID(), &regset,
+                                          &ioVec, sizeof(fpr))
+            .Success()) {
+      opt_regsets.Set(RegisterInfoPOSIX_riscv64::eRegsetMaskFP);
+    }
+
     auto register_info_up =
         std::make_unique<RegisterInfoPOSIX_riscv64>(target_arch, opt_regsets);
     return std::make_unique<NativeRegisterContextLinux_riscv64>(
@@ -194,20 +207,23 @@ Status NativeRegisterContextLinux_riscv64::ReadAllRegisterValues(
     lldb::WritableDataBufferSP &data_sp) {
   Status error;
 
-  data_sp.reset(new DataBufferHeap(REG_CONTEXT_SIZE, 0));
+  data_sp.reset(new DataBufferHeap(GetRegContextSize(), 0));
 
   error = ReadGPR();
   if (error.Fail())
     return error;
 
-  error = ReadFPR();
-  if (error.Fail())
-    return error;
+  if (GetRegisterInfo().IsFPPresent()) {
+    error = ReadFPR();
+    if (error.Fail())
+      return error;
+  }
 
   uint8_t *dst = const_cast<uint8_t *>(data_sp->GetBytes());
   ::memcpy(dst, GetGPRBuffer(), GetGPRSize());
   dst += GetGPRSize();
-  ::memcpy(dst, GetFPRBuffer(), GetFPRSize());
+  if (GetRegisterInfo().IsFPPresent())
+    ::memcpy(dst, GetFPRBuffer(), GetFPRSize());
 
   return error;
 }
@@ -223,11 +239,11 @@ Status NativeRegisterContextLinux_riscv64::WriteAllRegisterValues(
     return error;
   }
 
-  if (data_sp->GetByteSize() != REG_CONTEXT_SIZE) {
+  if (data_sp->GetByteSize() != GetRegContextSize()) {
     error = Status::FromErrorStringWithFormat(
         "NativeRegisterContextLinux_riscv64::%s data_sp contained mismatched "
         "data size, expected %" PRIu64 ", actual %" PRIu64,
-        __FUNCTION__, REG_CONTEXT_SIZE, data_sp->GetByteSize());
+        __FUNCTION__, GetRegContextSize(), data_sp->GetByteSize());
     return error;
   }
 
@@ -247,23 +263,32 @@ Status NativeRegisterContextLinux_riscv64::WriteAllRegisterValues(
     return error;
 
   src += GetRegisterInfoInterface().GetGPRSize();
-  ::memcpy(GetFPRBuffer(), src, GetFPRSize());
 
-  error = WriteFPR();
-  if (error.Fail())
-    return error;
+  if (GetRegisterInfo().IsFPPresent()) {
+    ::memcpy(GetFPRBuffer(), src, GetFPRSize());
+
+    error = WriteFPR();
+    if (error.Fail())
+      return error;
+  }
 
   return error;
 }
 
+size_t NativeRegisterContextLinux_riscv64::GetRegContextSize() {
+  size_t size = GetGPRSize();
+  if (GetRegisterInfo().IsFPPresent())
+    size += GetFPRSize();
+  return size;
+}
+
 bool NativeRegisterContextLinux_riscv64::IsGPR(unsigned reg) const {
   return GetRegisterInfo().GetRegisterSetFromRegisterIndex(reg) ==
          RegisterInfoPOSIX_riscv64::GPRegSet;
 }
 
 bool NativeRegisterContextLinux_riscv64::IsFPR(unsigned reg) const {
-  return GetRegisterInfo().GetRegisterSetFromRegisterIndex(reg) ==
-         RegisterInfoPOSIX_riscv64::FPRegSet;
+  return GetRegisterInfo().IsFPReg(reg);
 }
 
 Status NativeRegisterContextLinux_riscv64::ReadGPR() {
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.h b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.h
index 41b4e2573add9..d5cc50131cdc3 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.h
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_riscv64.h
@@ -75,6 +75,8 @@ class NativeRegisterContextLinux_riscv64 : public NativeRegisterContextLinux {
 
   RegisterInfoPOSIX_riscv64::FPR m_fpr;
 
+  size_t GetRegContextSize();
+
   bool IsGPR(unsigned reg) const;
 
   bool IsFPR(unsigned reg) const;
diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_riscv64.cpp b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_riscv64.cpp
index 035ce00e11626..bbcfb9eae1003 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_riscv64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_riscv64.cpp
@@ -77,6 +77,5 @@ bool RegisterContextPOSIX_riscv64::IsGPR(unsigned int reg) {
 }
 
 bool RegisterContextPOSIX_riscv64::IsFPR(unsigned int reg) {
-  return m_register_info_up->GetRegisterSetFromRegisterIndex(reg) ==
-         RegisterInfoPOSIX_riscv64::FPRegSet;
+  return m_register_info_up->IsFPReg(reg);
 }
diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.cpp b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.cpp
index 3819401c543b1..4a3737795848e 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.cpp
@@ -18,42 +18,15 @@
 #define GPR_OFFSET(idx) ((idx)*8 + 0)
 #define FPR_OFFSET(idx) ((idx)*8 + sizeof(RegisterInfoPOSIX_riscv64::GPR))
 
-#define REG_CONTEXT_SIZE                                                       \
-  (sizeof(RegisterInfoPOSIX_riscv64::GPR) +                                    \
-   sizeof(RegisterInfoPOSIX_riscv64::FPR))
-
 #define DECLARE_REGISTER_INFOS_RISCV64_STRUCT
 #include "RegisterInfos_riscv64.h"
 #undef DECLARE_REGISTER_INFOS_RISCV64_STRUCT
 
-const lldb_private::RegisterInfo *RegisterInfoPOSIX_riscv64::GetRegisterInfoPtr(
-    const lldb_private::ArchSpec &target_arch) {
-  switch (target_arch.GetMachine()) {
-  case llvm::Triple::riscv64:
-    return g_register_infos_riscv64_le;
-  default:
-    assert(false && "Unhandled target architecture.");
-    return nullptr;
-  }
-}
-
-uint32_t RegisterInfoPOSIX_riscv64::GetRegisterInfoCount(
-    const lldb_private::ArchSpec &target_arch) {
-  switch (target_arch.GetMachine()) {
-  case llvm::Triple::riscv64:
-    return static_cast<uint32_t>(sizeof(g_register_infos_riscv64_le) /
-                                 sizeof(g_register_infos_riscv64_le[0]));
-  default:
-    assert(false && "Unhandled target architecture.");
-    return 0;
-  }
-}
-
 // Number of register sets provided by this context.
 enum {
   k_num_gpr_registers = gpr_last_riscv - gpr_first_riscv + 1,
   k_num_fpr_registers = fpr_last_riscv - fpr_first_riscv + 1,
-  k_num_register_sets = 2
+  k_num_register_sets_default = 1
 };
 
 // RISC-V64 general purpose registers.
@@ -73,38 +46,69 @@ static_assert(((sizeof g_gpr_regnums_riscv64 /
                1) == k_num_gpr_registers,
               "g_gpr_regnums_riscv64 has wrong number of register infos");
 
-// RISC-V64 floating point registers.
-static const uint32_t g_fpr_regnums_riscv64[] = {
-    fpr_f0_riscv,   fpr_f1_riscv,       fpr_f2_riscv,  fpr_f3_riscv,
-    fpr_f4_riscv,   fpr_f5_riscv,       fpr_f6_riscv,  fpr_f7_riscv,
-    fpr_f8_riscv,   fpr_f9_riscv,       fpr_f10_riscv, fpr_f11_riscv,
-    fpr_f12_riscv,  fpr_f13_riscv,      fpr_f14_riscv, fpr_f15_riscv,
-    fpr_f16_riscv,  fpr_f17_riscv,      fpr_f18_riscv, fpr_f19_riscv,
-    fpr_f20_riscv,  fpr_f21_riscv,      fpr_f22_riscv, fpr_f23_riscv,
-    fpr_f24_riscv,  fpr_f25_riscv,      fpr_f26_riscv, fpr_f27_riscv,
-    fpr_f28_riscv,  fpr_f29_riscv,      fpr_f30_riscv, fpr_f31_riscv,
-    fpr_fcsr_riscv, LLDB_INVALID_REGNUM};
-
-static_assert(((sizeof g_fpr_regnums_riscv64 /
-                sizeof g_fpr_regnums_riscv64[0]) -
-               1) == k_num_fpr_registers,
-              "g_fpr_regnums_riscv64 has wrong number of register infos");
-
 // Register sets for RISC-V64.
-static const lldb_private::RegisterSet g_reg_sets_riscv64[k_num_register_sets] =
-    {{"General Purpose Registers", "gpr", k_num_gpr_registers,
-      g_gpr_regnums_riscv64},
-     {"Floating Point Registers", "fpr", k_num_fpr_registers,
-      g_fpr_regnums_riscv64}};
+static const lldb_private::RegisterSet g_reg_set_gpr_riscv64 = {
+    "General Purpose Registers", "gpr", k_num_gpr_registers,
+    g_gpr_regnums_riscv64};
+static const lldb_private::RegisterSet g_reg_set_fpr_riscv64 = {
+    "Floating Point Registers", "fpr", k_num_fpr_registers, nullptr};
 
 RegisterInfoPOSIX_riscv64::RegisterInfoPOSIX_riscv64(
-    const lldb_private::ArchSpec &target_arch, lldb_private::Flags flags)
+    const lldb_private::ArchSpec &target_arch, lldb_private::Flags opt_regsets)
     : lldb_private::RegisterInfoAndSetInterface(target_arch),
-      m_register_info_p(GetRegisterInfoPtr(target_arch)),
-      m_register_info_count(GetRegisterInfoCount(target_arch)) {}
+      m_opt_regsets(opt_regsets) {
+  switch (target_arch.GetMachine()) {
+  case llvm::Triple::riscv64: {
+    // By-default considering RISC-V has only GPR.
+    // Other register sets could be enabled optionally by opt_regsets.
+    AddRegSetGP();
+
+    if (m_opt_regsets.AnySet(eRegsetMaskFP))
+      AddRegSetFP();
+
+    break;
+  }
+  default:
+    assert(false && "Unhandled target architecture.");
+  }
+}
+
+void RegisterInfoPOSIX_riscv64::AddRegSetGP() {
+  m_register_infos.resize(k_num_gpr_registers);
+  memcpy(&m_register_infos[0], g_register_infos_riscv64_gpr,
+         sizeof(g_register_infos_riscv64_gpr));
+  m_register_sets.push_back(g_reg_set_gpr_riscv64);
+
+  m_per_regset_regnum_range[GPRegSet] =
+      std::make_pair(gpr_first_riscv, m_register_infos.size());
+}
+
+void RegisterInfoPOSIX_riscv64::AddRegSetFP() {
+  const uint32_t register_info_count = m_register_infos.size();
+  const uint32_t register_set_count = m_register_sets.size();
+
+  // Filling m_register_infos.
+  // For FPR case we do not need to correct register offsets and kinds
+  // while for other further cases (like VPR), register offset/kind
+  // should be started counting from the last one in previously added
+  // regset. This is needed for the case e.g. when architecture has GPR + VPR
+  // sets only.
+  m_register_infos.resize(register_info_count + k_num_fpr_registers);
+  memcpy(&m_register_infos[register_info_count], g_register_infos_riscv64_fpr,
+         sizeof(g_register_infos_riscv64_fpr));
+
+  // Filling m_register_sets with enabled register set
+  for (uint32_t i = 0; i < k_num_fpr_registers; i++)
+    m_fp_regnum_collection.push_back(register_info_count + i);
+  m_register_sets.push_back(g_reg_set_fpr_riscv64);
+  m_register_sets.back().registers = m_fp_regnum_collection.data();
+
+  m_per_regset_regnum_range[register_set_count] =
+      std::make_pair(register_info_count, m_register_infos.size());
+}
 
 uint32_t RegisterInfoPOSIX_riscv64::GetRegisterCount() const {
-  return m_register_info_count;
+  return m_register_infos.size();
 }
 
 size_t RegisterInfoPOSIX_riscv64::GetGPRSize() const {
@@ -117,26 +121,30 @@ size_t RegisterInfoPOSIX_riscv64::GetFPRSize() const {
 
 const lldb_private::RegisterInfo *
 RegisterInfoPOSIX_riscv64::GetRegisterInfo() const {
-  return m_register_info_p;
+  return m_register_infos.data();
 }
 
 size_t RegisterInfoPOSIX_riscv64::GetRegisterSetCount() const {
-  return k_num_register_sets;
+  return m_register_sets.size();
 }
 
 size_t RegisterInfoPOSIX_riscv64::GetRegisterSetFromRegisterIndex(
     uint32_t reg_index) const {
-  // coverity[unsigned_compare]
-  if (reg_index >= gpr_first_riscv && reg_index <= gpr_last_riscv)
-    return GPRegSet;
-  if (reg_index >= fpr_first_riscv && reg_index <= fpr_last_riscv)
-    return FPRegSet;
+  for (const auto &regset_range : m_per_regset_regnum_range) {
+    if (reg_index >= regset_range.second.first &&
+        reg_index < regset_range.second.second)
+      return regset_range.first;
+  }
   return LLDB_INVALID_REGNUM;
 }
 
+bool RegisterInfoPOSIX_riscv64::IsFPReg(unsigned reg) const {
+  return llvm::is_contained(m_fp_regnum_collection, reg);
+}
+
 const lldb_private::RegisterSet *
 RegisterInfoPOSIX_riscv64::GetRegisterSet(size_t set_index) const {
   if (set_index < GetRegisterSetCount())
-    return &g_reg_sets_riscv64[set_index];
+    return &m_register_sets[set_index];
   return nullptr;
 }
diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.h b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.h
index 4bf4bede01328..f8e22c7df3c88 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_riscv64.h
@@ -11,19 +11,21 @@
 
 #include "RegisterInfoAndSetInterface.h"
 #include "lldb/Target/RegisterContext.h"
+#include "lldb/Utility/Flags.h"
 #include "lldb/lldb-private.h"
 #include <map>
 
 class RegisterInfoPOSIX_riscv64
     : public lldb_private::RegisterInfoAndSetInterface {
 public:
-  static const lldb_private::RegisterInfo *
-  GetRegisterInfoPtr(const lldb_private::ArchSpec &target_arch);
-  static uint32_t
-  GetRegisterInfoCount(const lldb_private::ArchSpec &target_arch);
+  enum { GPRegSet = 0 };
 
-public:
-  enum { GPRegSet = 0, FPRegSet };
+  // RISC-V64 register set mask value
+  enum {
+    eRegsetMaskDefault = 0,
+    eRegsetMaskFP = 1,
+    eRegsetMaskAll = -1,
+  };
 
   struct GPR {
     // note: gpr[0] is pc, not x0
@@ -41,7 +43,11 @@ class RegisterInfoPOSIX_riscv64
   };
 
   RegisterInfoPOSIX_riscv64(const lldb_private::ArchSpec &target_arch,
-                            lldb_private::Flags flags);
+                            lldb_private::Flags opt_regsets);
+
+  void AddRegSetGP();
+
+  void AddRegSetFP();
 
   size_t GetGPRSize() const override;
 
@@ -58,9 +64,23 @@ class RegisterInfoPOSIX_riscv64
 
   size_t GetRegisterSetFromRegisterIndex(uint32_t reg_index) const override;
 
+  bool IsFPPresent() const { return m_opt_regsets.AnySet(eRegsetMaskFP); }
+
+  bool IsFPReg(unsigned reg) const;
+
 private:
-  const lldb_private::RegisterInfo *m_register_info_p;
-  uint32_t m_register_info_count;
+  std::vector<lldb_private::RegisterInfo> m_register_infos;
+
+  std::vector<lldb_private::RegisterSet> m_register_sets;
+
+  // Contains pair of [start, end] register numbers of a register set with start
+  // and end included.
+  std::map<uint32_t, std::pair<uint32_t, uint32_t>> m_per_regset_regnum_range;
+
+  // Register collections to be stored as reference for m_register_sets items
+  std::vector<uint32_t> m_fp_regnum_collection;
+
+  lldb_private::Flags m_opt_regsets;
 };
 
 #endif
diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfos_riscv64.h b/lldb/source/Plugins/Process/Utility/RegisterInfos_riscv64.h
index 720d900c7b97e..628ed3770bdc0 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterInfos_riscv64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterInfos_riscv64.h
@@ -79,7 +79,7 @@ using namespace riscv_dwarf;
 
 // clang-format on
 
-static lldb_private::RegisterInfo g_register_infos_riscv64_le[] = {
+static lldb_private::RegisterInfo g_register_infos_riscv64_gpr[] = {
     // DEFINE_GPR64(name, GENERIC KIND)
     DEFINE_GPR64(pc, LLDB_REGNUM_GENERIC_PC),
     DEFINE_GPR64_ALT(ra, x1, LLDB_REGNUM_GENERIC_RA),
@@ -114,7 +114,9 @@ static lldb_private::RegisterInfo g_register_infos_riscv64_le[] = {
     DEFINE_GPR64_ALT(t5, x30, LLDB_INVALID_REGNUM),
     DEFINE_GPR64_ALT(t6, x31, LLDB_INVALID_REGNUM),
     DEFINE_GPR64_ALT(zero, x0, LLDB_INVALID_REGNUM),
+};
 
+static lldb_private::RegisterInfo g_register_infos_riscv64_fpr[] = {
     DEFINE_FPR64_ALT(ft0, f0, LLDB_INVALID_REGNUM),
     DEFINE_FPR64_ALT(ft1, f1, LLDB_INVALID_REGNUM),
     DEFINE_FPR64_ALT(ft2, f2, LLDB_INVALID_REGNUM),
@@ -148,39 +150,25 @@ static lldb_private::RegisterInfo g_register_infos_riscv64_le[] = {
     DEFINE_FPR64_ALT(ft10, f30, LLDB_INVALID_REGNUM),
     DEFINE_FPR64_ALT(ft11, f31, LLDB_INVALID_REGNUM),
     DEFINE_FPR_ALT(fcsr, nullptr, 4, LLDB_INVALID_REGNUM),
+};
 
-    DEFINE_VPR(v0, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v1, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v2, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v3, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v4, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v5, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v6, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v7, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v8, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v9, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v10, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v11, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v12, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v13, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v14, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v15, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v16, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v17, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v18, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v19, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v20, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v21, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v22, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v23, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v24, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v25, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v26, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v27, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v28, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v29, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v30, LLDB_INVALID_REGNUM),
-    DEFINE_VPR(v31, LLDB_INVALID_REGNUM),
+static lldb_private::RegisterInfo g_register_infos_riscv64_vpr[] = {
+    DEFINE_VPR(v0, LLDB_INVALID_REGNUM),  DEFINE_VPR(v1, LLDB_INVALID_REGNUM),
+    DEFINE_VPR(v2, LLDB_INVALID_REGNUM),  DEFINE_VPR(v3, LLDB_INVALID_REGNUM),
+    DEFINE_VPR(v4, LLDB_INVALID_REGNUM),  DEFINE_VPR(v5, LLDB_INVALID_REGNUM),
+    DEFINE_VPR(v6, LLDB_INVALID_REGNUM),  DEFINE_VPR(v7, LLDB_INVALID_REGNUM),
+    DEFINE_VPR(v8, LLDB_INVALID_REGNUM),  DEFINE_VPR(v9, LLDB_INVALID_REGNUM),
+    DEFINE_VPR(v10, LLDB_INVALID_REGNUM), DEFINE_VPR(v11, LLDB_INVALID_REGNUM),
+    DEFINE_VPR(v12, LLDB_INVALID_REGNUM), DEFINE_VPR(v13, LLDB_INVALID_REGNUM),
+    DEFINE_VPR(v14, LLDB_INVALID_REGNUM), DEFINE_VPR(v15, LLDB_INVALID_REGNUM),
+    DEFINE_VPR(v16, LLDB_INVALID_REGNUM), DEFINE_VPR(v17, LLDB_INVALID_REGNUM),
+    DEFINE_VPR(v18, LLDB_INVALID_REGNUM), DEFINE_VPR(v19, LLDB_INVALID_REGNUM),
+    DEFINE_VPR(v20, LLDB_INVALID_REGNUM), DEFINE_VPR(v21, LLDB_INVALID_REGNUM),
+    DEFINE_VPR(v22, LLDB_INVALID_REGNUM), DEFINE_VPR(v23, LLDB_INVALID_REGNUM),
+    DEFINE_VPR(v24, LLDB_INVALID_REGNUM), DEFINE_VPR(v25, LLDB_INVALID_REGNUM),
+    DEFINE_VPR(v26, LLDB_INVALID_REGNUM), DEFINE_VPR(v27, LLDB_INVALID_REGNUM),
+    DEFINE_VPR(v28, LLDB_INVALID_REGNUM), DEFINE_VPR(v29, LLDB_INVALID_REGNUM),
+    DEFINE_VPR(v30, LLDB_INVALID_REGNUM), DEFINE_VPR(v31, LLDB_INVALID_REGNUM),
 };
 
 #endif // DECLARE_REGISTER_INFOS_RISCV64_STRUCT
diff --git a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.cpp b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.cpp
index 5ba18cdb9889a..3dca4b1609905 100644
--- a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.cpp
+++ b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.cpp
@@ -16,9 +16,17 @@ std::unique_ptr<RegisterContextCorePOSIX_riscv64>
 RegisterContextCorePOSIX_riscv64::Create(Thread &thread, const ArchSpec &arch,
                                          const DataExtractor &gpregset,
                                          llvm::ArrayRef<CoreNote> notes) {
+  Flags opt_regsets = RegisterInfoPOSIX_riscv64::eRegsetMaskDefault;
+
+  DataExtractor fpregset = getRegset(notes, arch.GetTriple(), FPR_Desc);
+  if (fpregset.GetByteSize() >= sizeof(uint64_t)) {
+    opt_regsets.Set(RegisterInfoPOSIX_riscv64::eRegsetMaskFP);
+  }
+
   return std::unique_ptr<RegisterContextCorePOSIX_riscv64>(
       new RegisterContextCorePOSIX_riscv64(
-          thread, std::make_unique<RegisterInfoPOSIX_riscv64>(arch, Flags()),
+          thread,
+          std::make_unique<RegisterInfoPOSIX_riscv64>(arch, opt_regsets),
           gpregset, notes));
 }
 
@@ -27,17 +35,14 @@ RegisterContextCorePOSIX_riscv64::RegisterContextCorePOSIX_riscv64(
     const DataExtractor &gpregset, llvm::ArrayRef<CoreNote> notes)
     : RegisterContextPOSIX_riscv64(thread, std::move(register_info)) {
 
-  m_gpr_buffer = std::make_shared<DataBufferHeap>(gpregset.GetDataStart(),
-                                                  gpregset.GetByteSize());
-  m_gpr.SetData(m_gpr_buffer);
+  m_gpr.SetData(std::make_shared<DataBufferHeap>(gpregset.GetDataStart(),
+                                                 gpregset.GetByteSize()));
   m_gpr.SetByteOrder(gpregset.GetByteOrder());
 
-  ArchSpec arch = m_register_info_up->GetTargetArchitecture();
-  DataExtractor fpregset = getRegset(notes, arch.GetTriple(), FPR_Desc);
-  m_fpr_buffer = std::make_shared<DataBufferHeap>(fpregset.GetDataStart(),
-                                                  fpregset.GetByteSize());
-  m_fpr.SetData(m_fpr_buffer);
-  m_fpr.SetByteOrder(fpregset.GetByteOrder());
+  if (m_register_info_up->IsFPPresent()) {
+    ArchSpec arch = m_register_info_up->GetTargetArchitecture();
+    m_fpr = getRegset(notes, arch.GetTriple(), FPR_Desc);
+  }
 }
 
 RegisterContextCorePOSIX_riscv64::~RegisterContextCorePOSIX_riscv64() = default;
diff --git a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.h b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.h
index 3cf9531df2c1d..a9a5984463574 100644
--- a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.h
+++ b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_riscv64.h
@@ -50,9 +50,6 @@ class RegisterContextCorePOSIX_riscv64 : public RegisterContextPOSIX_riscv64 {
   bool WriteFPR() override;
 
 private:
-  lldb::DataBufferSP m_gpr_buffer;
-  lldb::DataBufferSP m_fpr_buffer;
-
   lldb_private::DataExtractor m_gpr;
   lldb_private::DataExtractor m_fpr;
 };
diff --git a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
index 0b9d17bc9f45e..7e8531c88bf34 100644
--- a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
+++ b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
@@ -21,7 +21,8 @@ class LinuxCoreTestCase(TestBase):
     _x86_64_pid = 32259
     _s390x_pid = 1045
     _ppc64le_pid = 28147
-    _riscv64_pid = 89328
+    _riscv64_gpr_fpr_pid = 1089
+    _riscv64_gpr_only_pid = 97
 
     _aarch64_regions = 4
     _i386_regions = 4
@@ -61,9 +62,25 @@ def test_s390x(self):
         self.do_test("linux-s390x", self._s390x_pid, self._s390x_regions, "a.out")
 
     @skipIfLLVMTargetMissing("RISCV")
-    def test_riscv64(self):
+    def test_riscv64_gpr_fpr(self):
         """Test that lldb can read the process information from an riscv64 linux core file."""
-        self.do_test("linux-riscv64", self._riscv64_pid, self._riscv64_regions, "a.out")
+        self.do_test(
+            "linux-riscv64.gpr_fpr",
+            self._riscv64_gpr_fpr_pid,
+            self._riscv64_regions,
+            "a.out",
+        )
+
+    @skipIfLLVMTargetMissing("RISCV")
+    def test_riscv64_gpr_only(self):
+        """Test that lldb can read the process information from an riscv64 linux core file
+        made for a RV64IMAC target, having no FP-registers."""
+        self.do_test(
+            "linux-riscv64.gpr_only",
+            self._riscv64_gpr_only_pid,
+            self._riscv64_regions,
+            "a.out",
+        )
 
     @skipIfLLVMTargetMissing("X86")
     def test_same_pid_running(self):
@@ -668,46 +685,47 @@ def test_arm_core(self):
         self.expect("register read --all")
 
     @skipIfLLVMTargetMissing("RISCV")
-    def test_riscv64_regs(self):
+    def test_riscv64_regs_gpr_fpr(self):
         # check basic registers using 64 bit RISC-V core file
         target = self.dbg.CreateTarget(None)
         self.assertTrue(target, VALID_TARGET)
-        process = target.LoadCore("linux-riscv64.core")
+        process = target.LoadCore("linux-riscv64.gpr_fpr.core")
 
         values = {}
-        values["pc"] = "0x000000000001015e"
-        values["ra"] = "0x000000000001018c"
-        values["sp"] = "0x0000003fffd132a0"
-        values["gp"] = "0x0000002ae919af50"
-        values["tp"] = "0x0000003fdceae3e0"
-        values["t0"] = "0x0"
-        values["t1"] = "0x0000002ae9187b1c"
-        values["t2"] = "0x0000000000000021"
-        values["fp"] = "0x0000003fffd132d0"
-        values["s1"] = "0x0000002ae919cd98"
+        values["pc"] = "0x000000000001016e"
+        values["ra"] = "0x00000000000101a4"
+        values["sp"] = "0x0000003fffc1d2d0"
+        values["gp"] = "0x0000002ae6eccf50"
+        values["tp"] = "0x0000003ff3cb5400"
+        values["t0"] = "0x7f7f7f7fffffffff"
+        values["t1"] = "0x0000002ae6eb9b1c"
+        values["t2"] = "0xffffffffffffffff"
+        values["fp"] = "0x0000003fffc1d300"
+        values["s1"] = "0x0000002ae6eced98"
         values["a0"] = "0x0"
         values["a1"] = "0x0000000000010144"
-        values["a2"] = "0x0000002ae919cdb0"
-        values["a3"] = "0x000000000000002f"
-        values["a4"] = "0x000000000000002f"
+        values["a2"] = "0x0000002ae6ecedb0"
+        values["a3"] = "0xafdbdbff81cf7f81"
+        values["a4"] = "0x00000000000101e4"
         values["a5"] = "0x0"
-        values["a6"] = "0x7efefefefefefeff"
+        values["a6"] = "0x2f5b5a40014e0001"
         values["a7"] = "0x00000000000000dd"
-        values["s2"] = "0x0000002ae9196860"
-        values["s3"] = "0x0000002ae919cdb0"
-        values["s4"] = "0x0000003fffc63be8"
-        values["s5"] = "0x0000002ae919cb78"
-        values["s6"] = "0x0000002ae9196860"
-        values["s7"] = "0x0000002ae9196860"
+        values["s2"] = "0x0000002ae6ec8860"
+        values["s3"] = "0x0000002ae6ecedb0"
+        values["s4"] = "0x0000003fff886c18"
+        values["s5"] = "0x0000002ae6eceb78"
+        values["s6"] = "0x0000002ae6ec8860"
+        values["s7"] = "0x0000002ae6ec8860"
         values["s8"] = "0x0"
         values["s9"] = "0x000000000000000f"
-        values["s10"] = "0x0000002ae919a8d0"
+        values["s10"] = "0x0000002ae6ecc8d0"
         values["s11"] = "0x0000000000000008"
-        values["t3"] = "0x0000003fdce07df4"
+        values["t3"] = "0x0000003ff3be3728"
         values["t4"] = "0x0"
-        values["t5"] = "0x0000000000000020"
-        values["t6"] = "0x0000002ae919f1b0"
+        values["t5"] = "0x0000000000000002"
+        values["t6"] = "0x0000002ae6ed08b9"
         values["zero"] = "0x0"
+        values["fa5"] = "0xffffffff423c0000"
         values["fcsr"] = "0x00000000"
 
         fpr_names = {
@@ -728,7 +746,7 @@ def test_riscv64_regs(self):
             "fa2",
             "fa3",
             "fa4",
-            "fa5",
+            # fa5 is non-zero and checked in the list above.
             "fa6",
             "fa7",
             "fs0",
@@ -760,6 +778,61 @@ def test_riscv64_regs(self):
 
         self.expect("register read --all")
 
+    @skipIfLLVMTargetMissing("RISCV")
+    def test_riscv64_regs_gpr_only(self):
+        # check registers using 64 bit RISC-V core file containing GP-registers only
+        target = self.dbg.CreateTarget(None)
+        self.assertTrue(target, VALID_TARGET)
+        process = target.LoadCore("linux-riscv64.gpr_only.core")
+
+        values = {}
+        values["pc"] = "0x0000000000010164"
+        values["ra"] = "0x0000000000010194"
+        values["sp"] = "0x00fffffff4d5fcc0"
+        values["gp"] = "0x0000000000157678"
+        values["tp"] = "0x00ffffff99c43400"
+        values["t0"] = "0x00ffffff99c6b260"
+        values["t1"] = "0x00ffffff99b7bd54"
+        values["t2"] = "0x0000000003f0b27f"
+        values["fp"] = "0x00fffffff4d5fcf0"
+        values["s1"] = "0x0000000000000003"
+        values["a0"] = "0x0"
+        values["a1"] = "0x0000000000010144"
+        values["a2"] = "0x0000000000176460"
+        values["a3"] = "0x000000000015ee38"
+        values["a4"] = "0x00000000423c0000"
+        values["a5"] = "0x0"
+        values["a6"] = "0x0"
+        values["a7"] = "0x00000000000000dd"
+        values["s2"] = "0x0"
+        values["s3"] = "0x000000000014ddf8"
+        values["s4"] = "0x000000000003651c"
+        values["s5"] = "0x00fffffffccd8d28"
+        values["s6"] = "0x000000000014ddf8"
+        values["s7"] = "0x00ffffff99c69d48"
+        values["s8"] = "0x00ffffff99c6a008"
+        values["s9"] = "0x0"
+        values["s10"] = "0x0"
+        values["s11"] = "0x0"
+        values["t3"] = "0x00ffffff99c42000"
+        values["t4"] = "0x00ffffff99af8e20"
+        values["t5"] = "0x0000000000000005"
+        values["t6"] = "0x44760bdd8d5f6381"
+        values["zero"] = "0x0"
+
+        for regname, value in values.items():
+            self.expect(
+                "register read {}".format(regname),
+                substrs=["{} = {}".format(regname, value)],
+            )
+
+        # Check that LLDB does not try to read other registers from core file
+        self.expect(
+            "register read --all",
+            matching=False,
+            substrs=["registers were unavailable"],
+        )
+
     def test_get_core_file_api(self):
         """
         Test SBProcess::GetCoreFile() API can successfully get the core file.
diff --git a/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.core b/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_fpr.core
similarity index 76%
rename from lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.core
rename to lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_fpr.core
index 0b159fcab931dca85bd5d09c282c2d6c217e2b83..7187b4000c56dc8ca74aec484310746a0fdd1e9c 100644
GIT binary patch
delta 1412
zcmchV?@N<W6vywgXLAFaPR;z0;D!Viv^~$#rRh-TerPYAnX{0L<|_LEqBBh@h|M>#
zpsaD<1VTuJy@*U0K|u_KA%%i7O$CbzMDQ0G_NFM;b9c`L?GNa{<#W&Xo^$Sf&U1o;
zTyT(k(8~P?`&#+9VJ`f;IJ~aqzmQ$2YIH17h47dn3)geCff^-3lL~|w5!hfl+fi2M
zal3@+_%0W@#;M=I(YPnT_<M34LJd>v%ZErv)5I3>6S4(}6%4E;4~Rev&CrC08<0Sf
zWV1uT5zzYrQ+<iW#TQGU(<p>EByrWJTTGuoXbsUiEHAe`lzh<jHKI!hu>eL^h&3So
zmKM;!LGjUO@(Wm;+`Oc{)@&@}1%;5a!Xrc$Ybh{N$Y*loci<A`dd)CAI~B4?0<k?5
z$pU1n{l`yZKK7{!v%h_y%_;<S^Rw)S4OuwIE4d2%TxHdEU-7rKT`>8Ko!5F0PN?pu
zRN6?NaYjlgQlMVa;09jvSf{Icm7gXz51}`!b))2)ND#5j-e*HOvcETP;Uc-FB`SFj
zKGmXA&>-1Ytp3cwHN0qIG0{e8KZ#{A%GoW`PJc&^k>#+uB#s!Hx83i;+xo_IRyH@d
z{my?oJK{5(HQ*6m)W&Qy7a?J03{TjKQYUQ<Fi^92+Lj%N+Was;o8xrw*gXm8C99yG
zqrJ@nT7buaUf?BQFE9ig07if-gaNtwzIj^s0PqCx9PkG49`F(HBXA3txj^d_0V{!e
zG6eEH0?-ch$}A%Xw?OCz#(;}J)&T?C{D+~H$0xt2KfOZH5ewXH)(x+94R!r8!q}el
zI2x^Vknwj%fl@ePF2fm4cM51Xoo6@^&pVH$QUP7=>G9*Q&SGqLnez{uOD*R++dHqK
zX1mMlad<rSZpmA3x0>-CS2rg{{<wP8<&f#|)Ol;&lH{mulB2AmU@?k9xluIBt6X0~
TZr1M<EoJznThOtWvX1=)Ta1pf

delta 1420
zcmchW{Yz6(7{|}u6pg7fm-*7bJ0h^a?QV0{ie={A=1QDx+8fd~YfC{>%IrmQf<Hv%
zOXENU5lGMoED{z(&=38TL2pVVK~{oR8WJi(VCK5#p67(~59oor&w0N0d7g95bDwn$
za9so3bw9Tp3HX(M?RaFlFfyC2Y$Yp;f*-|Z?udG>jzeQb90KApWqx=GCD`ou1Q%su
z>L24cdC>zLFpb7bC8hdd-pn?M6O%v2iO)>ikt^#u5Jn*W=4%?y;=&>n9D)}Hc*Dp7
z<14ategeVYKnt<F>0Qs|1D$_Kb@rZ-)ltJ@<enV21*L+pcxfWx20Z9P^LaD)gF?ue
zRw6{6HC>DqD6=@ya$KA=TXqAkqk?RbKrD?yE<l#)Yy1r6<3Msj_8SZBZULxsrIH`f
zX>gY^i4*aIWLx%$V0~lYlp)v@DmvbBI%o*!Th6xOU&;M=PfA~GL&{X@oy^SAqk~H-
z^5OapF52!Z5Q>A1JsphZNcx2gUx@4xL5g?r;e2#!fgVKiLw7qfqzN$^SWGnDZy~WP
z=GDGG+SUH596ifnbx9mCIz$L1o(r$=9(6Zt&CU0iTpty&fX=T=tEXwwuXz;z-n&_a
zLcE#w<`L$~H->$!FgJ5iI&f@<&(K0v;2Zc0!0ZxQW-D+v&;hIko&<IP`+!lIVT8s(
zSht-P%ms?TT3{3K9Plde1@H@S1*qOZ>lFgi$P7rg(E<fRrzGqLp%K^(d;%N+eg(3Q
z=-A>v%&s&@dF2ZiJ;Re(oo}tZk2UYFn6K&S(4tze-D9KkjJ)0oUwNg=UR7f=3MG1@
zVAUH<(qrX|%~p$mO%8WHgt{xm>O-Q<(9qHxG*As+cGPhKo^tF|K~O_mTPyzQD8vVx
zCJj0)I(=Td*Xwdu`COvSY_#Ir&NG}4TVK(hqNXYT{!+Nt&1=Y}{OXgGnDH~Wu*ojE
QYTO=Mdt)<V*I@1Z3+t_NXaE2J

diff --git a/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_fpr.out b/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_fpr.out
new file mode 100755
index 0000000000000000000000000000000000000000..0e4e06dfe6de0f019ea8635f0ad271eeaa1a4bdc
GIT binary patch
literal 3480
zcmbtW-D@0G6hC)nH@ll{l1)=-LXm9Jv{5nB-A!!M+G@MahZU*X*iv5{XLe_kf!&?$
z?4%^2qPCSrpDYDiB&Zdws2~*dK?J2B;)DJVMJ);yn%Y7^8{;`M_ikn<8!Gg|z31HD
z`JHpWKVBVuW=vuXM3UeZQ0)tj?d66GF;5Aj*bBoD!ZHDEAOloFh`lFqVn9p;>Y^o*
z6l*2<EdVn-o{-p>o2V_yNls7lehS&=(LhMzQhNTHlW+3W|5ftl+_pBL2aXfQ4<3f=
zw_X_>y!6}UBNtEI{p*KsjGyN_3bV(0Z$szi-5=dO9|1UbrayM(lL+*0z5Va09jBKz
zBGwV<kDlpA?1MWvyS>h+cJj5^V@<bv7d^kt!(VdW-@_}up(XjT(@Q7MGx)F~xjwAD
z>CfIjAmOz$DGTm-oXT@=T)V?YZ!CWIH-leZ&gR3|1`YN5EWmEeKVkkhE5;?y9RhZO
zu}v)!B$3sLd3bqwc{@tTwx+W4Qg&Swia)NCQrT8;=W{ojo<`0P=Zw5A&4-9pxRUkr
z@@PMTyp+dF)_A(hOH#b?{rh(%9~v=qQ!6LagGxq84P*vw<M^yqO+U;L*mnp+*k(0%
z9N1H+h7w`>`1T<)Q%S3(OeLjiq!luijGCt+N7$U1tJ<DYFv>Xt*$9?O)5UZp1&?AF
zZ*N7ZdoaQ`(7SGJb#Wb`Fff<wZl(u`6UW@hGyD=7SWA;QKJF9{v~ZNlF=!(xoh?FB
zC`4x?Qm`ywk(N2$4R>x`hmVQ)B2hYB_3Tl}pxTzN$Z+;K3>Fi?WKyDJh!Gd$&#-9L
zH5(Xf5&d+R7IC)VhGilw26FwWSHOlqj_&GVkH>fR#13>wo!C|w<W>)gdx+cy@t){`
z4qT9SwxzWrE_ZM0ZfENwiH$f}3VX5(9pV@w-rq&=-p&(U4B1V65RX3*mvQ|g3d439
zaH0>eewQ2z1GUh+nmE^mqqu-M3{iN5M#R`Ztt}+zR=|u(H={L@Shv*XEjQNCgF#FZ
z6Vefm$t>0k49f(tFv$&AlVee=k&RH<XC<yo9<G&GerqdGG;AC%isb(E4!(<%10_v2
z_Y^DJhw`YKRc)YJF>-q{sr1ftCOt@YJ2$n*-Pt9rkk5Hr7)x*eCI=>=P_#4$bjvD%
zT6Hwr(+wL+npsv06<f){wkfM*Y^$0T!=NL0&m`kiCHcPdlVv<0Vvg!@8Ll?uhhVWG
z9`?J}bQv1o#4_Mj*&xZd%#D%^{80)RWx!QvkYxARt~SW9ZWTxyDFio{g9c(szWEV{
z%DY!r&l_D}#1EeDBZwy`ekZlGlBQ5i(5sWE$KOPJJ;n)MEHi8HPp^T$y$1gN8u+C(
zaR0ued!}#ZelKRa2vv2mXz5x}typH+G3-7>CPwzXuusj7jvqLvs#woXOsJy=M_gU6
zR{3dA)q0wd*BlL0ql_m(sm_)hO-Je2o}3a|*>VhJvOKNm(`GS0VCF$_j2Q<M+j0Xa
z#*|vHwUPl!&MK9RGMVT-QZ&c0O?}!is-WZzeR@*WY^^-GQZ>s3t7g)5eA+~<XqGXa
z=Y$;*D#17!8)y9f(<Tf-r%Bv>;+%_rQ(vG7+;MeD!5ZyAaZ<&7Fv7QG@VON2h#>Ti
z5mn&g-nh#Bz4Id2-k)xzhCh~ozqnr(x&OSMjC$~f7~}rp0Ty)epWG?{E_qnD&jdY-
znf|uiTodA-OK-P734eiUs01;x-Q*Jz>{pOW4F@su@1c(<rW>B9l|k+av;;9CC-_2t
zTJ!XjAoneVgBUTt=Kc$wTdfRo-$4su!EqkuPXc~5>XPsm@*-Lr`CsAw_%EqT!p|?o
v-t!Z$a{r^;h+h$C$1L`bh)XXEpK8B540Gb=b?EB3qL_{Rf93v{@8SO+vIc^W

literal 0
HcmV?d00001

diff --git a/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_only.core b/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_only.core
new file mode 100644
index 0000000000000000000000000000000000000000..63265a5db123d9ebc699a927d7477a264b558e91
GIT binary patch
literal 28672
zcmeHQeQ*=U72lJvBuoK=3Ni&!5dwDjkR?OF0i5^)AxI3kHeb_*2>B#iDz@a2WP>xb
z7}VJAOeheg*r84#Wd<{8n^4>t(@u#YA;7d@l1|$|r;tJ&aN0C{G{uG#P~W@TCyQ&C
zGG*H4k6k#uk9}|V?Qh>6d(7V4MrX+aog^_TgV-l*Kw_+bDpF_x%VTL=9>nRYRHqc%
zgJh7RLl(`IHZrK<E)7sUr)Z@#N&15GKT0GL)Cgl%&Js=WQ~I;>`Zf3NXF!)y?sx8Y
zf4`Uhu&?zHEXG^1Xvy+bK!@-qkfidY_5kxVTtrFV_!x02-TmC1a_E6O<yk*eWiOxC
z7dSn9Q;mIn{V3I7=Ft~%ENc!NQ|PioT?*ZJ;EEm%u5f!rP(762`MkX~UW+2<UE=Mr
zc||E^N%m}#Q<>{$zo;<dd}_<tTTb4@I7q+nB{d$?Zk1}>2SlqHzx6w>DREOyZj#+w
z&JL-cPll*(feU*pURvf<^kvE~W5xo`sD!XjQd_7lnqpzg@u0?<qPi~@V{3$Kk!w*o
z#YVXy-B-*1xiiMJ8#>3lr<f*Cb9~)olg#PE8hMIFo~n_jY2@h|*`|?mG%_v!=_Aw%
zWsM@sE4Z3@xgX_lfwCg`Jp}6Vcs-68i*j!PYE-0wAJYJ<aD%!0eZCfG&dBTw#Be%P
zBM;Na>Pi|=j^S*1LS3_8=JH>;JYcDE82?PToF$7E(3~anhrr@vXBo*F#W}|xs`06L
ztu`ip1p)#Afq+0jARrJB2nYlO0s;YnfIvVXAP^7;2m}NI0s(=5KtLcM5D*9m1Ox&C
z0fB%(Kp-Fx5C{ka1Ofs9fq+0jARrJB2nYl|w+L+1od4I~<@|ppKd_1?N7eKH1#r}M
zIG(Al#WS-XY7e~f5m#zPPA5;1(wSS0i6T;aMCn|w4r;N#6Z}B0mfCp-MCsfv`ry^V
z{j_u*)L|itwpdCpM7_vg=L#)-NU>s!C^u8in&TN^><-ToC(kEydm1~&M?N?<v`31=
zKp!b%%!mSs*=gL$x#VAt-Cw@d?jQHUbBVn_tC=unPjPVGsN*}c_E-48<*r4~(wUVy
z=2m!4dnCM#upwPlyv^~$R}N6Wkk9Kkx&60*j$l_)Ubs*z&<*Vw2b^%nHv>?g&J|%C
z2$VH&q5ikLecnC8WBI5*q0uw9yWSfKxx-O+Bv2LhMeD*r(#h2!d9Az37mWsLe5B-J
zsM70cAYIZh?2C{N?Bv=yk}9j~B7S#3_Q-x8s}2O~)=$T7ZSdEjzQ^I7X>&XRK|;~p
z@kiB3Fmw3l<?c&E@hyjOaMAk%h>;HJ>NfVKV;vU66AR`5>Tv#DOO~^FtxxPpoBHJS
z4g0^|mcC+Y_Q<~+IEnc{xl)3$fy{JjX3=?a>Wl2?y_{>e6CpI;#^;yPU>~0k%FlOs
zJ<Z2fbb><X+cQ=1p3ldVJ-pzd^k0etgmR0^ynA+q%Xz$BwfgV<N@MkNAA;twI}`li
zx_=iImlx<6pbECY)6seTGiBvW+8{||hw4y=<2nk#Q=X^Tk(uI)c3x<Y@I<?5a6Ofx
z15p%#!2bn-xM{yBqV!xi;=yx<518qYOa7oEJ<auO)m%#li#N}1xv{nNy+k73wD%it
zz0+P~?zgb9O-A;WkA^d=?o82--}xjf<FGl?$!4|19&6FX@=Ur|cN5ch8zlX{CQ1M5
zF$pd8dkm~+MCDCgW@mGMp?Q@_*V*4Ibv7IIUGMFj8*l2{)z{j2>bNA|>SMEK`Hq?m
zhBXIT)3SOl_1)H4r1;O<Chi|4wO{zb8l6{i7D}5N<E9KF#;#~4p{C2o8so?N6Q_%0
z)^F-OZqyBje?ouKWnO1A4E|`ltH*NfraG338^?DKPBV{ovMv*AoV6o<;(EezDv{7<
zxkBdk7AB7}fiFazXJ8Zer%C#}CN}Y>Y05|}r@=1!M0L*)WB=7I)hnx7PPYu1;Zl2C
z-M!sq=}jce#;OL{aE+;aM<Am(zHT&=CV#A8_f+vqe^jtDJJ}3V6;sfAyqW0%LH*-K
zDVE(V>21fAK4T9WS<m)2Zf;i4i=QU+Ec%^0w$;x*wR3JuIBw|QHKk2O@j$D5t<OX_
zO0HBS+b?t)jZ*vh)rUr=SuR49fb^6dmJe}Mez$pDCUacIvBzNSnU%;{dZ2FNp4B}M
z_;0?lqor~-%-EFHmWsHcZ`XqF%Aa0L&RA~ivoK>9Z*R72ioep@cA@?JrbCh|&_#1`
z>kwg7*i9Ly&d9Nqi;uJdvax;HE6x3xETCZ23Ud<caIN_GO+?B9d8nVfcZKDBptj(O
z*y~LLxpvT?%$Vgj7;8^i&rbiRy3v?x((`Jj9oOAkrp;;FY?-~XB4bnBY@B_zv$<C{
z(Vix~7$0|G`vu8~*@n4>QOEY~Euoy_^ZlgU)W<L<%*Ku<Vp+yv?MGr+u3=}6{GMk@
z^CI$)hxRkVWlUGxv@IR8S(?<)&GTge&W?Wa<OP^DzgF>ptO7iofBu7lhcajToa+3@
zPW@xekU7neNsv8xn2T0j>t*nG?Q&tym-_x;wm{xA_ZCX=&i+%d_Ih<{?&twK%$_&e
z{swEXib3{V?bH2h_Lwt`I$XI6+HSzCp6sURUrx>?+&#{B7}Pb<JBLQv3gdwL$D^{L
zpYipVmvV>s_KmGudT@=7f3wSy5BD77woPxrs$;3EJ=W1Q472JiAZ698d&c6XMg_gi
zjmX<G5J}|e`JfPY#^FFju4mHMfE|>P7OtN)Gmu;BaM$G4+T9-HWaOR+$Q7X)hss65
zs;bf#lzm{0)&4**TI;Ec_!P0Grmh-{nMq0?;wmnlW6dsGRPMCZJ7(B&?YT3i+a1##
zk34L3%(U5Ub}OnJxsFGz*}nC)zA$W9gHca4ygg})_#@G9)KkH1Rlz!&-xKjOo3|kt
zX{b?YG_2Ot`@)exD0qk9_A*;A6!j^Vo|=HnY(BrcGVH1Gx&2;nB#GIgzV%Ts^F%$&
zrr=@oM5EzAMO_s3#5OrpgAoBh_gUb+t32P){*LycMDuw2wB4h96zyAazu5<SLh`45
zE75|x_;WSG=MHRx{ApiH^fWib{?LZ{$Nd_X@o-^m<WKu#qO>nYTP%FrtP1cu2`+f2
zIT=6gYl)8JmNb6qhiER;;~tr0+BXv={Iv1&W?3OHu*FQqf4ApdvM1b0b^(LW1QZ{w
zBcgUbVDcmUXdhe(zGy=6-|e{>Oq4?Hko+_-ss8lbO4Lh+oKpX!_H%!Fu1QWXH+)~y
zp+;mu?oZ=C$Dc=uYWpLZC@vxJt)+N~(*I{vl7dqXXHwe)9O~>zjmj+HFA~qu=*d5o
zQtYXwE!_X#<!`Ew-~|E#0fB%(Kp-Fx5C{ka1Ofs9fq+0jARrJB2nYlO0s;YnfIvVX
zAP^7;2m}NI0s(=5KtLcM5D*9m1Ox&C0fB%(Kp-Fx5C{ka1OopX1dbM2Z>itUp>JT(
zx2rz8|F??@xcx8qcYW@(d%dW%Mg3M)=2z7>DCxYdtobg_S}r$m`aGxYobKWDRZb6b
zdYIE5PO1M1#E9=NDM)<YzQ07%19aV-;<k+M<!5It`_t}UZ8}@SKN-f#7dlHy@^dNz
z!5nzE!{-TCH`IpVRg$&&5qJ^AQ#~yjZ78?1(v|b`l`k@}g-aJZ^K;lTXW8QX`l!R6
y%a)fr%ksmaP!t*~Sy7VjV5No27Ut(fpwT%wb&)XVk|Ho{X~}%0P4-#edHN?P8#2xS

literal 0
HcmV?d00001

diff --git a/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_only.out b/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.gpr_only.out
new file mode 100755
index 0000000000000000000000000000000000000000..908d61bb54caaac5764fa2e60ad14bdcd9ed2c30
GIT binary patch
literal 3520
zcmbtWU2GIp6h3!mx4UKgv!xi@0O<lnieYDeTFQ^0?b1>j4I;$^cyO5Ao!t)1?kuyj
zO=(EP0`Vul5ECVe7>&lC#0L|7@I@t>$b%0?UwAe#G0_Bwydl<eX6{{fr&}@cCik9m
z&-u<d_vhT7=O>@qFEIw9Nbo06?<$v#am6?79z(h@*a-n_qtFd9KqY{(s}cwL$cV2m
zA`uNX=^8ZvdG1e84CWS!MK#9ZF+NT~2R#@FN?0n-{OaV}JoZ1eJe(VA4O-wlVd~(s
zu&w*<fnyh+{Q0dPzkE3#n9slc^`{5UoNl|b@fP%L+VlQj*NW#qd~w~TNACQ6e#eD-
z<3)MP_v<cy%iz6=<V<Rd=Nhq_c7E>SL+d?!;GVqw!o9^e?xR^x+aDW$gD82Oa#7cc
zuU(v4!zORudgm^KUruE5LG(&f`Yr>o2kQ@5zsiX5%L|8ron>ror{u0lA0iJeEiG*$
zlT24~;sxoYK&B%M&t1!0?~sz2E^u*Us6K)?#g9ADjGIOMal{RIxLDoaaTgaOe^dMR
z?T!r{o}QUh=2F8-I+-4hCsXm%_N}qhh>}#2F%qTHsa>%leZHdGMoBL_nz<FmhtaKV
zRC9B{wjvvd2JN}g5hGnmsikx!scOXKh+8O?YG$gE1e%$bJYgCWOz;d@bbHI2-%%6<
z=7Iwe?5rmYF|90vc~r2@HgP`Od6VeMQEK-?H&N-532lJ@C3vV{S-?V_6+R4iL0y9n
z2#6IzeJ*?HsC0^Yo4+8%r9<efT{x4836r9oupobe&8n<s17n?HobH~YoLzC<GT{~-
zx%M<FU?U)hcW+>iMRsk7O!rEC7%K>JmnY&9xf>!I!qdID;yrA0S8qffSUb?eR)?Yw
z<6<f7iGDPQpo_?seiD!Mo$Y6cZX1M1<nf4%n;uda#%X~GKg71U;PxO;gjLId3;j5Y
z%UFXDhDT{ejNR?(Btpj?D>4I&_DEs_Qn$C=*h32j?UGoK-e5##?H$0dT?H&ilzOt;
zBV81w!3Hg&8nj`b7H!cIgJ&(P1O?O59J&-@`%*g|kjb%lNy{30&C2KqvPM;lS1WpM
zZ#tRUg?T?rYCAW(*G=w{mW#!UpjI8tc7Q_ZHk34@tQIP^l7qx7N)j4M7tAx-DQhl4
zM=L>RNJ*uY)UfKFn9-5AX_Ra83DcY_#VfXTQqMWq$!8OeUUd@Vho%#SS~=$!R#`I*
z$Iz>ZidA(=mhI@Jgl-n%Im_1FRsHuy#vO_A^ln7TnBHP5n&mQFZ;21Ujh6CYqj{Z|
zq4kX}L#U2vWMs^yRz(K>xcQVa;AXcdvU@u$SIE$@Oi1|}fZI!c1tFz|^$};+ySJCG
z8(oBi_g^DBP#z`!ef&Y>fu>zKzJAeW`AL+oMnC@h>(~l-+W#oMf<8Z|SCF4uLH^<j
z^2Wrb>uhlTffqbosH$2tt*mCM70W0)x;==JnThcu<7#GdYWkq6Vm~u8qfQ>2aQSSt
z%FmOkHpBG1=4hbmWxUvx>Y0+GWsy3zM`s0Bwj5n4mTOA3W|;Z7kq5=m=N(XN%XOgW
zvueS{LkUXGD$)CeDtfpS&2elaTf;L4N?y;_imGO7<>LLWQ7%|@RW@tubE1dm7yWrg
zI1s^-7-wVSlK$_q7Cq2u5m%@<$KroA=!<}v&{XI-qYM)#QJnV)o_32(==*VmnEr<Z
z74gNraGl$We~iHU<L~j=i~HjSw|`X_a_YYU_H+BB4=4VeyM1!*)*Umcp2kXlU>~gh
zA71JIEASdJ!l#fC{fRG#e_Vbn1@_CxUZM{xmYW`_w|?wd)c9otzRm4Fu3LJHANvZ5
z{W3zn$v5hXXI5|h*w;}*GXFeB`IYZaa$zs%yDQj##O)XAkv&G(HBym#jnE;U&_}rv
se>Bj8RpbwTo7_S#2VW{K@yIz2^yi<5E5PKlm3;?)l0FkaPW|@(0Q3BfZ~y=R

literal 0
HcmV?d00001

diff --git a/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.out b/lldb/test/API/functionalities/postmortem/elf-core/linux-riscv64.out
deleted file mode 100755
index ae28aa9ba9aad9700b193281fa8045587c344187..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3328
zcmbtW-ESL35TCuv*-o4|P6KM93dK#+4-mQ7Zs-?nN!_GrB@{seQp-oLXZswx_<Zj9
zQZ-RgOKGJ%Af%`g6;OC+K?t6B0P%z<PayaskU$kkAT@}*pi;uj-fnWa#DLV5ZfEv4
zznR^e-PyZ~V^1GZ7z5c9_ycJ4mcT}YVWibpG!!GS2U@W2gaoJn^%lfzMG!+`GBlPm
zk)&8FEocEaD&k2ANuNn`(UTJNlvt;f10tZ5#HIG^9j_Si)&JFt>G)g^p$EPvj88ld
zjrV?9y7=O)UEfc9_r=Kz-&9|I^~+N;XZshSXUqN%{<;=TMQ%kG|9*Y<mHXY%1!c>n
zpD~;L-k5p$(#+Y&V*i5A+p^~~<enDXmiy`xSMFcF&fsQU;at`g`K|vKNM4IBXcX@{
zrs~?&JAbmVy9;04WANkYTv5ZAAkzZs>~UZh7~9mLKnmB|gHL&BX=xiu$R*ObIVIN~
zgO|Q-SJJrz0Oe;tK4*~gxZsSwrp&bv>(tk|Ag_q_8uE7dywo!91<p%Ryz#?__oa4>
zTKS4uOJ#QISv@^C>{{n$oJQtR0l=Z-7{RSH3g>_wMYScVx#xBbSF-htVQ1@U!z8Vg
zt!IrQbp^s^D}{#ZD<!K|u#k<QZBJJ-^)%4h{i712)xoGZ53_u<w)+(!4Hzf$v+)RV
zqWG-msTpWs9T8a#JOc#XI6?gp=p?DkI}l;95~|UX;{c0xoDs|5spSqI5|OGQx{<5%
zClvCH4t!3A`DZX#s|==+5+g$^aY6kAySBXP0%IL=oqT5zXE%6QCc|PN?oX=%HVkTP
z-)8n?eDCJAW8F#*&ZU8x@S(VmsGShs96Q#H8`8zLCc5Kl-=@AUwjr9_h{@8~Q@!XA
z#}M)Dy#$Z+T<B%UjtoFNelV`$_D6Mw^Q^;(J<PUqa;pY3L)%*7Y%jjWJNRf2gU9HN
z7`vC~AVEcr4=R0(_DErUN@w7v(HOKUG9leso61_-fN61*F_a8tJ1|w{BeEtD)dE&z
zqm_KfN<Kkr0w^9%kCmS~oY^hTaca;u^OXbD`mW(3>Xn8$*r;2D1KD(DZzh}BNp`nT
zKEThmZI+4!{}5yEpWxKs6yzPp=AGq&(eO;ygF+d1*fuLQy#PaH$F_!wyq<=vYH6_G
zxE5yM-<pcKN{NFP<f&L8GD)j(6>hJ~m$zn+O?RNS8W1BDB294Gv$ajdeO|ArAU-l7
zqYAhQ>ogVOYfw5dE%5tN*g!KXfjr3SY3IbX`I!*>a3RwTmZbPSG}B6>J2FXMiR5bh
zD&iY(-tZ0LHQ`@sg1^xOf42#Kvk5NG7hNU;=O6U;rQ6Ohrm9ZftQvKvQu8c#0FlYj
zL(d&Da%1DiCJY0|xyeamY+{t_`9?#Wcf(jsvx=r?f??G#nR;W!_RKs=&-G<lYBk5R
z^r_mko}aE%i-VOS=$>`n1KlbcCD*hq&<l=jTQ!9E$3{0j&#mOAJ*xpamell=VY+5*
zYGqidm7HZ$K9AomIjUA_7|(aY1xS@(d<z$|{9tVpy7QY&gpu_t|8E9D;lq`wqn%Ei
ztUy`kqv9Bbzk`9xlSdvF{t}n>!ENC`$5|^~xN;8)|MUulYx2KsYY3OsA>}yQ=^rvg
zTmGM<FJ`bwKZ%`0C5(~nws@iNy29LY6aNL_f0u#<TG*fJIE)ef2l`>e_OUOP`!M%1
zTEZBS=fp<8zwGHNVeXp<6Bd5YZQ`narHQ|c^L7*edEw80j{-5IUr@@k6QIGT!hb{<
oJJ6WKN1h-25#fz)e32@F_+mkPIzC#3uFX|f6aVXCqW{YOKOsPGG5`Po

diff --git a/lldb/test/API/functionalities/postmortem/elf-core/main_fpr.c b/lldb/test/API/functionalities/postmortem/elf-core/main_fpr.c
new file mode 100644
index 0000000000000..bcfe6d27359ca
--- /dev/null
+++ b/lldb/test/API/functionalities/postmortem/elf-core/main_fpr.c
@@ -0,0 +1,14 @@
+static void bar(float *boom) {
+  float F = 98.0;
+  *boom = 47.0; // Frame bar
+}
+
+static void foo(float *boom, void (*boomer)(float *)) {
+  float F = 102.0;
+  boomer(boom); // Frame foo
+}
+
+void _start(void) {
+  float F = 95.0;
+  foo(0, bar); // Frame _start
+}

From 2d7339ad24b41eb06c417f7067b9fbeb4fdb2e6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?=
 <jmartinezcaamao@gmail.com>
Date: Wed, 4 Sep 2024 16:41:43 +0200
Subject: [PATCH 106/425] [AMDGPU][LDS] Fix dynamic LDS interaction with
 "amdgpu-no-lds-kernel-id" (#107092)

Dynamic lds and Table lds both use the amdgpu_lds_kernel_id intrinsic.
Kernels and functons that make an indirect use of this should not have
the
"amdgpu-no-lds-kernel-id" attribute.

For the later, this was done. For the dynamic lds case, this was
missing. This patch fixes it.
---
 .../Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp    | 15 ++++++++-------
 .../AMDGPU/lower-module-lds-zero-size-arr.ll      |  9 ++++-----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 38f0b6dda1997..a166087a5cdc7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -1010,13 +1010,6 @@ class AMDGPULowerModuleLDS {
           M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement);
       replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered,
                                                LookupTable);
-
-      // Strip amdgpu-no-lds-kernel-id from all functions reachable from the
-      // kernel. We may have inferred this wasn't used prior to the pass.
-      //
-      // TODO: We could filter out subgraphs that do not access LDS globals.
-      for (Function *F : KernelsThatAllocateTableLDS)
-        removeFnAttrFromReachable(CG, F, {"amdgpu-no-lds-kernel-id"});
     }
 
     DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =
@@ -1024,6 +1017,14 @@ class AMDGPULowerModuleLDS {
                                  KernelsThatIndirectlyAllocateDynamicLDS,
                                  DynamicVariables, OrderedKernels);
 
+    // Strip amdgpu-no-lds-kernel-id from all functions reachable from the
+    // kernel. We may have inferred this wasn't used prior to the pass.
+    // TODO: We could filter out subgraphs that do not access LDS globals.
+    for (auto *KernelSet : {&KernelsThatIndirectlyAllocateDynamicLDS,
+                            &KernelsThatAllocateTableLDS})
+      for (Function *F : *KernelSet)
+        removeFnAttrFromReachable(CG, F, {"amdgpu-no-lds-kernel-id"});
+
     // All kernel frames have been allocated. Calculate and record the
     // addresses.
     {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll
index c7829be565373..59dfe3300c293 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll
@@ -12,7 +12,7 @@
 ;.
 define void @fn(float %val, i32 %idx) #0 {
 ; CHECK-LABEL: define void @fn(
-; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; CHECK-NEXT:    [[VAR0:%.*]] = getelementptr inbounds [1 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[VAR0]], align 4
@@ -28,7 +28,7 @@ define void @fn(float %val, i32 %idx) #0 {
 
 define amdgpu_kernel void @kernelA(float %val, i32 %idx) #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @kernelA(
-; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] {
+; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernelA.dynlds) ]
 ; CHECK-NEXT:    tail call void @fn(float [[VAL]], i32 [[IDX]])
 ; CHECK-NEXT:    ret void
@@ -40,9 +40,8 @@ define amdgpu_kernel void @kernelA(float %val, i32 %idx) #0 {
 attributes #0 = { "amdgpu-no-lds-kernel-id" }
 
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-lds-kernel-id" }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
 ; CHECK: [[META0]] = !{i32 0, i32 1}
 ; CHECK: [[META1]] = !{i32 0}

From 865edb0436bc55a3df3596eefb9a83050a5c7a96 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Wed, 4 Sep 2024 15:07:40 +0000
Subject: [PATCH 107/425] [mlir][ArmSME] Fix typo (NFC)

---
 mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
index d847dda5ae9f9..9a058ae4fe764 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
@@ -605,7 +605,7 @@ def InsertTileSliceOp : ArmSME_Op<"insert_tile_slice", [
 ]> {
   let summary = "Insert 1-D scalable vector into slice of 2-D tile";
   let description = [{
-    Inserts a 1-D scalable vector to a slice of a 2-D scalable vector tile at
+    Inserts a 1-D scalable vector into a slice of a 2-D scalable vector tile at
     the given index. The type of the 1-D scalable vector to be inserted must
     match the type of the tile slice. A tile slice is a 1-D vector of
     horizontally or vertically contiguous elements within a ZA tile. The updated

From ba40737e819b4ca77b25c0950c47c305a15a93de Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 4 Sep 2024 11:17:05 -0400
Subject: [PATCH 108/425] [libc++][modules] Include __type_traits/invoke.h from
 __type_traits/result_of.h (#106796)

The result_of trait requires the __invoke_of implementation detail, but
that is defined under __type_traits, not under __functional. Include the
right header directly to remove a dependency from __type_traits to
__functional.
---
 libcxx/include/__type_traits/result_of.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/include/__type_traits/result_of.h b/libcxx/include/__type_traits/result_of.h
index f00fa8e9be7f7..73a1944752066 100644
--- a/libcxx/include/__type_traits/result_of.h
+++ b/libcxx/include/__type_traits/result_of.h
@@ -10,7 +10,7 @@
 #define _LIBCPP___TYPE_TRAITS_RESULT_OF_H
 
 #include <__config>
-#include <__functional/invoke.h>
+#include <__type_traits/invoke.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header

From d9019d478d40b4e8766efccdb3eb1ff77cdbfaec Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 4 Sep 2024 11:17:32 -0400
Subject: [PATCH 109/425] [libc++] Remove unused pair.h include from hypot.h
 (#106798)

This was added in #100820 by mistake since the final version of that PR
didn't depend on std::pair anymore.
---
 libcxx/include/__math/hypot.h                    | 1 -
 libcxx/test/libcxx/transitive_includes/cxx03.csv | 1 -
 libcxx/test/libcxx/transitive_includes/cxx11.csv | 1 -
 libcxx/test/libcxx/transitive_includes/cxx14.csv | 1 -
 libcxx/test/libcxx/transitive_includes/cxx17.csv | 1 -
 libcxx/test/libcxx/transitive_includes/cxx20.csv | 1 -
 libcxx/test/libcxx/transitive_includes/cxx23.csv | 1 -
 libcxx/test/libcxx/transitive_includes/cxx26.csv | 1 -
 8 files changed, 8 deletions(-)

diff --git a/libcxx/include/__math/hypot.h b/libcxx/include/__math/hypot.h
index b992163711010..2c2c9c38ab530 100644
--- a/libcxx/include/__math/hypot.h
+++ b/libcxx/include/__math/hypot.h
@@ -18,7 +18,6 @@
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/promote.h>
-#include <__utility/pair.h>
 #include <limits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index fd2cd7f1c2d96..fd47c65ffc307 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -132,7 +132,6 @@ chrono vector
 chrono version
 cinttypes cstdint
 cmath cstddef
-cmath cstdint
 cmath initializer_list
 cmath limits
 cmath type_traits
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index 04122fd0f4571..347d5d8796687 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -132,7 +132,6 @@ chrono vector
 chrono version
 cinttypes cstdint
 cmath cstddef
-cmath cstdint
 cmath initializer_list
 cmath limits
 cmath type_traits
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index 42c742bead0c1..57bb2f25b3d62 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -133,7 +133,6 @@ chrono vector
 chrono version
 cinttypes cstdint
 cmath cstddef
-cmath cstdint
 cmath initializer_list
 cmath limits
 cmath type_traits
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index bc0659127d4bf..6826aeb75a83b 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -133,7 +133,6 @@ chrono vector
 chrono version
 cinttypes cstdint
 cmath cstddef
-cmath cstdint
 cmath initializer_list
 cmath limits
 cmath type_traits
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index fed0944f0219c..17169e385d544 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -138,7 +138,6 @@ chrono vector
 chrono version
 cinttypes cstdint
 cmath cstddef
-cmath cstdint
 cmath initializer_list
 cmath limits
 cmath type_traits
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index 53f99384a7f57..267dca3cc6c41 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -85,7 +85,6 @@ chrono vector
 chrono version
 cinttypes cstdint
 cmath cstddef
-cmath cstdint
 cmath initializer_list
 cmath limits
 cmath version
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 53f99384a7f57..267dca3cc6c41 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -85,7 +85,6 @@ chrono vector
 chrono version
 cinttypes cstdint
 cmath cstddef
-cmath cstdint
 cmath initializer_list
 cmath limits
 cmath version

From 7a785d46d6c31937c620f186464fdc59c265b4bf Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 4 Sep 2024 11:18:00 -0400
Subject: [PATCH 110/425] [libc++][modules] Use inline variable instead of
 true_type (#106797)

This allows breaking up a dependency from __fwd/array.h onto
__type_traits, which is a circular dependency once __type_traits becomes
a module of its own. This is also a small consistency improvement since
we've been using inline variables for traits like this elsewhere in the
library.
---
 libcxx/include/__fwd/array.h | 6 +++---
 libcxx/include/span          | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libcxx/include/__fwd/array.h b/libcxx/include/__fwd/array.h
index b429d0c5a9542..6c6461e727604 100644
--- a/libcxx/include/__fwd/array.h
+++ b/libcxx/include/__fwd/array.h
@@ -35,11 +35,11 @@ template <size_t _Ip, class _Tp, size_t _Size>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp&& get(const array<_Tp, _Size>&&) _NOEXCEPT;
 #endif
 
-template <class>
-struct __is_std_array : false_type {};
+template <class _Tp>
+inline const bool __is_std_array_v = false;
 
 template <class _Tp, size_t _Size>
-struct __is_std_array<array<_Tp, _Size> > : true_type {};
+inline const bool __is_std_array_v<array<_Tp, _Size> > = true;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/span b/libcxx/include/span
index da631cdc3f90e..a32f7a372e2ae 100644
--- a/libcxx/include/span
+++ b/libcxx/include/span
@@ -210,7 +210,7 @@ concept __span_compatible_range =
     ranges::contiguous_range<_Range> &&                             //
     ranges::sized_range<_Range> &&                                  //
     (ranges::borrowed_range<_Range> || is_const_v<_ElementType>) && //
-    !__is_std_array<remove_cvref_t<_Range>>::value &&               //
+    !__is_std_array_v<remove_cvref_t<_Range>> &&                    //
     !is_array_v<remove_cvref_t<_Range>> &&                          //
     is_convertible_v<remove_reference_t<ranges::range_reference_t<_Range>> (*)[], _ElementType (*)[]>;
 

From c1a8283fcc735b1567c49bb6cd485f9e71a12cc4 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 4 Sep 2024 11:18:30 -0400
Subject: [PATCH 111/425] [libc++][modules] Move
 __noexcept_move_assign_container out of __type_traits (#107140)

That header depends on allocator traits, which is fundamentally tied to
`<memory>`, not to `<type_traits>`. This breaks a cycle betweeen
__type_traits and __memory.
---
 libcxx/include/CMakeLists.txt                               | 2 +-
 .../noexcept_move_assign_container.h                        | 6 +++---
 libcxx/include/module.modulemap                             | 2 +-
 libcxx/include/string                                       | 2 +-
 libcxx/include/vector                                       | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)
 rename libcxx/include/{__type_traits => __memory}/noexcept_move_assign_container.h (85%)

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 32579272858a8..210beaf5a3364 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -536,6 +536,7 @@ set(files
   __memory/construct_at.h
   __memory/destruct_n.h
   __memory/inout_ptr.h
+  __memory/noexcept_move_assign_container.h
   __memory/out_ptr.h
   __memory/pointer_traits.h
   __memory/ranges_construct_at.h
@@ -824,7 +825,6 @@ set(files
   __type_traits/maybe_const.h
   __type_traits/nat.h
   __type_traits/negation.h
-  __type_traits/noexcept_move_assign_container.h
   __type_traits/promote.h
   __type_traits/rank.h
   __type_traits/remove_all_extents.h
diff --git a/libcxx/include/__type_traits/noexcept_move_assign_container.h b/libcxx/include/__memory/noexcept_move_assign_container.h
similarity index 85%
rename from libcxx/include/__type_traits/noexcept_move_assign_container.h
rename to libcxx/include/__memory/noexcept_move_assign_container.h
index baaf36d9980e9..b0063516aaafc 100644
--- a/libcxx/include/__type_traits/noexcept_move_assign_container.h
+++ b/libcxx/include/__memory/noexcept_move_assign_container.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___TYPE_TRAITS_NOEXCEPT_MOVE_ASSIGN_CONTAINER_H
-#define _LIBCPP___TYPE_TRAITS_NOEXCEPT_MOVE_ASSIGN_CONTAINER_H
+#ifndef _LIBCPP___MEMORY_NOEXCEPT_MOVE_ASSIGN_CONTAINER_H
+#define _LIBCPP___MEMORY_NOEXCEPT_MOVE_ASSIGN_CONTAINER_H
 
 #include <__config>
 #include <__memory/allocator_traits.h>
@@ -34,4 +34,4 @@ struct __noexcept_move_assign_container
 
 _LIBCPP_END_NAMESPACE_STD
 
-#endif // _LIBCPP___TYPE_TRAITS_NOEXCEPT_MOVE_ASSIGN_CONTAINER_H
+#endif // _LIBCPP___MEMORY_NOEXCEPT_MOVE_ASSIGN_CONTAINER_H
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index f193b5d95f49f..297d155cb5594 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1531,6 +1531,7 @@ module std_private_memory_construct_at                    [system] { header "__m
 module std_private_memory_destruct_n                      [system] { header "__memory/destruct_n.h" }
 module std_private_memory_fwd                             [system] { header "__fwd/memory.h" }
 module std_private_memory_inout_ptr                       [system] { header "__memory/inout_ptr.h" }
+module std_private_memory_noexcept_move_assign_container  [system] { header "__memory/noexcept_move_assign_container.h" }
 module std_private_memory_out_ptr                         [system] { header "__memory/out_ptr.h" }
 module std_private_memory_pointer_traits                  [system] { header "__memory/pointer_traits.h" }
 module std_private_memory_ranges_construct_at             [system] { header "__memory/ranges_construct_at.h" }
@@ -2023,7 +2024,6 @@ module std_private_type_traits_make_unsigned                             [system
 module std_private_type_traits_maybe_const                               [system] { header "__type_traits/maybe_const.h" }
 module std_private_type_traits_nat                                       [system] { header "__type_traits/nat.h" }
 module std_private_type_traits_negation                                  [system] { header "__type_traits/negation.h" }
-module std_private_type_traits_noexcept_move_assign_container            [system] { header "__type_traits/noexcept_move_assign_container.h" }
 module std_private_type_traits_promote                                   [system] { header "__type_traits/promote.h" }
 module std_private_type_traits_rank                                      [system] { header "__type_traits/rank.h" }
 module std_private_type_traits_remove_all_extents                        [system] { header "__type_traits/remove_all_extents.h" }
diff --git a/libcxx/include/string b/libcxx/include/string
index 15c7a2f6b988b..5cb0693ad10bc 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -609,6 +609,7 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #include <__memory/allocator_traits.h>
 #include <__memory/compressed_pair.h>
 #include <__memory/construct_at.h>
+#include <__memory/noexcept_move_assign_container.h>
 #include <__memory/pointer_traits.h>
 #include <__memory/swap_allocator.h>
 #include <__memory_resource/polymorphic_allocator.h>
@@ -629,7 +630,6 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #include <__type_traits/is_standard_layout.h>
 #include <__type_traits/is_trivial.h>
 #include <__type_traits/is_trivially_relocatable.h>
-#include <__type_traits/noexcept_move_assign_container.h>
 #include <__type_traits/remove_cvref.h>
 #include <__type_traits/void_t.h>
 #include <__utility/auto_cast.h>
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 0f852e7f36c29..2442852c764a6 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -334,6 +334,7 @@ template<class T, class charT> requires is-vector-bool-reference<T> // Since C++
 #include <__memory/addressof.h>
 #include <__memory/allocate_at_least.h>
 #include <__memory/allocator_traits.h>
+#include <__memory/noexcept_move_assign_container.h>
 #include <__memory/pointer_traits.h>
 #include <__memory/swap_allocator.h>
 #include <__memory/temp_value.h>
@@ -348,7 +349,6 @@ template<class T, class charT> requires is-vector-bool-reference<T> // Since C++
 #include <__type_traits/is_allocator.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_nothrow_assignable.h>
-#include <__type_traits/noexcept_move_assign_container.h>
 #include <__type_traits/type_identity.h>
 #include <__utility/exception_guard.h>
 #include <__utility/forward.h>

From 3d9abfc9f841b13825e3d03cfba272f5eeab9a3b Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Fri, 30 Aug 2024 17:13:51 -0700
Subject: [PATCH 112/425] Consolidate all IR logic for getting the identity
 value of a reduction [nfc]

This change merges the three different places (at the IR layer) for
finding the identity value of a reduction into a single copy.  This
depends on several prior commits which fix ommissions and bugs in
the distinct copies, but this patch itself should be fully
non-functional.

As the new comments and naming try to make clear, the identity value
is a property of the @llvm.vector.reduce.* intrinsic, not of e.g.
the recurrence descriptor.  (We still provide an interface for
clients using recurrence descriptors, but the implementation simply
translates to the intrinsic which each corresponds to.)

As a note, the getIntrinsicIdentity API does not support fminnum/fmaxnum
or fminimum/fmaximum which is why we still need manual logic (but at
least only one copy of manual logic) for those cases.
---
 llvm/include/llvm/Analysis/IVDescriptors.h    |  3 -
 .../include/llvm/Transforms/Utils/LoopUtils.h |  8 +++
 llvm/lib/Analysis/IVDescriptors.cpp           | 46 ---------------
 llvm/lib/CodeGen/ExpandVectorPredication.cpp  | 51 ++--------------
 llvm/lib/Transforms/Utils/LoopUtils.cpp       | 59 +++++++++++++++++--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  8 +--
 llvm/unittests/Analysis/IVDescriptorsTest.cpp |  8 ---
 7 files changed, 70 insertions(+), 113 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index e7e6c5c01ad4d..5d992faf99d27 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -155,9 +155,6 @@ class RecurrenceDescriptor {
   /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
   static InstDesc isConditionalRdxPattern(RecurKind Kind, Instruction *I);
 
-  /// Returns identity corresponding to the RecurrenceKind.
-  static Value *getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF);
-
   /// Returns the opcode corresponding to the RecurrenceKind.
   static unsigned getOpcode(RecurKind Kind);
 
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index ba8af4aa2b0cd..a761859465210 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -378,6 +378,14 @@ RecurKind getMinMaxReductionRecurKind(Intrinsic::ID RdxID);
 /// Returns the comparison predicate used when expanding a min/max reduction.
 CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK);
 
+/// Given information about an @llvm.vector.reduce.* intrinsic, return
+/// the identity value for the reduction.
+Value *getReductionIdentity(Intrinsic::ID RdxID, Type *Ty, FastMathFlags FMF);
+
+/// Given information about an recurrence kind, return the identity
+/// for the @llvm.vector.reduce.* used to generate it.
+Value *getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF);
+
 /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
 /// The Builder's fast-math-flags must be set to propagate the expected values.
 Value *createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index fdf78b9f8a44e..53001421ce6f7 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -1031,52 +1031,6 @@ bool RecurrenceDescriptor::isFixedOrderRecurrence(PHINode *Phi, Loop *TheLoop,
   return true;
 }
 
-/// This function returns the identity element (or neutral element) for
-/// the operation K.
-Value *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp,
-                                                   FastMathFlags FMF) {
-  switch (K) {
-  case RecurKind::Xor:
-  case RecurKind::Add:
-  case RecurKind::Or:
-  case RecurKind::Mul:
-  case RecurKind::And:
-  case RecurKind::FMul:
-  case RecurKind::FAdd:
-    return ConstantExpr::getBinOpIdentity(getOpcode(K), Tp, false, FMF.noSignedZeros());
-  case RecurKind::FMulAdd:
-    return ConstantExpr::getBinOpIdentity(Instruction::FAdd, Tp, false, FMF.noSignedZeros());
-  case RecurKind::UMin:
-    return ConstantInt::get(Tp, -1, true);
-  case RecurKind::UMax:
-    return ConstantInt::get(Tp, 0);
-  case RecurKind::SMin:
-    return ConstantInt::get(Tp,
-                            APInt::getSignedMaxValue(Tp->getIntegerBitWidth()));
-  case RecurKind::SMax:
-    return ConstantInt::get(Tp,
-                            APInt::getSignedMinValue(Tp->getIntegerBitWidth()));
-  case RecurKind::FMin:
-  case RecurKind::FMax:
-    assert((FMF.noNaNs() && FMF.noSignedZeros()) &&
-           "nnan, nsz is expected to be set for FP min/max reduction.");
-    [[fallthrough]];
-  case RecurKind::FMinimum:
-  case RecurKind::FMaximum: {
-    bool Negative = K == RecurKind::FMax || K == RecurKind::FMaximum;
-    const fltSemantics &Semantics = Tp->getFltSemantics();
-    return !FMF.noInfs()
-               ? ConstantFP::getInfinity(Tp, Negative)
-               : ConstantFP::get(Tp, APFloat::getLargest(Semantics, Negative));
-  }
-  case RecurKind::IAnyOf:
-  case RecurKind::FAnyOf:
-    llvm_unreachable("No meaningful identity for recurrence kind");
-  default:
-    llvm_unreachable("Unknown recurrence kind");
-  }
-}
-
 unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
   switch (Kind) {
   case RecurKind::Add:
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index fe95a27f0a24f..ffe879ff04964 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -368,52 +368,11 @@ Value *CachingVPExpander::expandPredicationToFPCall(
 
 static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI,
                                          Type *EltTy) {
-  bool Negative = false;
-  unsigned EltBits = EltTy->getScalarSizeInBits();
-  Intrinsic::ID VID = VPI.getIntrinsicID();
-  switch (VID) {
-  default:
-    llvm_unreachable("Expecting a VP reduction intrinsic");
-  case Intrinsic::vp_reduce_add:
-  case Intrinsic::vp_reduce_or:
-  case Intrinsic::vp_reduce_xor:
-  case Intrinsic::vp_reduce_umax:
-    return Constant::getNullValue(EltTy);
-  case Intrinsic::vp_reduce_mul:
-    return ConstantInt::get(EltTy, 1, /*IsSigned*/ false);
-  case Intrinsic::vp_reduce_and:
-  case Intrinsic::vp_reduce_umin:
-    return ConstantInt::getAllOnesValue(EltTy);
-  case Intrinsic::vp_reduce_smin:
-    return ConstantInt::get(EltTy->getContext(),
-                            APInt::getSignedMaxValue(EltBits));
-  case Intrinsic::vp_reduce_smax:
-    return ConstantInt::get(EltTy->getContext(),
-                            APInt::getSignedMinValue(EltBits));
-  case Intrinsic::vp_reduce_fmax:
-  case Intrinsic::vp_reduce_fmaximum:
-    Negative = true;
-    [[fallthrough]];
-  case Intrinsic::vp_reduce_fmin:
-  case Intrinsic::vp_reduce_fminimum: {
-    bool PropagatesNaN = VID == Intrinsic::vp_reduce_fminimum ||
-                         VID == Intrinsic::vp_reduce_fmaximum;
-    FastMathFlags Flags = VPI.getFastMathFlags();
-    const fltSemantics &Semantics = EltTy->getFltSemantics();
-    return (!Flags.noNaNs() && !PropagatesNaN)
-               ? ConstantFP::getQNaN(EltTy, Negative)
-           : !Flags.noInfs()
-               ? ConstantFP::getInfinity(EltTy, Negative)
-               : ConstantFP::get(EltTy,
-                                 APFloat::getLargest(Semantics, Negative));
-  }
-  case Intrinsic::vp_reduce_fadd:
-    return ConstantExpr::getBinOpIdentity(
-        Instruction::FAdd, EltTy, false,
-        VPI.getFastMathFlags().noSignedZeros());
-  case Intrinsic::vp_reduce_fmul:
-    return ConstantFP::get(EltTy, 1.0);
-  }
+  Intrinsic::ID RdxID = *VPI.getFunctionalIntrinsicID();
+  FastMathFlags FMF;
+  if (isa<FPMathOperator>(VPI))
+    FMF = VPI.getFastMathFlags();
+  return getReductionIdentity(RdxID, EltTy, FMF);
 }
 
 Value *
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 559129442a041..9a4289e1a30da 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1207,14 +1207,62 @@ Value *llvm::createAnyOfReduction(IRBuilderBase &Builder, Value *Src,
   return Builder.CreateSelect(AnyOf, NewVal, InitVal, "rdx.select");
 }
 
+Value *llvm::getReductionIdentity(Intrinsic::ID RdxID, Type *Ty,
+                                  FastMathFlags Flags) {
+  bool Negative = false;
+  switch (RdxID) {
+  default:
+    llvm_unreachable("Expecting a reduction intrinsic");
+  case Intrinsic::vector_reduce_add:
+  case Intrinsic::vector_reduce_mul:
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_xor:
+  case Intrinsic::vector_reduce_and:
+  case Intrinsic::vector_reduce_fadd:
+  case Intrinsic::vector_reduce_fmul: {
+    unsigned Opc = getArithmeticReductionInstruction(RdxID);
+    return ConstantExpr::getBinOpIdentity(Opc, Ty, false,
+                                          Flags.noSignedZeros());
+  }
+  case Intrinsic::vector_reduce_umax:
+  case Intrinsic::vector_reduce_umin:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_smax: {
+    Intrinsic::ID ScalarID = getMinMaxReductionIntrinsicOp(RdxID);
+    return ConstantExpr::getIntrinsicIdentity(ScalarID, Ty);
+  }
+  case Intrinsic::vector_reduce_fmax:
+  case Intrinsic::vector_reduce_fmaximum:
+    Negative = true;
+    [[fallthrough]];
+  case Intrinsic::vector_reduce_fmin:
+  case Intrinsic::vector_reduce_fminimum: {
+    bool PropagatesNaN = RdxID == Intrinsic::vector_reduce_fminimum ||
+                         RdxID == Intrinsic::vector_reduce_fmaximum;
+    const fltSemantics &Semantics = Ty->getFltSemantics();
+    return (!Flags.noNaNs() && !PropagatesNaN)
+               ? ConstantFP::getQNaN(Ty, Negative)
+           : !Flags.noInfs()
+               ? ConstantFP::getInfinity(Ty, Negative)
+               : ConstantFP::get(Ty, APFloat::getLargest(Semantics, Negative));
+  }
+  }
+}
+
+Value *llvm::getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF) {
+  assert((!(K == RecurKind::FMin || K == RecurKind::FMax) ||
+          (FMF.noNaNs() && FMF.noSignedZeros())) &&
+         "nnan, nsz is expected to be set for FP min/max reduction.");
+  Intrinsic::ID RdxID = getReductionIntrinsicID(K);
+  return getReductionIdentity(RdxID, Tp, FMF);
+}
+
 Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
                                    RecurKind RdxKind) {
   auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType();
   auto getIdentity = [&]() {
-    Intrinsic::ID ID = getReductionIntrinsicID(RdxKind);
-    unsigned Opc = getArithmeticReductionInstruction(ID);
-    bool NSZ = Builder.getFastMathFlags().noSignedZeros();
-    return ConstantExpr::getBinOpIdentity(Opc, SrcVecEltTy, false, NSZ);
+    return getRecurrenceIdentity(RdxKind, SrcVecEltTy,
+                                 Builder.getFastMathFlags());
   };
   switch (RdxKind) {
   case RecurKind::Add:
@@ -1249,8 +1297,7 @@ Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src,
   Intrinsic::ID Id = getReductionIntrinsicID(Kind);
   auto *SrcTy = cast<VectorType>(Src->getType());
   Type *SrcEltTy = SrcTy->getElementType();
-  Value *Iden =
-      Desc.getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags());
+  Value *Iden = getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags());
   Value *Ops[] = {Iden, Src};
   return VBuilder.createSimpleReduction(Id, SrcTy, Ops);
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 49ed733107da9..7708763aef859 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1840,8 +1840,8 @@ void VPReductionRecipe::execute(VPTransformState &State) {
       if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind))
         Start = RdxDesc.getRecurrenceStartValue();
       else
-        Start = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
-                                              RdxDesc.getFastMathFlags());
+        Start = llvm::getRecurrenceIdentity(Kind, ElementTy,
+                                            RdxDesc.getFastMathFlags());
       if (State.VF.isVector())
         Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(),
                                                 Start);
@@ -3010,8 +3010,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
     }
   } else {
-    Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
-                                         RdxDesc.getFastMathFlags());
+    Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
+                                       RdxDesc.getFastMathFlags());
 
     if (!ScalarPHI) {
       Iden = Builder.CreateVectorSplat(State.VF, Iden);
diff --git a/llvm/unittests/Analysis/IVDescriptorsTest.cpp b/llvm/unittests/Analysis/IVDescriptorsTest.cpp
index 32825d55ebd9a..ce9383d15f461 100644
--- a/llvm/unittests/Analysis/IVDescriptorsTest.cpp
+++ b/llvm/unittests/Analysis/IVDescriptorsTest.cpp
@@ -209,10 +209,6 @@ for.end:
         EXPECT_TRUE(IsRdxPhi);
         RecurKind Kind = Rdx.getRecurrenceKind();
         EXPECT_EQ(Kind, RecurKind::FMin);
-        Type *Ty = Phi->getType();
-        Value *Id = Rdx.getRecurrenceIdentity(Kind, Ty, Rdx.getFastMathFlags());
-        // Identity value for FP min reduction is +Inf.
-        EXPECT_EQ(Id, ConstantFP::getInfinity(Ty, false /*Negative*/));
       });
 }
 
@@ -261,9 +257,5 @@ for.end:
         EXPECT_TRUE(IsRdxPhi);
         RecurKind Kind = Rdx.getRecurrenceKind();
         EXPECT_EQ(Kind, RecurKind::FMax);
-        Type *Ty = Phi->getType();
-        Value *Id = Rdx.getRecurrenceIdentity(Kind, Ty, Rdx.getFastMathFlags());
-        // Identity value for FP max reduction is -Inf.
-        EXPECT_EQ(Id, ConstantFP::getInfinity(Ty, true /*Negative*/));
       });
 }

From 3ebd79751f2d5e1c54047409865c051daba0a21b Mon Sep 17 00:00:00 2001
From: Jan Leyonberg <jan_sjodin@yahoo.com>
Date: Wed, 4 Sep 2024 11:29:10 -0400
Subject: [PATCH 113/425] [MLIR][ROCDL] Remove patterns for ops supported as
 intrinsics in the AMDGPU backend (#102971)

This patch removes patterns for a few operations which allows mathToLLVM
conversion to convert the operations into LLVM intrinsics instead since
they are supported directly by the AMDGPU backend.
---
 .../Conversion/MathToROCDL/MathToROCDL.cpp    | 14 ++-
 .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir   | 86 +++++--------------
 .../Conversion/MathToROCDL/math-to-rocdl.mlir | 44 +---------
 3 files changed, 31 insertions(+), 113 deletions(-)

diff --git a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
index 7de6971ba2ee7..b3b4d81e7ffa5 100644
--- a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
+++ b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
@@ -48,18 +48,20 @@ static void populateOpPatterns(LLVMTypeConverter &converter,
 void mlir::populateMathToROCDLConversionPatterns(LLVMTypeConverter &converter,
                                                  RewritePatternSet &patterns) {
   // Handled by mathToLLVM: math::AbsIOp
+  // Handled by mathToLLVM: math::AbsFIOp
   // Handled by mathToLLVM: math::CopySignOp
   // Handled by mathToLLVM: math::CountLeadingZerosOp
   // Handled by mathToLLVM: math::CountTrailingZerosOp
   // Handled by mathToLLVM: math::CgPopOp
+  // Handled by mathToLLVM: math::ExpOp (32-bit only)
   // Handled by mathToLLVM: math::FmaOp
+  // Handled by mathToLLVM: math::LogOp (32-bit only)
   // FIXME: math::IPowIOp
   // FIXME: math::FPowIOp
   // Handled by mathToLLVM: math::RoundEvenOp
   // Handled by mathToLLVM: math::RoundOp
+  // Handled by mathToLLVM: math::SqrtOp
   // Handled by mathToLLVM: math::TruncOp
-  populateOpPatterns<math::AbsFOp>(converter, patterns, "__ocml_fabs_f32",
-                                   "__ocml_fabs_f64");
   populateOpPatterns<math::AcosOp>(converter, patterns, "__ocml_acos_f32",
                                    "__ocml_acos_f64");
   populateOpPatterns<math::AcoshOp>(converter, patterns, "__ocml_acosh_f32",
@@ -84,16 +86,14 @@ void mlir::populateMathToROCDLConversionPatterns(LLVMTypeConverter &converter,
                                    "__ocml_cosh_f64");
   populateOpPatterns<math::SinhOp>(converter, patterns, "__ocml_sinh_f32",
                                    "__ocml_sinh_f64");
-  populateOpPatterns<math::ExpOp>(converter, patterns, "__ocml_exp_f32",
-                                  "__ocml_exp_f64");
+  populateOpPatterns<math::ExpOp>(converter, patterns, "", "__ocml_exp_f64");
   populateOpPatterns<math::Exp2Op>(converter, patterns, "__ocml_exp2_f32",
                                    "__ocml_exp2_f64");
   populateOpPatterns<math::ExpM1Op>(converter, patterns, "__ocml_expm1_f32",
                                     "__ocml_expm1_f64");
   populateOpPatterns<math::FloorOp>(converter, patterns, "__ocml_floor_f32",
                                     "__ocml_floor_f64");
-  populateOpPatterns<math::LogOp>(converter, patterns, "__ocml_log_f32",
-                                  "__ocml_log_f64");
+  populateOpPatterns<math::LogOp>(converter, patterns, "", "__ocml_log_f64");
   populateOpPatterns<math::Log10Op>(converter, patterns, "__ocml_log10_f32",
                                     "__ocml_log10_f64");
   populateOpPatterns<math::Log1pOp>(converter, patterns, "__ocml_log1p_f32",
@@ -106,8 +106,6 @@ void mlir::populateMathToROCDLConversionPatterns(LLVMTypeConverter &converter,
                                     "__ocml_rsqrt_f64");
   populateOpPatterns<math::SinOp>(converter, patterns, "__ocml_sin_f32",
                                   "__ocml_sin_f64");
-  populateOpPatterns<math::SqrtOp>(converter, patterns, "__ocml_sqrt_f32",
-                                   "__ocml_sqrt_f64");
   populateOpPatterns<math::TanhOp>(converter, patterns, "__ocml_tanh_f32",
                                    "__ocml_tanh_f64");
   populateOpPatterns<math::TanOp>(converter, patterns, "__ocml_tan_f32",
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index bf49a42a11577..b6fb08522ae1f 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -131,21 +131,6 @@ gpu.module @test_module {
 
 // -----
 
-gpu.module @test_module {
-  // CHECK: llvm.func @__ocml_fabs_f32(f32) -> f32
-  // CHECK: llvm.func @__ocml_fabs_f64(f64) -> f64
-  // CHECK-LABEL: func @gpu_fabs
-  func.func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
-    %result32 = math.absf %arg_f32 : f32
-    // CHECK: llvm.call @__ocml_fabs_f32(%{{.*}}) : (f32) -> f32
-    %result64 = math.absf %arg_f64 : f64
-    // CHECK: llvm.call @__ocml_fabs_f64(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
-  }
-}
-
-// -----
-
 gpu.module @test_module {
   // CHECK: llvm.func @__ocml_cbrt_f32(f32) -> f32
   // CHECK: llvm.func @__ocml_cbrt_f64(f64) -> f64
@@ -207,17 +192,12 @@ gpu.module @test_module {
 // -----
 
 gpu.module @test_module {
-  // CHECK: llvm.func @__ocml_exp_f32(f32) -> f32
   // CHECK: llvm.func @__ocml_exp_f64(f64) -> f64
   // CHECK-LABEL: func @gpu_exp
-  func.func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
-    %exp_f32 = math.exp %arg_f32 : f32
-    // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32
-    %result32 = math.exp %exp_f32 : f32
-    // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32
+  func.func @gpu_exp(%arg_f64 : f64) -> (f64) {
     %result64 = math.exp %arg_f64 : f64
     // CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    func.return %result64 : f64
   }
 }
 
@@ -239,21 +219,20 @@ gpu.module @test_module {
 }
 
 // -----
-
 // Test that we handled properly operation with SymbolTable other than module op
 gpu.module @test_module {
   "test.symbol_scope"() ({
     // CHECK: test.symbol_scope
-    // CHECK: llvm.func @__ocml_exp_f32(f32) -> f32
-    // CHECK: llvm.func @__ocml_exp_f64(f64) -> f64
-    // CHECK-LABEL: func @gpu_exp
-    func.func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
-      %exp_f32 = math.exp %arg_f32 : f32
-      // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32
-      %result32 = math.exp %exp_f32 : f32
-      // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32
-      %result64 = math.exp %arg_f64 : f64
-      // CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (f64) -> f64
+    // CHECK: llvm.func @__ocml_sin_f32(f32) -> f32
+    // CHECK: llvm.func @__ocml_sin_f64(f64) -> f64
+    // CHECK-LABEL: func @gpu_sin
+    func.func @gpu_sin(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+      %sin_f32 = math.sin %arg_f32 : f32
+      // CHECK: llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32
+      %result32 = math.sin %sin_f32 : f32
+      // CHECK: llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32
+      %result64 = math.sin %arg_f64 : f64
+      // CHECK: llvm.call @__ocml_sin_f64(%{{.*}}) : (f64) -> f64
       func.return %result32, %result64 : f32, f64
     }
     "test.finish" () : () -> ()
@@ -280,15 +259,12 @@ gpu.module @test_module {
 // -----
 
 gpu.module @test_module {
-  // CHECK: llvm.func @__ocml_log_f32(f32) -> f32
   // CHECK: llvm.func @__ocml_log_f64(f64) -> f64
   // CHECK-LABEL: func @gpu_log
-  func.func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
-    %result32 = math.log %arg_f32 : f32
-    // CHECK: llvm.call @__ocml_log_f32(%{{.*}}) : (f32) -> f32
+  func.func @gpu_log(%arg_f64 : f64) -> (f64) {
     %result64 = math.log %arg_f64 : f64
     // CHECK: llvm.call @__ocml_log_f64(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    func.return %result64 : f64
   }
 }
 
@@ -359,26 +335,6 @@ gpu.module @test_module {
 
 // -----
 
-gpu.module @test_module {
-  // CHECK: llvm.func @__ocml_sqrt_f32(f32) -> f32
-  // CHECK: llvm.func @__ocml_sqrt_f64(f64) -> f64
-  // CHECK-LABEL: func @gpu_sqrt
-  func.func @gpu_sqrt(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64)
-      -> (f16, f32, f64) {
-    %result16 = math.sqrt %arg_f16 : f16
-    // CHECK: llvm.fpext %{{.*}} : f16 to f32
-    // CHECK-NEXT: llvm.call @__ocml_sqrt_f32(%{{.*}}) : (f32) -> f32
-    // CHECK-NEXT: llvm.fptrunc %{{.*}} : f32 to f16
-    %result32 = math.sqrt %arg_f32 : f32
-    // CHECK: llvm.call @__ocml_sqrt_f32(%{{.*}}) : (f32) -> f32
-    %result64 = math.sqrt %arg_f64 : f64
-    // CHECK: llvm.call @__ocml_sqrt_f64(%{{.*}}) : (f64) -> f64
-    func.return %result16, %result32, %result64 : f16, f32, f64
-  }
-}
-
-// -----
-
 gpu.module @test_module {
   // CHECK: llvm.func @__ocml_tan_f32(f32) -> f32
   // CHECK: llvm.func @__ocml_tan_f64(f64) -> f64
@@ -472,15 +428,15 @@ gpu.module @test_module {
 gpu.module @test_module {
   // CHECK-LABEL: func @gpu_unroll
   func.func @gpu_unroll(%arg0 : vector<4xf32>) -> vector<4xf32> {
-    %result = math.exp %arg0 : vector<4xf32>
+    %result = math.sin %arg0 : vector<4xf32>
     // CHECK: %[[V0:.+]] = llvm.mlir.undef : vector<4xf32>
-    // CHECK: %[[CL:.+]] = llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32
+    // CHECK: %[[CL:.+]] = llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32
     // CHECK: %[[V1:.+]] = llvm.insertelement %[[CL]], %[[V0]]
-    // CHECK: %[[CL:.+]] = llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32
+    // CHECK: %[[CL:.+]] = llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32
     // CHECK: %[[V2:.+]] = llvm.insertelement %[[CL]], %[[V1]]
-    // CHECK: %[[CL:.+]] = llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32
+    // CHECK: %[[CL:.+]] = llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32
     // CHECK: %[[V3:.+]] = llvm.insertelement %[[CL]], %[[V2]]
-    // CHECK: %[[CL:.+]] = llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32
+    // CHECK: %[[CL:.+]] = llvm.call @__ocml_sin_f32(%{{.*}}) : (f32) -> f32
     // CHECK: %[[V4:.+]] = llvm.insertelement %[[CL]], %[[V3]]
     // CHECK: return %[[V4]]
     func.return %result : vector<4xf32>
@@ -526,9 +482,9 @@ gpu.module @test_module {
 
 gpu.module @module {
 // CHECK-LABEL: @spirv_exp
-// CHECK: llvm.call @__ocml_exp_f32
+// CHECK: llvm.call @__ocml_sin_f32
   spirv.func @spirv_exp(%arg0: vector<4xf32>) -> vector<4xf32> "None" {
-    %0 = math.exp %arg0 : vector<4xf32>
+    %0 = math.sin %arg0 : vector<4xf32>
     spirv.ReturnValue %0 : vector<4xf32>
   }
 }
diff --git a/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir
index a406ec45a7f10..19d89e03a7f48 100644
--- a/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir
+++ b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir
@@ -15,21 +15,6 @@ module @test_module {
 
 // -----
 
-module @test_module {
-  // CHECK: llvm.func @__ocml_fabs_f32(f32) -> f32
-  // CHECK: llvm.func @__ocml_fabs_f64(f64) -> f64
-  // CHECK-LABEL: func @math_absf
-  func.func @math_absf(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
-    %result32 = math.absf %arg_f32 : f32
-    // CHECK: llvm.call @__ocml_fabs_f32(%{{.*}}) : (f32) -> f32
-    %result64 = math.absf %arg_f64 : f64
-    // CHECK: llvm.call @__ocml_fabs_f64(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
-  }
-}
-
-// -----
-
 module @test_module {
   // CHECK: llvm.func @__ocml_acos_f32(f32) -> f32
   // CHECK: llvm.func @__ocml_acos_f64(f64) -> f64
@@ -211,15 +196,12 @@ module @test_module {
 // -----
 
 module @test_module {
-  // CHECK: llvm.func @__ocml_exp_f32(f32) -> f32
   // CHECK: llvm.func @__ocml_exp_f64(f64) -> f64
   // CHECK-LABEL: func @math_exp
-  func.func @math_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
-    %result32 = math.exp %arg_f32 : f32
-    // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (f32) -> f32
+  func.func @math_exp(%arg_f64 : f64) -> (f64) {
     %result64 = math.exp %arg_f64 : f64
     // CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    func.return %result64 : f64
   }
 }
 
@@ -271,15 +253,12 @@ module @test_module {
 // -----
 
 module @test_module {
-  // CHECK: llvm.func @__ocml_log_f32(f32) -> f32
   // CHECK: llvm.func @__ocml_log_f64(f64) -> f64
   // CHECK-LABEL: func @math_log
-  func.func @math_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
-    %result32 = math.log %arg_f32 : f32
-    // CHECK: llvm.call @__ocml_log_f32(%{{.*}}) : (f32) -> f32
+  func.func @math_log(%arg_f64 : f64) -> (f64) {
     %result64 = math.log %arg_f64 : f64
     // CHECK: llvm.call @__ocml_log_f64(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    func.return %result64 : f64
   }
 }
 
@@ -360,21 +339,6 @@ module @test_module {
 
 // -----
 
-module @test_module {
-  // CHECK: llvm.func @__ocml_sqrt_f32(f32) -> f32
-  // CHECK: llvm.func @__ocml_sqrt_f64(f64) -> f64
-  // CHECK-LABEL: func @math_sqrt
-  func.func @math_sqrt(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
-    %result32 = math.sqrt %arg_f32 : f32
-    // CHECK: llvm.call @__ocml_sqrt_f32(%{{.*}}) : (f32) -> f32
-    %result64 = math.sqrt %arg_f64 : f64
-    // CHECK: llvm.call @__ocml_sqrt_f64(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
-  }
-}
-
-// -----
-
 module @test_module {
   // CHECK: llvm.func @__ocml_tanh_f32(f32) -> f32
   // CHECK: llvm.func @__ocml_tanh_f64(f64) -> f64

From fe454b2044aba1d808cec486a8ca7a0e202d31bf Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 4 Sep 2024 15:36:21 +0000
Subject: [PATCH 114/425] [gn build] Port c1a8283fcc73

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index f49c964b4128f..39ee220ee3a72 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -608,6 +608,7 @@ if (current_toolchain == default_toolchain) {
       "__memory/construct_at.h",
       "__memory/destruct_n.h",
       "__memory/inout_ptr.h",
+      "__memory/noexcept_move_assign_container.h",
       "__memory/out_ptr.h",
       "__memory/pointer_traits.h",
       "__memory/ranges_construct_at.h",
@@ -895,7 +896,6 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/maybe_const.h",
       "__type_traits/nat.h",
       "__type_traits/negation.h",
-      "__type_traits/noexcept_move_assign_container.h",
       "__type_traits/promote.h",
       "__type_traits/rank.h",
       "__type_traits/remove_all_extents.h",

From c81b43074ab010d01ad794224dd9dd22bbe8a1f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Wed, 4 Sep 2024 08:43:13 -0700
Subject: [PATCH 115/425] [flang][cuda] Fix lowering of cuf kernel with
 unstructured nested construct (#107149)

Lowering was crashing when cuf kernels has an unstructured construct.
Blocks created by PFT need to be re-created inside of the operation like
it is done for OpenACC construct.
---
 flang/lib/Lower/Bridge.cpp                    |  8 +++++++-
 .../Lower/CUDA/cuda-kernel-loop-directive.cuf | 20 +++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index e5ccf659c3f8e..1f2724290b885 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Lower/Bridge.h"
+#include "DirectivesCommon.h"
 #include "flang/Common/Version.h"
 #include "flang/Lower/Allocatable.h"
 #include "flang/Lower/CallInterface.h"
@@ -2999,6 +3000,12 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     mlir::Block &b = op.getRegion().back();
     builder->setInsertionPointToStart(&b);
 
+    Fortran::lower::pft::Evaluation *crtEval = &getEval();
+    if (crtEval->lowerAsUnstructured())
+      Fortran::lower::createEmptyRegionBlocks<fir::FirEndOp>(
+          *builder, crtEval->getNestedEvaluations());
+    builder->setInsertionPointToStart(&b);
+
     for (auto [arg, value] : llvm::zip(
              op.getLoopRegions().front()->front().getArguments(), ivValues)) {
       mlir::Value convArg =
@@ -3006,7 +3013,6 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       builder->create<fir::StoreOp>(loc, convArg, value);
     }
 
-    Fortran::lower::pft::Evaluation *crtEval = &getEval();
     if (crtEval->lowerAsStructured()) {
       crtEval = &crtEval->getFirstNestedEvaluation();
       for (int64_t i = 1; i < nestedLoops; i++)
diff --git a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
index ba5d390df4785..aac569b6eb35b 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
@@ -78,3 +78,23 @@ end
 ! CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]]#0 : !fir.ref<i64>
 ! CHECK: %[[STREAM_I32:.*]] = fir.convert %[[STREAM_LOAD]] : (i64) -> i32
 ! CHECK: cuf.kernel<<<*, *, stream = %[[STREAM_I32]]>>>
+
+
+! Test lowering with unstructured construct inside.
+subroutine sub2(m,a,b)
+  integer :: m
+  real, device :: a(m,m), b(m)
+  integer :: i,j
+  !$cuf kernel do<<<*,*>>>
+  
+  do j = 1, m
+    i = 1
+    do while (a(i,j).eq.0)
+      i = i + 1
+    end do
+    b(j) = i
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub2
+! CHECK: cuf.kernel

From 0367305af849da7ee9237fd83c04ed3a01e8d223 Mon Sep 17 00:00:00 2001
From: mzukovec <113346157+mzukovec@users.noreply.github.com>
Date: Wed, 4 Sep 2024 17:50:10 +0200
Subject: [PATCH 116/425] [lld][WebAssembly] Add allow-multiple-definition flag
 (#97699)

Add `allow-multiple-definition` flag to `wasm-ld`. This follows the ELF
linker logic. In case of duplication, the first symbol met is used.

This PR resolves the #97543
---
 .../wasm/Inputs/allow-multiple-definition.s   |  6 +++
 lld/test/wasm/allow-multiple-definition.s     | 38 +++++++++++++++++++
 lld/wasm/Config.h                             |  5 +++
 lld/wasm/Driver.cpp                           | 24 +++++++++++-
 lld/wasm/Options.td                           |  7 ++++
 lld/wasm/SymbolTable.cpp                      |  9 +++--
 6 files changed, 85 insertions(+), 4 deletions(-)
 create mode 100644 lld/test/wasm/Inputs/allow-multiple-definition.s
 create mode 100644 lld/test/wasm/allow-multiple-definition.s

diff --git a/lld/test/wasm/Inputs/allow-multiple-definition.s b/lld/test/wasm/Inputs/allow-multiple-definition.s
new file mode 100644
index 0000000000000..7a5577cb12791
--- /dev/null
+++ b/lld/test/wasm/Inputs/allow-multiple-definition.s
@@ -0,0 +1,6 @@
+  .hidden foo
+  .globl  foo
+foo:
+  .functype foo () -> (i32)
+  i32.const 1
+  end_function
diff --git a/lld/test/wasm/allow-multiple-definition.s b/lld/test/wasm/allow-multiple-definition.s
new file mode 100644
index 0000000000000..93fccd3c46f90
--- /dev/null
+++ b/lld/test/wasm/allow-multiple-definition.s
@@ -0,0 +1,38 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t1
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/allow-multiple-definition.s -o %t2
+# RUN: not wasm-ld %t1 %t2 -o /dev/null
+# RUN: not wasm-ld --allow-multiple-definition --no-allow-multiple-definition %t1 %t2 -o /dev/null
+# RUN: wasm-ld --allow-multiple-definition --fatal-warnings %t1 %t2 -o %t3
+# RUN: wasm-ld --allow-multiple-definition --fatal-warnings %t2 %t1 -o %t4
+# RUN: llvm-objdump --no-print-imm-hex -d %t3 | FileCheck %s
+# RUN: llvm-objdump --no-print-imm-hex -d %t4 | FileCheck --check-prefix=REVERT %s
+
+# RUN: wasm-ld --noinhibit-exec %t2 %t1 -o /dev/null 2>&1 | FileCheck %s --check-prefix=WARN
+# WARN: warning: duplicate symbol: foo
+
+# RUN: wasm-ld -z muldefs --fatal-warnings  %t1 %t2 -o %t3
+# RUN: wasm-ld -z muldefs --fatal-warnings  %t2 %t1 -o %t4
+# RUN: llvm-objdump --no-print-imm-hex -d %t3 | FileCheck %s
+# RUN: llvm-objdump --no-print-imm-hex -d %t4 | FileCheck --check-prefix=REVERT %s
+
+# CHECK:  i32.const 0
+# REVERT:  i32.const 1
+
+# inputs contain different constants for function foo return.
+# Tests below checks that order of files in command line
+# affects on what symbol will be used.
+# If flag allow-multiple-definition is enabled the first
+# meet symbol should be used.
+
+  .hidden foo
+  .globl  foo
+foo:
+  .functype foo () -> (i32)
+  i32.const 0
+  end_function
+
+  .globl _start
+_start:
+  .functype  _start () -> (i32)
+  call foo
+  end_function
diff --git a/lld/wasm/Config.h b/lld/wasm/Config.h
index 915c53c437172..05a547ff9278a 100644
--- a/lld/wasm/Config.h
+++ b/lld/wasm/Config.h
@@ -12,6 +12,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/Support/CachePruning.h"
 #include <optional>
@@ -43,6 +44,7 @@ enum class BuildIdKind { None, Fast, Sha1, Hexstring, Uuid };
 // and such fields have the same name as the corresponding options.
 // Most fields are initialized by the driver.
 struct Configuration {
+  bool allowMultipleDefinition;
   bool bsymbolic;
   bool checkFeatures;
   bool compressRelocations;
@@ -64,6 +66,7 @@ struct Configuration {
   bool importUndefined;
   std::optional<bool> is64;
   bool mergeDataSegments;
+  bool noinhibitExec;
   bool pie;
   bool printGcSections;
   bool relocatable;
@@ -148,6 +151,8 @@ struct Ctx {
 
 extern Ctx ctx;
 
+void errorOrWarn(const llvm::Twine &msg);
+
 } // namespace lld::wasm
 
 #endif
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 5368fe79b7eb8..cb8fe2534f5fe 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -47,6 +47,13 @@ namespace lld::wasm {
 Configuration *config;
 Ctx ctx;
 
+void errorOrWarn(const llvm::Twine &msg) {
+  if (config->noinhibitExec)
+    warn(msg);
+  else
+    error(msg);
+}
+
 void Ctx::reset() {
   objectFiles.clear();
   stubFiles.clear();
@@ -99,6 +106,16 @@ class LinkerDriver {
 
   std::vector<InputFile *> files;
 };
+
+static bool hasZOption(opt::InputArgList &args, StringRef key) {
+  bool ret = false;
+  for (const auto *arg : args.filtered(OPT_z))
+    if (key == arg->getValue()) {
+      ret = true;
+      arg->claim();
+    }
+  return ret;
+}
 } // anonymous namespace
 
 bool link(ArrayRef<const char *> args, llvm::raw_ostream &stdoutOS,
@@ -467,6 +484,10 @@ getBuildId(opt::InputArgList &args) {
 
 // Initializes Config members by the command line options.
 static void readConfigs(opt::InputArgList &args) {
+  config->allowMultipleDefinition =
+      hasZOption(args, "muldefs") ||
+      args.hasFlag(OPT_allow_multiple_definition,
+                   OPT_no_allow_multiple_definition, false);
   config->bsymbolic = args.hasArg(OPT_Bsymbolic);
   config->checkFeatures =
       args.hasFlag(OPT_check_features, OPT_no_check_features, true);
@@ -479,6 +500,7 @@ static void readConfigs(opt::InputArgList &args) {
   config->exportAll = args.hasArg(OPT_export_all);
   config->exportTable = args.hasArg(OPT_export_table);
   config->growableTable = args.hasArg(OPT_growable_table);
+  config->noinhibitExec = args.hasArg(OPT_noinhibit_exec);
 
   if (args.hasArg(OPT_import_memory_with_name)) {
     config->memoryImport =
@@ -1173,7 +1195,7 @@ static void splitSections() {
 
 static bool isKnownZFlag(StringRef s) {
   // For now, we only support a very limited set of -z flags
-  return s.starts_with("stack-size=");
+  return s.starts_with("stack-size=") || s.starts_with("muldefs");
 }
 
 // Report a warning for an unknown -z option.
diff --git a/lld/wasm/Options.td b/lld/wasm/Options.td
index 3a70ee65f7c4f..c5febd145a54f 100644
--- a/lld/wasm/Options.td
+++ b/lld/wasm/Options.td
@@ -42,6 +42,10 @@ def Bdynamic: F<"Bdynamic">, HelpText<"Link against shared libraries">;
 
 def Bstatic: F<"Bstatic">, HelpText<"Do not link against shared libraries (default)">;
 
+defm allow_multiple_definition: B<"allow-multiple-definition",
+    "Allow multiple definitions",
+    "Do not allow multiple definitions (default)">;
+
 def build_id: F<"build-id">, HelpText<"Alias for --build-id=fast">;
 
 def build_id_eq: J<"build-id=">, HelpText<"Generate build ID note">,
@@ -105,6 +109,9 @@ defm mllvm: Eq<"mllvm", "Additional arguments to forward to LLVM's option proces
 
 defm Map: Eq<"Map", "Print a link map to the specified file">;
 
+def noinhibit_exec: F<"noinhibit-exec">,
+  HelpText<"Retain the executable output file whenever it is still usable">;
+
 def o: JoinedOrSeparate<["-"], "o">, MetaVarName<"<path>">,
   HelpText<"Path to file to write output">;
 
diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp
index a5d37a5eba6d5..d2216ff5a39a0 100644
--- a/lld/wasm/SymbolTable.cpp
+++ b/lld/wasm/SymbolTable.cpp
@@ -319,9 +319,12 @@ static bool shouldReplace(const Symbol *existing, InputFile *newFile,
   }
 
   // Neither symbol is week. They conflict.
-  error("duplicate symbol: " + toString(*existing) + "\n>>> defined in " +
-        toString(existing->getFile()) + "\n>>> defined in " +
-        toString(newFile));
+  if (config->allowMultipleDefinition)
+    return false;
+
+  errorOrWarn("duplicate symbol: " + toString(*existing) + "\n>>> defined in " +
+              toString(existing->getFile()) + "\n>>> defined in " +
+              toString(newFile));
   return true;
 }
 

From 697bc748f97736b294dd85b8f78530d023557b72 Mon Sep 17 00:00:00 2001
From: Renaud Kauffmann <rkauffmann@nvidia.com>
Date: Wed, 4 Sep 2024 08:59:55 -0700
Subject: [PATCH 117/425] Allow disabling of types from the command line
 (#107126)

Adding hidden options to disable types through the
`TargetCharacteristics`. I am seeing issues when I do this
programmatically and would like, for anyone, to have the ability to
reproduce them for development and testing purposes.

I am planning to file a couple of issues following this patch.
---
 clang/include/clang/Driver/Options.td        |  8 ++++++++
 flang/include/flang/Frontend/TargetOptions.h |  6 ++++++
 flang/include/flang/Tools/TargetSetup.h      |  8 ++++++++
 flang/lib/Frontend/CompilerInvocation.cpp    | 17 ++++++++++++++---
 flang/tools/bbc/bbc.cpp                      |  5 +++--
 5 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 8fe9f4f28f8fc..1142416e227fc 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6762,6 +6762,14 @@ def fdefault_integer_8 : Flag<["-"],"fdefault-integer-8">, Group<f_Group>,
   HelpText<"Set the default integer and logical kind to an 8 byte wide type">;
 def fdefault_real_8 : Flag<["-"],"fdefault-real-8">, Group<f_Group>,
   HelpText<"Set the default real kind to an 8 byte wide type">;
+def fdisable_real_3 : Flag<["-"],"fdisable-real-3">, Group<f_Group>,
+   HelpText<"Disable real(KIND=3) from TargetCharacteristics">, Flags<[HelpHidden]>;
+def fdisable_real_10 : Flag<["-"],"fdisable-real-10">, Group<f_Group>,
+  HelpText<"Disable real(KIND=10) from TargetCharacteristics">, Flags<[HelpHidden]>;
+def fdisable_integer_2 : Flag<["-"],"fdisable-integer-2">, Group<f_Group>,
+  HelpText<"Disable integer(KIND=2) from TargetCharacteristics">, Flags<[HelpHidden]>;
+def fdisable_integer_16 : Flag<["-"],"fdisable-integer-16">, Group<f_Group>,
+  HelpText<"Disable integer(KIND=16) from TargetCharacteristics">, Flags<[HelpHidden]>;
 def flarge_sizes : Flag<["-"],"flarge-sizes">, Group<f_Group>,
   HelpText<"Use INTEGER(KIND=8) for the result type in size-related intrinsics">;
 
diff --git a/flang/include/flang/Frontend/TargetOptions.h b/flang/include/flang/Frontend/TargetOptions.h
index fa72c77a028a1..332adcbe6b6ac 100644
--- a/flang/include/flang/Frontend/TargetOptions.h
+++ b/flang/include/flang/Frontend/TargetOptions.h
@@ -38,6 +38,12 @@ class TargetOptions {
   /// The list of target specific features to enable or disable, as written on
   /// the command line.
   std::vector<std::string> featuresAsWritten;
+
+  /// The real KINDs disabled for this target
+  std::vector<int> disabledRealKinds;
+
+  /// The integer KINDs disabled for this target
+  std::vector<int> disabledIntegerKinds;
 };
 
 } // end namespace Fortran::frontend
diff --git a/flang/include/flang/Tools/TargetSetup.h b/flang/include/flang/Tools/TargetSetup.h
index 238d66c9241dd..37c1e1d2ff63f 100644
--- a/flang/include/flang/Tools/TargetSetup.h
+++ b/flang/include/flang/Tools/TargetSetup.h
@@ -10,6 +10,7 @@
 #define FORTRAN_TOOLS_TARGET_SETUP_H
 
 #include "flang/Evaluate/target.h"
+#include "flang/Frontend/TargetOptions.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace Fortran::tools {
@@ -17,6 +18,7 @@ namespace Fortran::tools {
 [[maybe_unused]] inline static void setUpTargetCharacteristics(
     Fortran::evaluate::TargetCharacteristics &targetCharacteristics,
     const llvm::TargetMachine &targetMachine,
+    const Fortran::frontend::TargetOptions &targetOptions,
     const std::string &compilerVersion, const std::string &compilerOptions) {
 
   const llvm::Triple &targetTriple{targetMachine.getTargetTriple()};
@@ -25,6 +27,12 @@ namespace Fortran::tools {
     targetCharacteristics.DisableType(
         Fortran::common::TypeCategory::Real, /*kind=*/10);
 
+  for (auto realKind : targetOptions.disabledRealKinds)
+    targetCharacteristics.DisableType(common::TypeCategory::Real, realKind);
+
+  for (auto intKind : targetOptions.disabledIntegerKinds)
+    targetCharacteristics.DisableType(common::TypeCategory::Integer, intKind);
+
   targetCharacteristics.set_compilerOptionsString(compilerOptions)
       .set_compilerVersionString(compilerVersion);
 
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 9e42fcc2e39d5..90c327546198b 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -438,8 +438,19 @@ static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) {
   for (const llvm::opt::Arg *currentArg :
        args.filtered(clang::driver::options::OPT_target_feature))
     opts.featuresAsWritten.emplace_back(currentArg->getValue());
-}
 
+  if (args.hasArg(clang::driver::options::OPT_fdisable_real_10))
+    opts.disabledRealKinds.push_back(10);
+
+  if (args.hasArg(clang::driver::options::OPT_fdisable_real_3))
+    opts.disabledRealKinds.push_back(3);
+
+  if (args.hasArg(clang::driver::options::OPT_fdisable_integer_2))
+    opts.disabledIntegerKinds.push_back(2);
+
+  if (args.hasArg(clang::driver::options::OPT_fdisable_integer_16))
+    opts.disabledIntegerKinds.push_back(16);
+}
 // Tweak the frontend configuration based on the frontend action
 static void setUpFrontendBasedOnAction(FrontendOptions &opts) {
   if (opts.programAction == DebugDumpParsingLog)
@@ -1531,8 +1542,8 @@ CompilerInvocation::getSemanticsCtx(
 
   std::string compilerVersion = Fortran::common::getFlangFullVersion();
   Fortran::tools::setUpTargetCharacteristics(
-      semanticsContext->targetCharacteristics(), targetMachine, compilerVersion,
-      allCompilerInvocOpts);
+      semanticsContext->targetCharacteristics(), targetMachine, getTargetOpts(),
+      compilerVersion, allCompilerInvocOpts);
   return semanticsContext;
 }
 
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index 736d68219581d..dcff4503f1657 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -18,6 +18,7 @@
 #include "flang/Common/OpenMP-features.h"
 #include "flang/Common/Version.h"
 #include "flang/Common/default-kinds.h"
+#include "flang/Frontend/TargetOptions.h"
 #include "flang/Lower/Bridge.h"
 #include "flang/Lower/PFTBuilder.h"
 #include "flang/Lower/Support/Verifier.h"
@@ -556,8 +557,8 @@ int main(int argc, char **argv) {
   std::string compilerVersion = Fortran::common::getFlangToolFullVersion("bbc");
   std::string compilerOptions = "";
   Fortran::tools::setUpTargetCharacteristics(
-      semanticsContext.targetCharacteristics(), *targetMachine, compilerVersion,
-      compilerOptions);
+      semanticsContext.targetCharacteristics(), *targetMachine, {},
+      compilerVersion, compilerOptions);
 
   return mlir::failed(
       convertFortranSourceToMLIR(inputFilename, options, programPrefix,

From 776495987272294de6aafbe73dab3e9ab445227a Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 4 Sep 2024 12:03:44 -0400
Subject: [PATCH 118/425] [gn/mac] bump HOST_LINK_VERSION to 520

With this, clang will pass -platform_version instead of
-mmacosx_version_min to the linker. Recent versions of the linker
complain that the flag is now spelled mmacos_version_min (without
the x), and this supresses that warning.

520 is over 4 years old by now, so just changing this unconditionally
seems fine.
---
 llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn
index a3fb952b1112b..bc0631dc269ac 100644
--- a/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn
@@ -68,7 +68,7 @@ write_cmake_config("Config") {
     # FIXME: Hardcoding this isn't great, but assuming that the host ld version
     # has anything to do with the ld version where the built clang will run
     # isn't either. Probably want to make this a declare_args.
-    values += [ "HOST_LINK_VERSION=305" ]
+    values += [ "HOST_LINK_VERSION=520" ]
   } else {
     values += [ "HOST_LINK_VERSION=" ]
   }

From c537dd9375156c2aa3cd1bfaee88af7c492359d5 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Wed, 4 Sep 2024 09:31:59 -0700
Subject: [PATCH 119/425] [MS] Put dllexported inline global initializers in a
 comdat (#107154)

Follow-up to c19f4f8069722f6804086d4438a0254104242c46 to handle corner
case of exported inline variables.

Should fix #56485
---
 clang/lib/CodeGen/CGDeclCXX.cpp               | 43 +++++++++++++------
 .../microsoft-abi-template-static-init.cpp    |  5 +--
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp
index 2f56355cff90e..8dcb5f6100619 100644
--- a/clang/lib/CodeGen/CGDeclCXX.cpp
+++ b/clang/lib/CodeGen/CGDeclCXX.cpp
@@ -586,31 +586,50 @@ CodeGenModule::EmitCXXGlobalVarDeclInitFunc(const VarDecl *D,
                                           PrioritizedCXXGlobalInits.size());
     PrioritizedCXXGlobalInits.push_back(std::make_pair(Key, Fn));
   } else if (isTemplateInstantiation(D->getTemplateSpecializationKind()) ||
-             getContext().GetGVALinkageForVariable(D) == GVA_DiscardableODR ||
+             !isUniqueGVALinkage(getContext().GetGVALinkageForVariable(D)) ||
              D->hasAttr<SelectAnyAttr>()) {
+    // For vague linkage globals, put the initializer into its own global_ctors
+    // entry with the global as a comdat key. This ensures at most one
+    // initializer per DSO runs during DSO dynamic initialization.
+    //
+    // For ELF platforms, this is an important code size and startup time
+    // optimization. For dynamic, non-hidden symbols, the weak guard variable
+    // remains to ensure that other DSOs do not re-initialize the global.
+    //
+    // For PE-COFF platforms, there is no guard variable, and COMDAT
+    // associativity is the only way to ensure vauge linkage globals are
+    // initialized exactly once.
+    //
+    // MachO is the only remaining platform with no comdats that doesn't
+    // benefit from this optimization. The rest are mainly modeled on ELF
+    // behavior.
+    //
+    // C++ requires that inline global variables are initialized in source
+    // order, but this requirement does not exist for templated entities.
+    // llvm.global_ctors does not guarantee initialization order, so in
+    // general, Clang does not fully conform to the ordering requirement.
+    // However, in practice, LLVM emits global_ctors in the provided order, and
+    // users typically don't rely on ordering between inline globals in
+    // different headers which are then transitively included in varying order.
+    // Clang's current behavior is a practical tradeoff, since dropping the
+    // comdat would lead to unacceptable impact on code size and startup time.
+    //
+    // FIXME: Find a solution to guarantee source-order initialization of
+    // inline variables.
+    //
     // C++ [basic.start.init]p2:
     //   Definitions of explicitly specialized class template static data
     //   members have ordered initialization. Other class template static data
     //   members (i.e., implicitly or explicitly instantiated specializations)
     //   have unordered initialization.
     //
-    // As a consequence, we can put them into their own llvm.global_ctors entry.
-    //
-    // If the global is externally visible, put the initializer into a COMDAT
-    // group with the global being initialized.  On most platforms, this is a
-    // minor startup time optimization.  In the MS C++ ABI, there are no guard
-    // variables, so this COMDAT key is required for correctness.
-    //
-    // SelectAny globals will be comdat-folded. Put the initializer into a
-    // COMDAT group associated with the global, so the initializers get folded
-    // too.
-    I = DelayedCXXInitPosition.find(D);
     // CXXGlobalInits.size() is the lex order number for the next deferred
     // VarDecl. Use it when the current VarDecl is non-deferred. Although this
     // lex order number is shared between current VarDecl and some following
     // VarDecls, their order of insertion into `llvm.global_ctors` is the same
     // as the lexing order and the following stable sort would preserve such
     // order.
+    I = DelayedCXXInitPosition.find(D);
     unsigned LexOrder =
         I == DelayedCXXInitPosition.end() ? CXXGlobalInits.size() : I->second;
     AddGlobalCtor(Fn, 65535, LexOrder, COMDATKey);
diff --git a/clang/test/CodeGenCXX/microsoft-abi-template-static-init.cpp b/clang/test/CodeGenCXX/microsoft-abi-template-static-init.cpp
index 60b48abca2f89..871551240debf 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-template-static-init.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-template-static-init.cpp
@@ -49,8 +49,6 @@ struct X {
   static T ioo;
   static T init();
 };
-// template specialized static data don't need in llvm.used,
-// the static init routine get call from _GLOBAL__sub_I_ routines.
 template <> int X<int>::ioo = X<int>::init();
 template struct X<int>;
 class a {
@@ -87,5 +85,6 @@ struct S1
 int foo();
 inline int zoo = foo();
 inline static int boo = foo();
+inline __declspec(dllexport) A exported_inline{};
 
-// CHECK: @llvm.used = appending global [8 x ptr] [ptr @"?x@selectany_init@@3HA", ptr @"?x1@selectany_init@@3HA", ptr @"?x@?$A@H@explicit_template_instantiation@@2HA", ptr @"?ioo@?$X_@H@@2HA", ptr @"?aoo@S1@@2UA@@A", ptr @"?zoo@@3HA", ptr @"?s@?$ExportedTemplate@H@@2US@@A", ptr @"?x@?$A@H@implicit_template_instantiation@@2HA"], section "llvm.metadata"
+// CHECK: @llvm.used = appending global [10 x ptr] [ptr @"?x@selectany_init@@3HA", ptr @"?x1@selectany_init@@3HA", ptr @"?x@?$A@H@explicit_template_instantiation@@2HA", ptr @"?ioo@?$X_@H@@2HA", ptr @"?ioo@?$X@H@@2HA", ptr @"?aoo@S1@@2UA@@A", ptr @"?zoo@@3HA", ptr @"?exported_inline@@3UA@@A", ptr @"?s@?$ExportedTemplate@H@@2US@@A", ptr @"?x@?$A@H@implicit_template_instantiation@@2HA"], section "llvm.metadata"

From 7e03753539baaaa7a5cc29da3c0dc4d2f6df3b58 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Wed, 4 Sep 2024 12:33:05 -0400
Subject: [PATCH 120/425] Disallow btf_type_tag in C++ mode (#107238)

This was always intended to be disallowed in C++ (see the definition in
Attr.td), but failed to add the correct checking code in SemaType.cpp to
ensure it was rejected.

Fixes #106864
---
 clang/docs/ReleaseNotes.rst               |  5 +++++
 clang/lib/Sema/SemaType.cpp               |  9 +++++++++
 clang/test/Sema/attr-btf_type_tag.cpp     | 11 +++++++++++
 clang/test/SemaCXX/sugar-common-types.cpp |  7 -------
 clang/test/SemaCXX/type-traits.cpp        |  1 -
 5 files changed, 25 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/Sema/attr-btf_type_tag.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index bcdbc5b702765..1520f7a2916aa 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -225,6 +225,11 @@ Attribute Changes in Clang
   more cases where the returned reference outlives the object.
   (#GH100567)
 
+- Clang now correctly diagnoses the use of ``btf_type_tag`` in C++ and ignores
+  it; this attribute is a C-only attribute, and caused crashes with template
+  instantiation by accidentally allowing it in C++ in some circumstances.
+  (#GH106864)
+
 Improvements to Clang's diagnostics
 -----------------------------------
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 7df8f663da26a..520dce870b7b7 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -6490,6 +6490,15 @@ static void HandleBTFTypeTagAttribute(QualType &Type, const ParsedAttr &Attr,
                                       TypeProcessingState &State) {
   Sema &S = State.getSema();
 
+  // This attribute is only supported in C.
+  // FIXME: we should implement checkCommonAttributeFeatures() in SemaAttr.cpp
+  // such that it handles type attributes, and then call that from
+  // processTypeAttrs() instead of one-off checks like this.
+  if (!Attr.diagnoseLangOpts(S)) {
+    Attr.setInvalid();
+    return;
+  }
+
   // Check the number of attribute arguments.
   if (Attr.getNumArgs() != 1) {
     S.Diag(Attr.getLoc(), diag::err_attribute_wrong_number_arguments)
diff --git a/clang/test/Sema/attr-btf_type_tag.cpp b/clang/test/Sema/attr-btf_type_tag.cpp
new file mode 100644
index 0000000000000..cef78fff79b96
--- /dev/null
+++ b/clang/test/Sema/attr-btf_type_tag.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify=c -x c %s
+
+// c-no-diagnostics
+
+// Ensure that we diagnose the attribute as ignored in C++ but not in C.
+#ifdef __cplusplus
+static_assert(__builtin_is_implicit_lifetime(int __attribute__((btf_type_tag("user"))) *)); // expected-warning {{'btf_type_tag' attribute ignored}}
+#endif
+int __attribute__((btf_type_tag("user"))) *ptr; // expected-warning {{'btf_type_tag' attribute ignored}}
+
diff --git a/clang/test/SemaCXX/sugar-common-types.cpp b/clang/test/SemaCXX/sugar-common-types.cpp
index e1c7578a66b9c..39a762127811f 100644
--- a/clang/test/SemaCXX/sugar-common-types.cpp
+++ b/clang/test/SemaCXX/sugar-common-types.cpp
@@ -90,13 +90,6 @@ N t19 = 0 ? (__underlying_type(EnumsX::X)){} : (__underlying_type(EnumsY::Y)){};
 N t20 = 0 ? (__underlying_type(EnumsX::X)){} : (__underlying_type(EnumsY::X)){};
 // expected-error@-1 {{rvalue of type '__underlying_type(Enums::X)' (aka 'int')}}
 
-using SBTF1 = SS1 [[clang::btf_type_tag("1")]];
-using SBTF2 = ::SS1 [[clang::btf_type_tag("1")]];
-using SBTF3 = ::SS1 [[clang::btf_type_tag("2")]];
-
-N t21 = 0 ? (SBTF1){} : (SBTF3){}; // expected-error {{from 'SS1'}}
-N t22 = 0 ? (SBTF1){} : (SBTF2){}; // expected-error {{from 'SS1 __attribute__((btf_type_tag("1")))' (aka 'SS1')}}
-
 using QX = const SB1 *;
 using QY = const ::SB1 *;
 N t23 = 0 ? (QX){} : (QY){}; // expected-error {{rvalue of type 'const SB1 *' (aka 'const SS1 *')}}
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index bf069d9bc082c..b8a9db103782c 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -2052,7 +2052,6 @@ void is_implicit_lifetime(int n) {
   static_assert(__builtin_is_implicit_lifetime(float4));
   static_assert(__builtin_is_implicit_lifetime(align_value_int));
   static_assert(__builtin_is_implicit_lifetime(int[[clang::annotate_type("category2")]] *));
-  static_assert(__builtin_is_implicit_lifetime(int __attribute__((btf_type_tag("user"))) *));
   static_assert(__builtin_is_implicit_lifetime(EnforceReadOnlyPlacement));
   static_assert(__builtin_is_implicit_lifetime(int __attribute__((noderef)) *));
   static_assert(__builtin_is_implicit_lifetime(TypeVisibility));

From accf90e16410468a2fa1ad9d1320f33fcc4cdd79 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 4 Sep 2024 17:33:31 +0100
Subject: [PATCH 121/425] [X86] pull-conditional-binop-through-shift.ll -
 replace X32 check prefixes with X86

We tend to use X32 only for gnux32 testing
---
 .../pull-conditional-binop-through-shift.ll   | 578 +++++++++---------
 1 file changed, 289 insertions(+), 289 deletions(-)

diff --git a/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll b/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll
index 46e8664c5efda..4f39b8f945413 100644
--- a/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll
+++ b/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
 
 ; shift left
 
@@ -15,18 +15,18 @@ define i32 @and_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: and_signbit_select_shl:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB0_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    andl $16711680, %eax # imm = 0xFF0000
-; X32-NEXT:  .LBB0_2:
-; X32-NEXT:    shll $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: and_signbit_select_shl:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB0_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    andl $16711680, %eax # imm = 0xFF0000
+; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = and i32 %x, 4294901760 ; 0xFFFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = shl i32 %t1, 8
@@ -44,18 +44,18 @@ define i32 @and_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: and_nosignbit_select_shl:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB1_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    andl $16711680, %eax # imm = 0xFF0000
-; X32-NEXT:  .LBB1_2:
-; X32-NEXT:    shll $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: and_nosignbit_select_shl:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB1_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    andl $16711680, %eax # imm = 0xFF0000
+; X86-NEXT:  .LBB1_2:
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = and i32 %x, 2147418112 ; 0x7FFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = shl i32 %t1, 8
@@ -74,18 +74,18 @@ define i32 @or_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: or_signbit_select_shl:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB2_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    orl $16711680, %eax # imm = 0xFF0000
-; X32-NEXT:  .LBB2_2:
-; X32-NEXT:    shll $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: or_signbit_select_shl:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB2_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    orl $16711680, %eax # imm = 0xFF0000
+; X86-NEXT:  .LBB2_2:
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = or i32 %x, 4294901760 ; 0xFFFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = shl i32 %t1, 8
@@ -103,18 +103,18 @@ define i32 @or_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: or_nosignbit_select_shl:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB3_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    orl $16711680, %eax # imm = 0xFF0000
-; X32-NEXT:  .LBB3_2:
-; X32-NEXT:    shll $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: or_nosignbit_select_shl:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB3_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    orl $16711680, %eax # imm = 0xFF0000
+; X86-NEXT:  .LBB3_2:
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = or i32 %x, 2147418112 ; 0x7FFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = shl i32 %t1, 8
@@ -133,18 +133,18 @@ define i32 @xor_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: xor_signbit_select_shl:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB4_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    xorl $16711680, %eax # imm = 0xFF0000
-; X32-NEXT:  .LBB4_2:
-; X32-NEXT:    shll $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: xor_signbit_select_shl:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB4_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    xorl $16711680, %eax # imm = 0xFF0000
+; X86-NEXT:  .LBB4_2:
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = xor i32 %x, 4294901760 ; 0xFFFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = shl i32 %t1, 8
@@ -162,18 +162,18 @@ define i32 @xor_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: xor_nosignbit_select_shl:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB5_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    xorl $16711680, %eax # imm = 0xFF0000
-; X32-NEXT:  .LBB5_2:
-; X32-NEXT:    shll $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: xor_nosignbit_select_shl:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB5_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    xorl $16711680, %eax # imm = 0xFF0000
+; X86-NEXT:  .LBB5_2:
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = xor i32 %x, 2147418112 ; 0x7FFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = shl i32 %t1, 8
@@ -192,18 +192,18 @@ define i32 @add_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: add_signbit_select_shl:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB6_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    addl $-65536, %eax # imm = 0xFFFF0000
-; X32-NEXT:  .LBB6_2:
-; X32-NEXT:    shll $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: add_signbit_select_shl:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB6_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    addl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT:  .LBB6_2:
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = add i32 %x, 4294901760 ; 0xFFFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = shl i32 %t1, 8
@@ -221,18 +221,18 @@ define i32 @add_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: add_nosignbit_select_shl:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB7_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    addl $2147418112, %eax # imm = 0x7FFF0000
-; X32-NEXT:  .LBB7_2:
-; X32-NEXT:    shll $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: add_nosignbit_select_shl:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB7_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    addl $2147418112, %eax # imm = 0x7FFF0000
+; X86-NEXT:  .LBB7_2:
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = add i32 %x, 2147418112 ; 0x7FFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = shl i32 %t1, 8
@@ -253,18 +253,18 @@ define i32 @and_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: and_signbit_select_lshr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB8_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
-; X32-NEXT:  .LBB8_2:
-; X32-NEXT:    shrl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: and_signbit_select_lshr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB8_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT:  .LBB8_2:
+; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = and i32 %x, 4294901760 ; 0xFFFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = lshr i32 %t1, 8
@@ -282,18 +282,18 @@ define i32 @and_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: and_nosignbit_select_lshr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB9_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
-; X32-NEXT:  .LBB9_2:
-; X32-NEXT:    shrl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: and_nosignbit_select_lshr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB9_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
+; X86-NEXT:  .LBB9_2:
+; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = and i32 %x, 2147418112 ; 0x7FFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = lshr i32 %t1, 8
@@ -312,18 +312,18 @@ define i32 @or_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: or_signbit_select_lshr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB10_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    orl $-65536, %eax # imm = 0xFFFF0000
-; X32-NEXT:  .LBB10_2:
-; X32-NEXT:    shrl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: or_signbit_select_lshr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB10_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    orl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT:  .LBB10_2:
+; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = or i32 %x, 4294901760 ; 0xFFFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = lshr i32 %t1, 8
@@ -341,18 +341,18 @@ define i32 @or_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: or_nosignbit_select_lshr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB11_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    orl $2147418112, %eax # imm = 0x7FFF0000
-; X32-NEXT:  .LBB11_2:
-; X32-NEXT:    shrl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: or_nosignbit_select_lshr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB11_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    orl $2147418112, %eax # imm = 0x7FFF0000
+; X86-NEXT:  .LBB11_2:
+; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = or i32 %x, 2147418112 ; 0x7FFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = lshr i32 %t1, 8
@@ -371,18 +371,18 @@ define i32 @xor_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: xor_signbit_select_lshr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB12_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    xorl $-65536, %eax # imm = 0xFFFF0000
-; X32-NEXT:  .LBB12_2:
-; X32-NEXT:    shrl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: xor_signbit_select_lshr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB12_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    xorl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT:  .LBB12_2:
+; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = xor i32 %x, 4294901760 ; 0xFFFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = lshr i32 %t1, 8
@@ -400,18 +400,18 @@ define i32 @xor_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: xor_nosignbit_select_lshr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB13_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    xorl $2147418112, %eax # imm = 0x7FFF0000
-; X32-NEXT:  .LBB13_2:
-; X32-NEXT:    shrl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: xor_nosignbit_select_lshr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB13_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    xorl $2147418112, %eax # imm = 0x7FFF0000
+; X86-NEXT:  .LBB13_2:
+; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = xor i32 %x, 2147418112 ; 0x7FFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = lshr i32 %t1, 8
@@ -430,18 +430,18 @@ define i32 @add_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: add_signbit_select_lshr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB14_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    addl $-65536, %eax # imm = 0xFFFF0000
-; X32-NEXT:  .LBB14_2:
-; X32-NEXT:    shrl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: add_signbit_select_lshr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB14_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    addl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT:  .LBB14_2:
+; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = add i32 %x, 4294901760 ; 0xFFFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = lshr i32 %t1, 8
@@ -459,18 +459,18 @@ define i32 @add_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: add_nosignbit_select_lshr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB15_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    addl $2147418112, %eax # imm = 0x7FFF0000
-; X32-NEXT:  .LBB15_2:
-; X32-NEXT:    shrl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: add_nosignbit_select_lshr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB15_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    addl $2147418112, %eax # imm = 0x7FFF0000
+; X86-NEXT:  .LBB15_2:
+; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = add i32 %x, 2147418112 ; 0x7FFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = lshr i32 %t1, 8
@@ -491,18 +491,18 @@ define i32 @and_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: and_signbit_select_ashr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB16_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
-; X32-NEXT:  .LBB16_2:
-; X32-NEXT:    sarl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: and_signbit_select_ashr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB16_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT:  .LBB16_2:
+; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = and i32 %x, 4294901760 ; 0xFFFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = ashr i32 %t1, 8
@@ -520,18 +520,18 @@ define i32 @and_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: and_nosignbit_select_ashr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB17_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
-; X32-NEXT:  .LBB17_2:
-; X32-NEXT:    sarl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: and_nosignbit_select_ashr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB17_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
+; X86-NEXT:  .LBB17_2:
+; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = and i32 %x, 2147418112 ; 0x7FFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = ashr i32 %t1, 8
@@ -550,18 +550,18 @@ define i32 @or_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: or_signbit_select_ashr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB18_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    orl $-65536, %eax # imm = 0xFFFF0000
-; X32-NEXT:  .LBB18_2:
-; X32-NEXT:    sarl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: or_signbit_select_ashr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB18_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    orl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT:  .LBB18_2:
+; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = or i32 %x, 4294901760 ; 0xFFFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = ashr i32 %t1, 8
@@ -579,18 +579,18 @@ define i32 @or_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: or_nosignbit_select_ashr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB19_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    orl $2147418112, %eax # imm = 0x7FFF0000
-; X32-NEXT:  .LBB19_2:
-; X32-NEXT:    sarl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: or_nosignbit_select_ashr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB19_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    orl $2147418112, %eax # imm = 0x7FFF0000
+; X86-NEXT:  .LBB19_2:
+; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = or i32 %x, 2147418112 ; 0x7FFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = ashr i32 %t1, 8
@@ -609,18 +609,18 @@ define i32 @xor_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: xor_signbit_select_ashr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB20_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    xorl $-65536, %eax # imm = 0xFFFF0000
-; X32-NEXT:  .LBB20_2:
-; X32-NEXT:    sarl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: xor_signbit_select_ashr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB20_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    xorl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT:  .LBB20_2:
+; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = xor i32 %x, 4294901760 ; 0xFFFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = ashr i32 %t1, 8
@@ -638,18 +638,18 @@ define i32 @xor_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: xor_nosignbit_select_ashr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB21_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    xorl $2147418112, %eax # imm = 0x7FFF0000
-; X32-NEXT:  .LBB21_2:
-; X32-NEXT:    sarl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: xor_nosignbit_select_ashr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB21_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    xorl $2147418112, %eax # imm = 0x7FFF0000
+; X86-NEXT:  .LBB21_2:
+; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = xor i32 %x, 2147418112 ; 0x7FFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = ashr i32 %t1, 8
@@ -668,18 +668,18 @@ define i32 @add_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: add_signbit_select_ashr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB22_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    addl $-65536, %eax # imm = 0xFFFF0000
-; X32-NEXT:  .LBB22_2:
-; X32-NEXT:    sarl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: add_signbit_select_ashr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB22_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    addl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT:  .LBB22_2:
+; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = add i32 %x, 4294901760 ; 0xFFFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = ashr i32 %t1, 8
@@ -697,18 +697,18 @@ define i32 @add_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: add_nosignbit_select_ashr:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB23_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    addl $2147418112, %eax # imm = 0x7FFF0000
-; X32-NEXT:  .LBB23_2:
-; X32-NEXT:    sarl $8, %eax
-; X32-NEXT:    movl %eax, (%ecx)
-; X32-NEXT:    retl
+; X86-LABEL: add_nosignbit_select_ashr:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB23_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    addl $2147418112, %eax # imm = 0x7FFF0000
+; X86-NEXT:  .LBB23_2:
+; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
   %t0 = add i32 %x, 2147418112 ; 0x7FFF0000
   %t1 = select i1 %cond, i32 %t0, i32 %x
   %r = ashr i32 %t1, 8

From b2223b4d7efa4ed003a1b3ce7439106ddc63125f Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Wed, 4 Sep 2024 09:52:42 -0700
Subject: [PATCH 122/425] [WebAssembly] Rename legacy EH mir tests (#107189)

We added `-legacy` suffix to the legacy EH `ll` tests in #107166 but
forgot to do the same for `mir` tests.
---
 .../{cfg-stackify-eh.mir => cfg-stackify-eh-legacy.mir}           | 0
 .../CodeGen/WebAssembly/{exception.mir => exception-legacy.mir}   | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/CodeGen/WebAssembly/{cfg-stackify-eh.mir => cfg-stackify-eh-legacy.mir} (100%)
 rename llvm/test/CodeGen/WebAssembly/{exception.mir => exception-legacy.mir} (100%)

diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.mir b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir
similarity index 100%
rename from llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.mir
rename to llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir
diff --git a/llvm/test/CodeGen/WebAssembly/exception.mir b/llvm/test/CodeGen/WebAssembly/exception-legacy.mir
similarity index 100%
rename from llvm/test/CodeGen/WebAssembly/exception.mir
rename to llvm/test/CodeGen/WebAssembly/exception-legacy.mir

From 32bc670609fe9c938bca5b3c0e70e6b3934b4641 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Wed, 4 Sep 2024 09:53:38 -0700
Subject: [PATCH 123/425] [WebAssembly] Misc. fixes in CFGStackify (NFC)
 (#107182)

This contains misc. small fixes in CFGStackify. Most of them are comment
fixes and variable name changes. Two code changes are removing the cases
that can never occur. Another is extracting a routine as a lambda
function. I will add explanations inline in the code as Github comments.
---
 .../WebAssembly/WebAssemblyCFGStackify.cpp    | 141 +++++++++---------
 1 file changed, 67 insertions(+), 74 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index c7001ef2b33e6..6fd882f62f3f0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -63,8 +63,9 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
   // over scoped regions when walking blocks.
   SmallVector<MachineBasicBlock *, 8> ScopeTops;
   void updateScopeTops(MachineBasicBlock *Begin, MachineBasicBlock *End) {
+    int BeginNo = Begin->getNumber();
     int EndNo = End->getNumber();
-    if (!ScopeTops[EndNo] || ScopeTops[EndNo]->getNumber() > Begin->getNumber())
+    if (!ScopeTops[EndNo] || ScopeTops[EndNo]->getNumber() > BeginNo)
       ScopeTops[EndNo] = Begin;
   }
 
@@ -77,8 +78,8 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
   // Exception handling related functions
   bool fixCallUnwindMismatches(MachineFunction &MF);
   bool fixCatchUnwindMismatches(MachineFunction &MF);
-  void addTryDelegate(MachineInstr *RangeBegin, MachineInstr *RangeEnd,
-                      MachineBasicBlock *DelegateDest);
+  void addNestedTryDelegate(MachineInstr *RangeBegin, MachineInstr *RangeEnd,
+                            MachineBasicBlock *UnwindDest);
   void recalculateScopeTops(MachineFunction &MF);
   void removeUnnecessaryInstrs(MachineFunction &MF);
 
@@ -225,7 +226,7 @@ void WebAssemblyCFGStackify::registerScope(MachineInstr *Begin,
   EndToBegin[End] = Begin;
 }
 
-// When 'End' is not an 'end_try' but 'delegate, EHPad is nullptr.
+// When 'End' is not an 'end_try' but a 'delegate', EHPad is nullptr.
 void WebAssemblyCFGStackify::registerTryScope(MachineInstr *Begin,
                                               MachineInstr *End,
                                               MachineBasicBlock *EHPad) {
@@ -293,7 +294,7 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
     }
   }
 
-  // Decide where in Header to put the BLOCK.
+  // Decide where in MBB to put the BLOCK.
 
   // Instructions that should go before the BLOCK.
   SmallPtrSet<const MachineInstr *, 4> BeforeSet;
@@ -359,21 +360,20 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
               TII.get(WebAssembly::BLOCK))
           .addImm(int64_t(ReturnType));
 
-  // Decide where in Header to put the END_BLOCK.
+  // Decide where in MBB to put the END_BLOCK.
   BeforeSet.clear();
   AfterSet.clear();
   for (auto &MI : MBB) {
 #ifndef NDEBUG
-    // END_BLOCK should precede existing LOOP and TRY markers.
-    if (MI.getOpcode() == WebAssembly::LOOP ||
-        MI.getOpcode() == WebAssembly::TRY)
+    // END_BLOCK should precede existing LOOP markers.
+    if (MI.getOpcode() == WebAssembly::LOOP)
       AfterSet.insert(&MI);
 #endif
 
     // If there is a previously placed END_LOOP marker and the header of the
     // loop is above this block's header, the END_LOOP should be placed after
-    // the BLOCK, because the loop contains this block. Otherwise the END_LOOP
-    // should be placed before the BLOCK. The same for END_TRY.
+    // the END_BLOCK, because the loop contains this block. Otherwise the
+    // END_LOOP should be placed before the END_BLOCK. The same for END_TRY.
     if (MI.getOpcode() == WebAssembly::END_LOOP ||
         MI.getOpcode() == WebAssembly::END_TRY) {
       if (EndToBegin[&MI]->getParent()->getNumber() >= Header->getNumber())
@@ -437,7 +437,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) {
                                 TII.get(WebAssembly::LOOP))
                             .addImm(int64_t(WebAssembly::BlockType::Void));
 
-  // Decide where in Header to put the END_LOOP.
+  // Decide where in MBB to put the END_LOOP.
   BeforeSet.clear();
   AfterSet.clear();
 #ifndef NDEBUG
@@ -491,7 +491,6 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
   WebAssemblyException *WE = WEI.getExceptionFor(&MBB);
   assert(WE);
   MachineBasicBlock *Bottom = SRI.getBottom(WE);
-
   auto Iter = std::next(Bottom->getIterator());
   if (Iter == MF.end()) {
     getAppendixBlock(MF);
@@ -499,12 +498,9 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
   }
   MachineBasicBlock *Cont = &*Iter;
 
-  assert(Cont != &MF.front());
-  MachineBasicBlock *LayoutPred = Cont->getPrevNode();
-
   // If the nearest common dominator is inside a more deeply nested context,
   // walk out to the nearest scope which isn't more deeply nested.
-  for (MachineFunction::iterator I(LayoutPred), E(Header); I != E; --I) {
+  for (MachineFunction::iterator I(Bottom), E(Header); I != E; --I) {
     if (MachineBasicBlock *ScopeTop = ScopeTops[I->getNumber()]) {
       if (ScopeTop->getNumber() > Header->getNumber()) {
         // Skip over an intervening scope.
@@ -538,7 +534,7 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
     }
 
     // All previously inserted BLOCK/TRY markers should be after the TRY because
-    // they are all nested trys.
+    // they are all nested blocks/trys.
     if (MI.getOpcode() == WebAssembly::BLOCK ||
         MI.getOpcode() == WebAssembly::TRY)
       AfterSet.insert(&MI);
@@ -607,14 +603,13 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
               TII.get(WebAssembly::TRY))
           .addImm(int64_t(WebAssembly::BlockType::Void));
 
-  // Decide where in Header to put the END_TRY.
+  // Decide where in Cont to put the END_TRY.
   BeforeSet.clear();
   AfterSet.clear();
   for (const auto &MI : *Cont) {
 #ifndef NDEBUG
-    // END_TRY should precede existing LOOP and BLOCK markers.
-    if (MI.getOpcode() == WebAssembly::LOOP ||
-        MI.getOpcode() == WebAssembly::BLOCK)
+    // END_TRY should precede existing LOOP markers.
+    if (MI.getOpcode() == WebAssembly::LOOP)
       AfterSet.insert(&MI);
 
     // All END_TRY markers placed earlier belong to exceptions that contains
@@ -643,9 +638,8 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
 
   // Mark the end of the TRY.
   InsertPos = getEarliestInsertPos(Cont, BeforeSet, AfterSet);
-  MachineInstr *End =
-      BuildMI(*Cont, InsertPos, Bottom->findBranchDebugLoc(),
-              TII.get(WebAssembly::END_TRY));
+  MachineInstr *End = BuildMI(*Cont, InsertPos, Bottom->findBranchDebugLoc(),
+                              TII.get(WebAssembly::END_TRY));
   registerTryScope(Begin, End, &MBB);
 
   // Track the farthest-spanning scope that ends at this point. We create two
@@ -845,9 +839,9 @@ static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB,
 
 // Wrap the given range of instruction with try-delegate. RangeBegin and
 // RangeEnd are inclusive.
-void WebAssemblyCFGStackify::addTryDelegate(MachineInstr *RangeBegin,
-                                            MachineInstr *RangeEnd,
-                                            MachineBasicBlock *DelegateDest) {
+void WebAssemblyCFGStackify::addNestedTryDelegate(
+    MachineInstr *RangeBegin, MachineInstr *RangeEnd,
+    MachineBasicBlock *UnwindDest) {
   auto *BeginBB = RangeBegin->getParent();
   auto *EndBB = RangeEnd->getParent();
   MachineFunction &MF = *BeginBB->getParent();
@@ -879,8 +873,8 @@ void WebAssemblyCFGStackify::addTryDelegate(MachineInstr *RangeBegin,
   MachineBasicBlock *DelegateBB = MF.CreateMachineBasicBlock();
   // If the destination of 'delegate' is not the caller, adds the destination to
   // the BB's successors.
-  if (DelegateDest != FakeCallerBB)
-    DelegateBB->addSuccessor(DelegateDest);
+  if (UnwindDest != FakeCallerBB)
+    DelegateBB->addSuccessor(UnwindDest);
 
   auto SplitPos = std::next(RangeEnd->getIterator());
   if (SplitPos == EndBB->end()) {
@@ -962,7 +956,7 @@ void WebAssemblyCFGStackify::addTryDelegate(MachineInstr *RangeBegin,
   // Add 'delegate' instruction in the delegate BB created above.
   MachineInstr *Delegate = BuildMI(DelegateBB, RangeEnd->getDebugLoc(),
                                    TII.get(WebAssembly::DELEGATE))
-                               .addMBB(DelegateDest);
+                               .addMBB(UnwindDest);
   registerTryScope(Try, Delegate, nullptr);
 }
 
@@ -1130,7 +1124,7 @@ bool WebAssemblyCFGStackify::fixCallUnwindMismatches(MachineFunction &MF) {
       if (EHPadStack.back() == UnwindDest)
         continue;
 
-      // Include EH_LABELs in the range before and afer the invoke
+      // Include EH_LABELs in the range before and after the invoke
       MachineInstr *RangeBegin = &MI, *RangeEnd = &MI;
       if (RangeBegin->getIterator() != MBB.begin() &&
           std::prev(RangeBegin->getIterator())->isEHLabel())
@@ -1231,22 +1225,24 @@ bool WebAssemblyCFGStackify::fixCallUnwindMismatches(MachineFunction &MF) {
       std::tie(RangeBegin, RangeEnd) = Range;
       auto *MBB = RangeBegin->getParent();
 
-      // If this BB has an EH pad successor, i.e., ends with an 'invoke', now we
-      // are going to wrap the invoke with try-delegate, making the 'delegate'
-      // BB the new successor instead, so remove the EH pad succesor here. The
-      // BB may not have an EH pad successor if calls in this BB throw to the
-      // caller.
-      MachineBasicBlock *EHPad = nullptr;
-      for (auto *Succ : MBB->successors()) {
-        if (Succ->isEHPad()) {
-          EHPad = Succ;
-          break;
+      // If this BB has an EH pad successor, i.e., ends with an 'invoke', and if
+      // the current range contains the invoke, now we are going to wrap the
+      // invoke with try-delegate, making the 'delegate' BB the new successor
+      // instead, so remove the EH pad succesor here. The BB may not have an EH
+      // pad successor if calls in this BB throw to the caller.
+      if (UnwindDest != getFakeCallerBlock(MF)) {
+        MachineBasicBlock *EHPad = nullptr;
+        for (auto *Succ : MBB->successors()) {
+          if (Succ->isEHPad()) {
+            EHPad = Succ;
+            break;
+          }
         }
+        if (EHPad)
+          MBB->removeSuccessor(EHPad);
       }
-      if (EHPad)
-        MBB->removeSuccessor(EHPad);
 
-      addTryDelegate(RangeBegin, RangeEnd, UnwindDest);
+      addNestedTryDelegate(RangeBegin, RangeEnd, UnwindDest);
     }
   }
 
@@ -1354,12 +1350,10 @@ bool WebAssemblyCFGStackify::fixCatchUnwindMismatches(MachineFunction &MF) {
   NumCatchUnwindMismatches += EHPadToUnwindDest.size();
   SmallPtrSet<MachineBasicBlock *, 4> NewEndTryBBs;
 
-  for (auto &P : EHPadToUnwindDest) {
-    MachineBasicBlock *EHPad = P.first;
-    MachineBasicBlock *UnwindDest = P.second;
+  for (auto &[EHPad, UnwindDest] : EHPadToUnwindDest) {
     MachineInstr *Try = EHPadToTry[EHPad];
     MachineInstr *EndTry = BeginToEnd[Try];
-    addTryDelegate(Try, EndTry, UnwindDest);
+    addNestedTryDelegate(Try, EndTry, UnwindDest);
     NewEndTryBBs.insert(EndTry->getParent());
   }
 
@@ -1534,7 +1528,7 @@ static void appendEndToFunction(MachineFunction &MF,
           TII.get(WebAssembly::END_FUNCTION));
 }
 
-/// Insert LOOP/TRY/BLOCK markers at appropriate places.
+/// Insert BLOCK/LOOP/TRY markers at appropriate places.
 void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) {
   // We allocate one more than the number of blocks in the function to
   // accommodate for the possible fake block we may insert at the end.
@@ -1558,9 +1552,9 @@ void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) {
   // Fix mismatches in unwind destinations induced by linearizing the code.
   if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm &&
       MF.getFunction().hasPersonalityFn()) {
-    bool Changed = fixCallUnwindMismatches(MF);
-    Changed |= fixCatchUnwindMismatches(MF);
-    if (Changed)
+    bool MismatchFixed = fixCallUnwindMismatches(MF);
+    MismatchFixed |= fixCatchUnwindMismatches(MF);
+    if (MismatchFixed)
       recalculateScopeTops(MF);
   }
 }
@@ -1654,6 +1648,23 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
   // Now rewrite references to basic blocks to be depth immediates.
   SmallVector<EndMarkerInfo, 8> Stack;
   SmallVector<const MachineBasicBlock *, 8> EHPadStack;
+
+  auto RewriteOperands = [&](MachineInstr &MI) {
+    // Rewrite MBB operands to be depth immediates.
+    SmallVector<MachineOperand, 4> Ops(MI.operands());
+    while (MI.getNumOperands() > 0)
+      MI.removeOperand(MI.getNumOperands() - 1);
+    for (auto MO : Ops) {
+      if (MO.isMBB()) {
+        if (MI.getOpcode() == WebAssembly::DELEGATE)
+          MO = MachineOperand::CreateImm(getDelegateDepth(Stack, MO.getMBB()));
+        else
+          MO = MachineOperand::CreateImm(getBranchDepth(Stack, MO.getMBB()));
+      }
+      MI.addOperand(MF, MO);
+    }
+  };
+
   for (auto &MBB : reverse(MF)) {
     for (MachineInstr &MI : llvm::reverse(MBB)) {
       switch (MI.getOpcode()) {
@@ -1697,23 +1708,8 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
         break;
 
       default:
-        if (MI.isTerminator()) {
-          // Rewrite MBB operands to be depth immediates.
-          SmallVector<MachineOperand, 4> Ops(MI.operands());
-          while (MI.getNumOperands() > 0)
-            MI.removeOperand(MI.getNumOperands() - 1);
-          for (auto MO : Ops) {
-            if (MO.isMBB()) {
-              if (MI.getOpcode() == WebAssembly::DELEGATE)
-                MO = MachineOperand::CreateImm(
-                    getDelegateDepth(Stack, MO.getMBB()));
-              else
-                MO = MachineOperand::CreateImm(
-                    getBranchDepth(Stack, MO.getMBB()));
-            }
-            MI.addOperand(MF, MO);
-          }
-        }
+        if (MI.isTerminator())
+          RewriteOperands(MI);
 
         if (MI.getOpcode() == WebAssembly::DELEGATE)
           Stack.push_back(std::make_pair(&MBB, &MI));
@@ -1767,10 +1763,7 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
 
   // Add an end instruction at the end of the function body.
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
-  if (!MF.getSubtarget<WebAssemblySubtarget>()
-           .getTargetTriple()
-           .isOSBinFormatELF())
-    appendEndToFunction(MF, TII);
+  appendEndToFunction(MF, TII);
 
   cleanupFunctionData(MF);
 

From 26ba186bd0a22fac7d08ed566b00c03236b6b7a9 Mon Sep 17 00:00:00 2001
From: RolandF77 <55763885+RolandF77@users.noreply.github.com>
Date: Wed, 4 Sep 2024 12:55:27 -0400
Subject: [PATCH 124/425] [PowerPC] Improve pwr7 codegen for v4i8 load
 (#104507)

There are no partial vector loads on pwr7 so current v4i8 codegen is an
int load then store to vector sized temp and re-load as vector. Try to
use lfiwax to load 32 bits into an FP reg and take advantage of VSX FP
and vector reg sharing to move the result to the right vector position.
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  24 +++-
 .../build-vector-from-load-and-zeros.ll       | 119 ++++++----------
 .../PowerPC/canonical-merge-shuffles.ll       |  53 +++----
 llvm/test/CodeGen/PowerPC/load-and-splat.ll   | 117 +++++++--------
 llvm/test/CodeGen/PowerPC/pre-inc-disable.ll  |  21 ++-
 .../CodeGen/PowerPC/scalar_vector_test_4.ll   |  42 +++---
 .../CodeGen/PowerPC/test-vector-insert.ll     |  92 +++++-------
 .../PowerPC/v16i8_scalar_to_vector_shuffle.ll |  28 ++--
 .../PowerPC/v2i64_scalar_to_vector_shuffle.ll |  44 ++----
 .../PowerPC/v4i32_scalar_to_vector_shuffle.ll | 134 +++++++-----------
 .../PowerPC/v8i16_scalar_to_vector_shuffle.ll |  94 +++++-------
 11 files changed, 303 insertions(+), 465 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 8a0858e246252..f1bd14d7ee011 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -11492,13 +11492,33 @@ SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op,
 SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
                                                  SelectionDAG &DAG) const {
   SDLoc dl(Op);
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDValue Op0 = Op.getOperand(0);
+  ReuseLoadInfo RLI;
+  if (Subtarget.hasLFIWAX() && Subtarget.hasVSX() &&
+      Op.getValueType() == MVT::v4i32 && Op0.getOpcode() == ISD::LOAD &&
+      Op0.getValueType() == MVT::i32 && Op0.hasOneUse() &&
+      canReuseLoadAddress(Op0, MVT::i32, RLI, DAG, ISD::NON_EXTLOAD)) {
+
+    MachineMemOperand *MMO =
+        MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+                                RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+    SDValue Ops[] = {RLI.Chain, RLI.Ptr, DAG.getValueType(Op.getValueType())};
+    SDValue Bits = DAG.getMemIntrinsicNode(
+        PPCISD::LD_SPLAT, dl, DAG.getVTList(MVT::v4i32, MVT::Other), Ops,
+        MVT::i32, MMO);
+    spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
+    return Bits.getValue(0);
+  }
+
   // Create a stack slot that is 16-byte aligned.
-  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
   int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
-  SDValue Val = Op.getOperand(0);
+  SDValue Val = Op0;
   EVT ValVT = Val.getValueType();
   // P10 hardware store forwarding requires that a single store contains all
   // the data for the load. P10 is able to merge a pair of adjacent stores. Try
diff --git a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
index 6d35a7281de6b..fba6725e2b2a3 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
@@ -27,20 +27,17 @@ define  <2 x i64> @build_v2i64_extload_0(ptr nocapture noundef readonly %p) {
 ; PWR7-LE-LABEL: build_v2i64_extload_0:
 ; PWR7-LE:       # %bb.0: # %entry
 ; PWR7-LE-NEXT:    li 4, 0
-; PWR7-LE-NEXT:    lwz 3, 0(3)
 ; PWR7-LE-NEXT:    stw 4, -16(1)
 ; PWR7-LE-NEXT:    addis 4, 2, .LCPI0_0@toc@ha
+; PWR7-LE-NEXT:    lfiwzx 0, 0, 3
+; PWR7-LE-NEXT:    addi 3, 1, -16
 ; PWR7-LE-NEXT:    addi 4, 4, .LCPI0_0@toc@l
-; PWR7-LE-NEXT:    stw 3, -32(1)
-; PWR7-LE-NEXT:    addi 3, 1, -32
-; PWR7-LE-NEXT:    lxvd2x 0, 0, 4
-; PWR7-LE-NEXT:    addi 4, 1, -16
 ; PWR7-LE-NEXT:    lxvd2x 1, 0, 4
-; PWR7-LE-NEXT:    xxswapd 34, 0
+; PWR7-LE-NEXT:    xxspltw 35, 0, 1
 ; PWR7-LE-NEXT:    lxvd2x 0, 0, 3
-; PWR7-LE-NEXT:    xxswapd 35, 1
+; PWR7-LE-NEXT:    xxswapd 34, 1
 ; PWR7-LE-NEXT:    xxswapd 36, 0
-; PWR7-LE-NEXT:    vperm 2, 3, 4, 2
+; PWR7-LE-NEXT:    vperm 2, 4, 3, 2
 ; PWR7-LE-NEXT:    blr
 ;
 ; PWR8-LE-LABEL: build_v2i64_extload_0:
@@ -337,17 +334,13 @@ entry:
 define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) {
 ; PWR7-BE-LABEL: build_v4i32_load_0:
 ; PWR7-BE:       # %bb.0: # %entry
-; PWR7-BE-NEXT:    lwz 3, 0(3)
-; PWR7-BE-NEXT:    xxlxor 36, 36, 36
-; PWR7-BE-NEXT:    sldi 3, 3, 32
-; PWR7-BE-NEXT:    std 3, -32(1)
-; PWR7-BE-NEXT:    std 3, -24(1)
+; PWR7-BE-NEXT:    lfiwzx 0, 0, 3
 ; PWR7-BE-NEXT:    addis 3, 2, .LCPI8_0@toc@ha
+; PWR7-BE-NEXT:    xxlxor 36, 36, 36
 ; PWR7-BE-NEXT:    addi 3, 3, .LCPI8_0@toc@l
-; PWR7-BE-NEXT:    lxvw4x 34, 0, 3
-; PWR7-BE-NEXT:    addi 3, 1, -32
 ; PWR7-BE-NEXT:    lxvw4x 35, 0, 3
-; PWR7-BE-NEXT:    vperm 2, 3, 4, 2
+; PWR7-BE-NEXT:    xxspltw 34, 0, 1
+; PWR7-BE-NEXT:    vperm 2, 2, 4, 3
 ; PWR7-BE-NEXT:    blr
 ;
 ; PWR8-BE-LABEL: build_v4i32_load_0:
@@ -365,20 +358,17 @@ define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) {
 ; PWR7-LE-LABEL: build_v4i32_load_0:
 ; PWR7-LE:       # %bb.0: # %entry
 ; PWR7-LE-NEXT:    li 4, 0
-; PWR7-LE-NEXT:    lwz 3, 0(3)
 ; PWR7-LE-NEXT:    stw 4, -16(1)
 ; PWR7-LE-NEXT:    addis 4, 2, .LCPI8_0@toc@ha
+; PWR7-LE-NEXT:    lfiwzx 0, 0, 3
+; PWR7-LE-NEXT:    addi 3, 1, -16
 ; PWR7-LE-NEXT:    addi 4, 4, .LCPI8_0@toc@l
-; PWR7-LE-NEXT:    stw 3, -32(1)
-; PWR7-LE-NEXT:    addi 3, 1, -32
-; PWR7-LE-NEXT:    lxvd2x 0, 0, 4
-; PWR7-LE-NEXT:    addi 4, 1, -16
 ; PWR7-LE-NEXT:    lxvd2x 1, 0, 4
-; PWR7-LE-NEXT:    xxswapd 34, 0
+; PWR7-LE-NEXT:    xxspltw 35, 0, 1
 ; PWR7-LE-NEXT:    lxvd2x 0, 0, 3
-; PWR7-LE-NEXT:    xxswapd 35, 1
+; PWR7-LE-NEXT:    xxswapd 34, 1
 ; PWR7-LE-NEXT:    xxswapd 36, 0
-; PWR7-LE-NEXT:    vperm 2, 3, 4, 2
+; PWR7-LE-NEXT:    vperm 2, 4, 3, 2
 ; PWR7-LE-NEXT:    blr
 ;
 ; PWR8-LE-LABEL: build_v4i32_load_0:
@@ -400,17 +390,13 @@ entry:
 define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) {
 ; PWR7-BE-LABEL: build_v4i32_load_1:
 ; PWR7-BE:       # %bb.0: # %entry
-; PWR7-BE-NEXT:    lwz 3, 0(3)
-; PWR7-BE-NEXT:    xxlxor 36, 36, 36
-; PWR7-BE-NEXT:    sldi 3, 3, 32
-; PWR7-BE-NEXT:    std 3, -16(1)
-; PWR7-BE-NEXT:    std 3, -8(1)
+; PWR7-BE-NEXT:    lfiwzx 0, 0, 3
 ; PWR7-BE-NEXT:    addis 3, 2, .LCPI9_0@toc@ha
+; PWR7-BE-NEXT:    xxlxor 36, 36, 36
 ; PWR7-BE-NEXT:    addi 3, 3, .LCPI9_0@toc@l
-; PWR7-BE-NEXT:    lxvw4x 34, 0, 3
-; PWR7-BE-NEXT:    addi 3, 1, -16
 ; PWR7-BE-NEXT:    lxvw4x 35, 0, 3
-; PWR7-BE-NEXT:    vperm 2, 4, 3, 2
+; PWR7-BE-NEXT:    xxspltw 34, 0, 1
+; PWR7-BE-NEXT:    vperm 2, 4, 2, 3
 ; PWR7-BE-NEXT:    blr
 ;
 ; PWR8-BE-LABEL: build_v4i32_load_1:
@@ -427,20 +413,17 @@ define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) {
 ; PWR7-LE-LABEL: build_v4i32_load_1:
 ; PWR7-LE:       # %bb.0: # %entry
 ; PWR7-LE-NEXT:    li 4, 0
-; PWR7-LE-NEXT:    lwz 3, 0(3)
-; PWR7-LE-NEXT:    stw 4, -32(1)
+; PWR7-LE-NEXT:    stw 4, -16(1)
 ; PWR7-LE-NEXT:    addis 4, 2, .LCPI9_0@toc@ha
-; PWR7-LE-NEXT:    addi 4, 4, .LCPI9_0@toc@l
-; PWR7-LE-NEXT:    stw 3, -16(1)
+; PWR7-LE-NEXT:    lfiwzx 0, 0, 3
 ; PWR7-LE-NEXT:    addi 3, 1, -16
-; PWR7-LE-NEXT:    lxvd2x 0, 0, 4
-; PWR7-LE-NEXT:    addi 4, 1, -32
+; PWR7-LE-NEXT:    addi 4, 4, .LCPI9_0@toc@l
 ; PWR7-LE-NEXT:    lxvd2x 1, 0, 4
-; PWR7-LE-NEXT:    xxswapd 34, 0
+; PWR7-LE-NEXT:    xxspltw 35, 0, 1
 ; PWR7-LE-NEXT:    lxvd2x 0, 0, 3
-; PWR7-LE-NEXT:    xxswapd 35, 1
+; PWR7-LE-NEXT:    xxswapd 34, 1
 ; PWR7-LE-NEXT:    xxswapd 36, 0
-; PWR7-LE-NEXT:    vperm 2, 4, 3, 2
+; PWR7-LE-NEXT:    vperm 2, 3, 4, 2
 ; PWR7-LE-NEXT:    blr
 ;
 ; PWR8-LE-LABEL: build_v4i32_load_1:
@@ -463,17 +446,13 @@ entry:
 define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) {
 ; PWR7-BE-LABEL: build_v4i32_load_2:
 ; PWR7-BE:       # %bb.0: # %entry
-; PWR7-BE-NEXT:    lwz 3, 0(3)
-; PWR7-BE-NEXT:    xxlxor 36, 36, 36
-; PWR7-BE-NEXT:    sldi 3, 3, 32
-; PWR7-BE-NEXT:    std 3, -16(1)
-; PWR7-BE-NEXT:    std 3, -8(1)
+; PWR7-BE-NEXT:    lfiwzx 0, 0, 3
 ; PWR7-BE-NEXT:    addis 3, 2, .LCPI10_0@toc@ha
+; PWR7-BE-NEXT:    xxlxor 36, 36, 36
 ; PWR7-BE-NEXT:    addi 3, 3, .LCPI10_0@toc@l
-; PWR7-BE-NEXT:    lxvw4x 34, 0, 3
-; PWR7-BE-NEXT:    addi 3, 1, -16
 ; PWR7-BE-NEXT:    lxvw4x 35, 0, 3
-; PWR7-BE-NEXT:    vperm 2, 4, 3, 2
+; PWR7-BE-NEXT:    xxspltw 34, 0, 1
+; PWR7-BE-NEXT:    vperm 2, 4, 2, 3
 ; PWR7-BE-NEXT:    blr
 ;
 ; PWR8-BE-LABEL: build_v4i32_load_2:
@@ -491,20 +470,17 @@ define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) {
 ; PWR7-LE-LABEL: build_v4i32_load_2:
 ; PWR7-LE:       # %bb.0: # %entry
 ; PWR7-LE-NEXT:    li 4, 0
-; PWR7-LE-NEXT:    lwz 3, 0(3)
-; PWR7-LE-NEXT:    stw 4, -32(1)
+; PWR7-LE-NEXT:    stw 4, -16(1)
 ; PWR7-LE-NEXT:    addis 4, 2, .LCPI10_0@toc@ha
-; PWR7-LE-NEXT:    addi 4, 4, .LCPI10_0@toc@l
-; PWR7-LE-NEXT:    stw 3, -16(1)
+; PWR7-LE-NEXT:    lfiwzx 0, 0, 3
 ; PWR7-LE-NEXT:    addi 3, 1, -16
-; PWR7-LE-NEXT:    lxvd2x 0, 0, 4
-; PWR7-LE-NEXT:    addi 4, 1, -32
+; PWR7-LE-NEXT:    addi 4, 4, .LCPI10_0@toc@l
 ; PWR7-LE-NEXT:    lxvd2x 1, 0, 4
-; PWR7-LE-NEXT:    xxswapd 34, 0
+; PWR7-LE-NEXT:    xxspltw 35, 0, 1
 ; PWR7-LE-NEXT:    lxvd2x 0, 0, 3
-; PWR7-LE-NEXT:    xxswapd 35, 1
+; PWR7-LE-NEXT:    xxswapd 34, 1
 ; PWR7-LE-NEXT:    xxswapd 36, 0
-; PWR7-LE-NEXT:    vperm 2, 4, 3, 2
+; PWR7-LE-NEXT:    vperm 2, 3, 4, 2
 ; PWR7-LE-NEXT:    blr
 ;
 ; PWR8-LE-LABEL: build_v4i32_load_2:
@@ -526,17 +502,13 @@ entry:
 define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) {
 ; PWR7-BE-LABEL: build_v4i32_load_3:
 ; PWR7-BE:       # %bb.0: # %entry
-; PWR7-BE-NEXT:    lwz 3, 0(3)
-; PWR7-BE-NEXT:    xxlxor 36, 36, 36
-; PWR7-BE-NEXT:    sldi 3, 3, 32
-; PWR7-BE-NEXT:    std 3, -16(1)
-; PWR7-BE-NEXT:    std 3, -8(1)
+; PWR7-BE-NEXT:    lfiwzx 0, 0, 3
 ; PWR7-BE-NEXT:    addis 3, 2, .LCPI11_0@toc@ha
+; PWR7-BE-NEXT:    xxlxor 36, 36, 36
 ; PWR7-BE-NEXT:    addi 3, 3, .LCPI11_0@toc@l
-; PWR7-BE-NEXT:    lxvw4x 34, 0, 3
-; PWR7-BE-NEXT:    addi 3, 1, -16
 ; PWR7-BE-NEXT:    lxvw4x 35, 0, 3
-; PWR7-BE-NEXT:    vperm 2, 4, 3, 2
+; PWR7-BE-NEXT:    xxspltw 34, 0, 1
+; PWR7-BE-NEXT:    vperm 2, 4, 2, 3
 ; PWR7-BE-NEXT:    blr
 ;
 ; PWR8-BE-LABEL: build_v4i32_load_3:
@@ -553,20 +525,17 @@ define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) {
 ; PWR7-LE-LABEL: build_v4i32_load_3:
 ; PWR7-LE:       # %bb.0: # %entry
 ; PWR7-LE-NEXT:    li 4, 0
-; PWR7-LE-NEXT:    lwz 3, 0(3)
-; PWR7-LE-NEXT:    stw 4, -32(1)
+; PWR7-LE-NEXT:    stw 4, -16(1)
 ; PWR7-LE-NEXT:    addis 4, 2, .LCPI11_0@toc@ha
-; PWR7-LE-NEXT:    addi 4, 4, .LCPI11_0@toc@l
-; PWR7-LE-NEXT:    stw 3, -16(1)
+; PWR7-LE-NEXT:    lfiwzx 0, 0, 3
 ; PWR7-LE-NEXT:    addi 3, 1, -16
-; PWR7-LE-NEXT:    lxvd2x 0, 0, 4
-; PWR7-LE-NEXT:    addi 4, 1, -32
+; PWR7-LE-NEXT:    addi 4, 4, .LCPI11_0@toc@l
 ; PWR7-LE-NEXT:    lxvd2x 1, 0, 4
-; PWR7-LE-NEXT:    xxswapd 34, 0
+; PWR7-LE-NEXT:    xxspltw 35, 0, 1
 ; PWR7-LE-NEXT:    lxvd2x 0, 0, 3
-; PWR7-LE-NEXT:    xxswapd 35, 1
+; PWR7-LE-NEXT:    xxswapd 34, 1
 ; PWR7-LE-NEXT:    xxswapd 36, 0
-; PWR7-LE-NEXT:    vperm 2, 4, 3, 2
+; PWR7-LE-NEXT:    vperm 2, 3, 4, 2
 ; PWR7-LE-NEXT:    blr
 ;
 ; PWR8-LE-LABEL: build_v4i32_load_3:
diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
index c26f98c5b0495..e1159e56e23eb 100644
--- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
+++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
@@ -536,15 +536,12 @@ define dso_local <8 x i16> @testmrglb3(ptr nocapture readonly %a) local_unnamed_
 ;
 ; P8-AIX-32-LABEL: testmrglb3:
 ; P8-AIX-32:       # %bb.0: # %entry
-; P8-AIX-32-NEXT:    lwz r4, 4(r3)
+; P8-AIX-32-NEXT:    li r4, 4
+; P8-AIX-32-NEXT:    lfiwzx f1, 0, r3
 ; P8-AIX-32-NEXT:    xxlxor v3, v3, v3
-; P8-AIX-32-NEXT:    stw r4, -16(r1)
-; P8-AIX-32-NEXT:    lwz r3, 0(r3)
-; P8-AIX-32-NEXT:    stw r3, -32(r1)
-; P8-AIX-32-NEXT:    addi r3, r1, -16
-; P8-AIX-32-NEXT:    lxvw4x vs0, 0, r3
-; P8-AIX-32-NEXT:    addi r3, r1, -32
-; P8-AIX-32-NEXT:    lxvw4x vs1, 0, r3
+; P8-AIX-32-NEXT:    lfiwzx f0, r3, r4
+; P8-AIX-32-NEXT:    xxspltw vs1, vs1, 1
+; P8-AIX-32-NEXT:    xxspltw vs0, vs0, 1
 ; P8-AIX-32-NEXT:    xxmrghw v2, vs1, vs0
 ; P8-AIX-32-NEXT:    vmrghb v2, v3, v2
 ; P8-AIX-32-NEXT:    blr
@@ -852,17 +849,15 @@ define dso_local <16 x i8> @no_RAUW_in_combine_during_legalize(ptr nocapture rea
 ;
 ; P8-AIX-32-LABEL: no_RAUW_in_combine_during_legalize:
 ; P8-AIX-32:       # %bb.0: # %entry
+; P8-AIX-32-NEXT:    li r5, 0
 ; P8-AIX-32-NEXT:    slwi r4, r4, 2
 ; P8-AIX-32-NEXT:    xxlxor v3, v3, v3
-; P8-AIX-32-NEXT:    lwzx r3, r3, r4
-; P8-AIX-32-NEXT:    li r4, 0
-; P8-AIX-32-NEXT:    stw r4, -32(r1)
-; P8-AIX-32-NEXT:    stw r3, -16(r1)
-; P8-AIX-32-NEXT:    addi r3, r1, -32
-; P8-AIX-32-NEXT:    lxvw4x vs0, 0, r3
+; P8-AIX-32-NEXT:    stw r5, -16(r1)
+; P8-AIX-32-NEXT:    lfiwzx f0, r3, r4
 ; P8-AIX-32-NEXT:    addi r3, r1, -16
 ; P8-AIX-32-NEXT:    lxvw4x vs1, 0, r3
-; P8-AIX-32-NEXT:    xxmrghw v2, vs0, vs1
+; P8-AIX-32-NEXT:    xxspltw vs0, vs0, 1
+; P8-AIX-32-NEXT:    xxmrghw v2, vs1, vs0
 ; P8-AIX-32-NEXT:    vmrghb v2, v2, v3
 ; P8-AIX-32-NEXT:    blr
 entry:
@@ -1026,14 +1021,11 @@ define dso_local <2 x i64> @testSplat8(ptr nocapture readonly %ptr) local_unname
 ;
 ; P8-AIX-32-LABEL: testSplat8:
 ; P8-AIX-32:       # %bb.0: # %entry
-; P8-AIX-32-NEXT:    lwz r4, 4(r3)
-; P8-AIX-32-NEXT:    stw r4, -16(r1)
-; P8-AIX-32-NEXT:    lwz r3, 0(r3)
-; P8-AIX-32-NEXT:    stw r3, -32(r1)
-; P8-AIX-32-NEXT:    addi r3, r1, -16
-; P8-AIX-32-NEXT:    lxvw4x vs0, 0, r3
-; P8-AIX-32-NEXT:    addi r3, r1, -32
-; P8-AIX-32-NEXT:    lxvw4x vs1, 0, r3
+; P8-AIX-32-NEXT:    li r4, 4
+; P8-AIX-32-NEXT:    lfiwzx f1, 0, r3
+; P8-AIX-32-NEXT:    lfiwzx f0, r3, r4
+; P8-AIX-32-NEXT:    xxspltw vs1, vs1, 1
+; P8-AIX-32-NEXT:    xxspltw vs0, vs0, 1
 ; P8-AIX-32-NEXT:    xxmrghw vs0, vs1, vs0
 ; P8-AIX-32-NEXT:    xxmrghd v2, vs0, vs0
 ; P8-AIX-32-NEXT:    blr
@@ -1081,17 +1073,14 @@ define <2 x i64> @testSplati64_0(ptr nocapture readonly %ptr) #0 {
 ;
 ; P8-AIX-32-LABEL: testSplati64_0:
 ; P8-AIX-32:       # %bb.0: # %entry
-; P8-AIX-32-NEXT:    lwz r4, 0(r3)
-; P8-AIX-32-NEXT:    lwz r3, 4(r3)
-; P8-AIX-32-NEXT:    stw r3, -16(r1)
+; P8-AIX-32-NEXT:    li r4, 4
+; P8-AIX-32-NEXT:    lfiwzx f0, r3, r4
+; P8-AIX-32-NEXT:    xxspltw v2, vs0, 1
+; P8-AIX-32-NEXT:    lfiwzx f0, 0, r3
 ; P8-AIX-32-NEXT:    lwz r3, L..C3(r2) # %const.0
-; P8-AIX-32-NEXT:    stw r4, -32(r1)
-; P8-AIX-32-NEXT:    lxvw4x v2, 0, r3
-; P8-AIX-32-NEXT:    addi r3, r1, -16
-; P8-AIX-32-NEXT:    lxvw4x v3, 0, r3
-; P8-AIX-32-NEXT:    addi r3, r1, -32
 ; P8-AIX-32-NEXT:    lxvw4x v4, 0, r3
-; P8-AIX-32-NEXT:    vperm v2, v4, v3, v2
+; P8-AIX-32-NEXT:    xxspltw v3, vs0, 1
+; P8-AIX-32-NEXT:    vperm v2, v3, v2, v4
 ; P8-AIX-32-NEXT:    blr
 entry:
   %0 = load <1 x i64>, ptr %ptr, align 8
diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
index bc68ad2a67bf5..c9ee3a51f4172 100644
--- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll
+++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
@@ -208,47 +208,41 @@ define dso_local void @test4(ptr nocapture %c, ptr nocapture readonly %a) local_
 ;
 ; P9-AIX32-LABEL: test4:
 ; P9-AIX32:       # %bb.0: # %entry
-; P9-AIX32-NEXT:    lwz r5, 24(r4)
-; P9-AIX32-NEXT:    lwz r4, 28(r4)
-; P9-AIX32-NEXT:    stw r4, -16(r1)
+; P9-AIX32-NEXT:    li r5, 28
+; P9-AIX32-NEXT:    lxvwsx vs0, r4, r5
+; P9-AIX32-NEXT:    li r5, 24
+; P9-AIX32-NEXT:    lxvwsx vs1, r4, r5
 ; P9-AIX32-NEXT:    lwz r4, L..C0(r2) # %const.0
-; P9-AIX32-NEXT:    stw r5, -32(r1)
-; P9-AIX32-NEXT:    lxv vs1, -16(r1)
-; P9-AIX32-NEXT:    lxv vs2, -32(r1)
-; P9-AIX32-NEXT:    lxv vs0, 0(r4)
-; P9-AIX32-NEXT:    xxperm vs1, vs2, vs0
-; P9-AIX32-NEXT:    stxv vs1, 0(r3)
+; P9-AIX32-NEXT:    lxv vs2, 0(r4)
+; P9-AIX32-NEXT:    xxperm vs0, vs1, vs2
+; P9-AIX32-NEXT:    stxv vs0, 0(r3)
 ; P9-AIX32-NEXT:    blr
 ;
 ; P8-AIX32-LABEL: test4:
 ; P8-AIX32:       # %bb.0: # %entry
-; P8-AIX32-NEXT:    lwz r5, 24(r4)
-; P8-AIX32-NEXT:    lwz r4, 28(r4)
-; P8-AIX32-NEXT:    stw r4, -16(r1)
+; P8-AIX32-NEXT:    li r5, 28
+; P8-AIX32-NEXT:    lfiwzx f0, r4, r5
+; P8-AIX32-NEXT:    li r5, 24
+; P8-AIX32-NEXT:    xxspltw v2, vs0, 1
+; P8-AIX32-NEXT:    lfiwzx f0, r4, r5
 ; P8-AIX32-NEXT:    lwz r4, L..C0(r2) # %const.0
-; P8-AIX32-NEXT:    stw r5, -32(r1)
-; P8-AIX32-NEXT:    lxvw4x v2, 0, r4
-; P8-AIX32-NEXT:    addi r4, r1, -16
-; P8-AIX32-NEXT:    lxvw4x v3, 0, r4
-; P8-AIX32-NEXT:    addi r4, r1, -32
 ; P8-AIX32-NEXT:    lxvw4x v4, 0, r4
-; P8-AIX32-NEXT:    vperm v2, v4, v3, v2
+; P8-AIX32-NEXT:    xxspltw v3, vs0, 1
+; P8-AIX32-NEXT:    vperm v2, v3, v2, v4
 ; P8-AIX32-NEXT:    stxvw4x v2, 0, r3
 ; P8-AIX32-NEXT:    blr
 ;
 ; P7-AIX32-LABEL: test4:
 ; P7-AIX32:       # %bb.0: # %entry
-; P7-AIX32-NEXT:    lwz r5, 24(r4)
-; P7-AIX32-NEXT:    lwz r4, 28(r4)
-; P7-AIX32-NEXT:    stw r4, -16(r1)
+; P7-AIX32-NEXT:    li r5, 28
+; P7-AIX32-NEXT:    lfiwzx f0, r4, r5
+; P7-AIX32-NEXT:    li r5, 24
+; P7-AIX32-NEXT:    xxspltw v2, vs0, 1
+; P7-AIX32-NEXT:    lfiwzx f0, r4, r5
 ; P7-AIX32-NEXT:    lwz r4, L..C0(r2) # %const.0
-; P7-AIX32-NEXT:    stw r5, -32(r1)
-; P7-AIX32-NEXT:    lxvw4x v2, 0, r4
-; P7-AIX32-NEXT:    addi r4, r1, -16
-; P7-AIX32-NEXT:    lxvw4x v3, 0, r4
-; P7-AIX32-NEXT:    addi r4, r1, -32
 ; P7-AIX32-NEXT:    lxvw4x v4, 0, r4
-; P7-AIX32-NEXT:    vperm v2, v4, v3, v2
+; P7-AIX32-NEXT:    xxspltw v3, vs0, 1
+; P7-AIX32-NEXT:    vperm v2, v3, v2, v4
 ; P7-AIX32-NEXT:    stxvw4x v2, 0, r3
 ; P7-AIX32-NEXT:    blr
 entry:
@@ -362,47 +356,41 @@ define void @test6(ptr %a, ptr %in) {
 ;
 ; P9-AIX32-LABEL: test6:
 ; P9-AIX32:       # %bb.0: # %entry
-; P9-AIX32-NEXT:    lwz r4, 0(r4)
 ; P9-AIX32-NEXT:    li r5, 0
-; P9-AIX32-NEXT:    stw r5, -32(r1)
-; P9-AIX32-NEXT:    lxv vs1, -32(r1)
-; P9-AIX32-NEXT:    stw r4, -16(r1)
-; P9-AIX32-NEXT:    lwz r4, L..C2(r2) # %const.0
+; P9-AIX32-NEXT:    stw r5, -16(r1)
+; P9-AIX32-NEXT:    lwz r5, L..C2(r2) # %const.0
+; P9-AIX32-NEXT:    lxvwsx vs1, 0, r4
 ; P9-AIX32-NEXT:    lxv vs2, -16(r1)
-; P9-AIX32-NEXT:    lxv vs0, 0(r4)
-; P9-AIX32-NEXT:    xxperm vs2, vs1, vs0
-; P9-AIX32-NEXT:    stxv vs2, 0(r3)
+; P9-AIX32-NEXT:    lxv vs0, 0(r5)
+; P9-AIX32-NEXT:    xxperm vs1, vs2, vs0
+; P9-AIX32-NEXT:    stxv vs1, 0(r3)
 ; P9-AIX32-NEXT:    blr
 ;
 ; P8-AIX32-LABEL: test6:
 ; P8-AIX32:       # %bb.0: # %entry
-; P8-AIX32-NEXT:    lwz r4, 0(r4)
 ; P8-AIX32-NEXT:    li r5, 0
-; P8-AIX32-NEXT:    stw r5, -32(r1)
-; P8-AIX32-NEXT:    stw r4, -16(r1)
+; P8-AIX32-NEXT:    stw r5, -16(r1)
+; P8-AIX32-NEXT:    lfiwzx f0, 0, r4
 ; P8-AIX32-NEXT:    lwz r4, L..C2(r2) # %const.0
-; P8-AIX32-NEXT:    lxvw4x v2, 0, r4
-; P8-AIX32-NEXT:    addi r4, r1, -32
 ; P8-AIX32-NEXT:    lxvw4x v3, 0, r4
 ; P8-AIX32-NEXT:    addi r4, r1, -16
 ; P8-AIX32-NEXT:    lxvw4x v4, 0, r4
-; P8-AIX32-NEXT:    vperm v2, v3, v4, v2
+; P8-AIX32-NEXT:    xxspltw v2, vs0, 1
+; P8-AIX32-NEXT:    vperm v2, v4, v2, v3
 ; P8-AIX32-NEXT:    stxvw4x v2, 0, r3
 ; P8-AIX32-NEXT:    blr
 ;
 ; P7-AIX32-LABEL: test6:
 ; P7-AIX32:       # %bb.0: # %entry
-; P7-AIX32-NEXT:    lwz r4, 0(r4)
 ; P7-AIX32-NEXT:    li r5, 0
-; P7-AIX32-NEXT:    stw r5, -32(r1)
-; P7-AIX32-NEXT:    stw r4, -16(r1)
+; P7-AIX32-NEXT:    stw r5, -16(r1)
+; P7-AIX32-NEXT:    lfiwzx f0, 0, r4
 ; P7-AIX32-NEXT:    lwz r4, L..C2(r2) # %const.0
-; P7-AIX32-NEXT:    lxvw4x v2, 0, r4
-; P7-AIX32-NEXT:    addi r4, r1, -32
 ; P7-AIX32-NEXT:    lxvw4x v3, 0, r4
 ; P7-AIX32-NEXT:    addi r4, r1, -16
 ; P7-AIX32-NEXT:    lxvw4x v4, 0, r4
-; P7-AIX32-NEXT:    vperm v2, v3, v4, v2
+; P7-AIX32-NEXT:    xxspltw v2, vs0, 1
+; P7-AIX32-NEXT:    vperm v2, v4, v2, v3
 ; P7-AIX32-NEXT:    stxvw4x v2, 0, r3
 ; P7-AIX32-NEXT:    blr
 entry:
@@ -810,40 +798,31 @@ define <16 x i8> @unadjusted_lxvdsx(ptr %s, ptr %t) {
 ;
 ; P9-AIX32-LABEL: unadjusted_lxvdsx:
 ; P9-AIX32:       # %bb.0: # %entry
-; P9-AIX32-NEXT:    lwz r4, 4(r3)
-; P9-AIX32-NEXT:    stw r4, -16(r1)
-; P9-AIX32-NEXT:    lwz r3, 0(r3)
-; P9-AIX32-NEXT:    lxv vs0, -16(r1)
-; P9-AIX32-NEXT:    stw r3, -32(r1)
-; P9-AIX32-NEXT:    lxv vs1, -32(r1)
+; P9-AIX32-NEXT:    li r4, 4
+; P9-AIX32-NEXT:    lxvwsx vs1, 0, r3
+; P9-AIX32-NEXT:    lxvwsx vs0, r3, r4
 ; P9-AIX32-NEXT:    xxmrghw vs0, vs1, vs0
 ; P9-AIX32-NEXT:    xxmrghd v2, vs0, vs0
 ; P9-AIX32-NEXT:    blr
 ;
 ; P8-AIX32-LABEL: unadjusted_lxvdsx:
 ; P8-AIX32:       # %bb.0: # %entry
-; P8-AIX32-NEXT:    lwz r4, 4(r3)
-; P8-AIX32-NEXT:    stw r4, -16(r1)
-; P8-AIX32-NEXT:    lwz r3, 0(r3)
-; P8-AIX32-NEXT:    stw r3, -32(r1)
-; P8-AIX32-NEXT:    addi r3, r1, -16
-; P8-AIX32-NEXT:    lxvw4x vs0, 0, r3
-; P8-AIX32-NEXT:    addi r3, r1, -32
-; P8-AIX32-NEXT:    lxvw4x vs1, 0, r3
+; P8-AIX32-NEXT:    li r4, 4
+; P8-AIX32-NEXT:    lfiwzx f1, 0, r3
+; P8-AIX32-NEXT:    lfiwzx f0, r3, r4
+; P8-AIX32-NEXT:    xxspltw vs1, vs1, 1
+; P8-AIX32-NEXT:    xxspltw vs0, vs0, 1
 ; P8-AIX32-NEXT:    xxmrghw vs0, vs1, vs0
 ; P8-AIX32-NEXT:    xxmrghd v2, vs0, vs0
 ; P8-AIX32-NEXT:    blr
 ;
 ; P7-AIX32-LABEL: unadjusted_lxvdsx:
 ; P7-AIX32:       # %bb.0: # %entry
-; P7-AIX32-NEXT:    lwz r4, 4(r3)
-; P7-AIX32-NEXT:    stw r4, -16(r1)
-; P7-AIX32-NEXT:    lwz r3, 0(r3)
-; P7-AIX32-NEXT:    stw r3, -32(r1)
-; P7-AIX32-NEXT:    addi r3, r1, -16
-; P7-AIX32-NEXT:    lxvw4x vs0, 0, r3
-; P7-AIX32-NEXT:    addi r3, r1, -32
-; P7-AIX32-NEXT:    lxvw4x vs1, 0, r3
+; P7-AIX32-NEXT:    li r4, 4
+; P7-AIX32-NEXT:    lfiwzx f1, 0, r3
+; P7-AIX32-NEXT:    lfiwzx f0, r3, r4
+; P7-AIX32-NEXT:    xxspltw vs1, vs1, 1
+; P7-AIX32-NEXT:    xxspltw vs0, vs0, 1
 ; P7-AIX32-NEXT:    xxmrghw vs0, vs1, vs0
 ; P7-AIX32-NEXT:    xxmrghd v2, vs0, vs0
 ; P7-AIX32-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
index 4da36c9af5c10..4435484ae0b94 100644
--- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
+++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
@@ -85,23 +85,20 @@ define void @test64(ptr nocapture readonly %pix2, i32 signext %i_pix2) {
 ;
 ; P9BE-AIX32-LABEL: test64:
 ; P9BE-AIX32:       # %bb.0: # %entry
-; P9BE-AIX32-NEXT:    lwzux 4, 3, 4
+; P9BE-AIX32-NEXT:    add 5, 3, 4
+; P9BE-AIX32-NEXT:    lxvwsx 0, 3, 4
+; P9BE-AIX32-NEXT:    li 3, 4
 ; P9BE-AIX32-NEXT:    xxlxor 2, 2, 2
 ; P9BE-AIX32-NEXT:    vspltisw 4, 8
-; P9BE-AIX32-NEXT:    stw 4, -48(1)
+; P9BE-AIX32-NEXT:    lxvwsx 1, 5, 3
+; P9BE-AIX32-NEXT:    lwz 3, L..C0(2) # %const.0
 ; P9BE-AIX32-NEXT:    vadduwm 4, 4, 4
-; P9BE-AIX32-NEXT:    lwz 4, 4(3)
-; P9BE-AIX32-NEXT:    lxv 0, -48(1)
-; P9BE-AIX32-NEXT:    stw 4, -32(1)
-; P9BE-AIX32-NEXT:    lwz 4, L..C0(2) # %const.0
-; P9BE-AIX32-NEXT:    lxv 1, -32(1)
-; P9BE-AIX32-NEXT:    lwz 3, 8(3)
-; P9BE-AIX32-NEXT:    stw 3, -16(1)
-; P9BE-AIX32-NEXT:    lwz 3, L..C1(2) # %const.1
 ; P9BE-AIX32-NEXT:    xxmrghw 2, 0, 1
-; P9BE-AIX32-NEXT:    lxv 0, 0(4)
+; P9BE-AIX32-NEXT:    lxv 0, 0(3)
+; P9BE-AIX32-NEXT:    li 3, 8
 ; P9BE-AIX32-NEXT:    xxperm 2, 2, 0
-; P9BE-AIX32-NEXT:    lxv 0, -16(1)
+; P9BE-AIX32-NEXT:    lxvwsx 0, 5, 3
+; P9BE-AIX32-NEXT:    lwz 3, L..C1(2) # %const.1
 ; P9BE-AIX32-NEXT:    xxmrghw 3, 1, 0
 ; P9BE-AIX32-NEXT:    lxv 0, 0(3)
 ; P9BE-AIX32-NEXT:    xxperm 3, 3, 0
diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll
index 25e1baa28f7ef..c8e0d0d25f4f7 100644
--- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll
@@ -73,13 +73,11 @@ define <4 x i32> @s2v_test1(ptr nocapture readonly %int32, <4 x i32> %vec)  {
 ;
 ; P8-AIX-32-LABEL: s2v_test1:
 ; P8-AIX-32:       # %bb.0: # %entry
-; P8-AIX-32-NEXT:    lwz r3, 0(r3)
-; P8-AIX-32-NEXT:    stw r3, -16(r1)
+; P8-AIX-32-NEXT:    lfiwzx f0, 0, r3
 ; P8-AIX-32-NEXT:    lwz r3, L..C0(r2) # %const.0
-; P8-AIX-32-NEXT:    lxvw4x v3, 0, r3
-; P8-AIX-32-NEXT:    addi r3, r1, -16
 ; P8-AIX-32-NEXT:    lxvw4x v4, 0, r3
-; P8-AIX-32-NEXT:    vperm v2, v4, v2, v3
+; P8-AIX-32-NEXT:    xxspltw v3, vs0, 1
+; P8-AIX-32-NEXT:    vperm v2, v3, v2, v4
 ; P8-AIX-32-NEXT:    blr
 entry:
   %0 = load i32, ptr %int32, align 4
@@ -142,13 +140,12 @@ define <4 x i32> @s2v_test2(ptr nocapture readonly %int32, <4 x i32> %vec)  {
 ;
 ; P8-AIX-32-LABEL: s2v_test2:
 ; P8-AIX-32:       # %bb.0: # %entry
-; P8-AIX-32-NEXT:    lwz r3, 4(r3)
-; P8-AIX-32-NEXT:    stw r3, -16(r1)
+; P8-AIX-32-NEXT:    addi r3, r3, 4
+; P8-AIX-32-NEXT:    lfiwzx f0, 0, r3
 ; P8-AIX-32-NEXT:    lwz r3, L..C1(r2) # %const.0
-; P8-AIX-32-NEXT:    lxvw4x v3, 0, r3
-; P8-AIX-32-NEXT:    addi r3, r1, -16
 ; P8-AIX-32-NEXT:    lxvw4x v4, 0, r3
-; P8-AIX-32-NEXT:    vperm v2, v4, v2, v3
+; P8-AIX-32-NEXT:    xxspltw v3, vs0, 1
+; P8-AIX-32-NEXT:    vperm v2, v3, v2, v4
 ; P8-AIX-32-NEXT:    blr
 entry:
   %arrayidx = getelementptr inbounds i32, ptr %int32, i64 1
@@ -224,13 +221,11 @@ define <4 x i32> @s2v_test3(ptr nocapture readonly %int32, <4 x i32> %vec, i32 s
 ; P8-AIX-32-LABEL: s2v_test3:
 ; P8-AIX-32:       # %bb.0: # %entry
 ; P8-AIX-32-NEXT:    slwi r4, r4, 2
-; P8-AIX-32-NEXT:    lwzx r3, r3, r4
-; P8-AIX-32-NEXT:    stw r3, -16(r1)
+; P8-AIX-32-NEXT:    lfiwzx f0, r3, r4
 ; P8-AIX-32-NEXT:    lwz r3, L..C2(r2) # %const.0
-; P8-AIX-32-NEXT:    lxvw4x v3, 0, r3
-; P8-AIX-32-NEXT:    addi r3, r1, -16
 ; P8-AIX-32-NEXT:    lxvw4x v4, 0, r3
-; P8-AIX-32-NEXT:    vperm v2, v4, v2, v3
+; P8-AIX-32-NEXT:    xxspltw v3, vs0, 1
+; P8-AIX-32-NEXT:    vperm v2, v3, v2, v4
 ; P8-AIX-32-NEXT:    blr
 entry:
   %idxprom = sext i32 %Idx to i64
@@ -295,13 +290,12 @@ define <4 x i32> @s2v_test4(ptr nocapture readonly %int32, <4 x i32> %vec)  {
 ;
 ; P8-AIX-32-LABEL: s2v_test4:
 ; P8-AIX-32:       # %bb.0: # %entry
-; P8-AIX-32-NEXT:    lwz r3, 4(r3)
-; P8-AIX-32-NEXT:    stw r3, -16(r1)
+; P8-AIX-32-NEXT:    addi r3, r3, 4
+; P8-AIX-32-NEXT:    lfiwzx f0, 0, r3
 ; P8-AIX-32-NEXT:    lwz r3, L..C3(r2) # %const.0
-; P8-AIX-32-NEXT:    lxvw4x v3, 0, r3
-; P8-AIX-32-NEXT:    addi r3, r1, -16
 ; P8-AIX-32-NEXT:    lxvw4x v4, 0, r3
-; P8-AIX-32-NEXT:    vperm v2, v4, v2, v3
+; P8-AIX-32-NEXT:    xxspltw v3, vs0, 1
+; P8-AIX-32-NEXT:    vperm v2, v3, v2, v4
 ; P8-AIX-32-NEXT:    blr
 entry:
   %arrayidx = getelementptr inbounds i32, ptr %int32, i64 1
@@ -362,13 +356,11 @@ define <4 x i32> @s2v_test5(<4 x i32> %vec, ptr nocapture readonly %ptr1)  {
 ;
 ; P8-AIX-32-LABEL: s2v_test5:
 ; P8-AIX-32:       # %bb.0: # %entry
-; P8-AIX-32-NEXT:    lwz r3, 0(r3)
-; P8-AIX-32-NEXT:    stw r3, -16(r1)
+; P8-AIX-32-NEXT:    lfiwzx f0, 0, r3
 ; P8-AIX-32-NEXT:    lwz r3, L..C4(r2) # %const.0
-; P8-AIX-32-NEXT:    lxvw4x v3, 0, r3
-; P8-AIX-32-NEXT:    addi r3, r1, -16
 ; P8-AIX-32-NEXT:    lxvw4x v4, 0, r3
-; P8-AIX-32-NEXT:    vperm v2, v4, v2, v3
+; P8-AIX-32-NEXT:    xxspltw v3, vs0, 1
+; P8-AIX-32-NEXT:    vperm v2, v3, v2, v4
 ; P8-AIX-32-NEXT:    blr
 entry:
   %0 = load i32, ptr %ptr1, align 4
diff --git a/llvm/test/CodeGen/PowerPC/test-vector-insert.ll b/llvm/test/CodeGen/PowerPC/test-vector-insert.ll
index 73b4ad8a507b8..47fa6f2a5b4d2 100644
--- a/llvm/test/CodeGen/PowerPC/test-vector-insert.ll
+++ b/llvm/test/CodeGen/PowerPC/test-vector-insert.ll
@@ -25,16 +25,13 @@ define dso_local <4 x i32> @test(<4 x i32> %a, double %b) {
 ; CHECK-LE-P7:       # %bb.0: # %entry
 ; CHECK-LE-P7-NEXT:    xscvdpsxws f0, f1
 ; CHECK-LE-P7-NEXT:    addi r3, r1, -4
+; CHECK-LE-P7-NEXT:    addis r4, r2, .LCPI0_0@toc@ha
+; CHECK-LE-P7-NEXT:    addi r4, r4, .LCPI0_0@toc@l
 ; CHECK-LE-P7-NEXT:    stfiwx f0, 0, r3
-; CHECK-LE-P7-NEXT:    lwz r3, -4(r1)
-; CHECK-LE-P7-NEXT:    stw r3, -32(r1)
-; CHECK-LE-P7-NEXT:    addis r3, r2, .LCPI0_0@toc@ha
-; CHECK-LE-P7-NEXT:    addi r3, r3, .LCPI0_0@toc@l
-; CHECK-LE-P7-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-LE-P7-NEXT:    addi r3, r1, -32
+; CHECK-LE-P7-NEXT:    lxvd2x vs0, 0, r4
 ; CHECK-LE-P7-NEXT:    xxswapd v3, vs0
-; CHECK-LE-P7-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-LE-P7-NEXT:    xxswapd v4, vs0
+; CHECK-LE-P7-NEXT:    lfiwzx f0, 0, r3
+; CHECK-LE-P7-NEXT:    xxspltw v4, vs0, 1
 ; CHECK-LE-P7-NEXT:    vperm v2, v4, v2, v3
 ; CHECK-LE-P7-NEXT:    blr
 ;
@@ -59,16 +56,12 @@ define dso_local <4 x i32> @test(<4 x i32> %a, double %b) {
 ; CHECK-BE-P7-NEXT:    xscvdpsxws f0, f1
 ; CHECK-BE-P7-NEXT:    addi r3, r1, -4
 ; CHECK-BE-P7-NEXT:    stfiwx f0, 0, r3
-; CHECK-BE-P7-NEXT:    lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT:    sldi r3, r3, 32
-; CHECK-BE-P7-NEXT:    std r3, -32(r1)
-; CHECK-BE-P7-NEXT:    std r3, -24(r1)
+; CHECK-BE-P7-NEXT:    lfiwzx f0, 0, r3
 ; CHECK-BE-P7-NEXT:    addis r3, r2, .LCPI0_0@toc@ha
 ; CHECK-BE-P7-NEXT:    addi r3, r3, .LCPI0_0@toc@l
-; CHECK-BE-P7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-BE-P7-NEXT:    addi r3, r1, -32
 ; CHECK-BE-P7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-BE-P7-NEXT:    vperm v2, v2, v4, v3
+; CHECK-BE-P7-NEXT:    xxspltw v3, vs0, 1
+; CHECK-BE-P7-NEXT:    vperm v2, v2, v3, v4
 ; CHECK-BE-P7-NEXT:    blr
 ;
 ; CHECK-BE-P8-LABEL: test:
@@ -96,16 +89,13 @@ define dso_local <4 x i32> @test2(<4 x i32> %a, float %b) {
 ; CHECK-LE-P7:       # %bb.0: # %entry
 ; CHECK-LE-P7-NEXT:    xscvdpsxws f0, f1
 ; CHECK-LE-P7-NEXT:    addi r3, r1, -4
+; CHECK-LE-P7-NEXT:    addis r4, r2, .LCPI1_0@toc@ha
+; CHECK-LE-P7-NEXT:    addi r4, r4, .LCPI1_0@toc@l
 ; CHECK-LE-P7-NEXT:    stfiwx f0, 0, r3
-; CHECK-LE-P7-NEXT:    lwz r3, -4(r1)
-; CHECK-LE-P7-NEXT:    stw r3, -32(r1)
-; CHECK-LE-P7-NEXT:    addis r3, r2, .LCPI1_0@toc@ha
-; CHECK-LE-P7-NEXT:    addi r3, r3, .LCPI1_0@toc@l
-; CHECK-LE-P7-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-LE-P7-NEXT:    addi r3, r1, -32
+; CHECK-LE-P7-NEXT:    lxvd2x vs0, 0, r4
 ; CHECK-LE-P7-NEXT:    xxswapd v3, vs0
-; CHECK-LE-P7-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-LE-P7-NEXT:    xxswapd v4, vs0
+; CHECK-LE-P7-NEXT:    lfiwzx f0, 0, r3
+; CHECK-LE-P7-NEXT:    xxspltw v4, vs0, 1
 ; CHECK-LE-P7-NEXT:    vperm v2, v4, v2, v3
 ; CHECK-LE-P7-NEXT:    blr
 ;
@@ -130,16 +120,12 @@ define dso_local <4 x i32> @test2(<4 x i32> %a, float %b) {
 ; CHECK-BE-P7-NEXT:    xscvdpsxws f0, f1
 ; CHECK-BE-P7-NEXT:    addi r3, r1, -4
 ; CHECK-BE-P7-NEXT:    stfiwx f0, 0, r3
-; CHECK-BE-P7-NEXT:    lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT:    sldi r3, r3, 32
-; CHECK-BE-P7-NEXT:    std r3, -32(r1)
-; CHECK-BE-P7-NEXT:    std r3, -24(r1)
+; CHECK-BE-P7-NEXT:    lfiwzx f0, 0, r3
 ; CHECK-BE-P7-NEXT:    addis r3, r2, .LCPI1_0@toc@ha
 ; CHECK-BE-P7-NEXT:    addi r3, r3, .LCPI1_0@toc@l
-; CHECK-BE-P7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-BE-P7-NEXT:    addi r3, r1, -32
 ; CHECK-BE-P7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-BE-P7-NEXT:    vperm v2, v2, v4, v3
+; CHECK-BE-P7-NEXT:    xxspltw v3, vs0, 1
+; CHECK-BE-P7-NEXT:    vperm v2, v2, v3, v4
 ; CHECK-BE-P7-NEXT:    blr
 ;
 ; CHECK-BE-P8-LABEL: test2:
@@ -167,16 +153,13 @@ define dso_local <4 x i32> @test3(<4 x i32> %a, double %b) {
 ; CHECK-LE-P7:       # %bb.0: # %entry
 ; CHECK-LE-P7-NEXT:    xscvdpuxws f0, f1
 ; CHECK-LE-P7-NEXT:    addi r3, r1, -4
+; CHECK-LE-P7-NEXT:    addis r4, r2, .LCPI2_0@toc@ha
+; CHECK-LE-P7-NEXT:    addi r4, r4, .LCPI2_0@toc@l
 ; CHECK-LE-P7-NEXT:    stfiwx f0, 0, r3
-; CHECK-LE-P7-NEXT:    lwz r3, -4(r1)
-; CHECK-LE-P7-NEXT:    stw r3, -32(r1)
-; CHECK-LE-P7-NEXT:    addis r3, r2, .LCPI2_0@toc@ha
-; CHECK-LE-P7-NEXT:    addi r3, r3, .LCPI2_0@toc@l
-; CHECK-LE-P7-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-LE-P7-NEXT:    addi r3, r1, -32
+; CHECK-LE-P7-NEXT:    lxvd2x vs0, 0, r4
 ; CHECK-LE-P7-NEXT:    xxswapd v3, vs0
-; CHECK-LE-P7-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-LE-P7-NEXT:    xxswapd v4, vs0
+; CHECK-LE-P7-NEXT:    lfiwzx f0, 0, r3
+; CHECK-LE-P7-NEXT:    xxspltw v4, vs0, 1
 ; CHECK-LE-P7-NEXT:    vperm v2, v4, v2, v3
 ; CHECK-LE-P7-NEXT:    blr
 ;
@@ -201,16 +184,12 @@ define dso_local <4 x i32> @test3(<4 x i32> %a, double %b) {
 ; CHECK-BE-P7-NEXT:    xscvdpuxws f0, f1
 ; CHECK-BE-P7-NEXT:    addi r3, r1, -4
 ; CHECK-BE-P7-NEXT:    stfiwx f0, 0, r3
-; CHECK-BE-P7-NEXT:    lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT:    sldi r3, r3, 32
-; CHECK-BE-P7-NEXT:    std r3, -32(r1)
-; CHECK-BE-P7-NEXT:    std r3, -24(r1)
+; CHECK-BE-P7-NEXT:    lfiwzx f0, 0, r3
 ; CHECK-BE-P7-NEXT:    addis r3, r2, .LCPI2_0@toc@ha
 ; CHECK-BE-P7-NEXT:    addi r3, r3, .LCPI2_0@toc@l
-; CHECK-BE-P7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-BE-P7-NEXT:    addi r3, r1, -32
 ; CHECK-BE-P7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-BE-P7-NEXT:    vperm v2, v2, v4, v3
+; CHECK-BE-P7-NEXT:    xxspltw v3, vs0, 1
+; CHECK-BE-P7-NEXT:    vperm v2, v2, v3, v4
 ; CHECK-BE-P7-NEXT:    blr
 ;
 ; CHECK-BE-P8-LABEL: test3:
@@ -238,16 +217,13 @@ define dso_local <4 x i32> @test4(<4 x i32> %a, float %b) {
 ; CHECK-LE-P7:       # %bb.0: # %entry
 ; CHECK-LE-P7-NEXT:    xscvdpuxws f0, f1
 ; CHECK-LE-P7-NEXT:    addi r3, r1, -4
+; CHECK-LE-P7-NEXT:    addis r4, r2, .LCPI3_0@toc@ha
+; CHECK-LE-P7-NEXT:    addi r4, r4, .LCPI3_0@toc@l
 ; CHECK-LE-P7-NEXT:    stfiwx f0, 0, r3
-; CHECK-LE-P7-NEXT:    lwz r3, -4(r1)
-; CHECK-LE-P7-NEXT:    stw r3, -32(r1)
-; CHECK-LE-P7-NEXT:    addis r3, r2, .LCPI3_0@toc@ha
-; CHECK-LE-P7-NEXT:    addi r3, r3, .LCPI3_0@toc@l
-; CHECK-LE-P7-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-LE-P7-NEXT:    addi r3, r1, -32
+; CHECK-LE-P7-NEXT:    lxvd2x vs0, 0, r4
 ; CHECK-LE-P7-NEXT:    xxswapd v3, vs0
-; CHECK-LE-P7-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-LE-P7-NEXT:    xxswapd v4, vs0
+; CHECK-LE-P7-NEXT:    lfiwzx f0, 0, r3
+; CHECK-LE-P7-NEXT:    xxspltw v4, vs0, 1
 ; CHECK-LE-P7-NEXT:    vperm v2, v4, v2, v3
 ; CHECK-LE-P7-NEXT:    blr
 ;
@@ -272,16 +248,12 @@ define dso_local <4 x i32> @test4(<4 x i32> %a, float %b) {
 ; CHECK-BE-P7-NEXT:    xscvdpuxws f0, f1
 ; CHECK-BE-P7-NEXT:    addi r3, r1, -4
 ; CHECK-BE-P7-NEXT:    stfiwx f0, 0, r3
-; CHECK-BE-P7-NEXT:    lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT:    sldi r3, r3, 32
-; CHECK-BE-P7-NEXT:    std r3, -32(r1)
-; CHECK-BE-P7-NEXT:    std r3, -24(r1)
+; CHECK-BE-P7-NEXT:    lfiwzx f0, 0, r3
 ; CHECK-BE-P7-NEXT:    addis r3, r2, .LCPI3_0@toc@ha
 ; CHECK-BE-P7-NEXT:    addi r3, r3, .LCPI3_0@toc@l
-; CHECK-BE-P7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-BE-P7-NEXT:    addi r3, r1, -32
 ; CHECK-BE-P7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-BE-P7-NEXT:    vperm v2, v2, v4, v3
+; CHECK-BE-P7-NEXT:    xxspltw v3, vs0, 1
+; CHECK-BE-P7-NEXT:    vperm v2, v2, v3, v4
 ; CHECK-BE-P7-NEXT:    blr
 ;
 ; CHECK-BE-P8-LABEL: test4:
diff --git a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
index 11cc8abd2c7fa..31d0960e19f4e 100644
--- a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
@@ -2045,31 +2045,25 @@ define <16 x i8> @test_v4i32_v2i64(ptr nocapture noundef readonly %a, ptr nocapt
 ;
 ; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
-; CHECK-AIX-32-P8-NEXT:    lxsiwzx v2, 0, r3
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 4(r4)
-; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r4)
-; CHECK-AIX-32-P8-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    li r5, 4
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f1, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v3, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    lwz r3, L..C9(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, r4, r5
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r3
-; CHECK-AIX-32-P8-NEXT:    xxmrghw v3, vs1, vs0
-; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs1, vs1, 1
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs0, vs0, 1
+; CHECK-AIX-32-P8-NEXT:    xxmrghw v2, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v3, v2, v4
 ; CHECK-AIX-32-P8-NEXT:    blr
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    lfiwzx f0, 0, r3
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 4(r4)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r4)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    li r3, 4
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs2, 0, r4
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs1, r4, r3
 ; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C5(r2) # %const.0
-; CHECK-AIX-32-P9-NEXT:    lxv vs2, -32(r1)
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw v2, vs2, vs1
 ; CHECK-AIX-32-P9-NEXT:    lxv vs1, 0(r3)
 ; CHECK-AIX-32-P9-NEXT:    xxperm v2, vs0, vs1
diff --git a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll
index 8bb71e073e814..56c8c128ba9f4 100644
--- a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll
@@ -1685,43 +1685,29 @@ define <2 x i64> @test_v2i64_v2i64(ptr nocapture noundef readonly %a, ptr nocapt
 ;
 ; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
-; CHECK-AIX-32-P8-NEXT:    lwz r5, 4(r3)
-; CHECK-AIX-32-P8-NEXT:    stw r5, -16(r1)
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 4(r4)
-; CHECK-AIX-32-P8-NEXT:    stw r3, -48(r1)
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r4)
-; CHECK-AIX-32-P8-NEXT:    stw r3, -64(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -48
+; CHECK-AIX-32-P8-NEXT:    li r5, 4
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, r3, r5
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f2, r4, r5
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs1, vs1, 1
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs3, vs3, 1
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs0, vs0, 1
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs2, vs2, 1
 ; CHECK-AIX-32-P8-NEXT:    xxmrghw v2, vs1, vs0
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -64
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
-; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs3, vs2
 ; CHECK-AIX-32-P8-NEXT:    xxmrghd v3, v2, vs0
 ; CHECK-AIX-32-P8-NEXT:    vaddudm v2, v3, v2
 ; CHECK-AIX-32-P8-NEXT:    blr
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    lwz r5, 4(r3)
-; CHECK-AIX-32-P9-NEXT:    stw r5, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 4(r4)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -48(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    li r5, 4
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs1, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs0, r3, r5
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw v2, vs1, vs0
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -48(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -64(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -64(r1)
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs0, r4, r5
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs1, 0, r4
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v3, v2, vs0
 ; CHECK-AIX-32-P9-NEXT:    vaddudm v2, v3, v2
diff --git a/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll
index 4ca55d276647b..c8e7b20e4b8c3 100644
--- a/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll
@@ -743,25 +743,21 @@ define void @test_v8i16_v4i32(ptr %a) {
 ; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    lhz r4, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    sth r4, -32(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r4
-; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    sth r4, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
-; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs0, vs0, 1
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
 ; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    blr
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    lhz r4, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r4, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs1, 0, r3
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs0, vs1
 ; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
 ; CHECK-AIX-32-P9-NEXT:    blr
@@ -842,25 +838,21 @@ define void @test_v8i16_v2i64(ptr %a) {
 ; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    lhz r4, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    sth r4, -32(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r4
-; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    sth r4, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
-; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs0, vs0, 1
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
 ; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    blr
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    lhz r4, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r4, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs1, 0, r3
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs0, vs1
 ; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
 ; CHECK-AIX-32-P9-NEXT:    blr
@@ -1030,25 +1022,21 @@ define void @test_v4i32_v8i16(ptr %a) {
 ; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    lhz r4, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    sth r4, -32(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r4
-; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    sth r4, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
-; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs0, vs0, 1
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs0, vs1
 ; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    blr
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    lhz r4, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r4, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs1, 0, r3
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
 ; CHECK-AIX-32-P9-NEXT:    blr
@@ -1125,26 +1113,18 @@ define void @test_v4i32_v2i64(ptr %a) {
 ;
 ; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
-; CHECK-AIX-32-P8-NEXT:    lwz r4, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    stw r4, -16(r1)
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs0, vs0, 1
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs1, vs1, 1
 ; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
 ; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    blr
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    lwz r4, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    stw r4, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs0, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs1, 0, r3
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
 ; CHECK-AIX-32-P9-NEXT:    blr
@@ -1212,14 +1192,11 @@ define void @test_v2i64_v2i64(ptr %a) {
 ;
 ; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
-; CHECK-AIX-32-P8-NEXT:    lwz r4, 4(r3)
-; CHECK-AIX-32-P8-NEXT:    stw r4, -16(r1)
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    li r4, 4
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, r3, r4
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs1, vs1, 1
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
 ; CHECK-AIX-32-P8-NEXT:    lfiwzx f1, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    xxspltw vs1, vs1, 1
@@ -1229,12 +1206,9 @@ define void @test_v2i64_v2i64(ptr %a) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    lwz r4, 4(r3)
-; CHECK-AIX-32-P9-NEXT:    stw r4, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    li r4, 4
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs1, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs0, r3, r4
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    lxvwsx vs1, 0, r3
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
@@ -1308,26 +1282,18 @@ define void @test_v2i64_v4i32(ptr %a) {
 ;
 ; CHECK-AIX-32-P8-LABEL: test_v2i64_v4i32:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
-; CHECK-AIX-32-P8-NEXT:    lwz r4, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs0, vs0, 1
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs1, vs1, 1
 ; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
 ; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    blr
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    lwz r4, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs0, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs1, 0, r3
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
 ; CHECK-AIX-32-P9-NEXT:    blr
@@ -1407,25 +1373,21 @@ define void @test_v2i64_v8i16(ptr %a) {
 ; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    lhz r4, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    sth r4, -32(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r4
-; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    sth r4, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
-; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs0, vs0, 1
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs0, vs1
 ; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    blr
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    lhz r4, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r4, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs1, 0, r3
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
 ; CHECK-AIX-32-P9-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
index 201bc5be54506..e1aa531db449e 100644
--- a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
@@ -654,17 +654,14 @@ define void @test_v2i64_none(ptr nocapture readonly %ptr1) {
 ;
 ; CHECK-AIX-32-P8-LABEL: test_v2i64_none:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
-; CHECK-AIX-32-P8-NEXT:    lwz r4, 4(r3)
+; CHECK-AIX-32-P8-NEXT:    li r4, 4
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f1, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    xxlxor v4, v4, v4
-; CHECK-AIX-32-P8-NEXT:    stw r4, -16(r1)
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, r3, r4
 ; CHECK-AIX-32-P8-NEXT:    lwz r3, L..C6(r2) # %const.0
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs1, vs1, 1
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    xxmrghw v2, vs1, vs0
 ; CHECK-AIX-32-P8-NEXT:    vperm v2, v4, v2, v3
 ; CHECK-AIX-32-P8-NEXT:    stxvw4x v2, 0, r3
@@ -672,14 +669,11 @@ define void @test_v2i64_none(ptr nocapture readonly %ptr1) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_none:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    lwz r4, 4(r3)
+; CHECK-AIX-32-P9-NEXT:    li r4, 4
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs1, 0, r3
 ; CHECK-AIX-32-P9-NEXT:    xxlxor vs2, vs2, vs2
-; CHECK-AIX-32-P9-NEXT:    stw r4, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs0, r3, r4
 ; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C5(r2) # %const.0
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    lxv vs1, 0(r3)
 ; CHECK-AIX-32-P9-NEXT:    xxperm vs0, vs2, vs1
@@ -847,24 +841,20 @@ define <16 x i8> @test_v8i16_v4i32(ptr %a, ptr %b) local_unnamed_addr {
 ; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    lhz r3, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    sth r3, -32(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
-; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r3
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r4)
-; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, 0, r4
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
-; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P8-NEXT:    xxspltw v2, vs0, 1
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v3, v2
 ; CHECK-AIX-32-P8-NEXT:    blr
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    lhz r3, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    sth r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r4)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxvwsx v3, 0, r4
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -937,24 +927,20 @@ define <16 x i8> @test_v8i16_v2i64(ptr %a, ptr %b) local_unnamed_addr {
 ; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    lhz r3, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    sth r3, -32(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
-; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r3
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r4)
-; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, 0, r4
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
-; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P8-NEXT:    xxspltw v2, vs0, 1
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v3, v2
 ; CHECK-AIX-32-P8-NEXT:    blr
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    lhz r3, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    sth r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r4)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxvwsx v3, 0, r4
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1149,24 +1135,20 @@ define <16 x i8> @test_v4i32_v8i16(ptr %a, ptr %b) local_unnamed_addr {
 ; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    lhz r3, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    sth r3, -32(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
-; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r3
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r4)
-; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, 0, r4
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
-; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P8-NEXT:    xxspltw v2, vs0, 1
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v2, v3
 ; CHECK-AIX-32-P8-NEXT:    blr
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    lhz r3, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    sth r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r4)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxvwsx v3, 0, r4
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1519,24 +1501,20 @@ define <16 x i8> @test_v2i64_v8i16(ptr %a, ptr %b) local_unnamed_addr {
 ; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    lhz r3, 0(r3)
-; CHECK-AIX-32-P8-NEXT:    sth r3, -32(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
-; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r3
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r4)
-; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, 0, r4
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
-; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P8-NEXT:    xxspltw v2, vs0, 1
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v2, v3
 ; CHECK-AIX-32-P8-NEXT:    blr
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    lhz r3, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    sth r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r4)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxvwsx v3, 0, r4
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:

From a724f9a7e5d46c9bf49c7b5e207f792fb5214c10 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 4 Sep 2024 10:08:05 -0700
Subject: [PATCH 125/425] [SLP][NFC]Make whole reg non-power-2 test for x86 and
 aarch64 along with risc-v

---
 .../SLPVectorizer/{RISCV => }/reduction-whole-regs-loads.ll    | 3 +++
 1 file changed, 3 insertions(+)
 rename llvm/test/Transforms/SLPVectorizer/{RISCV => }/reduction-whole-regs-loads.ll (87%)

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
similarity index 87%
rename from llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll
rename to llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
index 54dc33dbc0d00..c077181c35063 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s
+; REQUIRES: aarch64-registered-target, x86-registered-target, riscv-registered-target
 
 define i64 @test(ptr %p) {
 ; CHECK-LABEL: @test(

From 2092f3527ed743a8fb9e0858c839cd4b26907f2a Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 4 Sep 2024 10:20:13 -0700
Subject: [PATCH 126/425] [SLP][NFC]Remove unsupported attribute

---
 .../Transforms/SLPVectorizer/reduction-whole-regs-loads.ll    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
index c077181c35063..281b5f99540ea 100644
--- a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-100 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux -slp-threshold=-100 | FileCheck %s
 ; REQUIRES: aarch64-registered-target, x86-registered-target, riscv-registered-target
 
 define i64 @test(ptr %p) {

From 601645c3b70e2a17d18779a3a51b8bc9ecdc9aa6 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Wed, 4 Sep 2024 16:52:49 +0000
Subject: [PATCH 127/425] [clang] Fix FIXME in dynamic initializer emission,
 NFCI

This potentially affects platforms that support comdats other than ELF,
COFF, or wasm, but that is the intention of the FIXME, and if they don't
want this behavior, they probably shouldn't advertise comdat support.
---
 clang/lib/CodeGen/CGDeclCXX.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp
index 8dcb5f6100619..c44f38ef02a3f 100644
--- a/clang/lib/CodeGen/CGDeclCXX.cpp
+++ b/clang/lib/CodeGen/CGDeclCXX.cpp
@@ -640,13 +640,13 @@ CodeGenModule::EmitCXXGlobalVarDeclInitFunc(const VarDecl *D,
       addUsedGlobal(COMDATKey);
     }
 
-    // If we used a COMDAT key for the global ctor, the init function can be
-    // discarded if the global ctor entry is discarded.
-    // FIXME: Do we need to restrict this to ELF and Wasm?
+    // If comdats are in use and supported, place the initializer function into
+    // the comdat group of the global. In the MS ABI, initializers are mangled
+    // and have their own comdat, so we don't include them in the group for
+    // consistency with MSVC.
     llvm::Comdat *C = Addr->getComdat();
-    if (COMDATKey && C &&
-        (getTarget().getTriple().isOSBinFormatELF() ||
-         getTarget().getTriple().isOSBinFormatWasm())) {
+    if (COMDATKey && C && getTriple().supportsCOMDAT() &&
+        !getTarget().getCXXABI().isMicrosoft()) {
       Fn->setComdat(C);
     }
   } else {

From 9a2fd97d391caf1060e303f636d7113501788d2f Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Wed, 4 Sep 2024 10:41:09 -0700
Subject: [PATCH 128/425] Reapply^2 "[HWASan] remove incorrectly inferred
 attributes" (#106622) (#106816)

This reverts commit 66927fb95abef9327b453d7213c5df7d641269be.

Filter functions this applies to, which I initially wanted to do in a
follow up to make reverts easier, but turns out without that it gets
really slow

Fleetbench proto: no significant movement
Fleetbench hashing: no significant movement
Fleetbench libc: no significant movement

2nd stage LLVM build:
https://lab.llvm.org/buildbot/#/builders/55/builds/1765/steps/9/logs/stdio
after this change: 80833.56user 3303.04system
previous build: 78430.21user 3258.04system
---
 .../Instrumentation/HWAddressSanitizer.cpp    |  43 +++-
 .../HWAddressSanitizer/RISCV/alloca.ll        | 138 +++++-----
 .../HWAddressSanitizer/RISCV/basic.ll         | 242 +++++++++---------
 .../HWAddressSanitizer/alloca.ll              | 138 +++++-----
 .../HWAddressSanitizer/basic.ll               | 180 ++++++-------
 .../HWAddressSanitizer/mem-attr.ll            |  10 +-
 6 files changed, 394 insertions(+), 357 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 69e5835bee8a5..a7e7f9a570dac 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -598,6 +598,41 @@ void HWAddressSanitizer::initializeModule() {
   LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n");
   TargetTriple = Triple(M.getTargetTriple());
 
+  for (auto &F : M.functions()) {
+    // Remove memory attributes that are invalid with HWASan.
+    // HWASan checks read from shadow, which invalidates memory(argmem: *)
+    // Short granule checks on function arguments read from the argument memory
+    // (last byte of the granule), which invalidates writeonly.
+    //
+    // This is not only true for sanitized functions, because AttrInfer can
+    // infer those attributes on libc functions, which is not true if those
+    // are instrumented (Android) or intercepted.
+
+    // The API is weird. `onlyReadsMemory` actually means "does not write", and
+    // `onlyWritesMemory` actually means "does not read". So we reconstruct
+    // "accesses memory" && "does not read" <=> "writes".
+    bool Changed = false;
+    if (!F.doesNotAccessMemory()) {
+      bool WritesMemory = !F.onlyReadsMemory();
+      bool ReadsMemory = !F.onlyWritesMemory();
+      if ((WritesMemory && !ReadsMemory) || F.onlyAccessesArgMemory()) {
+        F.removeFnAttr(Attribute::Memory);
+        Changed = true;
+      }
+    }
+    for (Argument &A : F.args()) {
+      if (A.hasAttribute(Attribute::WriteOnly)) {
+        Changed = true;
+        A.removeAttr(Attribute::WriteOnly);
+      }
+    }
+    if (Changed) {
+      // nobuiltin makes sure later passes don't restore assumptions about
+      // the function.
+      F.addFnAttr(Attribute::NoBuiltin);
+    }
+  }
+
   // x86_64 currently has two modes:
   // - Intel LAM (default)
   // - pointer aliasing (heap only)
@@ -1622,14 +1657,6 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
 
   assert(!ShadowBase);
 
-  // Remove memory attributes that are about to become invalid.
-  // HWASan checks read from shadow, which invalidates memory(argmem: *)
-  // Short granule checks on function arguments read from the argument memory
-  // (last byte of the granule), which invalidates writeonly.
-  F.removeFnAttr(llvm::Attribute::Memory);
-  for (auto &A : F.args())
-    A.removeAttr(llvm::Attribute::WriteOnly);
-
   BasicBlock::iterator InsertPt = F.getEntryBlock().begin();
   IRBuilder<> EntryIRB(&F.getEntryBlock(), InsertPt);
   emitPrologue(EntryIRB,
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
index 23b1043c70016..5fd9dc6eede21 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
@@ -33,7 +33,7 @@ declare void @use32(ptr)
 ;.
 define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-LABEL: define void @test_alloca
-; DYNAMIC-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
+; DYNAMIC-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
 ; DYNAMIC-SHADOW-NEXT:  entry:
 ; DYNAMIC-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; DYNAMIC-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -42,33 +42,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; DYNAMIC-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
-; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
+; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
 ;
 ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca
-; ZERO-BASED-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
+; ZERO-BASED-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
 ; ZERO-BASED-SHADOW-NEXT:  entry:
 ; ZERO-BASED-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null)
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -77,30 +77,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; ZERO-BASED-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
-; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
+; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
 ;
 entry:
   %x = alloca i32, align 4
@@ -147,15 +147,16 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; DYNAMIC-SHADOW: [[META3]] = !{}
 ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
-; DYNAMIC-SHADOW: [[META9]] = !{null}
-; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
-; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
-; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
-; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
+; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
+; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
+; DYNAMIC-SHADOW: [[META10]] = !{null}
+; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
+; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
+; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
+; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
 ;.
 ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note}
 ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
@@ -163,13 +164,14 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; ZERO-BASED-SHADOW: [[META3]] = !{}
 ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META9]] = !{null}
-; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
-; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
-; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
+; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
+; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META10]] = !{null}
+; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
+; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
+; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
 ;.
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll
index 9cebe2e845f77..5415b08128663 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll
@@ -9,8 +9,6 @@
 ; RUN: opt < %s -passes=hwasan -hwasan-recover=0 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=ABORT-ZERO-BASED-SHADOW
 ; RUN: opt < %s -passes=hwasan -hwasan-recover=1 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=RECOVER-ZERO-BASED-SHADOW
 
-; CHECK: @llvm.used = appending global [1 x ptr] [ptr @hwasan.module_ctor]
-; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @hwasan.module_ctor, ptr @hwasan.module_ctor }]
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "riscv64-unknown-linux"
@@ -32,7 +30,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -68,7 +66,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -88,7 +86,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -108,10 +106,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -120,13 +118,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -145,7 +143,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -165,10 +163,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -177,13 +175,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -212,7 +210,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -248,7 +246,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -268,7 +266,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -288,10 +286,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -300,13 +298,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -325,7 +323,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -345,10 +343,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -357,13 +355,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -392,7 +390,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -428,7 +426,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -448,7 +446,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -468,10 +466,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -480,13 +478,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -505,7 +503,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -525,10 +523,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -537,13 +535,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -572,7 +570,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -608,7 +606,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -628,7 +626,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -648,10 +646,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -660,13 +658,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -685,7 +683,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -705,10 +703,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -717,13 +715,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -752,7 +750,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -788,7 +786,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -808,7 +806,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -828,10 +826,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -840,13 +838,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -865,7 +863,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -885,10 +883,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -897,13 +895,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1013,7 +1011,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1049,7 +1047,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1069,7 +1067,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1089,10 +1087,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1101,13 +1099,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1126,7 +1124,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1146,10 +1144,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1158,13 +1156,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1193,7 +1191,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1229,7 +1227,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1249,7 +1247,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1269,10 +1267,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1281,13 +1279,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1306,7 +1304,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1326,10 +1324,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1338,13 +1336,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1373,7 +1371,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1409,7 +1407,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1429,7 +1427,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1449,10 +1447,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1461,13 +1459,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1486,7 +1484,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1506,10 +1504,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1518,13 +1516,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1553,7 +1551,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1589,7 +1587,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1609,7 +1607,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1629,10 +1627,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1641,13 +1639,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1666,7 +1664,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1686,10 +1684,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1698,13 +1696,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1733,7 +1731,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1769,7 +1767,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1789,7 +1787,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1809,10 +1807,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1821,13 +1819,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1846,7 +1844,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1866,10 +1864,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1878,13 +1876,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
index 4bd23ea76c159..73f56de707b21 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
@@ -34,7 +34,7 @@ declare void @use32(ptr)
 ;.
 define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-LABEL: define void @test_alloca(
-; DYNAMIC-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
+; DYNAMIC-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
 ; DYNAMIC-SHADOW-NEXT:  entry:
 ; DYNAMIC-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; DYNAMIC-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -43,33 +43,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; DYNAMIC-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
-; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
+; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
 ;
 ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca(
-; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
+; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
 ; ZERO-BASED-SHADOW-NEXT:  entry:
 ; ZERO-BASED-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null)
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -78,30 +78,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; ZERO-BASED-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
-; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
+; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
 ;
 entry:
   %x = alloca i32, align 4
@@ -166,15 +166,16 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; DYNAMIC-SHADOW: [[META3]] = !{}
 ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
-; DYNAMIC-SHADOW: [[META9]] = !{null}
-; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
-; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
-; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
-; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
+; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
+; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
+; DYNAMIC-SHADOW: [[META10]] = !{null}
+; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
+; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
+; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
+; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
 ;.
 ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note}
 ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
@@ -182,13 +183,14 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; ZERO-BASED-SHADOW: [[META3]] = !{}
 ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META9]] = !{null}
-; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
-; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
-; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
+; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
+; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META10]] = !{null}
+; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
+; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
+; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
 ;.
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
index 4212293f42545..afbb8f5001114 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
@@ -42,7 +42,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -70,10 +70,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -82,13 +82,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -115,10 +115,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -127,13 +127,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -174,7 +174,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -202,10 +202,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -214,13 +214,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -247,10 +247,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -259,13 +259,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -306,7 +306,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -334,10 +334,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -346,13 +346,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -379,10 +379,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -391,13 +391,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -438,7 +438,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -466,10 +466,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -478,13 +478,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -511,10 +511,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -523,13 +523,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -570,7 +570,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -598,10 +598,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -610,13 +610,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -643,10 +643,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -655,13 +655,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -771,7 +771,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -799,10 +799,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -811,13 +811,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -844,10 +844,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -856,13 +856,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -903,7 +903,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -931,10 +931,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -943,13 +943,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -976,10 +976,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -988,13 +988,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1035,7 +1035,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -1063,10 +1063,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1075,13 +1075,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1108,10 +1108,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1120,13 +1120,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1167,7 +1167,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -1195,10 +1195,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1207,13 +1207,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1240,10 +1240,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1252,13 +1252,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1299,7 +1299,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -1327,10 +1327,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1339,13 +1339,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1372,10 +1372,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1384,13 +1384,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
index c0e370f20213a..724e6c5a0bdec 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
@@ -11,5 +11,13 @@ entry:
   ret void
 }
 
-; CHECK: attributes #0 = { sanitize_hwaddress uwtable }
+; CHECK: define dso_local void @test_readonly(ptr nocapture noundef readonly %p) local_unnamed_addr #0
+define dso_local void @test_readonly(ptr nocapture noundef readonly %p) local_unnamed_addr #1 {
+entry:
+  store i32 42, ptr %p, align 4
+  ret void
+}
+
+; CHECK: attributes #0 = { nobuiltin sanitize_hwaddress uwtable }
 attributes #0 = { sanitize_hwaddress memory(argmem: write) uwtable }
+attributes #1 = { sanitize_hwaddress memory(argmem: read) uwtable }

From 4228e28293458e6ec49bd5487210719ff33c319a Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Wed, 4 Sep 2024 10:51:52 -0700
Subject: [PATCH 129/425] [flang] Fix crash in semantics (#106158)

Semantics crashes when merging a USE-associated derived type with a
local generic procedure interface of the same name. (The other direction
works.)
---
 flang/lib/Semantics/resolve-names.cpp |  9 +++--
 flang/test/Semantics/generic09.f90    | 47 +++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Semantics/generic09.f90

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index ec8f854f64d10..2e86e0afc9bd0 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -3131,7 +3131,7 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
     combinedDerivedType = useDerivedType;
   } else {
     const Scope *localScope{localDerivedType->scope()};
-    const Scope *useScope{useDerivedType->scope()};
+    const Scope *useScope{useDerivedType->GetUltimate().scope()};
     if (localScope && useScope && localScope->derivedTypeSpec() &&
         useScope->derivedTypeSpec() &&
         evaluate::AreSameDerivedType(
@@ -3307,7 +3307,12 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
     AddGenericUse(newUseGeneric, localName, useUltimate);
     newUseGeneric.AddUse(*localSymbol);
     if (combinedDerivedType) {
-      newUseGeneric.set_derivedType(*const_cast<Symbol *>(combinedDerivedType));
+      if (const auto *oldDT{newUseGeneric.derivedType()}) {
+        CHECK(&oldDT->GetUltimate() == &combinedDerivedType->GetUltimate());
+      } else {
+        newUseGeneric.set_derivedType(
+            *const_cast<Symbol *>(combinedDerivedType));
+      }
     }
     if (combinedProcedure) {
       newUseGeneric.set_specific(*const_cast<Symbol *>(combinedProcedure));
diff --git a/flang/test/Semantics/generic09.f90 b/flang/test/Semantics/generic09.f90
new file mode 100644
index 0000000000000..6159dd4b701d7
--- /dev/null
+++ b/flang/test/Semantics/generic09.f90
@@ -0,0 +1,47 @@
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+module m1
+  type foo
+    integer n
+    integer :: m = 1
+  end type
+end
+
+module m2
+  use m1
+  interface foo
+    module procedure f1
+  end interface
+ contains
+  type(foo) function f1(a)
+    real, intent(in) :: a
+    f1%n = a
+    f1%m = 2
+  end
+end
+
+module m3
+  use m2
+  interface foo
+    module procedure f2
+  end interface
+ contains
+  type(foo) function f2(a)
+    double precision, intent(in) :: a
+    f2%n = a
+    f2%m = 3
+  end
+end
+
+program main
+  use m3
+  type(foo) x
+!CHECK: foo(n=1_4,m=1_4)
+  x = foo(1)
+  print *, x
+!CHECK: f1(2._4)
+  x = foo(2.)
+  print *, x
+!CHECK: f2(3._8)
+  x = foo(3.d0)
+  print *, x
+end

From 6facf6981488700c1554dcce36d4ac774a91d568 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Wed, 4 Sep 2024 10:52:20 -0700
Subject: [PATCH 130/425] [flang][runtime] Correct RANDOM_INIT seed generation
 (#106250)

The initial seed was generated from a bitwise AND ("&") of two
clock-generated values, instead of an XOR or (best) a truncated integer
multiplication. Maybe I mistyped a shift-7 instead of a shift-6 or
shift-8 when I wrote that line, but it was most likely just stupidity.

Fixes https://github.com/llvm/llvm-project/issues/106221.
---
 flang/runtime/random.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/runtime/random.cpp b/flang/runtime/random.cpp
index e0a421fd28396..69de9b8c96fb5 100644
--- a/flang/runtime/random.cpp
+++ b/flang/runtime/random.cpp
@@ -42,7 +42,7 @@ void RTNAME(RandomInit)(bool repeatable, bool /*image_distinct*/) {
 #ifdef CLOCK_REALTIME
       timespec ts;
       clock_gettime(CLOCK_REALTIME, &ts);
-      generator.seed(ts.tv_sec & ts.tv_nsec);
+      generator.seed(ts.tv_sec ^ ts.tv_nsec);
 #else
       generator.seed(time(nullptr));
 #endif

From 9e53e77265769f1916d8c4fd8ed8263798e8e815 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Wed, 4 Sep 2024 10:52:51 -0700
Subject: [PATCH 131/425] [flang] Fix warnings from more recent GCCs (#106567)

While experimenting with some more recent C++ features, I ran into
trouble with warnings from GCC 12.3.0 and 14.2.0. These warnings looked
legitimate, so I've tweaked the code to avoid them.
---
 flang/include/flang/Evaluate/integer.h            |  6 +++---
 flang/include/flang/Runtime/descriptor.h          |  2 +-
 flang/lib/Lower/ConvertExpr.cpp                   |  3 ++-
 flang/lib/Optimizer/CodeGen/CodeGen.cpp           |  2 +-
 .../Optimizer/HLFIR/Transforms/ConvertToFIR.cpp   |  3 ++-
 flang/lib/Semantics/check-omp-structure.cpp       | 15 ++++++++-------
 flang/lib/Semantics/tools.cpp                     |  2 +-
 flang/unittests/Runtime/Reduction.cpp             |  6 +++---
 flang/unittests/Runtime/Transformational.cpp      | 10 +++++-----
 9 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/flang/include/flang/Evaluate/integer.h b/flang/include/flang/Evaluate/integer.h
index e69876f07b63c..e420eb75e3dff 100644
--- a/flang/include/flang/Evaluate/integer.h
+++ b/flang/include/flang/Evaluate/integer.h
@@ -828,9 +828,9 @@ class Integer {
           if (Part ypart{y.LEPart(k)}) {
             BigPart xy{xpart};
             xy *= ypart;
-#if defined __GNUC__ && __GNUC__ < 8
-            // && to < (2 * parts) was added to avoid GCC < 8 build failure on
-            // -Werror=array-bounds. This can be removed if -Werror is disable.
+#if defined __GNUC__ && __GNUC__ < 8 || __GNUC__ >= 12
+            // && to < (2 * parts) was added to avoid GCC build failure on
+            // -Werror=array-bounds. This can be removed if -Werror is disabled.
             for (int to{j + k}; xy != 0 && to < (2 * parts); ++to) {
 #else
             for (int to{j + k}; xy != 0; ++to) {
diff --git a/flang/include/flang/Runtime/descriptor.h b/flang/include/flang/Runtime/descriptor.h
index 043f6931afad9..030d0c1031fba 100644
--- a/flang/include/flang/Runtime/descriptor.h
+++ b/flang/include/flang/Runtime/descriptor.h
@@ -438,7 +438,7 @@ class Descriptor {
   }
   RT_API_ATTRS inline void SetAllocIdx(int pos) {
     raw_.extra &= ~_CFI_ALLOCATOR_IDX_MASK; // Clear the allocator index bits.
-    raw_.extra |= (pos << _CFI_ALLOCATOR_IDX_SHIFT);
+    raw_.extra |= pos << _CFI_ALLOCATOR_IDX_SHIFT;
   }
 
 private:
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index 7dd317d64436b..62a7615e1af13 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -5590,7 +5590,8 @@ class ArrayExprLowering {
     ty = unwrapBoxEleTy(ty);
     mlir::Location loc = getLoc();
     mlir::IndexType idxTy = builder.getIndexType();
-    for (auto extent : mlir::cast<fir::SequenceType>(ty).getShape()) {
+    auto seqType = mlir::cast<fir::SequenceType>(ty);
+    for (auto extent : seqType.getShape()) {
       auto v = extent == fir::SequenceType::getUnknownExtent()
                    ? builder.create<fir::UndefOp>(loc, idxTy).getResult()
                    : builder.createIntegerConstant(loc, idxTy, extent);
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index e419b26125299..eb91969236ae0 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -1273,7 +1273,7 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
     } else {
       // Compute the value of the extra field based on allocator_idx and
       // addendum present using a Descriptor object.
-      Fortran::runtime::StaticDescriptor<0> staticDescriptor;
+      Fortran::runtime::StaticDescriptor staticDescriptor;
       Fortran::runtime::Descriptor &desc{staticDescriptor.descriptor()};
       desc.raw().extra = 0;
       desc.SetAllocIdx(allocatorIdx);
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
index 98205959020d2..536f2077e4f70 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
@@ -411,7 +411,8 @@ class DesignateOpConversion
     llvm::SmallVector<mlir::Value> firstElementIndices;
     auto indices = designate.getIndices();
     int i = 0;
-    for (auto isTriplet : designate.getIsTripletAttr().asArrayRef()) {
+    auto attrs = designate.getIsTripletAttr();
+    for (auto isTriplet : attrs.asArrayRef()) {
       // Coordinate of the first element are the index and triplets lower
       // bounds
       firstElementIndices.push_back(indices[i]);
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 50840898438c7..643b713b32e29 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1796,13 +1796,12 @@ inline void OmpStructureChecker::ErrIfLHSAndRHSSymbolsMatch(
   const auto *e{GetExpr(context_, expr)};
   const auto *v{GetExpr(context_, var)};
   if (e && v) {
-    const Symbol &varSymbol = evaluate::GetSymbolVector(*v).front();
+    auto vSyms{evaluate::GetSymbolVector(*v)};
+    const Symbol &varSymbol = vSyms.front();
     for (const Symbol &symbol : evaluate::GetSymbolVector(*e)) {
       if (varSymbol == symbol) {
         context_.Say(expr.source,
-            "RHS expression "
-            "on atomic assignment statement"
-            " cannot access '%s'"_err_en_US,
+            "RHS expression on atomic assignment statement cannot access '%s'"_err_en_US,
             var.GetSource().ToString());
       }
     }
@@ -1942,12 +1941,14 @@ void OmpStructureChecker::CheckAtomicUpdateStmt(
           "Expected scalar variable "
           "on the LHS of atomic update assignment "
           "statement"_err_en_US);
-    const Symbol &varSymbol = evaluate::GetSymbolVector(*v).front();
+    auto vSyms{evaluate::GetSymbolVector(*v)};
+    const Symbol &varSymbol = vSyms.front();
     int numOfSymbolMatches{0};
-    SymbolVector exprSymbols = evaluate::GetSymbolVector(*e);
+    SymbolVector exprSymbols{evaluate::GetSymbolVector(*e)};
     for (const Symbol &symbol : exprSymbols) {
-      if (varSymbol == symbol)
+      if (varSymbol == symbol) {
         numOfSymbolMatches++;
+      }
     }
     if (isIntrinsicProcedure) {
       std::string varName = var.GetSource().ToString();
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 845183f2d9b25..8d16ab7100876 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -1354,7 +1354,7 @@ ComponentIterator<componentKind>::const_iterator::BuildResultDesignatorName()
     const {
   std::string designator;
   for (const auto &node : componentPath_) {
-    designator += "%" + DEREF(node.component()).name().ToString();
+    designator += "%"s + DEREF(node.component()).name().ToString();
   }
   return designator;
 }
diff --git a/flang/unittests/Runtime/Reduction.cpp b/flang/unittests/Runtime/Reduction.cpp
index 41c8d86c35b76..25eb5fd760ead 100644
--- a/flang/unittests/Runtime/Reduction.cpp
+++ b/flang/unittests/Runtime/Reduction.cpp
@@ -42,7 +42,7 @@ TEST(Reductions, DimMaskProductInt4) {
       shape, std::vector<std::int32_t>{1, 2, 3, 4, 5, 6})};
   auto mask{MakeArray<TypeCategory::Logical, 1>(
       shape, std::vector<bool>{true, false, false, true, true, true})};
-  StaticDescriptor<1, true> statDesc;
+  StaticDescriptor<maxRank, true> statDesc;
   Descriptor &prod{statDesc.descriptor()};
   RTNAME(ProductDim)(prod, *array, 1, __FILE__, __LINE__, &*mask);
   EXPECT_EQ(prod.rank(), 1);
@@ -152,7 +152,7 @@ TEST(Reductions, DoubleMaxMinNorm2) {
   // A scalar result occurs when you have a rank 1 array and dim == 1.
   std::vector<int> shape1{24};
   auto array1{MakeArray<TypeCategory::Real, 8>(shape1, rawData)};
-  StaticDescriptor<1, true> statDesc0[1];
+  StaticDescriptor<2, true> statDesc0[1];
   Descriptor &scalarResult{statDesc0[0].descriptor()};
   RTNAME(MaxlocDim)
   (scalarResult, *array1, /*KIND=*/2, /*DIM=*/1, __FILE__, __LINE__,
@@ -655,7 +655,7 @@ TEST(Reductions, ReduceInt4) {
 TEST(Reductions, ReduceInt4Dim) {
   auto intMatrix{MakeArray<TypeCategory::Integer, 4>(
       std::vector<int>{2, 2}, std::vector<std::int32_t>{1, 2, 3, 4})};
-  StaticDescriptor<1, true> statDesc;
+  StaticDescriptor<2, true> statDesc;
   Descriptor &sums{statDesc.descriptor()};
   RTNAME(ReduceInteger4DimRef)(sums, *intMatrix, IAdd, __FILE__, __LINE__, 1);
   EXPECT_EQ(sums.rank(), 1);
diff --git a/flang/unittests/Runtime/Transformational.cpp b/flang/unittests/Runtime/Transformational.cpp
index 5678ea2515775..5836e70c740f9 100644
--- a/flang/unittests/Runtime/Transformational.cpp
+++ b/flang/unittests/Runtime/Transformational.cpp
@@ -33,7 +33,7 @@ template <int KIND>
 static void testBesselJn(BesselFuncType<KIND> rtFunc, int32_t n1, int32_t n2,
     CppTypeFor<TypeCategory::Real, KIND> x,
     const std::vector<CppTypeFor<TypeCategory::Real, KIND>> &expected) {
-  StaticDescriptor<1> desc;
+  StaticDescriptor desc;
   Descriptor &result{desc.descriptor()};
   unsigned len = expected.size();
 
@@ -60,7 +60,7 @@ static void testBesselJn(BesselFuncType<KIND> rtFunc, int32_t n1, int32_t n2,
 template <int KIND>
 static void testBesselJnX0(
     BesselX0FuncType<KIND> rtFunc, int32_t n1, int32_t n2) {
-  StaticDescriptor<1> desc;
+  StaticDescriptor desc;
   Descriptor &result{desc.descriptor()};
 
   rtFunc(result, n1, n2, __FILE__, __LINE__);
@@ -131,7 +131,7 @@ template <int KIND>
 static void testBesselYn(BesselFuncType<KIND> rtFunc, int32_t n1, int32_t n2,
     CppTypeFor<TypeCategory::Real, KIND> x,
     const std::vector<CppTypeFor<TypeCategory::Real, KIND>> &expected) {
-  StaticDescriptor<1> desc;
+  StaticDescriptor desc;
   Descriptor &result{desc.descriptor()};
   unsigned len = expected.size();
 
@@ -158,7 +158,7 @@ static void testBesselYn(BesselFuncType<KIND> rtFunc, int32_t n1, int32_t n2,
 template <int KIND>
 static void testBesselYnX0(
     BesselX0FuncType<KIND> rtFunc, int32_t n1, int32_t n2) {
-  StaticDescriptor<1> desc;
+  StaticDescriptor<2> desc;
   Descriptor &result{desc.descriptor()};
 
   rtFunc(result, n1, n2, __FILE__, __LINE__);
@@ -383,7 +383,7 @@ TEST(Transformational, Pack) {
       std::vector<std::uint8_t>{false, true, true, false, false, true})};
   mask->GetDimension(0).SetLowerBound(0); // shouldn't matter
   mask->GetDimension(1).SetLowerBound(2);
-  StaticDescriptor<1, true> statDesc;
+  StaticDescriptor<maxRank, true> statDesc;
   Descriptor &result{statDesc.descriptor()};
 
   RTNAME(Pack)(result, *array, *mask, nullptr, __FILE__, __LINE__);

From 500f6cc25cb93607e9ea13732b791297acf8f97f Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Wed, 4 Sep 2024 10:53:22 -0700
Subject: [PATCH 132/425] [flang][runtime] Support SPACING for REAL(2 & 3)
 (#106575)

Add runtime APIs for the intrinsic function SPACING for REAL kinds 2 & 3
in two ways: Spacing2 (& 3) for build environments with std::float16_t,
and Spacing2By4 (& 3By4) variants (for any build environment) which
compute SPACING for those types but accept and return their values as
32-bit floats.

SPACING for REAL(2) is needed by HDF5.
---
 flang/include/flang/Runtime/cpp-type.h | 36 +++++++++++++++++++++++---
 flang/include/flang/Runtime/numeric.h  | 15 +++++++++++
 flang/runtime/numeric-templates.h      |  9 +++++--
 flang/runtime/numeric.cpp              | 20 ++++++++++++++
 flang/unittests/Runtime/Numeric.cpp    |  5 ++++
 5 files changed, 79 insertions(+), 6 deletions(-)

diff --git a/flang/include/flang/Runtime/cpp-type.h b/flang/include/flang/Runtime/cpp-type.h
index 5141d0691c5c6..fe21dd544cf7d 100644
--- a/flang/include/flang/Runtime/cpp-type.h
+++ b/flang/include/flang/Runtime/cpp-type.h
@@ -14,11 +14,20 @@
 #include "flang/Common/Fortran.h"
 #include "flang/Common/float128.h"
 #include "flang/Common/uint128.h"
-#include <cfloat>
 #include <complex>
 #include <cstdint>
+#if __cplusplus >= 202302
+#include <stdfloat>
+#endif
 #include <type_traits>
 
+#if !defined HAS_FP16 && __STDCPP_FLOAT16_T__
+#define HAS_FP16 1
+#endif
+#if !defined HAS_BF16 && __STDCPP_BFLOAT16_T__
+#define HAS_BF16 1
+#endif
+
 namespace Fortran::runtime {
 
 using common::TypeCategory;
@@ -37,24 +46,43 @@ template <int KIND> struct CppTypeForHelper<TypeCategory::Integer, KIND> {
   using type = common::HostSignedIntType<8 * KIND>;
 };
 
-// TODO: REAL/COMPLEX(2 & 3)
+#if HAS_FP16
+template <> struct CppTypeForHelper<TypeCategory::Real, 2> {
+  using type = std::float16_t;
+};
+#endif
+#if HAS_BF16
+template <> struct CppTypeForHelper<TypeCategory::Real, 3> {
+  using type = std::bfloat16_t;
+};
+#endif
 template <> struct CppTypeForHelper<TypeCategory::Real, 4> {
+#if __STDCPP_FLOAT32_T__
+  using type = std::float32_t;
+#else
   using type = float;
+#endif
 };
 template <> struct CppTypeForHelper<TypeCategory::Real, 8> {
+#if __STDCPP_FLOAT64_T__
+  using type = std::float64_t;
+#else
   using type = double;
+#endif
 };
 #if LDBL_MANT_DIG == 64
 template <> struct CppTypeForHelper<TypeCategory::Real, 10> {
   using type = long double;
 };
 #endif
-#if LDBL_MANT_DIG == 113
+#if __STDCPP_FLOAT128_T__
+using CppFloat128Type = std::float128_t;
+#elif LDBL_MANT_DIG == 113
 using CppFloat128Type = long double;
 #elif HAS_FLOAT128
 using CppFloat128Type = __float128;
 #endif
-#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+#if __STDCPP_FLOAT128_t || LDBL_MANT_DIG == 113 || HAS_FLOAT128
 template <> struct CppTypeForHelper<TypeCategory::Real, 16> {
   using type = CppFloat128Type;
 };
diff --git a/flang/include/flang/Runtime/numeric.h b/flang/include/flang/Runtime/numeric.h
index 6e1979790e3c6..84a5a7cd7a361 100644
--- a/flang/include/flang/Runtime/numeric.h
+++ b/flang/include/flang/Runtime/numeric.h
@@ -391,6 +391,21 @@ CppTypeFor<TypeCategory::Integer, 4> RTDECL(SelectedRealKindMasked)(
     const char *, int, void *, int, void *, int, void *, int, int);
 
 // SPACING
+// The variants Spacing2By4 and Spacing3By4 compute SPACING for REAL(2/3)
+// but accept and return REAL(4) values, for use in environments where
+// std::float16_t or std::bfloat16_t are unavailable.
+#if HAS_FP16
+CppTypeFor<TypeCategory::Real, 2> RTDECL(Spacing2)(
+    CppTypeFor<TypeCategory::Real, 2>);
+#endif
+CppTypeFor<TypeCategory::Real, 4> RTDECL(Spacing2By4)(
+    CppTypeFor<TypeCategory::Real, 4>);
+#if HAS_BF16
+CppTypeFor<TypeCategory::Real, 3> RTDECL(Spacing3)(
+    CppTypeFor<TypeCategory::Real, 3>);
+#endif
+CppTypeFor<TypeCategory::Real, 4> RTDECL(Spacing3By4)(
+    CppTypeFor<TypeCategory::Real, 4>);
 CppTypeFor<TypeCategory::Real, 4> RTDECL(Spacing4)(
     CppTypeFor<TypeCategory::Real, 4>);
 CppTypeFor<TypeCategory::Real, 8> RTDECL(Spacing8)(
diff --git a/flang/runtime/numeric-templates.h b/flang/runtime/numeric-templates.h
index 1b5395df94519..1b43498a6bfd1 100644
--- a/flang/runtime/numeric-templates.h
+++ b/flang/runtime/numeric-templates.h
@@ -343,10 +343,15 @@ template <int PREC, typename T> inline RT_API_ATTRS T Spacing(T x) {
     return x; // NaN -> same NaN
   } else if (ISINFTy<T>::compute(x)) {
     return QNANTy<T>::compute(); // +/-Inf -> NaN
-  } else if (x == 0) {
+  } else if (x == 0) { // 0 -> TINY(x)
     // The standard-mandated behavior seems broken, since TINY() can't be
     // subnormal.
-    return MINTy<T>::compute(); // 0 -> TINY(x)
+    if constexpr (PREC == 11) { // REAL(2)
+      return 0.00006103515625E-04; // TINY(0._2)
+    } else {
+      // N.B. TINY(0._3) == TINY(0._4) so this works even if no std::bfloat16_t.
+      return MINTy<T>::compute();
+    }
   } else {
     T result{LDEXPTy<T>::compute(
         static_cast<T>(1.0), ILOGBTy<T>::compute(x) + 1 - PREC)}; // 2**(e-p)
diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp
index b5e0851a16cd1..9a8ddc6615564 100644
--- a/flang/runtime/numeric.cpp
+++ b/flang/runtime/numeric.cpp
@@ -848,6 +848,26 @@ CppTypeFor<TypeCategory::Integer, 4> RTDEF(SelectedRealKindMasked)(
   return SelectedRealKind(p, r, d, mask);
 }
 
+#if HAS_FP16
+CppTypeFor<TypeCategory::Real, 2> RTDEF(Spacing2)(
+    CppTypeFor<TypeCategory::Real, 2> x) {
+  return Spacing<11>(x);
+}
+#endif
+CppTypeFor<TypeCategory::Real, 4> RTDEF(Spacing2By4)(
+    CppTypeFor<TypeCategory::Real, 4> x) {
+  return Spacing<11>(x);
+}
+#if HAS_BF16
+CppTypeFor<TypeCategory::Real, 3> RTDEF(Spacing3)(
+    CppTypeFor<TypeCategory::Real, 3> x) {
+  return Spacing<8>(x);
+}
+#endif
+CppTypeFor<TypeCategory::Real, 4> RTDEF(Spacing3By4)(
+    CppTypeFor<TypeCategory::Real, 4> x) {
+  return Spacing<8>(x);
+}
 CppTypeFor<TypeCategory::Real, 4> RTDEF(Spacing4)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Spacing<24>(x);
diff --git a/flang/unittests/Runtime/Numeric.cpp b/flang/unittests/Runtime/Numeric.cpp
index 9f77e16570783..799756aab3839 100644
--- a/flang/unittests/Runtime/Numeric.cpp
+++ b/flang/unittests/Runtime/Numeric.cpp
@@ -259,6 +259,11 @@ TEST(Numeric, Spacing) {
       std::isnan(RTNAME(Spacing4)(std::numeric_limits<Real<4>>::infinity())));
   EXPECT_TRUE(
       std::isnan(RTNAME(Spacing8)(std::numeric_limits<Real<8>>::quiet_NaN())));
+  EXPECT_EQ(RTNAME(Spacing2By4)(Real<4>{3.0}), std::ldexp(Real<4>{1.0}, -9));
+  EXPECT_EQ(RTNAME(Spacing2By4)(Real<4>{0.0}), Real<4>{0.00006103515625E-04});
+  EXPECT_EQ(RTNAME(Spacing3By4)(Real<4>{3.0}), std::ldexp(Real<4>{1.0}, -6));
+  EXPECT_EQ(
+      RTNAME(Spacing3By4)(Real<4>{0.0}), std::numeric_limits<Real<4>>::min());
 }
 
 TEST(Numeric, FPowI) {

From 143f3fc40279cbdafce190c5516c9dd74fc22ae5 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Wed, 4 Sep 2024 10:54:00 -0700
Subject: [PATCH 133/425] [flang] Accept a non-breaking space character in
 source (#106611)

Accept non-breaking space characters (Latin-1 '\xa0', UTF-8 '\xc2'
'\xa0') in source code, converting them into regular spaces in the
cooked character stream when not in character literals.
---
 flang/lib/Parser/prescan.cpp             | 74 +++++++++++++++---------
 flang/test/Parser/non-breaking-space.f90 |  6 ++
 2 files changed, 54 insertions(+), 26 deletions(-)
 create mode 100644 flang/test/Parser/non-breaking-space.f90

diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 804ada7d11e02..a0cd0ff263f92 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -46,6 +46,23 @@ Prescanner::Prescanner(const Prescanner &that, bool isNestedInIncludeDirective)
       compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
       compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
 
+// Returns number of bytes to skip
+static inline int IsSpace(const char *p) {
+  if (*p == ' ') {
+    return 1;
+  } else if (*p == '\xa0') { // LATIN-1 NBSP non-breaking space
+    return 1;
+  } else if (p[0] == '\xc2' && p[1] == '\xa0') { // UTF-8 NBSP
+    return 2;
+  } else {
+    return 0;
+  }
+}
+
+static inline int IsSpaceOrTab(const char *p) {
+  return *p == '\t' ? 1 : IsSpace(p);
+}
+
 static inline constexpr bool IsFixedFormCommentChar(char ch) {
   return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
 }
@@ -126,8 +143,8 @@ void Prescanner::Statement() {
     if (inFixedForm_) {
       CHECK(IsFixedFormCommentChar(*at_));
     } else {
-      while (*at_ == ' ' || *at_ == '\t') {
-        ++at_, ++column_;
+      while (int n{IsSpaceOrTab(at_)}) {
+        at_ += n, ++column_;
       }
       CHECK(*at_ == '!');
     }
@@ -159,10 +176,10 @@ void Prescanner::Statement() {
            ++sp, ++at_, ++column_) {
         EmitChar(tokens, *sp);
       }
-      if (*at_ == ' ' || *at_ == '\t') {
+      if (IsSpaceOrTab(at_)) {
         EmitChar(tokens, ' ');
-        while (*at_ == ' ' || *at_ == '\t') {
-          ++at_, ++column_;
+        while (int n{IsSpaceOrTab(at_)}) {
+          at_ += n, ++column_;
         }
       }
       tokens.CloseToken();
@@ -361,7 +378,7 @@ void Prescanner::LabelField(TokenSequence &token) {
       column_ = 7;
       break;
     }
-    if (*at_ != ' ' &&
+    if (int n{IsSpace(at_)}; n == 0 &&
         !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
       EmitChar(token, *at_);
       ++outCol;
@@ -493,7 +510,9 @@ bool Prescanner::MustSkipToEndOfLine() const {
 
 void Prescanner::NextChar() {
   CHECK(*at_ != '\n');
-  ++at_, ++column_;
+  int n{IsSpace(at_)};
+  at_ += n ? n : 1;
+  ++column_;
   while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
     // UTF-8 byte order mark - treat this file as UTF-8
     at_ += 3;
@@ -556,23 +575,23 @@ void Prescanner::SkipCComments() {
 }
 
 void Prescanner::SkipSpaces() {
-  while (*at_ == ' ' || *at_ == '\t') {
+  while (IsSpaceOrTab(at_)) {
     NextChar();
   }
   insertASpace_ = false;
 }
 
 const char *Prescanner::SkipWhiteSpace(const char *p) {
-  while (*p == ' ' || *p == '\t') {
-    ++p;
+  while (int n{IsSpaceOrTab(p)}) {
+    p += n;
   }
   return p;
 }
 
 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
   while (true) {
-    if (*p == ' ' || *p == '\t') {
-      ++p;
+    if (int n{IsSpaceOrTab(p)}) {
+      p += n;
     } else if (IsCComment(p)) {
       if (const char *after{SkipCComment(p)}) {
         p = after;
@@ -613,7 +632,7 @@ bool Prescanner::NextToken(TokenSequence &tokens) {
       }
       SkipCComments();
     }
-    if (*at_ == ' ' || *at_ == '\t') {
+    if (IsSpaceOrTab(at_)) {
       // Compress free-form white space into a single space character.
       const auto theSpace{at_};
       char previous{at_ <= start_ ? ' ' : at_[-1]};
@@ -976,8 +995,8 @@ bool Prescanner::IsFixedFormCommentLine(const char *start) const {
   }
   bool anyTabs{false};
   while (true) {
-    if (*p == ' ') {
-      ++p;
+    if (int n{IsSpace(p)}) {
+      p += n;
     } else if (*p == '\t') {
       anyTabs = true;
       ++p;
@@ -1089,7 +1108,8 @@ void Prescanner::FortranInclude(const char *firstQuote) {
 
 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
   const char *p{start};
-  for (; *p == ' '; ++p) {
+  while (int n{IsSpace(p)}) {
+    p += n;
   }
   if (*p == '#') {
     if (inFixedForm_ && p == start + 5) {
@@ -1178,9 +1198,9 @@ const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
         return nullptr;
       }
     }
-    char col6{nextLine_[5]};
-    if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
-      if (nextLine_[6] != ' ' && mightNeedSpace) {
+    const char *col6{nextLine_ + 5};
+    if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
+      if (mightNeedSpace && !IsSpace(nextLine_ + 6)) {
         insertASpace_ = true;
       }
       return nextLine_ + 6;
@@ -1207,9 +1227,9 @@ const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
                 features_.IsEnabled(LanguageFeature::OldDebugLines))) &&
         nextLine_[1] == ' ' && nextLine_[2] == ' ' && nextLine_[3] == ' ' &&
         nextLine_[4] == ' ') {
-      char col6{nextLine_[5]};
-      if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
-        if ((col6 == 'i' || col6 == 'I') && IsIncludeLine(nextLine_)) {
+      const char *col6{nextLine_ + 5};
+      if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
+        if ((*col6 == 'i' || *col6 == 'I') && IsIncludeLine(nextLine_)) {
           // It's An INCLUDE line, not a continuation
         } else {
           return nextLine_ + 6;
@@ -1356,7 +1376,7 @@ Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
   char sentinel[5], *sp{sentinel};
   int column{2};
   for (; column < 6; ++column, ++p) {
-    if (*p == ' ' || *p == '\n' || *p == '\t') {
+    if (*p == '\n' || IsSpaceOrTab(p)) {
       break;
     }
     if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
@@ -1366,8 +1386,10 @@ Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
     *sp++ = ToLowerCaseLetter(*p);
   }
   if (column == 6) {
-    if (*p == ' ' || *p == '\t' || *p == '0') {
+    if (*p == '0') {
       ++p;
+    } else if (int n{IsSpaceOrTab(p)}) {
+      p += n;
     } else {
       // This is a Continuation line, not an initial directive line.
       return std::nullopt;
@@ -1442,10 +1464,10 @@ std::optional<std::pair<const char *, const char *>>
 Prescanner::IsCompilerDirectiveSentinel(const char *p) const {
   char sentinel[8];
   for (std::size_t j{0}; j + 1 < sizeof sentinel && *p != '\n'; ++p, ++j) {
-    if (*p == ' ' || *p == '\t' || *p == '&') {
+    if (int n{*p == '&' ? 1 : IsSpaceOrTab(p)}) {
       if (j > 0) {
         sentinel[j] = '\0';
-        p = SkipWhiteSpace(p + 1);
+        p = SkipWhiteSpace(p + n);
         if (*p != '!') {
           if (const char *sp{IsCompilerDirectiveSentinel(sentinel, j)}) {
             return std::make_pair(sp, p);
diff --git a/flang/test/Parser/non-breaking-space.f90 b/flang/test/Parser/non-breaking-space.f90
new file mode 100644
index 0000000000000..f807d4b637f63
--- /dev/null
+++ b/flang/test/Parser/non-breaking-space.f90
@@ -0,0 +1,6 @@
+! RUN: %flang_fc1 -fsyntax-only %s
+! This line contains the Latin-1 NBSP (non-breaking space) character '\xa0'
+x=�1.
+! This line contains the UTF-8 encoding of NBSP ('\xc2' '\xa0')
+x= 1.
+end

From 840da2e8ba7e0f77938adfc6f6d315137542a1b8 Mon Sep 17 00:00:00 2001
From: Sterling-Augustine
 <56981066+Sterling-Augustine@users.noreply.github.com>
Date: Wed, 4 Sep 2024 17:54:17 +0000
Subject: [PATCH 134/425] [SandboxIR] Implement CmpInst, FCmpInst, and ICmpInst
 (#106301)

As in the description.

Not sure the macros for "WRAP_XXX" add value or not, but do save some
boiler plate. Maybe there is a better way.
---
 llvm/include/llvm/SandboxIR/SandboxIR.h       | 175 ++++++++++++++++-
 .../llvm/SandboxIR/SandboxIRValues.def        |   2 +
 llvm/include/llvm/SandboxIR/Tracker.h         |  15 +-
 llvm/include/llvm/SandboxIR/Type.h            |   2 +
 llvm/lib/SandboxIR/SandboxIR.cpp              |  83 ++++++++
 llvm/lib/SandboxIR/Tracker.cpp                |   8 +
 llvm/unittests/SandboxIR/SandboxIRTest.cpp    | 178 ++++++++++++++++++
 llvm/unittests/SandboxIR/TrackerTest.cpp      |  43 +++++
 8 files changed, 500 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 5c2d58c1b99dc..89e963498426d 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -54,11 +54,17 @@
 //                                      |                   |
 //                                      |                   +- ZExtInst
 //                                      |
-//                                      +- CallBase -----------+- CallBrInst
-//                                      |                      |
-//                                      +- CmpInst             +- CallInst
-//                                      |                      |
-//                                      +- ExtractElementInst  +- InvokeInst
+//                                      +- CallBase --------+- CallBrInst
+//                                      |                   |
+//                                      |                   +- CallInst
+//                                      |                   |
+//                                      |                   +- InvokeInst
+//                                      |
+//                                      +- CmpInst ---------+- ICmpInst
+//                                      |                   |
+//                                      |                   +- FCmpInst
+//                                      |
+//                                      +- ExtractElementInst
 //                                      |
 //                                      +- GetElementPtrInst
 //                                      |
@@ -158,6 +164,9 @@ class BinaryOperator;
 class PossiblyDisjointInst;
 class AtomicRMWInst;
 class AtomicCmpXchgInst;
+class CmpInst;
+class ICmpInst;
+class FCmpInst;
 
 /// Iterator for the `Use` edges of a User's operands.
 /// \Returns the operand `Use` when dereferenced.
@@ -304,6 +313,7 @@ class Value {
   friend class PHINode;               // For getting `Val`.
   friend class UnreachableInst;       // For getting `Val`.
   friend class CatchSwitchAddHandler; // For `Val`.
+  friend class CmpInst;               // For getting `Val`.
   friend class ConstantArray;         // For `Val`.
   friend class ConstantStruct;        // For `Val`.
 
@@ -1076,6 +1086,7 @@ class Instruction : public sandboxir::User {
   friend class CastInst;           // For getTopmostLLVMInstruction().
   friend class PHINode;            // For getTopmostLLVMInstruction().
   friend class UnreachableInst;    // For getTopmostLLVMInstruction().
+  friend class CmpInst;            // For getTopmostLLVMInstruction().
 
   /// \Returns the LLVM IR Instructions that this SandboxIR maps to in program
   /// order.
@@ -1232,6 +1243,7 @@ template <typename LLVMT> class SingleLLVMInstructionImpl : public Instruction {
   friend class UnaryInstruction;
   friend class CallBase;
   friend class FuncletPadInst;
+  friend class CmpInst;
 
   Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
     return getOperandUseDefault(OpIdx, Verify);
@@ -3425,6 +3437,151 @@ class PHINode final : public SingleLLVMInstructionImpl<llvm::PHINode> {
   //                         uint32_t ToIdx = 0)
 };
 
+// Wraps a static function that takes a single Predicate parameter
+// LLVMValType should be the type of the wrapped class
+#define WRAP_STATIC_PREDICATE(FunctionName)                                    \
+  static auto FunctionName(Predicate P) { return LLVMValType::FunctionName(P); }
+// Wraps a member function that takes no parameters
+// LLVMValType should be the type of the wrapped class
+#define WRAP_MEMBER(FunctionName)                                              \
+  auto FunctionName() const { return cast<LLVMValType>(Val)->FunctionName(); }
+// Wraps both--a common idiom in the CmpInst classes
+#define WRAP_BOTH(FunctionName)                                                \
+  WRAP_STATIC_PREDICATE(FunctionName)                                          \
+  WRAP_MEMBER(FunctionName)
+
+class CmpInst : public SingleLLVMInstructionImpl<llvm::CmpInst> {
+protected:
+  using LLVMValType = llvm::CmpInst;
+  /// Use Context::createCmpInst(). Don't call the constructor directly.
+  CmpInst(llvm::CmpInst *CI, Context &Ctx, ClassID Id, Opcode Opc)
+      : SingleLLVMInstructionImpl(Id, Opc, CI, Ctx) {}
+  friend Context; // for CmpInst()
+  static Value *createCommon(Value *Cond, Value *True, Value *False,
+                             const Twine &Name, IRBuilder<> &Builder,
+                             Context &Ctx);
+
+public:
+  using Predicate = llvm::CmpInst::Predicate;
+
+  static CmpInst *create(Predicate Pred, Value *S1, Value *S2,
+                         Instruction *InsertBefore, Context &Ctx,
+                         const Twine &Name = "");
+  static CmpInst *createWithCopiedFlags(Predicate Pred, Value *S1, Value *S2,
+                                        const Instruction *FlagsSource,
+                                        Instruction *InsertBefore, Context &Ctx,
+                                        const Twine &Name = "");
+  void setPredicate(Predicate P);
+  void swapOperands();
+
+  WRAP_MEMBER(getPredicate);
+  WRAP_BOTH(isFPPredicate);
+  WRAP_BOTH(isIntPredicate);
+  WRAP_STATIC_PREDICATE(getPredicateName);
+  WRAP_BOTH(getInversePredicate);
+  WRAP_BOTH(getOrderedPredicate);
+  WRAP_BOTH(getUnorderedPredicate);
+  WRAP_BOTH(getSwappedPredicate);
+  WRAP_BOTH(isStrictPredicate);
+  WRAP_BOTH(isNonStrictPredicate);
+  WRAP_BOTH(getStrictPredicate);
+  WRAP_BOTH(getNonStrictPredicate);
+  WRAP_BOTH(getFlippedStrictnessPredicate);
+  WRAP_MEMBER(isCommutative);
+  WRAP_BOTH(isEquality);
+  WRAP_BOTH(isRelational);
+  WRAP_BOTH(isSigned);
+  WRAP_BOTH(getSignedPredicate);
+  WRAP_BOTH(getUnsignedPredicate);
+  WRAP_BOTH(getFlippedSignednessPredicate);
+  WRAP_BOTH(isTrueWhenEqual);
+  WRAP_BOTH(isFalseWhenEqual);
+  WRAP_BOTH(isUnsigned);
+  WRAP_STATIC_PREDICATE(isOrdered);
+  WRAP_STATIC_PREDICATE(isUnordered);
+
+  static bool isImpliedTrueByMatchingCmp(Predicate Pred1, Predicate Pred2) {
+    return llvm::CmpInst::isImpliedTrueByMatchingCmp(Pred1, Pred2);
+  }
+  static bool isImpliedFalseByMatchingCmp(Predicate Pred1, Predicate Pred2) {
+    return llvm::CmpInst::isImpliedFalseByMatchingCmp(Pred1, Pred2);
+  }
+
+  /// Method for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::ICmp ||
+           From->getSubclassID() == ClassID::FCmp;
+  }
+
+  /// Create a result type for fcmp/icmp
+  static Type *makeCmpResultType(Type *OpndType);
+
+#ifndef NDEBUG
+  void dumpOS(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const;
+#endif
+};
+
+class ICmpInst : public CmpInst {
+  /// Use Context::createICmpInst(). Don't call the constructor directly.
+  ICmpInst(llvm::ICmpInst *CI, Context &Ctx)
+      : CmpInst(CI, Ctx, ClassID::ICmp, Opcode::ICmp) {}
+  friend class Context; // For constructor.
+  using LLVMValType = llvm::ICmpInst;
+
+public:
+  void swapOperands();
+
+  WRAP_BOTH(getSignedPredicate);
+  WRAP_BOTH(getUnsignedPredicate);
+  WRAP_BOTH(isEquality);
+  WRAP_MEMBER(isCommutative);
+  WRAP_MEMBER(isRelational);
+  WRAP_STATIC_PREDICATE(isGT);
+  WRAP_STATIC_PREDICATE(isLT);
+  WRAP_STATIC_PREDICATE(isGE);
+  WRAP_STATIC_PREDICATE(isLE);
+
+  static auto predicates() { return llvm::ICmpInst::predicates(); }
+  static bool compare(const APInt &LHS, const APInt &RHS,
+                      ICmpInst::Predicate Pred) {
+    return llvm::ICmpInst::compare(LHS, RHS, Pred);
+  }
+
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::ICmp;
+  }
+};
+
+class FCmpInst : public CmpInst {
+  /// Use Context::createFCmpInst(). Don't call the constructor directly.
+  FCmpInst(llvm::FCmpInst *CI, Context &Ctx)
+      : CmpInst(CI, Ctx, ClassID::FCmp, Opcode::FCmp) {}
+  friend class Context; // For constructor.
+  using LLVMValType = llvm::FCmpInst;
+
+public:
+  void swapOperands();
+
+  WRAP_BOTH(isEquality);
+  WRAP_MEMBER(isCommutative);
+  WRAP_MEMBER(isRelational);
+
+  static auto predicates() { return llvm::FCmpInst::predicates(); }
+  static bool compare(const APFloat &LHS, const APFloat &RHS,
+                      FCmpInst::Predicate Pred) {
+    return llvm::FCmpInst::compare(LHS, RHS, Pred);
+  }
+
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::FCmp;
+  }
+};
+
+#undef WRAP_STATIC_PREDICATE
+#undef WRAP_MEMBER
+#undef WRAP_BOTH
+
 /// An LLLVM Instruction that has no SandboxIR equivalent class gets mapped to
 /// an OpaqueInstr.
 class OpaqueInst : public SingleLLVMInstructionImpl<llvm::Instruction> {
@@ -3445,6 +3602,8 @@ class Context {
   LLVMContext &LLVMCtx;
   friend class Type;        // For LLVMCtx.
   friend class PointerType; // For LLVMCtx.
+  friend class CmpInst; // For LLVMCtx. TODO: cleanup when sandboxir::VectorType
+                        // is complete
   friend class IntegerType; // For LLVMCtx.
   friend class StructType;  // For LLVMCtx.
   Tracker IRTracker;
@@ -3572,6 +3731,12 @@ class Context {
   friend PHINode; // For createPHINode()
   UnreachableInst *createUnreachableInst(llvm::UnreachableInst *UI);
   friend UnreachableInst; // For createUnreachableInst()
+  CmpInst *createCmpInst(llvm::CmpInst *I);
+  friend CmpInst; // For createCmpInst()
+  ICmpInst *createICmpInst(llvm::ICmpInst *I);
+  friend ICmpInst; // For createICmpInst()
+  FCmpInst *createFCmpInst(llvm::FCmpInst *I);
+  friend FCmpInst; // For createFCmpInst()
 
 public:
   Context(LLVMContext &LLVMCtx)
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index d2031bbdcfb54..f320f61934efa 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -113,6 +113,8 @@ DEF_INSTR(Cast,   OPCODES(\
                           ),                  CastInst)
 DEF_INSTR(PHI,            OP(PHI),            PHINode)
 DEF_INSTR(Unreachable,    OP(Unreachable),    UnreachableInst)
+DEF_INSTR(ICmp,           OP(ICmp),          FCmpInst)
+DEF_INSTR(FCmp,           OP(FCmp),          ICmpInst)
 
 // clang-format on
 #ifdef DEF_VALUE
diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h
index c8a9e99a34341..5fc43db82bd70 100644
--- a/llvm/include/llvm/SandboxIR/Tracker.h
+++ b/llvm/include/llvm/SandboxIR/Tracker.h
@@ -63,7 +63,7 @@ class CatchSwitchInst;
 class SwitchInst;
 class ConstantInt;
 class ShuffleVectorInst;
-
+class CmpInst;
 /// The base class for IR Change classes.
 class IRChangeBase {
 protected:
@@ -130,6 +130,19 @@ class PHIAddIncoming : public IRChangeBase {
 #endif
 };
 
+class CmpSwapOperands : public IRChangeBase {
+  CmpInst *Cmp;
+
+public:
+  CmpSwapOperands(CmpInst *Cmp);
+  void revert(Tracker &Tracker) final;
+  void accept() final {}
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final { OS << "CmpSwapOperands"; }
+  LLVM_DUMP_METHOD void dump() const final;
+#endif
+};
+
 /// Tracks swapping a Use with another Use.
 class UseSwap : public IRChangeBase {
   Use ThisUse;
diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h
index 2f9b94b8d7175..61721ca836321 100644
--- a/llvm/include/llvm/SandboxIR/Type.h
+++ b/llvm/include/llvm/SandboxIR/Type.h
@@ -50,6 +50,8 @@ class Type {
   friend class ConstantArray;  // For LLVMTy.
   friend class ConstantStruct; // For LLVMTy.
   friend class ConstantVector; // For LLVMTy.
+  friend class CmpInst; // For LLVMTy. TODO: Cleanup after sandboxir::VectorType
+                        // is more complete.
 
   // Friend all instruction classes because `create()` functions use LLVMTy.
 #define DEF_INSTR(ID, OPCODE, CLASS) friend class CLASS;
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index e8d081e6b17e7..c0e5837209213 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -2735,6 +2735,16 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
     It->second = std::unique_ptr<PHINode>(new PHINode(LLVMPhi, *this));
     return It->second.get();
   }
+  case llvm::Instruction::ICmp: {
+    auto *LLVMICmp = cast<llvm::ICmpInst>(LLVMV);
+    It->second = std::unique_ptr<ICmpInst>(new ICmpInst(LLVMICmp, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::FCmp: {
+    auto *LLVMFCmp = cast<llvm::FCmpInst>(LLVMV);
+    It->second = std::unique_ptr<FCmpInst>(new FCmpInst(LLVMFCmp, *this));
+    return It->second.get();
+  }
   case llvm::Instruction::Unreachable: {
     auto *LLVMUnreachable = cast<llvm::UnreachableInst>(LLVMV);
     It->second = std::unique_ptr<UnreachableInst>(
@@ -2922,6 +2932,79 @@ PHINode *Context::createPHINode(llvm::PHINode *I) {
   auto NewPtr = std::unique_ptr<PHINode>(new PHINode(I, *this));
   return cast<PHINode>(registerValue(std::move(NewPtr)));
 }
+ICmpInst *Context::createICmpInst(llvm::ICmpInst *I) {
+  auto NewPtr = std::unique_ptr<ICmpInst>(new ICmpInst(I, *this));
+  return cast<ICmpInst>(registerValue(std::move(NewPtr)));
+}
+FCmpInst *Context::createFCmpInst(llvm::FCmpInst *I) {
+  auto NewPtr = std::unique_ptr<FCmpInst>(new FCmpInst(I, *this));
+  return cast<FCmpInst>(registerValue(std::move(NewPtr)));
+}
+CmpInst *CmpInst::create(Predicate P, Value *S1, Value *S2,
+                         Instruction *InsertBefore, Context &Ctx,
+                         const Twine &Name) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(InsertBefore->getTopmostLLVMInstruction());
+  auto *LLVMI = Builder.CreateCmp(P, S1->Val, S2->Val, Name);
+  if (dyn_cast<llvm::ICmpInst>(LLVMI))
+    return Ctx.createICmpInst(cast<llvm::ICmpInst>(LLVMI));
+  return Ctx.createFCmpInst(cast<llvm::FCmpInst>(LLVMI));
+}
+CmpInst *CmpInst::createWithCopiedFlags(Predicate P, Value *S1, Value *S2,
+                                        const Instruction *F,
+                                        Instruction *InsertBefore, Context &Ctx,
+                                        const Twine &Name) {
+  CmpInst *Inst = create(P, S1, S2, InsertBefore, Ctx, Name);
+  cast<llvm::CmpInst>(Inst->Val)->copyIRFlags(F->Val);
+  return Inst;
+}
+
+Type *CmpInst::makeCmpResultType(Type *OpndType) {
+  if (auto *VT = dyn_cast<VectorType>(OpndType)) {
+    // TODO: Cleanup when we have more complete support for
+    // sandboxir::VectorType
+    return OpndType->getContext().getType(llvm::VectorType::get(
+        llvm::Type::getInt1Ty(OpndType->getContext().LLVMCtx),
+        cast<llvm::VectorType>(VT->LLVMTy)->getElementCount()));
+  }
+  return Type::getInt1Ty(OpndType->getContext());
+}
+
+void CmpInst::setPredicate(Predicate P) {
+  Ctx.getTracker()
+      .emplaceIfTracking<
+          GenericSetter<&CmpInst::getPredicate, &CmpInst::setPredicate>>(this);
+  cast<llvm::CmpInst>(Val)->setPredicate(P);
+}
+
+void CmpInst::swapOperands() {
+  if (ICmpInst *IC = dyn_cast<ICmpInst>(this))
+    IC->swapOperands();
+  else
+    cast<FCmpInst>(this)->swapOperands();
+}
+
+void ICmpInst::swapOperands() {
+  Ctx.getTracker().emplaceIfTracking<CmpSwapOperands>(this);
+  cast<llvm::ICmpInst>(Val)->swapOperands();
+}
+
+void FCmpInst::swapOperands() {
+  Ctx.getTracker().emplaceIfTracking<CmpSwapOperands>(this);
+  cast<llvm::FCmpInst>(Val)->swapOperands();
+}
+
+#ifndef NDEBUG
+void CmpInst::dumpOS(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void CmpInst::dump() const {
+  dumpOS(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
 
 Value *Context::getValue(llvm::Value *V) const {
   auto It = LLVMValueToValueMap.find(V);
diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp
index 953d4bd51353a..c6eb9fc68a4b1 100644
--- a/llvm/lib/SandboxIR/Tracker.cpp
+++ b/llvm/lib/SandboxIR/Tracker.cpp
@@ -248,6 +248,14 @@ void ShuffleVectorSetMask::dump() const {
 }
 #endif
 
+CmpSwapOperands::CmpSwapOperands(CmpInst *Cmp) : Cmp(Cmp) {}
+
+void CmpSwapOperands::revert(Tracker &Tracker) { Cmp->swapOperands(); }
+void CmpSwapOperands::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+
 void Tracker::save() { State = TrackerState::Record; }
 
 void Tracker::revert() {
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index ca2a183e53268..d1c5690ccad5b 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -4800,6 +4800,184 @@ define void @foo(i32 %arg) {
   EXPECT_EQ(NewPHI->getNumIncomingValues(), PHI->getNumIncomingValues());
 }
 
+static void checkSwapOperands(sandboxir::Context &Ctx,
+                              llvm::sandboxir::CmpInst *Cmp,
+                              llvm::CmpInst *LLVMCmp) {
+  auto OrigOp0 = Cmp->getOperand(0);
+  auto OrigOp1 = Cmp->getOperand(1);
+  EXPECT_EQ(Ctx.getValue(LLVMCmp->getOperand(0)), OrigOp0);
+  EXPECT_EQ(Ctx.getValue(LLVMCmp->getOperand(1)), OrigOp1);
+  // This checks the dispatch mechanism in CmpInst, as well as
+  // the specific implementations.
+  Cmp->swapOperands();
+  EXPECT_EQ(Ctx.getValue(LLVMCmp->getOperand(1)), OrigOp0);
+  EXPECT_EQ(Ctx.getValue(LLVMCmp->getOperand(0)), OrigOp1);
+  EXPECT_EQ(Cmp->getOperand(0), OrigOp1);
+  EXPECT_EQ(Cmp->getOperand(1), OrigOp0);
+  // Undo it to keep the rest of the test consistent
+  Cmp->swapOperands();
+}
+
+static void checkCommonPredicates(sandboxir::CmpInst *Cmp,
+                                  llvm::CmpInst *LLVMCmp) {
+  // Check proper creation
+  auto Pred = Cmp->getPredicate();
+  auto LLVMPred = LLVMCmp->getPredicate();
+  EXPECT_EQ(Pred, LLVMPred);
+  // Check setPredicate
+  Cmp->setPredicate(llvm::CmpInst::FCMP_FALSE);
+  EXPECT_EQ(Cmp->getPredicate(), llvm::CmpInst::FCMP_FALSE);
+  EXPECT_EQ(LLVMCmp->getPredicate(), llvm::CmpInst::FCMP_FALSE);
+  Cmp->setPredicate(Pred);
+  EXPECT_EQ(LLVMCmp->getPredicate(), Pred);
+  // Ensure the accessors properly forward to the underlying implementation
+  EXPECT_STREQ(sandboxir::CmpInst::getPredicateName(Pred).data(),
+               llvm::CmpInst::getPredicateName(LLVMPred).data());
+  EXPECT_EQ(Cmp->isFPPredicate(), LLVMCmp->isFPPredicate());
+  EXPECT_EQ(Cmp->isIntPredicate(), LLVMCmp->isIntPredicate());
+  EXPECT_EQ(Cmp->getInversePredicate(), LLVMCmp->getInversePredicate());
+  EXPECT_EQ(Cmp->getOrderedPredicate(), LLVMCmp->getOrderedPredicate());
+  EXPECT_EQ(Cmp->getUnorderedPredicate(), LLVMCmp->getUnorderedPredicate());
+  EXPECT_EQ(Cmp->getSwappedPredicate(), LLVMCmp->getSwappedPredicate());
+  EXPECT_EQ(Cmp->isStrictPredicate(), LLVMCmp->isStrictPredicate());
+  EXPECT_EQ(Cmp->isNonStrictPredicate(), LLVMCmp->isNonStrictPredicate());
+  EXPECT_EQ(Cmp->isRelational(), LLVMCmp->isRelational());
+  if (Cmp->isRelational()) {
+    EXPECT_EQ(Cmp->getFlippedStrictnessPredicate(),
+              LLVMCmp->getFlippedStrictnessPredicate());
+  }
+  EXPECT_EQ(Cmp->isCommutative(), LLVMCmp->isCommutative());
+  EXPECT_EQ(Cmp->isTrueWhenEqual(), LLVMCmp->isTrueWhenEqual());
+  EXPECT_EQ(Cmp->isFalseWhenEqual(), LLVMCmp->isFalseWhenEqual());
+  EXPECT_EQ(sandboxir::CmpInst::isOrdered(Pred),
+            llvm::CmpInst::isOrdered(LLVMPred));
+  EXPECT_EQ(sandboxir::CmpInst::isUnordered(Pred),
+            llvm::CmpInst::isUnordered(LLVMPred));
+}
+
+TEST_F(SandboxIRTest, ICmpInst) {
+  SCOPED_TRACE("SandboxIRTest sandboxir::ICmpInst tests");
+  parseIR(C, R"IR(
+define void @foo(i32 %i0, i32 %i1) {
+ bb:
+  %ine  = icmp ne i32 %i0, %i1
+  %iugt = icmp ugt i32 %i0, %i1
+  %iuge = icmp uge i32 %i0, %i1
+  %iult = icmp ult i32 %i0, %i1
+  %iule = icmp ule i32 %i0, %i1
+  %isgt = icmp sgt i32 %i0, %i1
+  %isle = icmp sle i32 %i0, %i1
+  %ieg  = icmp eq i32 %i0, %i1
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF);
+
+  auto *LLVMBB = getBasicBlockByName(LLVMF, "bb");
+  auto LLVMIt = LLVMBB->begin();
+  auto *BB = cast<sandboxir::BasicBlock>(Ctx.getValue(LLVMBB));
+  auto It = BB->begin();
+  // Check classof()
+  while (auto *ICmp = dyn_cast<sandboxir::ICmpInst>(&*It++)) {
+    auto *LLVMICmp = cast<llvm::ICmpInst>(&*LLVMIt++);
+    checkSwapOperands(Ctx, ICmp, LLVMICmp);
+    checkCommonPredicates(ICmp, LLVMICmp);
+    EXPECT_EQ(ICmp->isSigned(), LLVMICmp->isSigned());
+    EXPECT_EQ(ICmp->isUnsigned(), LLVMICmp->isUnsigned());
+    EXPECT_EQ(ICmp->getSignedPredicate(), LLVMICmp->getSignedPredicate());
+    EXPECT_EQ(ICmp->getUnsignedPredicate(), LLVMICmp->getUnsignedPredicate());
+  }
+  auto *NewCmp =
+      sandboxir::CmpInst::create(llvm::CmpInst::ICMP_ULE, F.getArg(0),
+                                 F.getArg(1), &*BB->begin(), Ctx, "NewCmp");
+  EXPECT_EQ(NewCmp, &*BB->begin());
+  EXPECT_EQ(NewCmp->getPredicate(), llvm::CmpInst::ICMP_ULE);
+  EXPECT_EQ(NewCmp->getOperand(0), F.getArg(0));
+  EXPECT_EQ(NewCmp->getOperand(1), F.getArg(1));
+#ifndef NDEBUG
+  EXPECT_EQ(NewCmp->getName(), "NewCmp");
+#endif // NDEBUG
+  // TODO: Improve this test when sandboxir::VectorType is more completely
+  // implemented.
+  sandboxir::Type *RT =
+      sandboxir::CmpInst::makeCmpResultType(F.getArg(0)->getType());
+  EXPECT_TRUE(RT->isIntegerTy(1)); // Only one bit in a single comparison
+}
+
+TEST_F(SandboxIRTest, FCmpInst) {
+  SCOPED_TRACE("SandboxIRTest sandboxir::FCmpInst tests");
+  parseIR(C, R"IR(
+define void @foo(float %f0, float %f1) {
+bb:
+  %ffalse = fcmp false float %f0, %f1
+  %foeq = fcmp oeq float %f0, %f1
+  %fogt = fcmp ogt float %f0, %f1
+  %folt = fcmp olt float %f0, %f1
+  %fole = fcmp ole float %f0, %f1
+  %fone = fcmp one float %f0, %f1
+  %ford = fcmp ord float %f0, %f1
+  %funo = fcmp uno float %f0, %f1
+  %fueq = fcmp ueq float %f0, %f1
+  %fugt = fcmp ugt float %f0, %f1
+  %fuge = fcmp uge float %f0, %f1
+  %fult = fcmp ult float %f0, %f1
+  %fule = fcmp ule float %f0, %f1
+  %fune = fcmp une float %f0, %f1
+  %ftrue = fcmp true float %f0, %f1
+  ret void
+bb1:
+  %copyfrom = fadd reassoc float %f0, 42.0
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF);
+
+  auto *LLVMBB = getBasicBlockByName(LLVMF, "bb");
+  auto LLVMIt = LLVMBB->begin();
+  auto *BB = cast<sandboxir::BasicBlock>(Ctx.getValue(LLVMBB));
+  auto It = BB->begin();
+  // Check classof()
+  while (auto *FCmp = dyn_cast<sandboxir::ICmpInst>(&*It++)) {
+    auto *LLVMFCmp = cast<llvm::ICmpInst>(&*LLVMIt++);
+    checkSwapOperands(Ctx, FCmp, LLVMFCmp);
+    checkCommonPredicates(FCmp, LLVMFCmp);
+  }
+
+  auto *LLVMBB1 = getBasicBlockByName(LLVMF, "bb1");
+  auto *BB1 = cast<sandboxir::BasicBlock>(Ctx.getValue(LLVMBB1));
+  auto It1 = BB1->begin();
+  auto *CopyFrom = &*It1++;
+  CopyFrom->setFastMathFlags(FastMathFlags::getFast());
+
+  // create with default flags
+  auto *NewFCmp = sandboxir::CmpInst::create(
+      llvm::CmpInst::FCMP_ONE, F.getArg(0), F.getArg(1), &*It1, Ctx, "NewFCmp");
+  EXPECT_EQ(NewFCmp->getPredicate(), llvm::CmpInst::FCMP_ONE);
+  EXPECT_EQ(NewFCmp->getOperand(0), F.getArg(0));
+  EXPECT_EQ(NewFCmp->getOperand(1), F.getArg(1));
+#ifndef NDEBUG
+  EXPECT_EQ(NewFCmp->getName(), "NewFCmp");
+#endif // NDEBUG
+  FastMathFlags DefaultFMF = NewFCmp->getFastMathFlags();
+  EXPECT_TRUE(CopyFrom->getFastMathFlags() != DefaultFMF);
+  // create with copied flags
+  auto *NewFCmpFlags = sandboxir::CmpInst::createWithCopiedFlags(
+      llvm::CmpInst::FCMP_ONE, F.getArg(0), F.getArg(1), CopyFrom, &*It1, Ctx,
+      "NewFCmpFlags");
+  EXPECT_FALSE(NewFCmpFlags->getFastMathFlags() !=
+               CopyFrom->getFastMathFlags());
+  EXPECT_EQ(NewFCmpFlags->getPredicate(), llvm::CmpInst::FCMP_ONE);
+  EXPECT_EQ(NewFCmpFlags->getOperand(0), F.getArg(0));
+  EXPECT_EQ(NewFCmpFlags->getOperand(1), F.getArg(1));
+#ifndef NDEBUG
+  EXPECT_EQ(NewFCmpFlags->getName(), "NewFCmpFlags");
+#endif // NDEBUG
+}
+
 TEST_F(SandboxIRTest, UnreachableInst) {
   parseIR(C, R"IR(
 define void @foo() {
diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp
index fe29452a8aea2..a1f39fe958e35 100644
--- a/llvm/unittests/SandboxIR/TrackerTest.cpp
+++ b/llvm/unittests/SandboxIR/TrackerTest.cpp
@@ -1452,6 +1452,49 @@ define void @foo(i8 %arg0, i8 %arg1, i8 %arg2) {
   EXPECT_EQ(PHI->getIncomingValue(1), Arg1);
 }
 
+void checkCmpInst(sandboxir::Context &Ctx, sandboxir::CmpInst *Cmp) {
+  Ctx.save();
+  auto OrigP = Cmp->getPredicate();
+  auto NewP = Cmp->getSwappedPredicate();
+  Cmp->setPredicate(NewP);
+  EXPECT_EQ(Cmp->getPredicate(), NewP);
+  Ctx.revert();
+  EXPECT_EQ(Cmp->getPredicate(), OrigP);
+
+  Ctx.save();
+  auto OrigOp0 = Cmp->getOperand(0);
+  auto OrigOp1 = Cmp->getOperand(1);
+  Cmp->swapOperands();
+  EXPECT_EQ(Cmp->getPredicate(), NewP);
+  EXPECT_EQ(Cmp->getOperand(0), OrigOp1);
+  EXPECT_EQ(Cmp->getOperand(1), OrigOp0);
+  Ctx.revert();
+  EXPECT_EQ(Cmp->getPredicate(), OrigP);
+  EXPECT_EQ(Cmp->getOperand(0), OrigOp0);
+  EXPECT_EQ(Cmp->getOperand(1), OrigOp1);
+}
+
+TEST_F(TrackerTest, CmpInst) {
+  SCOPED_TRACE("TrackerTest sandboxir::CmpInst tests");
+  parseIR(C, R"IR(
+define void @foo(i64 %i0, i64 %i1, float %f0, float %f1) {
+  %foeq = fcmp ogt float %f0, %f1
+  %ioeq = icmp uge i64 %i0, %i1
+
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *BB = &*F.begin();
+  auto It = BB->begin();
+  auto *FCmp = cast<sandboxir::CmpInst>(&*It++);
+  checkCmpInst(Ctx, FCmp);
+  auto *ICmp = cast<sandboxir::CmpInst>(&*It++);
+  checkCmpInst(Ctx, ICmp);
+}
+
 TEST_F(TrackerTest, SetVolatile) {
   parseIR(C, R"IR(
 define void @foo(ptr %arg0, i8 %val) {

From d1e4a2d300f7c0c6b681ddf719132c81d348aaab Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Wed, 4 Sep 2024 10:54:46 -0700
Subject: [PATCH 135/425] [flang] Fix spurious error with separate module
 procedures (#106768)

When the implementation of one SMP apparently references another in what
might be a specification expression, semantics may need to resolve it as
a forward reference, and to allow for the replacement of a
SubprogramNameDetails place-holding symbol with the final
SubprogramDetails symbol. Otherwise, as in the bug report below,
confusing error messages may result.

(The reference in question isn't really in the specification part of a
subprogram, but due to the syntactic ambiguity between the array element
assignment statement and a statement function definition, it appears to
be so at the time that the reference is processed.)

I needed to make DumpSymbols() available via SemanticsContext to analyze
this bug, and left that new API in place to make things easier next
time.

Fixes https://github.com/llvm/llvm-project/issues/106705.
---
 flang/include/flang/Semantics/expression.h |  2 +-
 flang/include/flang/Semantics/semantics.h  |  2 ++
 flang/lib/Semantics/expression.cpp         | 41 ++++++++++++++--------
 flang/lib/Semantics/semantics.cpp          |  6 ++--
 flang/test/Semantics/smp-proc-ref.f90      | 20 +++++++++++
 5 files changed, 53 insertions(+), 18 deletions(-)
 create mode 100644 flang/test/Semantics/smp-proc-ref.f90

diff --git a/flang/include/flang/Semantics/expression.h b/flang/include/flang/Semantics/expression.h
index a224b08da21da..b1304d704232d 100644
--- a/flang/include/flang/Semantics/expression.h
+++ b/flang/include/flang/Semantics/expression.h
@@ -354,7 +354,7 @@ class ExpressionAnalyzer {
       parser::CharBlock, const ProcedureDesignator &, ActualArguments &);
   using AdjustActuals =
       std::optional<std::function<bool(const Symbol &, ActualArguments &)>>;
-  bool ResolveForward(const Symbol &);
+  const Symbol *ResolveForward(const Symbol &);
   std::pair<const Symbol *, bool /* failure due ambiguity */> ResolveGeneric(
       const Symbol &, const ActualArguments &, const AdjustActuals &,
       bool isSubroutine, bool mightBeStructureConstructor = false);
diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h
index ec8d12b0f9865..e73f9d2e85d58 100644
--- a/flang/include/flang/Semantics/semantics.h
+++ b/flang/include/flang/Semantics/semantics.h
@@ -257,6 +257,8 @@ class SemanticsContext {
   void NoteDefinedSymbol(const Symbol &);
   bool IsSymbolDefined(const Symbol &) const;
 
+  void DumpSymbols(llvm::raw_ostream &);
+
 private:
   struct ScopeIndexComparator {
     bool operator()(parser::CharBlock, parser::CharBlock) const;
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 4f8632a2055d9..60db02dc764b4 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -2650,9 +2650,9 @@ static int ComputeCudaMatchingDistance(
 // Handles a forward reference to a module function from what must
 // be a specification expression.  Return false if the symbol is
 // an invalid forward reference.
-bool ExpressionAnalyzer::ResolveForward(const Symbol &symbol) {
+const Symbol *ExpressionAnalyzer::ResolveForward(const Symbol &symbol) {
   if (context_.HasError(symbol)) {
-    return false;
+    return nullptr;
   }
   if (const auto *details{
           symbol.detailsIf<semantics::SubprogramNameDetails>()}) {
@@ -2661,8 +2661,13 @@ bool ExpressionAnalyzer::ResolveForward(const Symbol &symbol) {
       // checking a specification expression in a sibling module
       // procedure.  Resolve its names now so that its interface
       // is known.
+      const semantics::Scope &scope{symbol.owner()};
       semantics::ResolveSpecificationParts(context_, symbol);
-      if (symbol.has<semantics::SubprogramNameDetails>()) {
+      const Symbol *resolved{nullptr};
+      if (auto iter{scope.find(symbol.name())}; iter != scope.cend()) {
+        resolved = &*iter->second;
+      }
+      if (!resolved || resolved->has<semantics::SubprogramNameDetails>()) {
         // When the symbol hasn't had its details updated, we must have
         // already been in the process of resolving the function's
         // specification part; but recursive function calls are not
@@ -2670,8 +2675,8 @@ bool ExpressionAnalyzer::ResolveForward(const Symbol &symbol) {
         Say("The module function '%s' may not be referenced recursively in a specification expression"_err_en_US,
             symbol.name());
         context_.SetError(symbol);
-        return false;
       }
+      return resolved;
     } else if (inStmtFunctionDefinition_) {
       semantics::ResolveSpecificationParts(context_, symbol);
       CHECK(symbol.has<semantics::SubprogramDetails>());
@@ -2679,10 +2684,10 @@ bool ExpressionAnalyzer::ResolveForward(const Symbol &symbol) {
       Say("The internal function '%s' may not be referenced in a specification expression"_err_en_US,
           symbol.name());
       context_.SetError(symbol);
-      return false;
+      return nullptr;
     }
   }
-  return true;
+  return &symbol;
 }
 
 // Resolve a call to a generic procedure with given actual arguments.
@@ -2709,20 +2714,21 @@ std::pair<const Symbol *, bool> ExpressionAnalyzer::ResolveGeneric(
   }
   if (const auto *details{ultimate.detailsIf<semantics::GenericDetails>()}) {
     for (const Symbol &specific0 : details->specificProcs()) {
-      const Symbol &specific{BypassGeneric(specific0)};
-      if (isSubroutine != !IsFunction(specific)) {
+      const Symbol &specific1{BypassGeneric(specific0)};
+      if (isSubroutine != !IsFunction(specific1)) {
         continue;
       }
-      if (!ResolveForward(specific)) {
+      const Symbol *specific{ResolveForward(specific1)};
+      if (!specific) {
         continue;
       }
       if (std::optional<characteristics::Procedure> procedure{
               characteristics::Procedure::Characterize(
-                  ProcedureDesignator{specific}, context_.foldingContext(),
+                  ProcedureDesignator{*specific}, context_.foldingContext(),
                   /*emitError=*/false)}) {
         ActualArguments localActuals{actuals};
-        if (specific.has<semantics::ProcBindingDetails>()) {
-          if (!adjustActuals.value()(specific, localActuals)) {
+        if (specific->has<semantics::ProcBindingDetails>()) {
+          if (!adjustActuals.value()(*specific, localActuals)) {
             continue;
           }
         }
@@ -2751,9 +2757,9 @@ std::pair<const Symbol *, bool> ExpressionAnalyzer::ResolveGeneric(
           }
           if (!procedure->IsElemental()) {
             // takes priority over elemental match
-            nonElemental = &specific;
+            nonElemental = specific;
           } else {
-            elemental = &specific;
+            elemental = specific;
           }
           crtMatchingDistance = ComputeCudaMatchingDistance(
               context_.languageFeatures(), *procedure, localActuals);
@@ -2866,7 +2872,12 @@ auto ExpressionAnalyzer::GetCalleeAndArguments(const parser::Name &name,
   if (context_.HasError(symbol)) {
     return std::nullopt; // also handles null symbol
   }
-  const Symbol &ultimate{DEREF(symbol).GetUltimate()};
+  symbol = ResolveForward(*symbol);
+  if (!symbol) {
+    return std::nullopt;
+  }
+  name.symbol = const_cast<Symbol *>(symbol);
+  const Symbol &ultimate{symbol->GetUltimate()};
   CheckForBadRecursion(name.source, ultimate);
   bool dueToAmbiguity{false};
   bool isGenericInterface{ultimate.has<semantics::GenericDetails>()};
diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp
index f7a277d1b414f..8592d1e5d6217 100644
--- a/flang/lib/Semantics/semantics.cpp
+++ b/flang/lib/Semantics/semantics.cpp
@@ -655,10 +655,12 @@ void Semantics::EmitMessages(llvm::raw_ostream &os) {
   context_.messages().Emit(os, context_.allCookedSources());
 }
 
-void Semantics::DumpSymbols(llvm::raw_ostream &os) {
-  DoDumpSymbols(os, context_.globalScope());
+void SemanticsContext::DumpSymbols(llvm::raw_ostream &os) {
+  DoDumpSymbols(os, globalScope());
 }
 
+void Semantics::DumpSymbols(llvm::raw_ostream &os) { context_.DumpSymbols(os); }
+
 void Semantics::DumpSymbolsSources(llvm::raw_ostream &os) const {
   NameToSymbolMap symbols;
   GetSymbolNames(context_.globalScope(), symbols);
diff --git a/flang/test/Semantics/smp-proc-ref.f90 b/flang/test/Semantics/smp-proc-ref.f90
new file mode 100644
index 0000000000000..9a2fae442e8e7
--- /dev/null
+++ b/flang/test/Semantics/smp-proc-ref.f90
@@ -0,0 +1,20 @@
+!RUN: %flang_fc1 -fsyntax-only %s
+module m
+  real :: qux(10)
+  interface
+    module subroutine bar(i)
+    end
+    module function baz()
+    end
+  end interface
+end
+
+submodule(m) sm
+ contains
+  module procedure bar
+    qux(i) = baz() ! ensure no bogus error here
+  end
+  module procedure baz
+    baz = 1.
+  end
+end

From 1324789a65665c27eda9e04bc93db81cc859924c Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Wed, 4 Sep 2024 10:55:21 -0700
Subject: [PATCH 136/425] [flang][preprocessor] Extend handling of line
 continuation replacements (#107010)

Codes using traditional C preprocessors will sometimes put a keyword
macro name in a free form continuation line in order to get macro
replacement of part of an identifier, as in

  call subr_&
    &N&
    &(1.)

where N is a keyword macro. f18 already handles this case, but not when
there is white space between the macro name and the following
continuation marker character '&'. Allow white space to appear.

Fixes https://github.com/llvm/llvm-project/issues/106931.
---
 flang/lib/Parser/prescan.cpp       | 12 ++++++++----
 flang/test/Preprocessing/pp134.F90 | 12 ++++++++----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index a0cd0ff263f92..7dcb61ac79109 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -731,12 +731,16 @@ bool Prescanner::NextToken(TokenSequence &tokens) {
       // Subtlety: When an identifier is split across three or more continuation
       // lines (or two continuation lines, immediately preceded or followed
       // by '&' free form continuation line markers, its parts are kept as
-      // distinct pp-tokens so that macro operates on them independently.
-      // This trick accommodates the historic practice of using line
-      // continuation for token pasting after replacement.
+      // distinct pp-tokens so that macro replacement operates on them
+      // independently.  This trick accommodates the historic practice of
+      // using line continuation for token pasting after replacement.
     } else if (parts == 2) {
+      if (afterLast && afterLast < limit_) {
+        afterLast = SkipWhiteSpace(afterLast);
+      }
       if ((start > start_ && start[-1] == '&') ||
-          (afterLast < limit_ && (*afterLast == '&' || *afterLast == '\n'))) {
+          (afterLast && afterLast < limit_ &&
+              (*afterLast == '&' || *afterLast == '\n'))) {
         // call &                call foo&        call foo&
         //   &MACRO&      OR       &MACRO&   OR     &MACRO
         //   &foo(...)             &(...)
diff --git a/flang/test/Preprocessing/pp134.F90 b/flang/test/Preprocessing/pp134.F90
index bc34767224fa0..213baad900b60 100644
--- a/flang/test/Preprocessing/pp134.F90
+++ b/flang/test/Preprocessing/pp134.F90
@@ -1,7 +1,8 @@
 ! RUN: %flang -E %s 2>&1 | FileCheck %s
 ! CHECK: print *, ADC, 1
-! CHECK: print *, AD, 1
-! CHECK: print *, DC, 1
+! CHECK: print *, AD, 2
+! CHECK: print *, DC, 3
+! CHECK: print *, AD(1), 4
 ! CHECK: print *, AD
 ! CHECK: print *, AB
 #define B D
@@ -12,10 +13,13 @@
   &C, 1
 print *, A&
   &B&
-  &, 1
+  &, 2
 print *, &
   &B&
-  &C, 1
+  &C, 3
+print *, A&
+  &B &
+  &(1), 4
 print *, A&
   &B
 print *, A&

From 61759513c8166a6420ded480802de72859a45499 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Wed, 4 Sep 2024 11:08:05 -0700
Subject: [PATCH 137/425] [Analysis] Update
 getPromotionCandidatesForInstruction description (NFC) (#107277)

Updates the description for getPromotionCandidatesForInstruction to
reflect the cleanup done in #95624.
---
 llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h b/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h
index b9cf048a71043..b6bdfb1275c92 100644
--- a/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h
+++ b/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h
@@ -49,9 +49,8 @@ class ICallPromotionAnalysis {
   /// Returns reference to array of InstrProfValueData for the given
   /// instruction \p I.
   ///
-  /// The \p NumVals, \p TotalCount and \p NumCandidates
-  /// are set to the number of values in the array, the total profile count
-  /// of the indirect call \p I, and the number of profitable candidates
+  /// The \p TotalCount and \p NumCandidates are set to the the total profile
+  /// count of the indirect call \p I and the number of profitable candidates
   /// in the given array (which is sorted in reverse order of profitability).
   ///
   /// The returned array space is owned by this class, and overwritten on

From 6e60330af55bfdf5b34aed4c9197cd3afbf00498 Mon Sep 17 00:00:00 2001
From: Lei Wang <wlei@fb.com>
Date: Wed, 4 Sep 2024 11:08:40 -0700
Subject: [PATCH 138/425] [SampleFDO] Read call-graph matching recovered
 top-level function profile (#101053)

With extbinary profile format, initial profile loading only reads
profile based on current function names in the module. However, if a
function is renamed, sample loader skips to load its original
profile(which has a different name), we will miss this case. To address
this, we load the top-level profile candidate explicitly for the
matching. If a match is found later, the function profile will be
further preserved for use by the sample loader.
---
 .../llvm/ProfileData/SampleProfReader.h       |  21 +++
 .../Transforms/IPO/SampleProfileMatcher.h     |   1 +
 .../Transforms/IPO/SampleProfileMatcher.cpp   |  90 +++++++---
 ...seudo-probe-stale-profile-toplev-func.prof |  23 +++
 .../pseudo-probe-stale-profile-toplev-func.ll | 169 ++++++++++++++++++
 5 files changed, 278 insertions(+), 26 deletions(-)
 create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-toplev-func.prof
 create mode 100644 llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-toplev-func.ll

diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
index 0fd86600de21f..5301f23def3f3 100644
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -424,6 +424,16 @@ class SampleProfileReader {
     if (It != Profiles.end())
       return &It->second;
 
+    if (FuncNameToProfNameMap && !FuncNameToProfNameMap->empty()) {
+      auto R = FuncNameToProfNameMap->find(FunctionId(Fname));
+      if (R != FuncNameToProfNameMap->end()) {
+        Fname = R->second.stringRef();
+        auto It = Profiles.find(FunctionId(Fname));
+        if (It != Profiles.end())
+          return &It->second;
+      }
+    }
+
     if (Remapper) {
       if (auto NameInProfile = Remapper->lookUpNameInProfile(Fname)) {
         auto It = Profiles.find(FunctionId(*NameInProfile));
@@ -505,6 +515,11 @@ class SampleProfileReader {
 
   void setModule(const Module *Mod) { M = Mod; }
 
+  void setFuncNameToProfNameMap(
+      const HashKeyMap<std::unordered_map, FunctionId, FunctionId> &FPMap) {
+    FuncNameToProfNameMap = &FPMap;
+  }
+
 protected:
   /// Map every function to its associated profile.
   ///
@@ -541,6 +556,12 @@ class SampleProfileReader {
 
   std::unique_ptr<SampleProfileReaderItaniumRemapper> Remapper;
 
+  // A map pointer to the FuncNameToProfNameMap in SampleProfileLoader,
+  // which maps the function name to the matched profile name. This is used
+  // for sample loader to look up profile using the new name.
+  const HashKeyMap<std::unordered_map, FunctionId, FunctionId>
+      *FuncNameToProfNameMap = nullptr;
+
   // A map from a function's context hash to its meta data section range, used
   // for on-demand read function profile metadata.
   std::unordered_map<uint64_t, std::pair<const uint8_t *, const uint8_t *>>
diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
index a67f158433391..076d91adfd1de 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
@@ -198,6 +198,7 @@ class SampleProfileMatcher {
   // function and all inlinees.
   void countMismatchedCallsiteSamples(const FunctionSamples &FS);
   void computeAndReportProfileStaleness();
+  void UpdateWithSalvagedProfiles();
 
   LocToLocMap &getIRToProfileLocationMap(const Function &F) {
     auto Ret = FuncMappings.try_emplace(
diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
index 312672e56b017..0c676e8fb95fd 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
@@ -36,6 +36,13 @@ static cl::opt<unsigned> MinCallCountForCGMatching(
     cl::desc("The minimum number of call anchors required for a function to "
              "run stale profile call graph matching."));
 
+static cl::opt<bool> LoadFuncProfileforCGMatching(
+    "load-func-profile-for-cg-matching", cl::Hidden, cl::init(false),
+    cl::desc(
+        "Load top-level profiles that the sample reader initially skipped for "
+        "the call-graph matching (only meaningful for extended binary "
+        "format)"));
+
 extern cl::opt<bool> SalvageStaleProfile;
 extern cl::opt<bool> SalvageUnusedProfile;
 extern cl::opt<bool> PersistProfileStaleness;
@@ -410,18 +417,19 @@ void SampleProfileMatcher::runOnFunction(Function &F) {
   // callsites in one context may differ from those in another context. To get
   // the maximum number of callsites, we merge the function profiles from all
   // contexts, aka, the flattened profile to find profile anchors.
-  const auto *FSFlattened = getFlattenedSamplesFor(F);
-  if (SalvageUnusedProfile && !FSFlattened) {
+  const auto *FSForMatching = getFlattenedSamplesFor(F);
+  if (SalvageUnusedProfile && !FSForMatching) {
     // Apply the matching in place to find the new function's matched profile.
-    // TODO: For extended profile format, if a function profile is unused and
-    // it's top-level, even if the profile is matched, it's not found in the
-    // profile. This is because sample reader only read the used profile at the
-    // beginning, we need to support loading the profile on-demand in future.
     auto R = FuncToProfileNameMap.find(&F);
-    if (R != FuncToProfileNameMap.end())
-      FSFlattened = getFlattenedSamplesFor(R->second);
+    if (R != FuncToProfileNameMap.end()) {
+      FSForMatching = getFlattenedSamplesFor(R->second);
+      // Try to find the salvaged top-level profiles that are explicitly loaded
+      // for the matching, see "functionMatchesProfileHelper" for the details.
+      if (!FSForMatching && LoadFuncProfileforCGMatching)
+        FSForMatching = Reader.getSamplesFor(R->second.stringRef());
+    }
   }
-  if (!FSFlattened)
+  if (!FSForMatching)
     return;
 
   // Anchors for IR. It's a map from IR location to callee name, callee name is
@@ -432,7 +440,7 @@ void SampleProfileMatcher::runOnFunction(Function &F) {
   // Anchors for profile. It's a map from callsite location to a set of callee
   // name.
   AnchorMap ProfileAnchors;
-  findProfileAnchors(*FSFlattened, ProfileAnchors);
+  findProfileAnchors(*FSForMatching, ProfileAnchors);
 
   // Compute the callsite match states for profile staleness report.
   if (ReportProfileStaleness || PersistProfileStaleness)
@@ -443,7 +451,7 @@ void SampleProfileMatcher::runOnFunction(Function &F) {
   // For probe-based profiles, run matching only when profile checksum is
   // mismatched.
   bool ChecksumMismatch = FunctionSamples::ProfileIsProbeBased &&
-                          !ProbeManager->profileIsValid(F, *FSFlattened);
+                          !ProbeManager->profileIsValid(F, *FSForMatching);
   bool RunCFGMatching =
       !FunctionSamples::ProfileIsProbeBased || ChecksumMismatch;
   bool RunCGMatching = SalvageUnusedProfile;
@@ -781,14 +789,30 @@ bool SampleProfileMatcher::functionMatchesProfileHelper(
   // two sequences are.
   float Similarity = 0.0;
 
-  const auto *FSFlattened = getFlattenedSamplesFor(ProfFunc);
-  if (!FSFlattened)
+  const auto *FSForMatching = getFlattenedSamplesFor(ProfFunc);
+  // With extbinary profile format, initial profile loading only reads profile
+  // based on current function names in the module.
+  // However, if a function is renamed, sample loader skips to load its original
+  // profile(which has a different name), we will miss this case. To address
+  // this, we load the top-level profile candidate explicitly for the matching.
+  if (!FSForMatching && LoadFuncProfileforCGMatching) {
+    DenseSet<StringRef> TopLevelFunc({ProfFunc.stringRef()});
+    if (std::error_code EC = Reader.read(TopLevelFunc))
+      return false;
+    FSForMatching = Reader.getSamplesFor(ProfFunc.stringRef());
+    LLVM_DEBUG({
+      if (FSForMatching)
+        dbgs() << "Read top-level function " << ProfFunc
+               << " for call-graph matching\n";
+    });
+  }
+  if (!FSForMatching)
     return false;
   // The check for similarity or checksum may not be reliable if the function is
   // tiny, we use the number of basic block as a proxy for the function
   // complexity and skip the matching if it's too small.
   if (IRFunc.size() < MinFuncCountForCGMatching ||
-      FSFlattened->getBodySamples().size() < MinFuncCountForCGMatching)
+      FSForMatching->getBodySamples().size() < MinFuncCountForCGMatching)
     return false;
 
   // For probe-based function, we first trust the checksum info. If the checksum
@@ -796,7 +820,7 @@ bool SampleProfileMatcher::functionMatchesProfileHelper(
   if (FunctionSamples::ProfileIsProbeBased) {
     const auto *FuncDesc = ProbeManager->getDesc(IRFunc);
     if (FuncDesc &&
-        !ProbeManager->profileIsHashMismatched(*FuncDesc, *FSFlattened)) {
+        !ProbeManager->profileIsHashMismatched(*FuncDesc, *FSForMatching)) {
       LLVM_DEBUG(dbgs() << "The checksums for " << IRFunc.getName()
                         << "(IR) and " << ProfFunc << "(Profile) match.\n");
 
@@ -807,7 +831,7 @@ bool SampleProfileMatcher::functionMatchesProfileHelper(
   AnchorMap IRAnchors;
   findIRAnchors(IRFunc, IRAnchors);
   AnchorMap ProfileAnchors;
-  findProfileAnchors(*FSFlattened, ProfileAnchors);
+  findProfileAnchors(*FSForMatching, ProfileAnchors);
 
   AnchorList FilteredIRAnchorsList;
   AnchorList FilteredProfileAnchorList;
@@ -863,6 +887,29 @@ bool SampleProfileMatcher::functionMatchesProfile(Function &IRFunc,
   return Matched;
 }
 
+void SampleProfileMatcher::UpdateWithSalvagedProfiles() {
+  DenseSet<StringRef> ProfileSalvagedFuncs;
+  // Update FuncNameToProfNameMap and SymbolMap.
+  for (auto &I : FuncToProfileNameMap) {
+    assert(I.first && "New function is null");
+    FunctionId FuncName(I.first->getName());
+    ProfileSalvagedFuncs.insert(I.second.stringRef());
+    FuncNameToProfNameMap->emplace(FuncName, I.second);
+
+    // We need to remove the old entry to avoid duplicating the function
+    // processing.
+    SymbolMap->erase(FuncName);
+    SymbolMap->emplace(I.second, I.first);
+  }
+
+  // With extbinary profile format, initial profile loading only reads profile
+  // based on current function names in the module, so we need to load top-level
+  // profiles for functions with different profile name explicitly after
+  // function-profile name map is established with stale profile matching.
+  Reader.read(ProfileSalvagedFuncs);
+  Reader.setFuncNameToProfNameMap(*FuncNameToProfNameMap);
+}
+
 void SampleProfileMatcher::runOnModule() {
   ProfileConverter::flattenProfile(Reader.getProfiles(), FlattenedProfiles,
                                    FunctionSamples::ProfileIsCS);
@@ -880,17 +927,8 @@ void SampleProfileMatcher::runOnModule() {
     runOnFunction(*F);
   }
 
-  // Update the data in SampleLoader.
   if (SalvageUnusedProfile)
-    for (auto &I : FuncToProfileNameMap) {
-      assert(I.first && "New function is null");
-      FunctionId FuncName(I.first->getName());
-      FuncNameToProfNameMap->emplace(FuncName, I.second);
-      // We need to remove the old entry to avoid duplicating the function
-      // processing.
-      SymbolMap->erase(FuncName);
-      SymbolMap->emplace(I.second, I.first);
-    }
+    UpdateWithSalvagedProfiles();
 
   if (SalvageStaleProfile)
     distributeIRToProfileLocationMap();
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-toplev-func.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-toplev-func.prof
new file mode 100644
index 0000000000000..86c8cb3285afe
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-stale-profile-toplev-func.prof
@@ -0,0 +1,23 @@
+foo:2724522:51
+ 1: 51
+ 2: 452674
+ 3: 47
+ 4: 497875
+ 6: 415959
+ 10: 452623
+ 11: 452687 bar:452687
+ 12: 452623
+ 13: 47
+ !CFGChecksum: 281479271677951
+bar:452687:452687
+ 1: 452687
+ !CFGChecksum: 4294967295
+main:204:0
+ 1: 0
+ 2: 51
+ 3: 0
+ 4: 51
+ 5: 51 foo:51
+ 6: 51
+ 7: 0
+ !CFGChecksum: 281582264815352
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-toplev-func.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-toplev-func.ll
new file mode 100644
index 0000000000000..c839364f23553
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-toplev-func.ll
@@ -0,0 +1,169 @@
+; REQUIRES: x86_64-linux
+; REQUIRES: asserts
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-toplev-func.prof --salvage-stale-profile --salvage-unused-profile -report-profile-staleness -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl -pass-remarks=inline --min-call-count-for-cg-matching=0 --min-func-count-for-cg-matching=0 --load-func-profile-for-cg-matching 2>&1 | FileCheck %s -check-prefix=CHECK-TEXT
+; RUN: llvm-profdata merge --sample %S/Inputs/pseudo-probe-stale-profile-toplev-func.prof -extbinary -o %t.extbinary
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t.extbinary --salvage-stale-profile --salvage-unused-profile -report-profile-staleness -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl -pass-remarks=inline --min-call-count-for-cg-matching=0 --min-func-count-for-cg-matching=0 --load-func-profile-for-cg-matching 2>&1 | FileCheck %s -check-prefix=CHECK-EXTBIN
+
+; CHECK-TEXT: Run stale profile matching for main
+; CHECK-TEXT-NOT: Read top-level function foo for call-graph matching
+; CHECK-TEXT: The checksums for foo_rename(IR) and foo(Profile) match.
+; CHECK-TEXT: Function:foo_rename matches profile:foo
+; CHECK-TEXT: Run stale profile matching for foo_rename
+; CHECK-TEXT: (1/3) of functions' profile are matched and (2724522/3177413) of samples are reused by call graph matching.
+
+; CHECK-TEXT: Processing Function main
+; CHECK-TEXT:     5:  call void @foo_rename(), !dbg ![[#]] - weight: 51
+; CHECK-TEXT: Processing Function foo_rename
+; CHECK-TEXT:     2:  %call = call i32 @bar(i32 noundef %0), !dbg ![[#]] - weight: 452674
+
+
+; CHECK-EXTBIN: Run stale profile matching for main
+; CHECK-EXTBIN: Read top-level function foo for call-graph matching
+; CHECK-EXTBIN: The checksums for foo_rename(IR) and foo(Profile) match.
+; CHECK-EXTBIN: Function:foo_rename matches profile:foo
+; CHECK-EXTBIN: Run stale profile matching for foo_rename
+; CHECK-EXTBIN: (1/3) of functions' profile are matched and (2724522/3177413) of samples are reused by call graph matching.
+
+; CHECK-EXTBIN: Processing Function main
+; CHECK-EXTBIN:     5:  call void @foo_rename(), !dbg ![[#]] - weight: 51
+; CHECK-EXTBIN: Processing Function foo_rename
+; CHECK-EXTBIN:     2:  %call = call i32 @bar(i32 noundef %0), !dbg ![[#]] - weight: 452674
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = dso_local global i32 0, align 4, !dbg !0
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @bar(i32 noundef %x) #0 !dbg !18 {
+entry:
+    #dbg_value(i32 %x, !22, !DIExpression(), !23)
+  call void @llvm.pseudoprobe(i64 -2012135647395072713, i64 1, i32 0, i64 -1), !dbg !24
+  %add = add nsw i32 %x, 1, !dbg !25
+  ret i32 %add, !dbg !26
+}
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @foo_rename() #0 !dbg !27 {
+entry:
+  call void @llvm.pseudoprobe(i64 -2115950948644264162, i64 1, i32 0, i64 -1), !dbg !30
+  %0 = load volatile i32, ptr @x, align 4, !dbg !30, !tbaa !31
+  %call = call i32 @bar(i32 noundef %0), !dbg !35
+  %1 = load volatile i32, ptr @x, align 4, !dbg !37, !tbaa !31
+  %add = add nsw i32 %1, %call, !dbg !37
+  store volatile i32 %add, ptr @x, align 4, !dbg !37, !tbaa !31
+  ret void, !dbg !38
+}
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @main() #1 !dbg !39 {
+entry:
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 1, i32 0, i64 -1), !dbg !45
+    #dbg_value(i32 0, !43, !DIExpression(), !46)
+  br label %for.cond, !dbg !47
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ], !dbg !48
+    #dbg_value(i32 %i.0, !43, !DIExpression(), !46)
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 2, i32 0, i64 -1), !dbg !49
+  %cmp = icmp slt i32 %i.0, 100000, !dbg !51
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !52
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 3, i32 0, i64 -1), !dbg !53
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 7, i32 0, i64 -1), !dbg !54
+  ret i32 0, !dbg !54
+
+for.body:                                         ; preds = %for.cond
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 4, i32 0, i64 -1), !dbg !55
+  call void @foo_rename(), !dbg !57
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 6, i32 0, i64 -1), !dbg !59
+  %inc = add nsw i32 %i.0, 1, !dbg !59
+    #dbg_value(i32 %inc, !43, !DIExpression(), !46)
+  br label %for.cond, !dbg !60, !llvm.loop !61
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #3
+
+attributes #0 = { noinline nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
+attributes #1 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8, !9, !10, !11, !12, !13}
+!llvm.ident = !{!14}
+!llvm.pseudo_probe_desc = !{!15, !16, !17}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "x", scope: !2, file: !3, line: 1, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "clang version 20.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "test_rename.c", directory: "/home", checksumkind: CSK_MD5, checksum: "11a33a83e4d190ebda0792d0610f0c67")
+!4 = !{!0}
+!5 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !6)
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = !{i32 7, !"Dwarf Version", i32 5}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 4}
+!10 = !{i32 8, !"PIC Level", i32 2}
+!11 = !{i32 7, !"PIE Level", i32 2}
+!12 = !{i32 7, !"uwtable", i32 2}
+!13 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!14 = !{!"clang version 20.0.0"}
+!15 = !{i64 -2012135647395072713, i64 4294967295, !"bar"}
+!16 = !{i64 -2115950948644264162, i64 281479271677951, !"foo_rename"}
+!17 = !{i64 -2624081020897602054, i64 281582264815352, !"main"}
+!18 = distinct !DISubprogram(name: "bar", scope: !3, file: !3, line: 3, type: !19, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
+!19 = !DISubroutineType(types: !20)
+!20 = !{!6, !6}
+!21 = !{!22}
+!22 = !DILocalVariable(name: "x", arg: 1, scope: !18, file: !3, line: 3, type: !6)
+!23 = !DILocation(line: 0, scope: !18)
+!24 = !DILocation(line: 4, column: 10, scope: !18)
+!25 = !DILocation(line: 4, column: 12, scope: !18)
+!26 = !DILocation(line: 4, column: 3, scope: !18)
+!27 = distinct !DISubprogram(name: "foo_rename", scope: !3, file: !3, line: 7, type: !28, scopeLine: 7, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!28 = !DISubroutineType(types: !29)
+!29 = !{null}
+!30 = !DILocation(line: 8, column: 15, scope: !27)
+!31 = !{!32, !32, i64 0}
+!32 = !{!"int", !33, i64 0}
+!33 = !{!"omnipotent char", !34, i64 0}
+!34 = !{!"Simple C/C++ TBAA"}
+!35 = !DILocation(line: 8, column: 11, scope: !36)
+!36 = !DILexicalBlockFile(scope: !27, file: !3, discriminator: 455082007)
+!37 = !DILocation(line: 8, column: 8, scope: !27)
+!38 = !DILocation(line: 9, column: 1, scope: !27)
+!39 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !40, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !42)
+!40 = !DISubroutineType(types: !41)
+!41 = !{!6}
+!42 = !{!43}
+!43 = !DILocalVariable(name: "i", scope: !44, file: !3, line: 12, type: !6)
+!44 = distinct !DILexicalBlock(scope: !39, file: !3, line: 12, column: 3)
+!45 = !DILocation(line: 12, column: 12, scope: !44)
+!46 = !DILocation(line: 0, scope: !44)
+!47 = !DILocation(line: 12, column: 8, scope: !44)
+!48 = !DILocation(line: 12, scope: !44)
+!49 = !DILocation(line: 12, column: 19, scope: !50)
+!50 = distinct !DILexicalBlock(scope: !44, file: !3, line: 12, column: 3)
+!51 = !DILocation(line: 12, column: 21, scope: !50)
+!52 = !DILocation(line: 12, column: 3, scope: !44)
+!53 = !DILocation(line: 0, scope: !39)
+!54 = !DILocation(line: 15, column: 1, scope: !39)
+!55 = !DILocation(line: 13, column: 7, scope: !56)
+!56 = distinct !DILexicalBlock(scope: !50, file: !3, line: 12, column: 40)
+!57 = !DILocation(line: 13, column: 7, scope: !58)
+!58 = !DILexicalBlockFile(scope: !56, file: !3, discriminator: 455082031)
+!59 = !DILocation(line: 12, column: 36, scope: !50)
+!60 = !DILocation(line: 12, column: 3, scope: !50)
+!61 = distinct !{!61, !52, !62, !63}
+!62 = !DILocation(line: 14, column: 3, scope: !44)
+!63 = !{!"llvm.loop.mustprogress"}

From 2e0ded3371f8d42f376bdfd4d70687537e36818e Mon Sep 17 00:00:00 2001
From: R-Goc <131907007+R-Goc@users.noreply.github.com>
Date: Wed, 4 Sep 2024 20:10:36 +0200
Subject: [PATCH 139/425] [Windows SEH] Fix crash on empty seh block (#107031)

Fixes https://github.com/llvm/llvm-project/issues/105813 and
https://github.com/llvm/llvm-project/issues/106915.
Adds a check for the end of the iterator, which can be a sentinel.
The issue was introduced in
https://github.com/llvm/llvm-project/commit/0efe111365ae176671e01252d24028047d807a84
from what I can see, so along with the introduction of /EHa support.
---
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp  |  4 ++++
 .../CodeGen/WinEH/wineh-empty-seh-scope.ll     | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 llvm/test/CodeGen/WinEH/wineh-empty-seh-scope.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index b37e54d66ddf5..263a213bd4f64 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1476,6 +1476,10 @@ void SelectionDAGISel::reportIPToStateForBlocks(MachineFunction *MF) {
     if (BB->getFirstMayFaultInst()) {
       // Report IP range only for blocks with Faulty inst
       auto MBBb = MBB.getFirstNonPHI();
+
+      if (MBBb == MBB.end())
+        continue;
+
       MachineInstr *MIb = &*MBBb;
       if (MIb->isTerminator())
         continue;
diff --git a/llvm/test/CodeGen/WinEH/wineh-empty-seh-scope.ll b/llvm/test/CodeGen/WinEH/wineh-empty-seh-scope.ll
new file mode 100644
index 0000000000000..5f382f10f180b
--- /dev/null
+++ b/llvm/test/CodeGen/WinEH/wineh-empty-seh-scope.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64-pc-windows-msvc19.41.34120 < %s | FileCheck %s
+
+define void @foo() personality ptr @__CxxFrameHandler3 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nop # avoids zero-length function
+  call void @llvm.seh.scope.begin()
+  unreachable
+}
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare void @llvm.seh.scope.begin()
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"eh-asynch", i32 1}

From 36c210bb340cfdc68d314dd188e18c0bf017b999 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 4 Sep 2024 11:14:31 -0700
Subject: [PATCH 140/425] [RISCV] Remove pre-assignment of mask vectors during
 call lowering. NFC (#107192)

The first mask vector operand is supposed to be assigned to V0. No other
vector types will be assigned to V0. We don't need to pre-assign, we can
just try V0 first for any mask vectors in the normal processing.
---
 .../Target/RISCV/GISel/RISCVCallLowering.cpp  | 26 ++--------
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 50 ++++---------------
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |  9 ++--
 3 files changed, 17 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
index 6e33032384ede..31a9df53a2aa1 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
@@ -35,9 +35,6 @@ struct RISCVOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner {
   // Whether this is assigning args for a return.
   bool IsRet;
 
-  // true if assignArg has been called for a mask argument, false otherwise.
-  bool AssignedFirstMaskArg = false;
-
 public:
   RISCVOutgoingValueAssigner(
       RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet)
@@ -52,16 +49,9 @@ struct RISCVOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner {
     const DataLayout &DL = MF.getDataLayout();
     const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>();
 
-    std::optional<unsigned> FirstMaskArgument;
-    if (Subtarget.hasVInstructions() && !AssignedFirstMaskArg &&
-        ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) {
-      FirstMaskArgument = ValNo;
-      AssignedFirstMaskArg = true;
-    }
-
     if (RISCVAssignFn(DL, Subtarget.getTargetABI(), ValNo, ValVT, LocVT,
                       LocInfo, Flags, State, Info.IsFixed, IsRet, Info.Ty,
-                      *Subtarget.getTargetLowering(), FirstMaskArgument))
+                      *Subtarget.getTargetLowering()))
       return true;
 
     StackSize = State.getStackSize();
@@ -197,9 +187,6 @@ struct RISCVIncomingValueAssigner : public CallLowering::IncomingValueAssigner {
   // Whether this is assigning args from a return.
   bool IsRet;
 
-  // true if assignArg has been called for a mask argument, false otherwise.
-  bool AssignedFirstMaskArg = false;
-
 public:
   RISCVIncomingValueAssigner(
       RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet)
@@ -217,16 +204,9 @@ struct RISCVIncomingValueAssigner : public CallLowering::IncomingValueAssigner {
     if (LocVT.isScalableVector())
       MF.getInfo<RISCVMachineFunctionInfo>()->setIsVectorCall();
 
-    std::optional<unsigned> FirstMaskArgument;
-    if (Subtarget.hasVInstructions() && !AssignedFirstMaskArg &&
-        ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) {
-      FirstMaskArgument = ValNo;
-      AssignedFirstMaskArg = true;
-    }
-
     if (RISCVAssignFn(DL, Subtarget.getTargetABI(), ValNo, ValVT, LocVT,
                       LocInfo, Flags, State, /*IsFixed=*/true, IsRet, Info.Ty,
-                      *Subtarget.getTargetLowering(), FirstMaskArgument))
+                      *Subtarget.getTargetLowering()))
       return true;
 
     StackSize = State.getStackSize();
@@ -483,7 +463,7 @@ bool RISCVCallLowering::canLowerReturn(MachineFunction &MF,
     MVT VT = MVT::getVT(Outs[I].Ty);
     if (RISCV::CC_RISCV(MF.getDataLayout(), ABI, I, VT, VT, CCValAssign::Full,
                         Outs[I].Flags[0], CCInfo, /*IsFixed=*/true,
-                        /*isRet=*/true, nullptr, TLI, FirstMaskArgument))
+                        /*isRet=*/true, nullptr, TLI))
       return false;
   }
   return true;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8f95a86ade303..bc661c72e5ecc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -19122,17 +19122,16 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
   return false;
 }
 
-static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo,
-                                 std::optional<unsigned> FirstMaskArgument,
-                                 CCState &State,
+static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo, CCState &State,
                                  const RISCVTargetLowering &TLI) {
   const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT);
   if (RC == &RISCV::VRRegClass) {
     // Assign the first mask argument to V0.
     // This is an interim calling convention and it may be changed in the
     // future.
-    if (FirstMaskArgument && ValNo == *FirstMaskArgument)
-      return State.AllocateReg(RISCV::V0);
+    if (ValVT.getVectorElementType() == MVT::i1)
+      if (MCRegister Reg = State.AllocateReg(RISCV::V0))
+        return Reg;
     return State.AllocateReg(ArgVRs);
   }
   if (RC == &RISCV::VRM2RegClass)
@@ -19170,8 +19169,7 @@ static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo,
 bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
                      MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
                      ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
-                     bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI,
-                     std::optional<unsigned> FirstMaskArgument) {
+                     bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI) {
   unsigned XLen = DL.getLargestLegalIntTypeSizeInBits();
   assert(XLen == 32 || XLen == 64);
   MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;
@@ -19351,7 +19349,7 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
   else if (ValVT == MVT::f64 && !UseGPRForF64)
     Reg = State.AllocateReg(ArgFPR64s);
   else if (ValVT.isVector() || ValVT.isRISCVVectorTuple()) {
-    Reg = allocateRVVReg(ValVT, ValNo, FirstMaskArgument, State, TLI);
+    Reg = allocateRVVReg(ValVT, ValNo, State, TLI);
     if (!Reg) {
       // For return values, the vector must be passed fully via registers or
       // via the stack.
@@ -19421,16 +19419,6 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
   return false;
 }
 
-template <typename ArgTy>
-static std::optional<unsigned> preAssignMask(const ArgTy &Args) {
-  for (const auto &ArgIdx : enumerate(Args)) {
-    MVT ArgVT = ArgIdx.value().VT;
-    if (ArgVT.isVector() && ArgVT.getVectorElementType() == MVT::i1)
-      return ArgIdx.index();
-  }
-  return std::nullopt;
-}
-
 void RISCVTargetLowering::analyzeInputArgs(
     MachineFunction &MF, CCState &CCInfo,
     const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet,
@@ -19438,10 +19426,6 @@ void RISCVTargetLowering::analyzeInputArgs(
   unsigned NumArgs = Ins.size();
   FunctionType *FType = MF.getFunction().getFunctionType();
 
-  std::optional<unsigned> FirstMaskArgument;
-  if (Subtarget.hasVInstructions())
-    FirstMaskArgument = preAssignMask(Ins);
-
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT ArgVT = Ins[i].VT;
     ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
@@ -19454,8 +19438,7 @@ void RISCVTargetLowering::analyzeInputArgs(
 
     RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
     if (Fn(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
-           ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy, *this,
-           FirstMaskArgument)) {
+           ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy, *this)) {
       LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
                         << ArgVT << '\n');
       llvm_unreachable(nullptr);
@@ -19469,10 +19452,6 @@ void RISCVTargetLowering::analyzeOutputArgs(
     CallLoweringInfo *CLI, RISCVCCAssignFn Fn) const {
   unsigned NumArgs = Outs.size();
 
-  std::optional<unsigned> FirstMaskArgument;
-  if (Subtarget.hasVInstructions())
-    FirstMaskArgument = preAssignMask(Outs);
-
   for (unsigned i = 0; i != NumArgs; i++) {
     MVT ArgVT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
@@ -19480,8 +19459,7 @@ void RISCVTargetLowering::analyzeOutputArgs(
 
     RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
     if (Fn(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
-           ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy, *this,
-           FirstMaskArgument)) {
+           ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy, *this)) {
       LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
                         << ArgVT << "\n");
       llvm_unreachable(nullptr);
@@ -19659,8 +19637,7 @@ bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
                             CCValAssign::LocInfo LocInfo,
                             ISD::ArgFlagsTy ArgFlags, CCState &State,
                             bool IsFixed, bool IsRet, Type *OrigTy,
-                            const RISCVTargetLowering &TLI,
-                            std::optional<unsigned> FirstMaskArgument) {
+                            const RISCVTargetLowering &TLI) {
   if (LocVT == MVT::i32 || LocVT == MVT::i64) {
     if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
@@ -19744,8 +19721,7 @@ bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
   }
 
   if (LocVT.isVector()) {
-    if (MCRegister Reg =
-            allocateRVVReg(ValVT, ValNo, FirstMaskArgument, State, TLI)) {
+    if (MCRegister Reg = allocateRVVReg(ValVT, ValNo, State, TLI)) {
       // Fixed-length vectors are located in the corresponding scalable-vector
       // container types.
       if (ValVT.isFixedLengthVector())
@@ -20377,17 +20353,13 @@ bool RISCVTargetLowering::CanLowerReturn(
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
 
-  std::optional<unsigned> FirstMaskArgument;
-  if (Subtarget.hasVInstructions())
-    FirstMaskArgument = preAssignMask(Outs);
-
   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
     MVT VT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
     RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
     if (RISCV::CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full,
                         ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true,
-                        nullptr, *this, FirstMaskArgument))
+                        nullptr, *this))
       return false;
   }
   return true;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index f1d1cca043b35..3beee4686956e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -904,8 +904,7 @@ class RISCVTargetLowering : public TargetLowering {
                                CCValAssign::LocInfo LocInfo,
                                ISD::ArgFlagsTy ArgFlags, CCState &State,
                                bool IsFixed, bool IsRet, Type *OrigTy,
-                               const RISCVTargetLowering &TLI,
-                               std::optional<unsigned> FirstMaskArgument);
+                               const RISCVTargetLowering &TLI);
 
 private:
   void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
@@ -1054,14 +1053,12 @@ namespace RISCV {
 bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
               MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
               ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
-              bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI,
-              std::optional<unsigned> FirstMaskArgument);
+              bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI);
 
 bool CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
                      MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
                      ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
-                     bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI,
-                     std::optional<unsigned> FirstMaskArgument);
+                     bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI);
 
 bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
                   CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,

From b30880e975279c1c8ef4c2645eb03063e4b19f2b Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas@google.com>
Date: Wed, 4 Sep 2024 11:25:30 -0700
Subject: [PATCH 141/425] [SandboxIR] Fix linking error caused by
 840da2e8ba7e0f77938adfc6f6d315137542a1b8

---
 llvm/lib/SandboxIR/Tracker.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp
index c6eb9fc68a4b1..b1f472d7928f4 100644
--- a/llvm/lib/SandboxIR/Tracker.cpp
+++ b/llvm/lib/SandboxIR/Tracker.cpp
@@ -251,10 +251,12 @@ void ShuffleVectorSetMask::dump() const {
 CmpSwapOperands::CmpSwapOperands(CmpInst *Cmp) : Cmp(Cmp) {}
 
 void CmpSwapOperands::revert(Tracker &Tracker) { Cmp->swapOperands(); }
+#ifndef NDEBUG
 void CmpSwapOperands::dump() const {
   dump(dbgs());
   dbgs() << "\n";
 }
+#endif
 
 void Tracker::save() { State = TrackerState::Record; }
 

From a43137c3f85fd87f90c9a8ffaebd71d432018e79 Mon Sep 17 00:00:00 2001
From: Kyle Huey <khuey@kylehuey.com>
Date: Wed, 4 Sep 2024 11:46:01 -0700
Subject: [PATCH 142/425] [LLVM][DWARF] Make some effort to avoid duplicates in
 .debug_ranges. (#106614)

Inlining and zero-cost abstractions tend to produce volumes of debug
info with identical ranges. When built with full debugging information
(the equivalent of -g2) librustc_driver.so has 2.1 million entries in
.debug_ranges. But only 1.1 million of those entries are unique. While
in principle all duplicates could be eliminated with a hashtable,
checking to see if the new range is exactly identical to the previous
range and skipping a new addition if it is is sufficient to eliminate
99.99% of the duplicates. This reduces the size of librustc_driver.so's
.debug_ranges section by 35%, or the overall binary size a little more
than 1%.
---
 llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp     | 16 ++++-
 llvm/lib/CodeGen/AsmPrinter/DwarfFile.h       |  4 ++
 .../Generic/debug-ranges-duplication.ll       | 70 +++++++++++++++++++
 3 files changed, 88 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index eab798c0da784..cd1279d202132 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -121,7 +121,19 @@ void DwarfFile::addScopeLabel(LexicalScope *LS, DbgLabel *Label) {
 
 std::pair<uint32_t, RangeSpanList *>
 DwarfFile::addRange(const DwarfCompileUnit &CU, SmallVector<RangeSpan, 2> R) {
-  CURangeLists.push_back(
-      RangeSpanList{Asm->createTempSymbol("debug_ranges"), &CU, std::move(R)});
+  bool CanReuseLastRange = false;
+
+  if (!CURangeLists.empty()) {
+    auto Last = CURangeLists.back();
+    if (Last.CU == &CU && Last.Ranges == R) {
+      CanReuseLastRange = true;
+    }
+  }
+
+  if (!CanReuseLastRange) {
+    CURangeLists.push_back(RangeSpanList{Asm->createTempSymbol("debug_ranges"),
+                                         &CU, std::move(R)});
+  }
+
   return std::make_pair(CURangeLists.size() - 1, &CURangeLists.back());
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
index f76858fc2f36a..0fc2b91ddfa91 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -37,6 +37,10 @@ class MDNode;
 struct RangeSpan {
   const MCSymbol *Begin;
   const MCSymbol *End;
+
+  bool operator==(const RangeSpan &Other) const {
+    return Begin == Other.Begin && End == Other.End;
+  }
 };
 
 struct RangeSpanList {
diff --git a/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll b/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll
new file mode 100644
index 0000000000000..540400be740a5
--- /dev/null
+++ b/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll
@@ -0,0 +1,70 @@
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-info - | FileCheck %s
+;
+; Generated from the following C++ source with:
+; clang -S -emit-llvm -g -O2 test.c
+;
+; /* BEGIN SOURCE */
+; void f1();
+; inline void f2() {
+;   f1();
+;   f1();
+; }
+; inline void f3() {
+;   f2();
+; }
+; void f4() {
+;   f3();
+;   f1();
+; }
+; /* END SOURCE */
+;
+; Minor complication: after generating the LLVM IR, it was manually edited so
+; that the 'f1()' call from f3 was reordered to appear between the two inlined
+; f1 calls from f2. This causes f2's inlined_subroutine to use DW_AT_ranges.
+
+; Check that identical debug ranges in succession reuse the same entry in
+; .debug_ranges rather than emitting duplicate entries.
+
+; CHECK:      DW_TAG_inlined_subroutine
+; CHECK:      DW_AT_ranges
+; CHECK-SAME: rangelist = 0x[[#%.8X,RANGE:]]
+; CHECK:      DW_TAG_inlined_subroutine
+; CHECK:      DW_AT_ranges
+; CHECK-SAME: rangelist = 0x[[#RANGE]]
+
+; Function Attrs: nounwind uwtable
+define dso_local void @f4() local_unnamed_addr !dbg !9 {
+entry:
+  tail call void (...) @f1(), !dbg !12
+  tail call void (...) @f1(), !dbg !18
+  tail call void (...) @f1(), !dbg !17
+  ret void, !dbg !19
+}
+
+declare !dbg !20 void @f1(...) local_unnamed_addr
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git (https://github.com/llvm/llvm-project.git 9edd998e10fabfff067b9e6e5b044f85a24d0dd5)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/home/khuey/dev/llvm-project", checksumkind: CSK_MD5, checksum: "4510feb241cf078af753e3dc13205127")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{!"clang version 20.0.0git (https://github.com/llvm/llvm-project.git 9edd998e10fabfff067b9e6e5b044f85a24d0dd5)"}
+!9 = distinct !DISubprogram(name: "f4", scope: !1, file: !1, line: 9, type: !10, scopeLine: 9, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!10 = !DISubroutineType(types: !11)
+!11 = !{null}
+!12 = !DILocation(line: 3, column: 3, scope: !13, inlinedAt: !14)
+!13 = distinct !DISubprogram(name: "f2", scope: !1, file: !1, line: 2, type: !10, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!14 = distinct !DILocation(line: 7, column: 3, scope: !15, inlinedAt: !16)
+!15 = distinct !DISubprogram(name: "f3", scope: !1, file: !1, line: 6, type: !10, scopeLine: 6, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!16 = distinct !DILocation(line: 10, column: 3, scope: !9)
+!17 = !DILocation(line: 4, column: 3, scope: !13, inlinedAt: !14)
+!18 = !DILocation(line: 11, column: 3, scope: !9)
+!19 = !DILocation(line: 12, column: 1, scope: !9)
+!20 = !DISubprogram(name: "f1", scope: !1, file: !1, line: 1, type: !10, spFlags: DISPFlagOptimized)

From c1667f909949d15c593e4a03a4e992cffa72ad3c Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Wed, 4 Sep 2024 15:06:27 -0400
Subject: [PATCH 143/425] Fix `transpose->unpack` folding pattern for the
 partial-tile case of `unpack` (#107271)

Just directly create the empty tensor of appropriate shape instead of
relying on `UnPackOp::createDestinationTensor` which is trying to infer
the destination shape, which isn't possible in general with the set of
paramters that it is taking.

Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
---
 .../Transforms/PackAndUnpackPatterns.cpp      | 13 ++++---
 .../Tensor/fold-into-pack-and-unpack.mlir     | 35 +++++++++++++++----
 2 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
index c681cadcb27cb..995486c87771a 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
@@ -439,6 +439,11 @@ struct FoldConsumerUnPackWithProducerLinalgTransposeOp
     if (failed(maybePerm))
       return failure();
 
+    SmallVector<SmallVector<OpFoldResult>> unpackOpResultDims;
+    if (failed(reifyResultShapes(rewriter, unPackOp, unpackOpResultDims))) {
+      return failure();
+    }
+
     SmallVector<int64_t> inverseTransposePerm =
         invertPermutationVector(maybePerm.value());
     auto outerDimsPerm = unPackOp.getOuterDimsPerm();
@@ -448,7 +453,6 @@ struct FoldConsumerUnPackWithProducerLinalgTransposeOp
     SmallVector<int64_t> newOuterDimsPermVec;
     SmallVector<int64_t> newInnerDimsPosVec;
     SmallVector<OpFoldResult> newMixedInnerTilesVec;
-
     if (!checkAndPermute(inverseTransposePerm, outerDimsPerm,
                          newOuterDimsPermVec, destRank))
       return rewriter.notifyMatchFailure(
@@ -463,9 +467,10 @@ struct FoldConsumerUnPackWithProducerLinalgTransposeOp
       newInnerDimsPosVec.push_back(innerDimsPos[remappedPosition]);
     }
 
-    Value output = unPackOp.createDestinationTensor(
-        rewriter, unPackOp.getLoc(), linalgOp->getOperand(0),
-        newMixedInnerTilesVec, newInnerDimsPosVec, newOuterDimsPermVec);
+    auto elemType =
+        cast<ShapedType>(unPackOp->getResultTypes()[0]).getElementType();
+    Value output = rewriter.create<tensor::EmptyOp>(
+        unPackOp->getLoc(), unpackOpResultDims[0], elemType);
 
     rewriter.replaceOpWithNewOp<UnPackOp>(
         unPackOp, linalgOp->getOperand(0), output, newInnerDimsPosVec,
diff --git a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
index 629a4c2135720..bff913f5f55fe 100644
--- a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
+++ b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
@@ -550,6 +550,32 @@ func.func @linalg_transpose_tensor_unpack_fold(%arg0: tensor<1x1x4x16xi32>) -> t
 
 // -----
 
+func.func @linalg_transpose_tensor_unpack_fold_partial_tile(%arg0: tensor<1x1x4x16xi32>) -> tensor<15x3xi32> {
+  %0 = tensor.empty() : tensor<1x1x16x4xi32>
+  %transposed = linalg.transpose ins(%arg0 : tensor<1x1x4x16xi32>)
+                outs(%0 : tensor<1x1x16x4xi32>)
+                permutation = [1, 0, 3, 2]
+  %1 = tensor.empty() : tensor<15x3xi32>
+  %unpack = tensor.unpack %transposed
+            outer_dims_perm = [0, 1]
+            inner_dims_pos = [0, 1]
+            inner_tiles = [16, 4] into
+            %1 : tensor<1x1x16x4xi32> -> tensor<15x3xi32>
+  return %unpack : tensor<15x3xi32>
+}
+//CHECK-LABEL:  func.func @linalg_transpose_tensor_unpack_fold_partial_tile(
+// CHECK-SAME:   %[[ARG0:.+]]: tensor<1x1x4x16xi32>) -> tensor<15x3xi32> {
+//      CHECK:     %[[OUT:.+]] = tensor.empty() : tensor<15x3xi32>
+//      CHECK:     %[[UNPACK:.+]] = tensor.unpack %[[ARG0]]
+// CHECK-SAME:        outer_dims_perm = [1, 0]
+// CHECK-SAME:        inner_dims_pos = [1, 0]
+// CHECK-SAME:        inner_tiles = [4, 16]
+// CHECK-SAME:        into %[[OUT]] : tensor<1x1x4x16xi32> -> tensor<15x3xi32>
+//      CHECK:     return %[[UNPACK]] : tensor<15x3xi32>
+//      CHECK:   }
+
+// -----
+
 func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile_sizes(%arg0: tensor<?x?x?x?xf32>, %transpose_dest: tensor<?x?x?x?xf32>, %unpack_dest: tensor<?x?xf32>, %tile_p : index, %tile_q : index) -> tensor<?x?xf32> {
   %transposed = linalg.transpose
     ins(%arg0 : tensor<?x?x?x?xf32>)
@@ -563,17 +589,14 @@ func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile
     into %unpack_dest : tensor<?x?x?x?xf32> -> tensor<?x?xf32>
   return %unpack : tensor<?x?xf32>
 }
-//       CHECK:    #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 * s1)>
 // CHECK-LABEL:   func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile_sizes(
 //  CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?x?xf32>, %[[ARG1:.+]]: tensor<?x?x?x?xf32>, %[[ARG2:.+]]: tensor<?x?xf32>,
 //  CHECK-SAME:     %[[IDX1:.+]]: index, %[[IDX2:.+]]: index) -> tensor<?x?xf32> {
 //   CHECK-DAG:       %[[CST1:.+]] = arith.constant 1 : index
 //   CHECK-DAG:       %[[CST0:.+]] = arith.constant 0 : index
-//   CHECK-DAG:       %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[CST0]] : tensor<?x?x?x?xf32>
-//   CHECK-DAG:       %[[DIM1:.+]] = tensor.dim %[[ARG0]], %[[CST1]] : tensor<?x?x?x?xf32>
-//   CHECK-DAG:       %[[AMAP0:.+]] = affine.apply #[[$MAP]]()[%[[DIM1]], %[[IDX2]]]
-//   CHECK-DAG:       %[[AMAP1:.+]] = affine.apply #[[$MAP]]()[%[[DIM0]], %[[IDX1]]]
-//       CHECK:       %[[OUT:.+]] = tensor.empty(%[[AMAP1]], %[[AMAP0]]) : tensor<?x?xf32>
+//   CHECK-DAG:       %[[DIM0:.+]] = tensor.dim %[[ARG2]], %[[CST0]] : tensor<?x?xf32>
+//   CHECK-DAG:       %[[DIM1:.+]] = tensor.dim %[[ARG2]], %[[CST1]] : tensor<?x?xf32>
+//       CHECK:       %[[OUT:.+]] = tensor.empty(%[[DIM0]], %[[DIM1]]) : tensor<?x?xf32>
 //       CHECK:       %[[UNPACK:.+]] = tensor.unpack %[[ARG0]]
 //  CHECK-SAME:         outer_dims_perm = [0, 1]
 //  CHECK-SAME:         inner_dims_pos = [1, 0]

From ebf0599314e17c3ab89f303d452811b1db3e6d1e Mon Sep 17 00:00:00 2001
From: SJW <48454132+sjw36@users.noreply.github.com>
Date: Wed, 4 Sep 2024 14:24:58 -0500
Subject: [PATCH 144/425] [MLIR][SCF] Add support for loop pipeline peeling for
 dynamic loops. (#106436)

Allow speculative execution and predicate results per stage.
---
 .../Dialect/SCF/Transforms/LoopPipelining.cpp | 116 ++++++++++++------
 mlir/test/Dialect/SCF/loop-pipelining.mlir    | 103 +++++++++++++++-
 mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp    |   4 +-
 3 files changed, 179 insertions(+), 44 deletions(-)

diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
index d8e1cc0ecef88..a34542f0161ac 100644
--- a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
@@ -94,8 +94,8 @@ struct LoopPipelinerInternal {
       RewriterBase &rewriter);
   /// Emits the epilogue, this creates `maxStage - 1` part which will contain
   /// operations from stages [i; maxStage], where i is the part index.
-  void emitEpilogue(RewriterBase &rewriter,
-                    llvm::SmallVector<Value> &returnValues);
+  LogicalResult emitEpilogue(RewriterBase &rewriter,
+                             llvm::SmallVector<Value> &returnValues);
 };
 
 bool LoopPipelinerInternal::initializeLoopInfo(
@@ -133,10 +133,6 @@ bool LoopPipelinerInternal::initializeLoopInfo(
     LDBG("--no epilogue or predicate set -> BAIL");
     return false;
   }
-  if (dynamicLoop && peelEpilogue) {
-    LDBG("--dynamic loop doesn't support epilogue yet -> BAIL");
-    return false;
-  }
   std::vector<std::pair<Operation *, unsigned>> schedule;
   options.getScheduleFn(forOp, schedule);
   if (schedule.empty()) {
@@ -313,10 +309,10 @@ void LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) {
           });
       int predicateIdx = i - stages[op];
       if (predicates[predicateIdx]) {
+        OpBuilder::InsertionGuard insertGuard(rewriter);
         newOp = predicateFn(rewriter, newOp, predicates[predicateIdx]);
         assert(newOp && "failed to predicate op.");
       }
-      rewriter.setInsertionPointAfter(newOp);
       if (annotateFn)
         annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i);
       for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
@@ -561,6 +557,7 @@ LogicalResult LoopPipelinerInternal::createKernel(
     }
 
     if (predicates[useStage]) {
+      OpBuilder::InsertionGuard insertGuard(rewriter);
       newOp = predicateFn(rewriter, newOp, predicates[useStage]);
       if (!newOp)
         return failure();
@@ -568,7 +565,6 @@ LogicalResult LoopPipelinerInternal::createKernel(
       for (auto values : llvm::zip(op->getResults(), newOp->getResults()))
         mapping.map(std::get<0>(values), std::get<1>(values));
     }
-    rewriter.setInsertionPointAfter(newOp);
     if (annotateFn)
       annotateFn(newOp, PipeliningOption::PipelinerPart::Kernel, 0);
   }
@@ -640,70 +636,113 @@ LogicalResult LoopPipelinerInternal::createKernel(
   return success();
 }
 
-void LoopPipelinerInternal::emitEpilogue(
-    RewriterBase &rewriter, llvm::SmallVector<Value> &returnValues) {
+LogicalResult
+LoopPipelinerInternal::emitEpilogue(RewriterBase &rewriter,
+                                    llvm::SmallVector<Value> &returnValues) {
+  Location loc = forOp.getLoc();
   // Emit different versions of the induction variable. They will be
   // removed by dead code if not used.
+
+  // bounds_range = ub - lb
+  // total_iterations = (bounds_range + step - 1) / step
+  Type t = lb.getType();
+  Value minus1 =
+      rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, -1));
+  Value boundsRange = rewriter.create<arith::SubIOp>(loc, ub, lb);
+  Value rangeIncr = rewriter.create<arith::AddIOp>(loc, boundsRange, step);
+  Value rangeDecr = rewriter.create<arith::AddIOp>(loc, rangeIncr, minus1);
+  Value totalIterations = rewriter.create<arith::DivUIOp>(loc, rangeDecr, step);
+
+  SmallVector<Value> predicates(maxStage + 1);
   for (int64_t i = 0; i < maxStage; i++) {
-    Location loc = forOp.getLoc();
-    Type t = lb.getType();
-    Value minusOne =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, -1));
-    // number of iterations = ((ub - 1) - lb) / step
-    Value totalNumIteration = rewriter.create<arith::DivUIOp>(
-        loc,
-        rewriter.create<arith::SubIOp>(
-            loc, rewriter.create<arith::AddIOp>(loc, ub, minusOne), lb),
-        step);
-    // newLastIter = lb + step * ((((ub - 1) - lb) / step) - i)
+    // iterI = total_iters - 1 - i
+    // May go negative...
     Value minusI =
         rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, -i));
+    Value iterI = rewriter.create<arith::AddIOp>(
+        loc, rewriter.create<arith::AddIOp>(loc, totalIterations, minus1),
+        minusI);
+    // newLastIter = lb + step * iterI
     Value newlastIter = rewriter.create<arith::AddIOp>(
-        loc, lb,
-        rewriter.create<arith::MulIOp>(
-            loc, step,
-            rewriter.create<arith::AddIOp>(loc, totalNumIteration, minusI)));
+        loc, lb, rewriter.create<arith::MulIOp>(loc, step, iterI));
+
     setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i);
+
+    if (dynamicLoop) {
+      // pred = iterI >= lb
+      predicates[i + 1] = rewriter.create<arith::CmpIOp>(
+          loc, arith::CmpIPredicate::sge, iterI, lb);
+    }
   }
+
   // Emit `maxStage - 1` epilogue part that includes operations from stages
   // [i; maxStage].
   for (int64_t i = 1; i <= maxStage; i++) {
+    SmallVector<std::pair<Value, unsigned>> returnMap(returnValues.size());
     for (Operation *op : opOrder) {
       if (stages[op] < i)
         continue;
+      unsigned currentVersion = maxStage - stages[op] + i;
+      unsigned nextVersion = currentVersion + 1;
       Operation *newOp =
           cloneAndUpdateOperands(rewriter, op, [&](OpOperand *newOperand) {
             auto it = valueMapping.find(newOperand->get());
             if (it != valueMapping.end()) {
-              Value replacement = it->second[maxStage - stages[op] + i];
+              Value replacement = it->second[currentVersion];
               newOperand->set(replacement);
             }
           });
+      if (dynamicLoop) {
+        OpBuilder::InsertionGuard insertGuard(rewriter);
+        newOp = predicateFn(rewriter, newOp, predicates[currentVersion]);
+        if (!newOp)
+          return failure();
+      }
       if (annotateFn)
         annotateFn(newOp, PipeliningOption::PipelinerPart::Epilogue, i - 1);
-      for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
-        setValueMapping(op->getResult(destId), newOp->getResult(destId),
-                        maxStage - stages[op] + i);
+
+      for (auto [opRes, newRes] :
+           llvm::zip(op->getResults(), newOp->getResults())) {
+        setValueMapping(opRes, newRes, currentVersion);
         // If the value is a loop carried dependency update the loop argument
         // mapping and keep track of the last version to replace the original
         // forOp uses.
         for (OpOperand &operand :
              forOp.getBody()->getTerminator()->getOpOperands()) {
-          if (operand.get() != op->getResult(destId))
+          if (operand.get() != opRes)
             continue;
-          unsigned version = maxStage - stages[op] + i + 1;
           // If the version is greater than maxStage it means it maps to the
           // original forOp returned value.
-          if (version > maxStage) {
-            returnValues[operand.getOperandNumber()] = newOp->getResult(destId);
-            continue;
-          }
-          setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
-                          newOp->getResult(destId), version);
+          unsigned ri = operand.getOperandNumber();
+          returnValues[ri] = newRes;
+          Value mapVal = forOp.getRegionIterArgs()[ri];
+          returnMap[ri] = std::make_pair(mapVal, currentVersion);
+          if (nextVersion <= maxStage)
+            setValueMapping(mapVal, newRes, nextVersion);
+        }
+      }
+    }
+    if (dynamicLoop) {
+      // Select return values from this stage (live outs) based on predication.
+      // If the stage is valid select the peeled value, else use previous stage
+      // value.
+      for (auto pair : llvm::enumerate(returnValues)) {
+        unsigned ri = pair.index();
+        auto [mapVal, currentVersion] = returnMap[ri];
+        if (mapVal) {
+          unsigned nextVersion = currentVersion + 1;
+          Value pred = predicates[currentVersion];
+          Value prevValue = valueMapping[mapVal][currentVersion];
+          auto selOp = rewriter.create<arith::SelectOp>(loc, pred, pair.value(),
+                                                        prevValue);
+          returnValues[ri] = selOp;
+          if (nextVersion <= maxStage)
+            setValueMapping(mapVal, selOp, nextVersion);
         }
       }
     }
   }
+  return success();
 }
 
 void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) {
@@ -760,7 +799,8 @@ FailureOr<ForOp> mlir::scf::pipelineForLoop(RewriterBase &rewriter, ForOp forOp,
   if (options.peelEpilogue) {
     // 4. Emit the epilogue after the new forOp.
     rewriter.setInsertionPointAfter(newForOp);
-    pipeliner.emitEpilogue(rewriter, returnValues);
+    if (failed(pipeliner.emitEpilogue(rewriter, returnValues)))
+      return failure();
   }
   // 5. Erase the original loop and replace the uses with the epilogue output.
   if (forOp->getNumResults() > 0)
diff --git a/mlir/test/Dialect/SCF/loop-pipelining.mlir b/mlir/test/Dialect/SCF/loop-pipelining.mlir
index 9687f80f5ddfc..4a1406faabce1 100644
--- a/mlir/test/Dialect/SCF/loop-pipelining.mlir
+++ b/mlir/test/Dialect/SCF/loop-pipelining.mlir
@@ -764,11 +764,44 @@ func.func @stage_0_value_escape(%A: memref<?xf32>, %result: memref<?xf32>, %ub:
 //      NOEPILOGUE:     memref.load %[[A]][%[[IV3]]] : memref<?xf32>
 //      NOEPILOGUE:   scf.yield %[[V2]], %[[L3]] : f32, f32
 
-// In case dynamic loop pipelining is off check that the transformation didn't
-// apply.
+// Check for predicated epilogue for dynamic loop.
 // CHECK-LABEL: dynamic_loop(
-//   CHECK-NOT:   memref.load
-//       CHECK:   scf.for
+//        CHECK:   %{{.*}}:2 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}})
+//        CHECK:       memref.store %[[ARG6]], %{{.*}}[%[[ARG5]]]
+//        CHECK:       %[[ADDF_24:.*]] = arith.addf %[[ARG7]], %{{.*}}
+//        CHECK:       %[[MULI_25:.*]] = arith.muli %{{.*}}, %{{.*}}
+//        CHECK:       %[[ADDI_26:.*]] = arith.addi %[[ARG5]], %[[MULI_25]]
+//        CHECK:       %[[LOAD_27:.*]] = memref.load %{{.*}}[%[[ADDI_26]]]
+//        CHECK:       scf.yield %[[ADDF_24]], %[[LOAD_27]]
+//        CHECK:   }
+//        CHECK:   %[[SUBI_10:.*]] = arith.subi %{{.*}}, %{{.*}}
+//        CHECK:   %[[ADDI_11:.*]] = arith.addi %[[SUBI_10]], %{{.*}}
+//        CHECK:   %[[ADDI_12:.*]] = arith.addi %[[ADDI_11]], %{{.*}}-1
+//        CHECK:   %[[DIVUI_13:.*]] = arith.divui %[[ADDI_12]], %{{.*}}
+//        CHECK:   %[[ADDI_14:.*]] = arith.addi %[[DIVUI_13]], %{{.*}}-1
+//        CHECK:   %[[MULI_15:.*]] = arith.muli %{{.*}}, %[[ADDI_14]]
+//        CHECK:   %[[ADDI_16:.*]] = arith.addi %{{.*}}, %[[MULI_15]]
+//        CHECK:   %[[CMPI_17:.*]] = arith.cmpi sge, %[[ADDI_14]], %{{.*}}
+//        CHECK:   %[[ADDI_18:.*]] = arith.addi %[[DIVUI_13]], %{{.*}}-1
+//        CHECK:   %[[ADDI_19:.*]] = arith.addi %[[ADDI_18]], %{{.*}}-1
+//        CHECK:   %[[MULI_20:.*]] = arith.muli %{{.*}}, %[[ADDI_19]]
+//        CHECK:   %[[ADDI_21:.*]] = arith.addi %{{.*}}, %[[MULI_20]]
+//        CHECK:   %[[CMPI_22:.*]] = arith.cmpi sge, %[[ADDI_19]], %{{.*}}
+//        CHECK:   scf.if %[[CMPI_17]] {
+//        CHECK:     memref.store %{{.*}}#0, %{{.*}}[%[[ADDI_21]]]
+//        CHECK:   } else {
+//        CHECK:   }
+//        CHECK:   %[[IF_23:.*]] = scf.if %[[CMPI_22]] -> (f32) {
+//        CHECK:     %[[ADDF_24:.*]] = arith.addf %{{.*}}#1, %{{.*}}
+//        CHECK:     scf.yield %[[ADDF_24]]
+//        CHECK:   } else {
+//        CHECK:     scf.yield %{{.*}}
+//        CHECK:   }
+//        CHECK:   scf.if %[[CMPI_22]] {
+//        CHECK:     memref.store %[[IF_23]], %{{.*}}[%[[ADDI_16]]]
+//        CHECK:   } else {
+//        CHECK:   }
+//        CHECK:   return
 func.func @dynamic_loop(%A: memref<?xf32>, %result: memref<?xf32>, %lb: index, %ub: index, %step: index) {
   %cf = arith.constant 1.0 : f32
   scf.for %i0 = %lb to %ub step %step {
@@ -781,6 +814,68 @@ func.func @dynamic_loop(%A: memref<?xf32>, %result: memref<?xf32>, %lb: index, %
 
 // -----
 
+// NOEPILOGUE-LABEL:   func.func @dynamic_loop_result
+//       NOEPILOGUE:     %{{.*}}:2 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}})
+//       NOEPILOGUE:       %[[SUBI_3:.*]] = arith.subi %{{.*}}, %{{.*}}
+//       NOEPILOGUE:       %[[CMPI_4:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_3]]
+//       NOEPILOGUE:       %[[ADDF_5:.*]] = arith.addf %[[ARG7]], %[[ARG6]]
+//       NOEPILOGUE:       %[[MULF_6:.*]] = arith.mulf %[[ADDF_5]], %{{.*}}
+//       NOEPILOGUE:       %[[ADDI_7:.*]] = arith.addi %[[ARG5]], %{{.*}}
+//       NOEPILOGUE:       %[[IF_8:.*]] = scf.if %[[CMPI_4]]
+//       NOEPILOGUE:         %[[LOAD_9:.*]] = memref.load %{{.*}}[%[[ADDI_7]]]
+//       NOEPILOGUE:         scf.yield %[[LOAD_9]]
+//       NOEPILOGUE:       } else {
+//       NOEPILOGUE:         scf.yield %{{.*}}
+//       NOEPILOGUE:       }
+//       NOEPILOGUE:       scf.yield %[[MULF_6]], %[[IF_8]]
+//       NOEPILOGUE:     }
+//       NOEPILOGUE:     memref.store %{{.*}}#0, %{{.*}}[%{{.*}}]
+
+// Check for predicated epilogue for dynamic loop.
+// CHECK-LABEL:   func.func @dynamic_loop_result
+//       CHECK:     %{{.*}}:2 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}})
+//       CHECK:       %[[ADDF_13:.*]] = arith.addf %[[ARG7]], %[[ARG6]]
+//       CHECK:       %[[MULF_14:.*]] = arith.mulf %[[ADDF_13]], %{{.*}}
+//       CHECK:       %[[ADDI_15:.*]] = arith.addi %[[ARG5]], %{{.*}}
+//       CHECK:       %[[LOAD_16:.*]] = memref.load %{{.*}}[%[[ADDI_15]]]
+//       CHECK:       scf.yield %[[MULF_14]], %[[LOAD_16]]
+//       CHECK:     }
+//       CHECK:     %[[SUBI_4:.*]] = arith.subi %{{.*}}, %{{.*}}
+//       CHECK:     %[[ADDI_5:.*]] = arith.addi %[[SUBI_4]], %{{.*}}
+//       CHECK:     %[[ADDI_6:.*]] = arith.addi %[[ADDI_5]], %{{.*}}-1
+//       CHECK:     %[[DIVUI_7:.*]] = arith.divui %[[ADDI_6]], %{{.*}}
+//       CHECK:     %[[ADDI_8:.*]] = arith.addi %[[DIVUI_7]], %{{.*}}-1
+//       CHECK:     %[[CMPI_9:.*]] = arith.cmpi sge, %[[ADDI_8]], %{{.*}}
+//       CHECK:     %[[IF_10:.*]] = scf.if %[[CMPI_9]]
+//       CHECK:       %[[ADDF_13:.*]] = arith.addf %{{.*}}#1, %{{.*}}#0
+//       CHECK:       scf.yield %[[ADDF_13]]
+//       CHECK:     } else {
+//       CHECK:       scf.yield %{{.*}}
+//       CHECK:     }
+//       CHECK:     %[[IF_11:.*]] = scf.if %[[CMPI_9]]
+//       CHECK:       %[[MULF_13:.*]] = arith.mulf %[[IF_10]], %{{.*}}
+//       CHECK:       scf.yield %[[MULF_13]]
+//       CHECK:     } else {
+//       CHECK:       scf.yield %{{.*}}
+//       CHECK:     }
+//       CHECK:     %[[SELECT_12:.*]] = arith.select %[[CMPI_9]], %[[IF_11]], %{{.*}}#0
+//       CHECK:     memref.store %[[SELECT_12]], %{{.*}}[%{{.*}}]
+func.func @dynamic_loop_result(%A: memref<?xf32>, %result: memref<?xf32>, %lb: index, %ub: index, %step: index) {
+  %cf0 = arith.constant 1.0 : f32
+  %cf1 = arith.constant 33.0 : f32
+  %cst = arith.constant 0 : index
+  %res:1 = scf.for %i0 = %lb to %ub step %step iter_args (%arg0 = %cf0) -> (f32) {
+    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
+    %A1_elem = arith.addf %A_elem, %arg0 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : f32
+    %A2_elem = arith.mulf %A1_elem, %cf1 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : f32
+    scf.yield %A2_elem : f32
+  } { __test_pipelining_loop__ }
+  memref.store %res#0, %result[%cst] : memref<?xf32>
+  return
+}
+
+// -----
+
 // CHECK-LABEL: yield_constant_loop(
 //  CHECK-SAME:   %[[A:.*]]: memref<?xf32>) -> f32 {
 //   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
diff --git a/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp b/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp
index 8a92d840ad130..3ff7f9966e93d 100644
--- a/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp
+++ b/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp
@@ -214,12 +214,12 @@ struct TestSCFPipeliningPass
     RewritePatternSet patterns(&getContext());
     mlir::scf::PipeliningOption options;
     options.getScheduleFn = getSchedule;
+    options.supportDynamicLoops = true;
+    options.predicateFn = predicateOp;
     if (annotatePipeline)
       options.annotateFn = annotate;
     if (noEpiloguePeeling) {
-      options.supportDynamicLoops = true;
       options.peelEpilogue = false;
-      options.predicateFn = predicateOp;
     }
     scf::populateSCFLoopPipeliningPatterns(patterns, options);
     (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));

From 0fffdeb5f46078ddcc61e112cd38856b1165f050 Mon Sep 17 00:00:00 2001
From: Ziqing Luo <ziqing@udel.edu>
Date: Wed, 4 Sep 2024 12:34:43 -0700
Subject: [PATCH 145/425] [-Wunsafe-buffer-usage] Warning Libc functions
 (#101583)

[-Wunsafe-buffer-usage] Add warn on unsafe calls to libc functions

Warning about calls to libc functions involving buffer access.  Warned
functions are hardcoded by names.

(rdar://117182250)
---
 .../Analysis/Analyses/UnsafeBufferUsage.h     |  15 +
 .../Analyses/UnsafeBufferUsageGadgets.def     |   1 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   7 +
 clang/lib/Analysis/UnsafeBufferUsage.cpp      | 513 +++++++++++++++++-
 clang/lib/Sema/AnalysisBasedWarnings.cpp      |  14 +
 ...-usage-libc-functions-inline-namespace.cpp |  60 ++
 ...arn-unsafe-buffer-usage-libc-functions.cpp | 106 ++++
 ...n-unsafe-buffer-usage-test-unreachable.cpp |   4 +-
 8 files changed, 716 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp
 create mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp

diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
index 228b4ae1e3e11..aa2c01ad10d45 100644
--- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
+++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
@@ -15,6 +15,7 @@
 #define LLVM_CLANG_ANALYSIS_ANALYSES_UNSAFEBUFFERUSAGE_H
 
 #include "clang/AST/Decl.h"
+#include "clang/AST/Expr.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/Support/Debug.h"
@@ -106,6 +107,20 @@ class UnsafeBufferUsageHandler {
   virtual void handleUnsafeOperation(const Stmt *Operation,
                                      bool IsRelatedToDecl, ASTContext &Ctx) = 0;
 
+  /// Invoked when a call to an unsafe libc function is found.
+  /// \param PrintfInfo
+  ///  is 0 if the callee function is not a member of the printf family;
+  ///  is 1 if the callee is `sprintf`;
+  ///  is 2 if arguments of the call have `__size_by` relation but are not in a
+  ///  safe pattern;
+  ///  is 3 if string arguments do not guarantee null-termination
+  ///  is 4 if the callee takes va_list
+  /// \param UnsafeArg one of the actual arguments that is unsafe, non-null
+  /// only when `2 <= PrintfInfo <= 3`
+  virtual void handleUnsafeLibcCall(const CallExpr *Call, unsigned PrintfInfo,
+                                    ASTContext &Ctx,
+                                    const Expr *UnsafeArg = nullptr) = 0;
+
   /// Invoked when an unsafe operation with a std container is found.
   virtual void handleUnsafeOperationInContainer(const Stmt *Operation,
                                                 bool IsRelatedToDecl,
diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
index 242ad763ba62b..ac01b285ae833 100644
--- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
+++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
@@ -38,6 +38,7 @@ WARNING_GADGET(PointerArithmetic)
 WARNING_GADGET(UnsafeBufferUsageAttr)
 WARNING_GADGET(UnsafeBufferUsageCtorAttr)
 WARNING_GADGET(DataInvocation)
+WARNING_GADGET(UnsafeLibcFunctionCall)
 WARNING_CONTAINER_GADGET(SpanTwoParamConstructor) // Uses of `std::span(arg0, arg1)`
 FIXABLE_GADGET(ULCArraySubscript)          // `DRE[any]` in an Unspecified Lvalue Context
 FIXABLE_GADGET(DerefSimplePtrArithFixable)
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index dcb49d8a67604..35f68f51dfb35 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12412,6 +12412,13 @@ def warn_unsafe_buffer_operation : Warning<
   "unsafe buffer access|function introduces unsafe buffer manipulation|unsafe invocation of span::data|"
   "field %1 prone to unsafe buffer manipulation}0">,
   InGroup<UnsafeBufferUsage>, DefaultIgnore;
+def warn_unsafe_buffer_libc_call : Warning<
+  "function %0 is unsafe">,
+  InGroup<UnsafeBufferUsage>, DefaultIgnore;
+def note_unsafe_buffer_printf_call : Note<
+  "%select{|change to 'snprintf' for explicit bounds checking | buffer pointer and size may not match"
+          "|string argument is not guaranteed to be null-terminated"
+          "|'va_list' is unsafe}0">;
 def note_unsafe_buffer_operation : Note<
   "used%select{| in pointer arithmetic| in buffer access}0 here">;
 def note_unsafe_buffer_variable_fixit_group : Note<
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index da7446913f7c8..f0d072643f8ff 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -10,12 +10,12 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/FormatString.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtVisitor.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
-#include "clang/Basic/CharInfo.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Lex/Preprocessor.h"
@@ -443,6 +443,426 @@ AST_MATCHER(ArraySubscriptExpr, isSafeArraySubscript) {
   return false;
 }
 
+AST_MATCHER_P(CallExpr, hasNumArgs, unsigned, Num) {
+  return Node.getNumArgs() == Num;
+}
+
+namespace libc_func_matchers {
+// Under `libc_func_matchers`, define a set of matchers that match unsafe
+// functions in libc and unsafe calls to them.
+
+//  A tiny parser to strip off common prefix and suffix of libc function names
+//  in real code.
+//
+//  Given a function name, `matchName` returns `CoreName` according to the
+//  following grammar:
+//
+//  LibcName     := CoreName | CoreName + "_s"
+//  MatchingName := "__builtin_" + LibcName              |
+//                  "__builtin___" + LibcName + "_chk"   |
+//                  "__asan_" + LibcName
+//
+struct LibcFunNamePrefixSuffixParser {
+  StringRef matchName(StringRef FunName, bool isBuiltin) {
+    // Try to match __builtin_:
+    if (isBuiltin && FunName.starts_with("__builtin_"))
+      // Then either it is __builtin_LibcName or __builtin___LibcName_chk or
+      // no match:
+      return matchLibcNameOrBuiltinChk(
+          FunName.drop_front(10 /* truncate "__builtin_" */));
+    // Try to match __asan_:
+    if (FunName.starts_with("__asan_"))
+      return matchLibcName(FunName.drop_front(7 /* truncate of "__asan_" */));
+    return matchLibcName(FunName);
+  }
+
+  // Parameter `Name` is the substring after stripping off the prefix
+  // "__builtin_".
+  StringRef matchLibcNameOrBuiltinChk(StringRef Name) {
+    if (Name.starts_with("__") && Name.ends_with("_chk"))
+      return matchLibcName(
+          Name.drop_front(2).drop_back(4) /* truncate "__" and "_chk" */);
+    return matchLibcName(Name);
+  }
+
+  StringRef matchLibcName(StringRef Name) {
+    if (Name.ends_with("_s"))
+      return Name.drop_back(2 /* truncate "_s" */);
+    return Name;
+  }
+};
+
+// A pointer type expression is known to be null-terminated, if it has the
+// form: E.c_str(), for any expression E of `std::string` type.
+static bool isNullTermPointer(const Expr *Ptr) {
+  if (isa<StringLiteral>(Ptr->IgnoreParenImpCasts()))
+    return true;
+  if (isa<PredefinedExpr>(Ptr->IgnoreParenImpCasts()))
+    return true;
+  if (auto *MCE = dyn_cast<CXXMemberCallExpr>(Ptr->IgnoreParenImpCasts())) {
+    const CXXMethodDecl *MD = MCE->getMethodDecl();
+    const CXXRecordDecl *RD = MCE->getRecordDecl()->getCanonicalDecl();
+
+    if (MD && RD && RD->isInStdNamespace())
+      if (MD->getName() == "c_str" && RD->getName() == "basic_string")
+        return true;
+  }
+  return false;
+}
+
+// Return true iff at least one of following cases holds:
+//  1. Format string is a literal and there is an unsafe pointer argument
+//     corresponding to an `s` specifier;
+//  2. Format string is not a literal and there is least an unsafe pointer
+//     argument (including the formatter argument).
+//
+// `UnsafeArg` is the output argument that will be set only if this function
+// returns true.
+static bool hasUnsafeFormatOrSArg(const CallExpr *Call, const Expr *&UnsafeArg,
+                                  const unsigned FmtArgIdx, ASTContext &Ctx,
+                                  bool isKprintf = false) {
+  class StringFormatStringHandler
+      : public analyze_format_string::FormatStringHandler {
+    const CallExpr *Call;
+    unsigned FmtArgIdx;
+    const Expr *&UnsafeArg;
+
+  public:
+    StringFormatStringHandler(const CallExpr *Call, unsigned FmtArgIdx,
+                              const Expr *&UnsafeArg)
+        : Call(Call), FmtArgIdx(FmtArgIdx), UnsafeArg(UnsafeArg) {}
+
+    bool HandlePrintfSpecifier(const analyze_printf::PrintfSpecifier &FS,
+                               const char *startSpecifier,
+                               unsigned specifierLen,
+                               const TargetInfo &Target) override {
+      if (FS.getConversionSpecifier().getKind() ==
+          analyze_printf::PrintfConversionSpecifier::sArg) {
+        unsigned ArgIdx = FS.getPositionalArgIndex() + FmtArgIdx;
+
+        if (0 < ArgIdx && ArgIdx < Call->getNumArgs())
+          if (!isNullTermPointer(Call->getArg(ArgIdx))) {
+            UnsafeArg = Call->getArg(ArgIdx); // output
+            // returning false stops parsing immediately
+            return false;
+          }
+      }
+      return true; // continue parsing
+    }
+  };
+
+  const Expr *Fmt = Call->getArg(FmtArgIdx);
+
+  if (auto *SL = dyn_cast<StringLiteral>(Fmt->IgnoreParenImpCasts())) {
+    StringRef FmtStr = SL->getString();
+    StringFormatStringHandler Handler(Call, FmtArgIdx, UnsafeArg);
+
+    return analyze_format_string::ParsePrintfString(
+        Handler, FmtStr.begin(), FmtStr.end(), Ctx.getLangOpts(),
+        Ctx.getTargetInfo(), isKprintf);
+  }
+  // If format is not a string literal, we cannot analyze the format string.
+  // In this case, this call is considered unsafe if at least one argument
+  // (including the format argument) is unsafe pointer.
+  return llvm::any_of(
+      llvm::make_range(Call->arg_begin() + FmtArgIdx, Call->arg_end()),
+      [&UnsafeArg](const Expr *Arg) -> bool {
+        if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg)) {
+          UnsafeArg = Arg;
+          return true;
+        }
+        return false;
+      });
+}
+
+// Matches a FunctionDecl node such that
+//  1. It's name, after stripping off predefined prefix and suffix, is
+//     `CoreName`; and
+//  2. `CoreName` or `CoreName[str/wcs]` is one of the `PredefinedNames`, which
+//     is a set of libc function names.
+//
+//  Note: For predefined prefix and suffix, see `LibcFunNamePrefixSuffixParser`.
+//  The notation `CoreName[str/wcs]` means a new name obtained from replace
+//  string "wcs" with "str" in `CoreName`.
+AST_MATCHER(FunctionDecl, isPredefinedUnsafeLibcFunc) {
+  static std::unique_ptr<std::set<StringRef>> PredefinedNames = nullptr;
+  if (!PredefinedNames)
+    PredefinedNames =
+        std::make_unique<std::set<StringRef>, std::set<StringRef>>({
+            // numeric conversion:
+            "atof",
+            "atoi",
+            "atol",
+            "atoll",
+            "strtol",
+            "strtoll",
+            "strtoul",
+            "strtoull",
+            "strtof",
+            "strtod",
+            "strtold",
+            "strtoimax",
+            "strtoumax",
+            // "strfromf",  "strfromd", "strfroml", // C23?
+            // string manipulation:
+            "strcpy",
+            "strncpy",
+            "strlcpy",
+            "strcat",
+            "strncat",
+            "strlcat",
+            "strxfrm",
+            "strdup",
+            "strndup",
+            // string examination:
+            "strlen",
+            "strnlen",
+            "strcmp",
+            "strncmp",
+            "stricmp",
+            "strcasecmp",
+            "strcoll",
+            "strchr",
+            "strrchr",
+            "strspn",
+            "strcspn",
+            "strpbrk",
+            "strstr",
+            "strtok",
+            // "mem-" functions
+            "memchr",
+            "wmemchr",
+            "memcmp",
+            "wmemcmp",
+            "memcpy",
+            "memccpy",
+            "mempcpy",
+            "wmemcpy",
+            "memmove",
+            "wmemmove",
+            "memset",
+            "wmemset",
+            // IO:
+            "fread",
+            "fwrite",
+            "fgets",
+            "fgetws",
+            "gets",
+            "fputs",
+            "fputws",
+            "puts",
+            // others
+            "strerror_s",
+            "strerror_r",
+            "bcopy",
+            "bzero",
+            "bsearch",
+            "qsort",
+        });
+
+  auto *II = Node.getIdentifier();
+
+  if (!II)
+    return false;
+
+  StringRef Name = LibcFunNamePrefixSuffixParser().matchName(
+      II->getName(), Node.getBuiltinID());
+
+  // Match predefined names:
+  if (PredefinedNames->find(Name) != PredefinedNames->end())
+    return true;
+
+  std::string NameWCS = Name.str();
+  size_t WcsPos = NameWCS.find("wcs");
+
+  while (WcsPos != std::string::npos) {
+    NameWCS[WcsPos++] = 's';
+    NameWCS[WcsPos++] = 't';
+    NameWCS[WcsPos++] = 'r';
+    WcsPos = NameWCS.find("wcs", WcsPos);
+  }
+  if (PredefinedNames->find(NameWCS) != PredefinedNames->end())
+    return true;
+  // All `scanf` functions are unsafe (including `sscanf`, `vsscanf`, etc.. They
+  // all should end with "scanf"):
+  return Name.ends_with("scanf");
+}
+
+// Match a call to one of the `v*printf` functions taking `va_list`.  We cannot
+// check safety for these functions so they should be changed to their
+// non-va_list versions.
+AST_MATCHER(FunctionDecl, isUnsafeVaListPrintfFunc) {
+  auto *II = Node.getIdentifier();
+
+  if (!II)
+    return false;
+
+  StringRef Name = LibcFunNamePrefixSuffixParser().matchName(
+      II->getName(), Node.getBuiltinID());
+
+  if (!Name.ends_with("printf"))
+    return false; // neither printf nor scanf
+  return Name.starts_with("v");
+}
+
+// Matches a call to one of the `sprintf` functions as they are always unsafe
+// and should be changed to `snprintf`.
+AST_MATCHER(FunctionDecl, isUnsafeSprintfFunc) {
+  auto *II = Node.getIdentifier();
+
+  if (!II)
+    return false;
+
+  StringRef Name = LibcFunNamePrefixSuffixParser().matchName(
+      II->getName(), Node.getBuiltinID());
+
+  if (!Name.ends_with("printf") ||
+      // Let `isUnsafeVaListPrintfFunc` check for cases with va-list:
+      Name.starts_with("v"))
+    return false;
+
+  StringRef Prefix = Name.drop_back(6);
+
+  if (Prefix.ends_with("w"))
+    Prefix = Prefix.drop_back(1);
+  return Prefix == "s";
+}
+
+// Match function declarations of `printf`, `fprintf`, `snprintf` and their wide
+// character versions.  Calls to these functions can be safe if their arguments
+// are carefully made safe.
+AST_MATCHER(FunctionDecl, isNormalPrintfFunc) {
+  auto *II = Node.getIdentifier();
+
+  if (!II)
+    return false;
+
+  StringRef Name = LibcFunNamePrefixSuffixParser().matchName(
+      II->getName(), Node.getBuiltinID());
+
+  if (!Name.ends_with("printf") || Name.starts_with("v"))
+    return false;
+
+  StringRef Prefix = Name.drop_back(6);
+
+  if (Prefix.ends_with("w"))
+    Prefix = Prefix.drop_back(1);
+
+  return Prefix.empty() || Prefix == "k" || Prefix == "f" || Prefix == "sn";
+}
+
+// This matcher requires that it is known that the callee `isNormalPrintf`.
+// Then if the format string is a string literal, this matcher matches when at
+// least one string argument is unsafe. If the format is not a string literal,
+// this matcher matches when at least one pointer type argument is unsafe.
+AST_MATCHER_P(CallExpr, hasUnsafePrintfStringArg,
+              clang::ast_matchers::internal::Matcher<Expr>,
+              UnsafeStringArgMatcher) {
+  // Determine what printf it is:
+  const Expr *FirstArg = Node.getArg(0);
+  ASTContext &Ctx = Finder->getASTContext();
+
+  if (isa<StringLiteral>(FirstArg->IgnoreParenImpCasts())) {
+    // It is a printf/kprintf. And, the format is a string literal:
+    bool isKprintf = false;
+    const Expr *UnsafeArg;
+
+    if (auto *Callee = Node.getDirectCallee())
+      if (auto *II = Node.getDirectCallee()->getIdentifier())
+        isKprintf = II->getName() == "kprintf";
+    if (hasUnsafeFormatOrSArg(&Node, UnsafeArg, 0, Ctx, isKprintf))
+      return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder);
+    return false;
+  }
+
+  QualType PtrTy = FirstArg->getType();
+
+  assert(PtrTy->isPointerType());
+
+  QualType PteTy = (cast<PointerType>(PtrTy))->getPointeeType();
+
+  if (!Ctx.getFILEType().isNull() /* If `FILE *` is not ever in the ASTContext,
+                                     there can't be any file pointer then */
+      && PteTy.getCanonicalType() == Ctx.getFILEType().getCanonicalType()) {
+    // It is a fprintf:
+    const Expr *UnsafeArg;
+
+    if (hasUnsafeFormatOrSArg(&Node, UnsafeArg, 1, Ctx, false))
+      return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder);
+    return false;
+  }
+
+  const Expr *SecondArg = Node.getArg(1);
+
+  if (SecondArg->getType()->isIntegerType()) {
+    // It is a snprintf:
+    const Expr *UnsafeArg;
+
+    if (unsigned UnsafeArgIdx =
+            hasUnsafeFormatOrSArg(&Node, UnsafeArg, 2, Ctx, false))
+      return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder);
+    return false;
+  }
+  // It is printf but the format string is passed by pointer. The only thing we
+  // can do is to require all pointers to be null-terminated:
+  for (auto Arg : Node.arguments())
+    if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg))
+      if (UnsafeStringArgMatcher.matches(*Arg, Finder, Builder))
+        return true;
+  return false;
+}
+
+// This matcher requires that it is known that the callee `isNormalPrintf`.
+// Then it matches if the first two arguments of the call is a pointer and an
+// integer and they are not in a safe pattern.
+//
+// For the first two arguments: `ptr` and `size`, they are safe if in the
+// following patterns:
+//    ptr := DRE.data();
+//    size:= DRE.size()/DRE.size_bytes()
+// And DRE is a hardened container or view.
+AST_MATCHER(CallExpr, hasUnsafeSnprintfBuffer) {
+  if (Node.getNumArgs() < 3)
+    return false; // not an snprintf call
+
+  const Expr *Buf = Node.getArg(0), *Size = Node.getArg(1);
+
+  if (!Buf->getType()->isPointerType() || !Size->getType()->isIntegerType())
+    return false; // not an snprintf call
+
+  static StringRef SizedObjs[] = {"span", "array", "vector",
+                                  "basic_string_view", "basic_string"};
+  Buf = Buf->IgnoreParenImpCasts();
+  Size = Size->IgnoreParenImpCasts();
+  if (auto *MCEPtr = dyn_cast<CXXMemberCallExpr>(Buf))
+    if (auto *MCESize = dyn_cast<CXXMemberCallExpr>(Size)) {
+      auto *DREOfPtr = dyn_cast<DeclRefExpr>(
+          MCEPtr->getImplicitObjectArgument()->IgnoreParenImpCasts());
+      auto *DREOfSize = dyn_cast<DeclRefExpr>(
+          MCESize->getImplicitObjectArgument()->IgnoreParenImpCasts());
+
+      if (!DREOfPtr || !DREOfSize)
+        return true; // not in safe pattern
+      if (DREOfPtr->getDecl() != DREOfSize->getDecl())
+        return true; // not in safe pattern
+      if (MCEPtr->getMethodDecl()->getName() != "data")
+        return true; // not in safe pattern
+
+      if (MCESize->getMethodDecl()->getName() == "size_bytes" ||
+          // Note here the pointer must be a pointer-to-char type unless there
+          // is explicit casting.  If there is explicit casting, this branch
+          // is unreachable. Thus, at this branch "size" and "size_bytes" are
+          // equivalent as the pointer is a char pointer:
+          MCESize->getMethodDecl()->getName() == "size")
+        for (StringRef SizedObj : SizedObjs)
+          if (MCEPtr->getRecordDecl()->isInStdNamespace() &&
+              MCEPtr->getRecordDecl()->getCanonicalDecl()->getName() ==
+                  SizedObj)
+            return false; // It is in fact safe
+    }
+  return true; // ptr and size are not in safe pattern
+}
+} // namespace libc_func_matchers
 } // namespace clang::ast_matchers
 
 namespace {
@@ -1030,6 +1450,97 @@ class DataInvocationGadget : public WarningGadget {
   DeclUseList getClaimedVarUseSites() const override { return {}; }
 };
 
+class UnsafeLibcFunctionCallGadget : public WarningGadget {
+  const CallExpr *const Call;
+  const Expr *UnsafeArg = nullptr;
+  constexpr static const char *const Tag = "UnsafeLibcFunctionCall";
+  // Extra tags for additional information:
+  constexpr static const char *const UnsafeSprintfTag =
+      "UnsafeLibcFunctionCall_sprintf";
+  constexpr static const char *const UnsafeSizedByTag =
+      "UnsafeLibcFunctionCall_sized_by";
+  constexpr static const char *const UnsafeStringTag =
+      "UnsafeLibcFunctionCall_string";
+  constexpr static const char *const UnsafeVaListTag =
+      "UnsafeLibcFunctionCall_va_list";
+
+  enum UnsafeKind {
+    OTHERS = 0,  // no specific information, the callee function is unsafe
+    SPRINTF = 1, // never call `-sprintf`s, call `-snprintf`s instead.
+    SIZED_BY =
+        2, // the first two arguments of `snprintf` function have
+           // "__sized_by" relation but they do not conform to safe patterns
+    STRING = 3,  // an argument is a pointer-to-char-as-string but does not
+                 // guarantee null-termination
+    VA_LIST = 4, // one of the `-printf`s function that take va_list, which is
+                 // considered unsafe as it is not compile-time check
+  } WarnedFunKind = OTHERS;
+
+public:
+  UnsafeLibcFunctionCallGadget(const MatchFinder::MatchResult &Result)
+      : WarningGadget(Kind::UnsafeLibcFunctionCall),
+        Call(Result.Nodes.getNodeAs<CallExpr>(Tag)) {
+    if (Result.Nodes.getNodeAs<Decl>(UnsafeSprintfTag))
+      WarnedFunKind = SPRINTF;
+    else if (auto *E = Result.Nodes.getNodeAs<Expr>(UnsafeStringTag)) {
+      WarnedFunKind = STRING;
+      UnsafeArg = E;
+    } else if (Result.Nodes.getNodeAs<CallExpr>(UnsafeSizedByTag)) {
+      WarnedFunKind = SIZED_BY;
+      UnsafeArg = Call->getArg(0);
+    } else if (Result.Nodes.getNodeAs<Decl>(UnsafeVaListTag))
+      WarnedFunKind = VA_LIST;
+  }
+
+  static Matcher matcher() {
+    return stmt(anyOf(
+        callExpr(
+            callee(functionDecl(anyOf(
+                // Match a predefined unsafe libc
+                // function:
+                functionDecl(libc_func_matchers::isPredefinedUnsafeLibcFunc()),
+                // Match a call to one of the `v*printf` functions
+                // taking va-list, which cannot be checked at
+                // compile-time:
+                functionDecl(libc_func_matchers::isUnsafeVaListPrintfFunc())
+                    .bind(UnsafeVaListTag),
+                // Match a call to a `sprintf` function, which is never
+                // safe:
+                functionDecl(libc_func_matchers::isUnsafeSprintfFunc())
+                    .bind(UnsafeSprintfTag)))),
+            //  (unless the call has a sole string literal argument):
+            unless(
+                allOf(hasArgument(0, expr(stringLiteral())), hasNumArgs(1)))),
+
+        // The following two cases require checking against actual
+        // arguments of the call:
+
+        // Match a call to an `snprintf` function. And first two
+        // arguments of the call (that describe a buffer) are not in
+        // safe patterns:
+        callExpr(callee(functionDecl(libc_func_matchers::isNormalPrintfFunc())),
+                 libc_func_matchers::hasUnsafeSnprintfBuffer())
+            .bind(UnsafeSizedByTag),
+        // Match a call to a `printf` function, which can be safe if
+        // all arguments are null-terminated:
+        callExpr(callee(functionDecl(libc_func_matchers::isNormalPrintfFunc())),
+                 libc_func_matchers::hasUnsafePrintfStringArg(
+                     expr().bind(UnsafeStringTag)))));
+  }
+
+  const Stmt *getBaseStmt() const { return Call; }
+
+  SourceLocation getSourceLoc() const override { return Call->getBeginLoc(); }
+
+  void handleUnsafeOperation(UnsafeBufferUsageHandler &Handler,
+                             bool IsRelatedToDecl,
+                             ASTContext &Ctx) const override {
+    Handler.handleUnsafeLibcCall(Call, WarnedFunKind, Ctx, UnsafeArg);
+  }
+
+  DeclUseList getClaimedVarUseSites() const override { return {}; }
+};
+
 // Represents expressions of the form `DRE[*]` in the Unspecified Lvalue
 // Context (see `isInUnspecifiedLvalueContext`).
 // Note here `[]` is the built-in subscript operator.
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index e6ce89dc7ec40..72078ae1534b0 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -2304,6 +2304,20 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler {
     }
   }
 
+  void handleUnsafeLibcCall(const CallExpr *Call, unsigned PrintfInfo,
+                            ASTContext &Ctx,
+                            const Expr *UnsafeArg = nullptr) override {
+    S.Diag(Call->getBeginLoc(), diag::warn_unsafe_buffer_libc_call)
+        << Call->getDirectCallee() // We've checked there is a direct callee
+        << Call->getSourceRange();
+    if (PrintfInfo > 0) {
+      SourceRange R =
+          UnsafeArg ? UnsafeArg->getSourceRange() : Call->getSourceRange();
+      S.Diag(R.getBegin(), diag::note_unsafe_buffer_printf_call)
+          << PrintfInfo << R;
+    }
+  }
+
   void handleUnsafeOperationInContainer(const Stmt *Operation,
                                         bool IsRelatedToDecl,
                                         ASTContext &Ctx) override {
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp
new file mode 100644
index 0000000000000..2bd12db93fd52
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp
@@ -0,0 +1,60 @@
+// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage \
+// RUN:            -verify %s
+
+namespace std {
+  inline namespace __1 {
+  template< class InputIt, class OutputIt >
+  OutputIt copy( InputIt first, InputIt last,
+		 OutputIt d_first );
+
+  struct iterator{};
+  template<typename T>
+  struct span {
+    T * ptr;
+    T * data();
+    unsigned size_bytes();
+    unsigned size();
+    iterator begin() const noexcept;
+    iterator end() const noexcept;
+  };
+
+  template<typename T>
+  struct basic_string {
+    T* p;
+    T *c_str();
+    T *data();
+    unsigned size_bytes();
+  };
+
+  typedef basic_string<char> string;
+  typedef basic_string<wchar_t> wstring;
+
+  // C function under std:
+  void memcpy();
+  void strcpy();
+  int snprintf( char* buffer, unsigned buf_size, const char* format, ... );
+  }
+}
+
+void f(char * p, char * q, std::span<char> s) {
+  std::memcpy();              // expected-warning{{function 'memcpy' is unsafe}}
+  std::strcpy();              // expected-warning{{function 'strcpy' is unsafe}}
+  std::__1::memcpy();              // expected-warning{{function 'memcpy' is unsafe}}
+  std::__1::strcpy();              // expected-warning{{function 'strcpy' is unsafe}}
+
+  /* Test printfs */
+  std::snprintf(s.data(), 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}}
+  std::__1::snprintf(s.data(), 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}}
+  std::snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn
+  std::__1::snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn
+}
+
+void v(std::string s1) {
+  std::snprintf(s1.data(), s1.size_bytes(), "%s%d", s1.c_str(), 0); // no warn
+  std::__1::snprintf(s1.data(), s1.size_bytes(), "%s%d", s1.c_str(), 0); // no warn
+}
+
+void g(char *begin, char *end, char *p, std::span<char> s) {
+  std::copy(begin, end, p); // no warn
+  std::copy(s.begin(), s.end(), s.begin()); // no warn
+}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp
new file mode 100644
index 0000000000000..1a29654f660c9
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp
@@ -0,0 +1,106 @@
+// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage \
+// RUN:            -verify %s
+
+typedef struct {} FILE;
+void memcpy();
+void __asan_memcpy();
+void strcpy();
+void strcpy_s();
+void wcscpy_s();
+unsigned strlen( const char* str );
+int fprintf( FILE* stream, const char* format, ... );
+int printf( const char* format, ... );
+int sprintf( char* buffer, const char* format, ... );
+int swprintf( char* buffer, const char* format, ... );
+int snprintf( char* buffer, unsigned buf_size, const char* format, ... );
+int snwprintf( char* buffer, unsigned buf_size, const char* format, ... );
+int snwprintf_s( char* buffer, unsigned buf_size, const char* format, ... );
+int vsnprintf( char* buffer, unsigned buf_size, const char* format, ... );
+int sscanf_s(const char * buffer, const char * format, ...);
+int sscanf(const char * buffer, const char * format, ... );
+
+namespace std {
+  template< class InputIt, class OutputIt >
+  OutputIt copy( InputIt first, InputIt last,
+		 OutputIt d_first );
+
+  struct iterator{};
+  template<typename T>
+  struct span {
+    T * ptr;
+    T * data();
+    unsigned size_bytes();
+    unsigned size();
+    iterator begin() const noexcept;
+    iterator end() const noexcept;
+  };
+
+  template<typename T>
+  struct basic_string {
+    T* p;
+    T *c_str();
+    T *data();
+    unsigned size_bytes();
+  };
+
+  typedef basic_string<char> string;
+  typedef basic_string<wchar_t> wstring;
+
+  // C function under std:
+  void memcpy();
+  void strcpy();
+}
+
+void f(char * p, char * q, std::span<char> s, std::span<char> s2) {
+  memcpy();                   // expected-warning{{function 'memcpy' is unsafe}}
+  std::memcpy();              // expected-warning{{function 'memcpy' is unsafe}}
+  __builtin_memcpy(p, q, 64); // expected-warning{{function '__builtin_memcpy' is unsafe}}
+  __builtin___memcpy_chk(p, q, 8, 64);  // expected-warning{{function '__builtin___memcpy_chk' is unsafe}}
+  __asan_memcpy();                      // expected-warning{{function '__asan_memcpy' is unsafe}}
+  strcpy();                   // expected-warning{{function 'strcpy' is unsafe}}
+  std::strcpy();              // expected-warning{{function 'strcpy' is unsafe}}
+  strcpy_s();                 // expected-warning{{function 'strcpy_s' is unsafe}}
+  wcscpy_s();                 // expected-warning{{function 'wcscpy_s' is unsafe}}
+
+
+  /* Test printfs */
+  fprintf((FILE*)p, "%s%d", p, *p);  // expected-warning{{function 'fprintf' is unsafe}} expected-note{{string argument is not guaranteed to be null-terminated}}
+  printf("%s%d", // expected-warning{{function 'printf' is unsafe}}
+	 p,    // expected-note{{string argument is not guaranteed to be null-terminated}} note attached to the unsafe argument
+	 *p);
+  sprintf(q, "%s%d", "hello", *p); // expected-warning{{function 'sprintf' is unsafe}} expected-note{{change to 'snprintf' for explicit bounds checking}}
+  swprintf(q, "%s%d", "hello", *p); // expected-warning{{function 'swprintf' is unsafe}} expected-note{{change to 'snprintf' for explicit bounds checking}}
+  snprintf(q, 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}}
+  snprintf(s.data(), s2.size(), "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}}
+  snwprintf(s.data(), s2.size(), "%s%d", "hello", *p); // expected-warning{{function 'snwprintf' is unsafe}} expected-note{{buffer pointer and size may not match}}
+  snwprintf_s(                      // expected-warning{{function 'snwprintf_s' is unsafe}}
+	      s.data(),             // expected-note{{buffer pointer and size may not match}} // note attached to the buffer
+	      s2.size(),
+	      "%s%d", "hello", *p);
+  vsnprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // expected-warning{{function 'vsnprintf' is unsafe}} expected-note{{'va_list' is unsafe}}
+  sscanf(p, "%s%d", "hello", *p);    // expected-warning{{function 'sscanf' is unsafe}}
+  sscanf_s(p, "%s%d", "hello", *p);  // expected-warning{{function 'sscanf_s' is unsafe}}
+  fprintf((FILE*)p, "%P%d%p%i hello world %32s", *p, *p, p, *p, p); // expected-warning{{function 'fprintf' is unsafe}} expected-note{{string argument is not guaranteed to be null-terminated}}
+  fprintf((FILE*)p, "%P%d%p%i hello world %32s", *p, *p, p, *p, "hello"); // no warn
+  printf("%s%d", "hello", *p); // no warn
+  snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn
+  snprintf(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn
+  snwprintf(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn
+  snwprintf_s(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn
+  strlen("hello");// no warn
+}
+
+void v(std::string s1, int *p) {
+  snprintf(s1.data(), s1.size_bytes(), "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str()); // no warn
+  snprintf(s1.data(), s1.size_bytes(), s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str());      // no warn
+  printf("%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str());              // no warn
+  printf(s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str());                   // no warn
+  fprintf((FILE*)0, "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str());   // no warn
+  fprintf((FILE*)0, s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str());        // no warn
+}
+
+
+void g(char *begin, char *end, char *p, std::span<char> s) {
+  std::copy(begin, end, p); // no warn
+  std::copy(s.begin(), s.end(), s.begin()); // no warn
+}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp
index 844311c3a51a5..989931e41c0cc 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp
@@ -1,8 +1,6 @@
 // RUN: %clang_cc1 -std=c++20 -Wunsafe-buffer-usage -fsafe-buffer-usage-suggestions -verify %s
 
-// expected-no-diagnostics
-
 typedef unsigned __darwin_size_t;
 typedef __darwin_size_t size_t;
  #define bzero(s, n) __builtin_bzero(s, n)
-void __nosan_bzero(void *dst, size_t sz) { bzero(dst, sz); }
+void __nosan_bzero(void *dst, size_t sz) { bzero(dst, sz); } // expected-warning{{function '__builtin_bzero' is unsafe}}

From f574b9c9297538a8d471658564619be3ad6e87dd Mon Sep 17 00:00:00 2001
From: Edd Dawson <edd.dawson@sony.com>
Date: Wed, 4 Sep 2024 20:36:24 +0100
Subject: [PATCH 146/425] [PS4,PS5][Driver] Check for absent SDK when
 -nostdlib/-nodefaultlibs (#107112)

The PlayStation drivers emit warnings if it looks like SDK libraries are
missing. Until this point, the check was skipped when either `-nostdlib`
or `-nodefaultlibs` was supplied. I believe the idea is that if you
aren't linking default libraries, you won't be in need of the SDK.

However, in a situation where these switches are supplied, users may
still want to pass `-lSomeSDKLib` to the driver/linker with the
expectation that libSomeSDKLib.a will be sourced from the SDK. That is,
`-nodefaultlibs` and `-nostdlib` affect the libraries passed to the
linker, but not the library search paths.

So this change removes `-nostdlib`/`-nodefaultlibs` from consideration
when deciding whether or not to probe for the SDK's existence.

N.B. complete behaviour for `-nostdlib` and `-nodefaultlibs` is yet to
be added to the PlayStation compiler drivers. Coming soon.

SIE tracker: TOOLCHAIN-16704
---
 clang/lib/Driver/ToolChains/PS4CPU.cpp |  4 +---
 clang/test/Driver/ps4-sdk-root.c       |  8 ++------
 clang/test/Driver/ps5-sdk-root.c       | 10 +++-------
 3 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 22103eb50803a..54ec59e6398f8 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -354,9 +354,7 @@ toolchains::PS4PS5Base::PS4PS5Base(const Driver &D, const llvm::Triple &Triple,
 
   SmallString<512> SDKLibDir(SDKRootDir);
   llvm::sys::path::append(SDKLibDir, "target/lib");
-  if (!Args.hasArg(options::OPT_nostdlib) &&
-      !Args.hasArg(options::OPT_nodefaultlibs) &&
-      !Args.hasArg(options::OPT__sysroot_EQ) && !Args.hasArg(options::OPT_E) &&
+  if (!Args.hasArg(options::OPT__sysroot_EQ) && !Args.hasArg(options::OPT_E) &&
       !Args.hasArg(options::OPT_c) && !Args.hasArg(options::OPT_S) &&
       !Args.hasArg(options::OPT_emit_ast) &&
       !llvm::sys::fs::exists(SDKLibDir)) {
diff --git a/clang/test/Driver/ps4-sdk-root.c b/clang/test/Driver/ps4-sdk-root.c
index e1a04522030c1..3e02fa9fc3bc2 100644
--- a/clang/test/Driver/ps4-sdk-root.c
+++ b/clang/test/Driver/ps4-sdk-root.c
@@ -6,9 +6,8 @@
 
 // Check that PS4 clang doesn't report a warning message when locating
 // system libraries (either by looking at the value of SCE_ORBIS_SDK_DIR
-// or relative to the location of the compiler driver), if "-c", "-S", "-E",
-// "--sysroot", "-nostdlib" or "-nodefaultlibs" option is specified on
-// the command line.
+// or relative to the location of the compiler driver), if "-c", "-S", "-E"
+// or "--sysroot" option is specified on the command line.
 // Otherwise, check that PS4 clang reports a warning.
 
 // Setting up SCE_ORBIS_SDK_DIR to existing location, which is not a PS4 SDK.
@@ -36,9 +35,6 @@
 // RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
 // RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### --sysroot=foo/ -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
 
-// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nostdlib -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
-// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nodefaultlibs -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
-
 // NO-WARN-NOT: {{warning:|error:}}
 // WARN-SYS-HEADERS: warning: unable to find PS4 system headers directory
 // WARN-ISYSROOT: warning: no such sysroot directory: 'foo'
diff --git a/clang/test/Driver/ps5-sdk-root.c b/clang/test/Driver/ps5-sdk-root.c
index c3672aef9dc0c..2a82d8e72283b 100644
--- a/clang/test/Driver/ps5-sdk-root.c
+++ b/clang/test/Driver/ps5-sdk-root.c
@@ -8,12 +8,11 @@
 
 // Check that PS5 clang doesn't report a warning message when locating
 // system libraries (either by looking at the value of SCE_PROSPERO_SDK_DIR
-// or relative to the location of the compiler driver), if "-c", "-S", "-E",
-// "--sysroot", "-nostdlib" or "-nodefaultlibs" option is specified on
-// the command line.
+// or relative to the location of the compiler driver), if "-c", "-S", "-E"
+// or "--sysroot" option is specified on the command line.
 // Otherwise, check that PS5 clang reports a warning.
 
-// Setting up SCE_PROSPERO_SDK_DIR to existing location, which is not a PS4 SDK.
+// Setting up SCE_PROSPERO_SDK_DIR to existing location, which is not a PS5 SDK.
 // RUN: env SCE_PROSPERO_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -target x86_64-sie-ps5 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=WARN-SYS-LIBS -check-prefix=NO-WARN %s
 
 // RUN: env SCE_PROSPERO_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c -target x86_64-sie-ps5 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
@@ -38,9 +37,6 @@
 // RUN: env SCE_PROSPERO_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -isysroot foo -target x86_64-sie-ps5 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
 // RUN: env SCE_PROSPERO_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### --sysroot=foo/ -isysroot foo -target x86_64-sie-ps5 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
 
-// RUN: env SCE_PROSPERO_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nostdlib -target x86_64-sie-ps5 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
-// RUN: env SCE_PROSPERO_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nodefaultlibs -target x86_64-sie-ps5 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
-
 // NO-WARN-NOT: {{warning:|error:}}
 // WARN-SYS-HEADERS: warning: unable to find PS5 system headers directory
 // WARN-ISYSROOT: warning: no such sysroot directory: 'foo'

From c50fecaaaabcf1598dc25fbde24c8352745b4ac9 Mon Sep 17 00:00:00 2001
From: Ben Howe <141149032+bmhowe23@users.noreply.github.com>
Date: Wed, 4 Sep 2024 14:37:14 -0500
Subject: [PATCH 147/425] [mlir] Fix region simplification bug when later
 blocks use prior block argument values (#97960)

This fixes #94520 by ensuring that any if any block arguments are being
used outside of the original block that the block is not considered a
candidate for merging.

More details: the root cause of the issue described in #94520 was that
`^bb2` and `^bb5` were being merged despite `%4` (an argument to `^bb2`)
was being used later in `^bb7`. When the block merge occurred, that
unintentionally changed the value of `%4` for all downstream code. This
change prevents that from happening.
---
 mlir/lib/Transforms/Utils/RegionUtils.cpp     |  9 ++++++
 .../Transforms/canonicalize-block-merge.mlir  | 28 +++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp
index 3c7523827699c..b7d26e8823094 100644
--- a/mlir/lib/Transforms/Utils/RegionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp
@@ -877,6 +877,15 @@ static LogicalResult mergeIdenticalBlocks(RewriterBase &rewriter,
       if (hasNonEmptyRegion)
         continue;
 
+      // Don't allow merging if this block's arguments are used outside of the
+      // original block.
+      bool argHasExternalUsers = llvm::any_of(
+          block->getArguments(), [block](mlir::BlockArgument &arg) {
+            return arg.isUsedOutsideOfBlock(block);
+          });
+      if (argHasExternalUsers)
+        continue;
+
       // Try to add this block to an existing cluster.
       bool addedToCluster = false;
       for (auto &cluster : clusters)
diff --git a/mlir/test/Transforms/canonicalize-block-merge.mlir b/mlir/test/Transforms/canonicalize-block-merge.mlir
index 92cfde817cf7f..6dbfcb562e588 100644
--- a/mlir/test/Transforms/canonicalize-block-merge.mlir
+++ b/mlir/test/Transforms/canonicalize-block-merge.mlir
@@ -290,3 +290,31 @@ func.func @dead_dealloc_fold_multi_use(%cond : i1) {
   memref.dealloc %a: memref<4xf32>
   return
 }
+
+// CHECK-LABEL: func @nested_loop
+func.func @nested_loop(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i1) {
+// Irreducible control-flow: enter the middle of the loop in LoopBody_entry here.
+  "test.foo_br"(%arg0, %arg4)[^LoopBody_entry] : (i32, i32) -> ()
+
+// Loop exit condition: jump to exit or LoobBody blocks
+^Loop_header:  // 2 preds: ^bb2, ^bb3
+  // Consumes the block arg from LoopBody_entry
+  // Because of this use here, we can't merge the two blocks below.
+  "test.foo_br2"(%0)[^EXIT, ^LoopBody_entry, ^LoopBody_other] : (i32) -> ()
+
+// LoopBody_entry is jumped in from the entry block (bb0) and Loop_header
+// It **dominates** the Loop_header.
+^LoopBody_entry(%0: i32):  // 2 preds: ^bb0, ^Loop_header
+ // CHECK: test.bar
+  %1 = "test.bar"(%0) : (i32) -> i32
+  cf.br ^Loop_header
+
+// Other block inside the loop, not dominating the header
+^LoopBody_other(%2: i32):  // pred: ^Loop_header
+ // CHECK: test.bar
+  %3 = "test.bar"(%2) : (i32) -> i32
+  cf.br ^Loop_header
+
+^EXIT:  // pred: ^Loop_header
+  return
+}

From 34f2c9a9ce73a61b27d75dab7e1eed256491afcc Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 4 Sep 2024 20:44:13 +0100
Subject: [PATCH 148/425] [AArch64] Add tests for FP conversion with 3 element
 vectors.

Add tests showing a number of cases where costs for floating point
conversions are overestimated for vectors with 3 elements.
---
 .../CostModel/AArch64/vec3-fp-conversions.ll  | 110 ++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/AArch64/vec3-fp-conversions.ll

diff --git a/llvm/test/Analysis/CostModel/AArch64/vec3-fp-conversions.ll b/llvm/test/Analysis/CostModel/AArch64/vec3-fp-conversions.ll
new file mode 100644
index 0000000000000..334c6107eb383
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/vec3-fp-conversions.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=arm64-apple-macosx < %s | FileCheck %s
+
+define <3 x i32> @fptoui_v3f32_to_v3i32(<3 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptoui_v3f32_to_v3i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = fptoui <3 x float> %in to <3 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i32> %conv
+;
+  %conv = fptoui <3 x float> %in to <3 x i32>
+  ret <3 x i32> %conv
+}
+
+define <3 x i16> @fptoui_v3f32_to_v3i16(<3 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptoui_v3f32_to_v3i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %conv = fptoui <3 x float> %in to <3 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i16> %conv
+;
+  %conv = fptoui <3 x float> %in to <3 x i16>
+  ret <3 x i16> %conv
+}
+
+define <3 x i8> @fptoui_v3f32_to_v3i8(<3 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptoui_v3f32_to_v3i8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %conv = fptoui <3 x float> %in to <3 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i8> %conv
+;
+  %conv = fptoui <3 x float> %in to <3 x i8>
+  ret <3 x i8> %conv
+}
+
+define <3 x i32> @fptosi_v3f32_to_v3i32(<3 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptosi_v3f32_to_v3i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = fptosi <3 x float> %in to <3 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i32> %conv
+;
+  %conv = fptosi <3 x float> %in to <3 x i32>
+  ret <3 x i32> %conv
+}
+
+define <3 x i16> @fptosi_v3f32_to_v3i16(<3 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptosi_v3f32_to_v3i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %conv = fptosi <3 x float> %in to <3 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i16> %conv
+;
+  %conv = fptosi <3 x float> %in to <3 x i16>
+  ret <3 x i16> %conv
+}
+
+define <3 x i8> @fptosi_v3f32_to_v3i8(<3 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptosi_v3f32_to_v3i8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %conv = fptosi <3 x float> %in to <3 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i8> %conv
+;
+  %conv = fptosi <3 x float> %in to <3 x i8>
+  ret <3 x i8> %conv
+}
+
+define <3 x float> @uitofp_v3i32_to_v3f32(<3 x i32> %in, ptr %dst) {
+; CHECK-LABEL: 'uitofp_v3i32_to_v3f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = uitofp <3 x i32> %in to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
+;
+  %conv = uitofp <3 x i32> %in to <3 x float>
+  ret <3 x float> %conv
+}
+
+define <3 x float> @uitofp_v3i16_to_v3f32(<3 x i16> %in, ptr %dst) {
+; CHECK-LABEL: 'uitofp_v3i16_to_v3f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = uitofp <3 x i16> %in to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
+;
+  %conv = uitofp <3 x i16> %in to <3 x float>
+  ret <3 x float> %conv
+}
+
+define <3 x float> @uitofp_v3i8_to_v3f32(<3 x i8> %in, ptr %dst) {
+; CHECK-LABEL: 'uitofp_v3i8_to_v3f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = uitofp <3 x i8> %in to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
+;
+  %conv = uitofp <3 x i8> %in to <3 x float>
+  ret <3 x float> %conv
+}
+
+define <3 x float> @sitofp_v3i32_to_v3f32(<3 x i32> %in, ptr %dst) {
+; CHECK-LABEL: 'sitofp_v3i32_to_v3f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = sitofp <3 x i32> %in to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
+;
+  %conv = sitofp <3 x i32> %in to <3 x float>
+  ret <3 x float> %conv
+}
+
+define <3 x float> @sitofp_v3i16_to_v3f32(<3 x i16> %in, ptr %dst) {
+; CHECK-LABEL: 'sitofp_v3i16_to_v3f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = sitofp <3 x i16> %in to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
+;
+  %conv = sitofp <3 x i16> %in to <3 x float>
+  ret <3 x float> %conv
+}
+
+define <3 x float> @sitofp_v3i8_to_v3f32(<3 x i8> %in, ptr %dst) {
+; CHECK-LABEL: 'sitofp_v3i8_to_v3f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = sitofp <3 x i8> %in to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
+;
+  %conv = sitofp <3 x i8> %in to <3 x float>
+  ret <3 x float> %conv
+}

From 3fe6a064f15cd854fd497594cc20e8b680cd2133 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 4 Sep 2024 20:50:05 +0100
Subject: [PATCH 149/425] [LV] Check if compare is truncated directly in
 getInstructionCost.

The current check for truncated compares in getInstructionCost misses
cases where either the first or both operands are constants.
Check directly if the compare is marked for truncation. In that case,
the minimum bitwidth is that of the operands.

The patch also adds asserts to ensure that.

This fixes a divergence between legacy and VPlan-based cost model, where
the legacy cost model incorrectly estimated the cost of compares with
truncated operands.

Fixes https://github.com/llvm/llvm-project/issues/107171.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 15 +++++-
 .../truncate-to-minimal-bitwidth-cost.ll      | 51 +++++++++++++++++++
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0200525a718d5..0ccf442dac999 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6606,9 +6606,20 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Type *ValTy = I->getOperand(0)->getType();
+
     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
-    if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
-      ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
+    (void)Op0AsInstruction;
+    assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
+            canTruncateToMinimalBitwidth(I, VF)) &&
+           "truncating Op0 must imply truncating the compare");
+    if (canTruncateToMinimalBitwidth(I, VF)) {
+      assert(!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
+             MinBWs[I] == MinBWs[Op0AsInstruction] &&
+                 "if both the operand and the compare are marked for "
+                 "truncation, they must have the same bitwidth");
+      ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
+    }
+
     VectorTy = ToVectorTy(ValTy, VF);
     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
                                   cast<CmpInst>(I)->getPredicate(), CostKind,
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index 3e2f290a497db..9fe5a2a6a3ecc 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -221,7 +221,56 @@ exit:
   ret void
 }
 
+; Test case for https://github.com/llvm/llvm-project/issues/107171.
+define i8 @icmp_ops_narrowed_to_i1() #1 {
+; CHECK-LABEL: define i8 @icmp_ops_narrowed_to_i1(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 96, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 0, 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[C]] to i64
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i64 [[EXT]], 1
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[SHR]] to i8
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 100
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[TRUNC_LCSSA:%.*]] = phi i8 [ [[TRUNC]], %[[LOOP]] ], [ 0, %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i8 [[TRUNC_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
+  %c = icmp eq i8 0, 0
+  %ext = zext i1 %c to i64
+  %shr = lshr i64 %ext, 1
+  %trunc = trunc i64 %shr to i8
+  %iv.next = add i16 %iv, 1
+  %ec = icmp eq i16 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i8 %trunc
+}
+
 attributes #0 = { "target-features"="+64bit,+v,+zvl256b" }
+attributes #1 = { "target-features"="+64bit,+v" }
 
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
@@ -232,4 +281,6 @@ attributes #0 = { "target-features"="+64bit,+v,+zvl256b" }
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
 ;.

From 42b4092db99633ec53d136d5da7abfcfb236c14e Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 4 Sep 2024 12:57:08 -0700
Subject: [PATCH 150/425] [RISCV] Precommit vmv.v.v with undef passthru tests

---
 llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll  | 14 ++++++++++++++
 llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir | 14 ++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
index 3952e48c5c28f..c1602c912da63 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
@@ -194,3 +194,17 @@ define <vscale x 2 x i32> @unfoldable_mismatched_sew(<vscale x 2 x i32> %passthr
   %b = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(<vscale x 2 x i32> %passthru, <vscale x 2 x i32> %a.bitcast, iXLen %avl)
   ret <vscale x 2 x i32> %b
 }
+
+
+define <vscale x 1 x i64> @undef_passthru(<vscale x 1 x i64> %passthru, <vscale x 1 x i64> %x, <vscale x 1 x i64> %y, iXLen %avl) {
+; CHECK-LABEL: undef_passthru:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
+; CHECK-NEXT:    vadd.vv v8, v9, v10
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vmv.v.v v8, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64(<vscale x 1 x i64> %passthru, <vscale x 1 x i64> %x, <vscale x 1 x i64> %y, iXLen %avl)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %a, iXLen %avl)
+  ret <vscale x 1 x i64> %b
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
index 771b2073370e6..7e2ac0e26f251 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
@@ -60,3 +60,17 @@ body: |
     %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */
     %y:vr = PseudoVMV_V_V_M1 $noreg, %x, 4, 5 /* e32 */, 0 /* tu, mu */
 ...
+---
+name: undef_passthru
+body: |
+  bb.0:
+    liveins: $v8
+    ; CHECK-LABEL: name: undef_passthru
+    ; CHECK: liveins: $v8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %passthru:vr = COPY $v8
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVMV_V_V_M1 $noreg, %x, 4, 5 /* e32 */, 1 /* ta, mu */
+    %passthru:vr = COPY $v8
+    %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */
+    %y:vr = PseudoVMV_V_V_M1 $noreg, %x, 4, 5 /* e32 */, 1 /* ta, mu */

From d21e731c42d6b967e29dbe2edc16c1b86885df0d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 4 Sep 2024 12:59:53 -0700
Subject: [PATCH 151/425] [RISCV] Fix typos in comment. NFC

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index bc661c72e5ecc..2ea909b085a6d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -19631,7 +19631,7 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
 }
 
 // FastCC has less than 1% performance improvement for some particular
-// benchmark. But theoretically, it may has benenfit for some cases.
+// benchmark. But theoretically, it may have benefit for some cases.
 bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
                             unsigned ValNo, MVT ValVT, MVT LocVT,
                             CCValAssign::LocInfo LocInfo,

From 23f6c3370b8bc0bf773e69a41bf90454c0a10120 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 4 Sep 2024 16:39:09 -0400
Subject: [PATCH 152/425] [libc++][modules] Remove dependency on
 __algorithm/max from hypot.h (#107150)

That dependency was added recently when we made improvements to
std::hypot, but that resulted in `__math` depending on `__algorithm`,
which is a very heavyweight module. This patch uses `__math::fmax`
instead.
---
 libcxx/include/__math/hypot.h                    | 4 ++--
 libcxx/test/libcxx/transitive_includes/cxx03.csv | 2 --
 libcxx/test/libcxx/transitive_includes/cxx11.csv | 2 --
 libcxx/test/libcxx/transitive_includes/cxx14.csv | 2 --
 libcxx/test/libcxx/transitive_includes/cxx17.csv | 2 --
 libcxx/test/libcxx/transitive_includes/cxx20.csv | 2 --
 libcxx/test/libcxx/transitive_includes/cxx23.csv | 2 --
 libcxx/test/libcxx/transitive_includes/cxx26.csv | 2 --
 8 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/libcxx/include/__math/hypot.h b/libcxx/include/__math/hypot.h
index 2c2c9c38ab530..b2bf8e11c8ec2 100644
--- a/libcxx/include/__math/hypot.h
+++ b/libcxx/include/__math/hypot.h
@@ -9,10 +9,10 @@
 #ifndef _LIBCPP___MATH_HYPOT_H
 #define _LIBCPP___MATH_HYPOT_H
 
-#include <__algorithm/max.h>
 #include <__config>
 #include <__math/abs.h>
 #include <__math/exponential_functions.h>
+#include <__math/min_max.h>
 #include <__math/roots.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_arithmetic.h>
@@ -62,7 +62,7 @@ _LIBCPP_HIDE_FROM_ABI _Real __hypot(_Real __x, _Real __y, _Real __z) {
   const _Real __overflow_scale     = __math::ldexp(_Real(1), -(__exp + 20));
 
   // Scale arguments depending on their size
-  const _Real __max_abs = std::max(__math::fabs(__x), std::max(__math::fabs(__y), __math::fabs(__z)));
+  const _Real __max_abs = __math::fmax(__math::fabs(__x), __math::fmax(__math::fabs(__y), __math::fabs(__z)));
   _Real __scale;
   if (__max_abs > __overflow_threshold) { // x*x + y*y + z*z might overflow
     __scale = __overflow_scale;
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index fd47c65ffc307..3bf39ea17c912 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -131,8 +131,6 @@ chrono type_traits
 chrono vector
 chrono version
 cinttypes cstdint
-cmath cstddef
-cmath initializer_list
 cmath limits
 cmath type_traits
 cmath version
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index 347d5d8796687..49125486cfcf6 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -131,8 +131,6 @@ chrono type_traits
 chrono vector
 chrono version
 cinttypes cstdint
-cmath cstddef
-cmath initializer_list
 cmath limits
 cmath type_traits
 cmath version
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index 57bb2f25b3d62..28dfb320fe06c 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -132,8 +132,6 @@ chrono type_traits
 chrono vector
 chrono version
 cinttypes cstdint
-cmath cstddef
-cmath initializer_list
 cmath limits
 cmath type_traits
 cmath version
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index 6826aeb75a83b..5b7b6cecf73f8 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -132,8 +132,6 @@ chrono type_traits
 chrono vector
 chrono version
 cinttypes cstdint
-cmath cstddef
-cmath initializer_list
 cmath limits
 cmath type_traits
 cmath version
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index 17169e385d544..84ea6433fb12d 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -137,8 +137,6 @@ chrono type_traits
 chrono vector
 chrono version
 cinttypes cstdint
-cmath cstddef
-cmath initializer_list
 cmath limits
 cmath type_traits
 cmath version
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index 267dca3cc6c41..946ba486294d3 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -84,8 +84,6 @@ chrono string_view
 chrono vector
 chrono version
 cinttypes cstdint
-cmath cstddef
-cmath initializer_list
 cmath limits
 cmath version
 codecvt cctype
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 267dca3cc6c41..946ba486294d3 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -84,8 +84,6 @@ chrono string_view
 chrono vector
 chrono version
 cinttypes cstdint
-cmath cstddef
-cmath initializer_list
 cmath limits
 cmath version
 codecvt cctype

From 5e19fd172063c8957a35c7fa3596620f79ebba97 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 4 Sep 2024 16:39:55 -0400
Subject: [PATCH 153/425] [libc++][modules] Consolidate leaf modules into their
 own top-level modules (#107147)

Some modules are leaf modules in the sense that they are not used by any
other part of the headers. These leaf modules are easy to consolidate
since there is no risk to create a cycle. As a result of regrouping
these modules, several missing includes were found and fixed in this
patch.
---
 libcxx/include/__filesystem/directory_entry.h |   1 +
 libcxx/include/__filesystem/path.h            |   1 +
 libcxx/include/__mdspan/layout_left.h         |   1 +
 libcxx/include/__mdspan/layout_right.h        |   1 +
 libcxx/include/__mdspan/layout_stride.h       |   4 +
 libcxx/include/module.modulemap               | 117 +++++++-----------
 .../views/mdspan/CustomTestLayouts.h          |   1 +
 .../views/mdspan/extents/comparison.pass.cpp  |   5 +-
 .../views/mdspan/extents/conversion.pass.cpp  |   5 +-
 .../views/mdspan/extents/ctad.pass.cpp        |   1 +
 .../views/mdspan/extents/dextents.pass.cpp    |   1 +
 .../views/mdspan/extents/dims.pass.cpp        |   1 +
 .../mdspan/extents/index_type.verify.cpp      |   1 +
 .../views/mdspan/extents/obs_static.pass.cpp  |   1 +
 .../views/mdspan/extents/types.pass.cpp       |   5 +-
 .../mdspan/layout_left/comparison.pass.cpp    |   5 +-
 .../mdspan/layout_left/ctor.default.pass.cpp  |   1 +
 .../mdspan/layout_left/ctor.extents.pass.cpp  |   1 +
 .../layout_left/ctor.layout_right.pass.cpp    |   3 +-
 .../layout_left/ctor.layout_stride.pass.cpp   |   4 +-
 .../mdspan/layout_left/ctor.mapping.pass.cpp  |   3 +-
 .../layout_left/index_operator.pass.cpp       |   2 +
 .../mdspan/layout_left/properties.pass.cpp    |   5 +-
 .../layout_left/required_span_size.pass.cpp   |   2 +-
 .../layout_left/static_requirements.pass.cpp  |   6 +-
 .../views/mdspan/layout_left/stride.pass.cpp  |   4 +-
 .../mdspan/layout_right/comparison.pass.cpp   |   5 +-
 .../mdspan/layout_right/ctor.default.pass.cpp |   1 +
 .../mdspan/layout_right/ctor.extents.pass.cpp |   1 +
 .../layout_right/ctor.layout_left.pass.cpp    |   3 +-
 .../layout_right/ctor.layout_stride.pass.cpp  |   4 +-
 .../mdspan/layout_right/ctor.mapping.pass.cpp |   3 +-
 .../layout_right/index_operator.pass.cpp      |   2 +
 .../mdspan/layout_right/properties.pass.cpp   |   5 +-
 .../layout_right/required_span_size.pass.cpp  |   1 +
 .../layout_right/static_requirements.pass.cpp |   6 +-
 .../views/mdspan/layout_right/stride.pass.cpp |   4 +-
 .../mdspan/layout_stride/comparison.pass.cpp  |   5 +-
 .../layout_stride/ctor.default.pass.cpp       |   1 +
 .../layout_stride/ctor.extents_array.pass.cpp |   3 +
 .../layout_stride/ctor.extents_span.pass.cpp  |   3 +
 .../ctor.strided_mapping.pass.cpp             |   3 +-
 .../mdspan/layout_stride/deduction.pass.cpp   |  11 +-
 .../layout_stride/index_operator.pass.cpp     |   3 +
 .../is_exhaustive_corner_case.pass.cpp        |   6 +-
 .../mdspan/layout_stride/properties.pass.cpp  |   6 +-
 .../layout_stride/required_span_size.pass.cpp |   2 +
 .../static_requirements.pass.cpp              |   6 +-
 .../mdspan/layout_stride/stride.pass.cpp      |   4 +-
 .../views/mdspan/mdspan/assign.pass.cpp       |   5 +-
 .../views/mdspan/mdspan/conversion.pass.cpp   |   5 +-
 .../views/mdspan/mdspan/ctor.copy.pass.cpp    |   5 +-
 .../views/mdspan/mdspan/ctor.default.pass.cpp |   5 +-
 .../mdspan/mdspan/ctor.dh_array.pass.cpp      |   3 +-
 .../mdspan/mdspan/ctor.dh_extents.pass.cpp    |   5 +-
 .../mdspan/mdspan/ctor.dh_integers.pass.cpp   |   3 +-
 .../views/mdspan/mdspan/ctor.dh_map.pass.cpp  |   5 +-
 .../mdspan/mdspan/ctor.dh_map_acc.pass.cpp    |   5 +-
 .../views/mdspan/mdspan/ctor.dh_span.pass.cpp |   3 +-
 .../views/mdspan/mdspan/ctor.move.pass.cpp    |   5 +-
 .../views/mdspan/mdspan/deduction.pass.cpp    |   5 +-
 .../mdspan/mdspan/index_operator.pass.cpp     |   1 +
 .../views/mdspan/mdspan/move.pass.cpp         |   5 +-
 .../views/mdspan/mdspan/properties.pass.cpp   |   5 +-
 .../views/mdspan/mdspan/swap.pass.cpp         |   5 +-
 .../views/mdspan/mdspan/types.pass.cpp        |   5 +-
 .../test_offset_time_zone.h                   |   1 +
 .../string_view.pass.cpp                      |   2 +
 .../string_view_local_time.pass.cpp           |   2 +
 .../string_view_local_time_choose.pass.cpp    |   2 +-
 .../string_view_sys_time.pass.cpp             |   2 +
 ...ned_time_duration2_time_zone_ptr2.pass.cpp |   2 +-
 ...e_duration2_time_zone_ptr2_choose.pass.cpp |   2 +-
 .../sys_time.pass.cpp                         |   1 +
 .../time_zone_pointer.pass.cpp                |   1 +
 .../time_zone_pointer_local_time.pass.cpp     |   2 +-
 ...me_zone_pointer_local_time_choose.pass.cpp |   2 +-
 .../time_zone_pointer_sys_time.pass.cpp       |   2 +-
 .../get_local_time.pass.cpp                   |   1 +
 .../get_sys_time.pass.cpp                     |   1 +
 .../get_time_zone.pass.cpp                    |   1 +
 .../operator_local_time.pass.cpp              |   1 +
 .../operator_sys_time.pass.cpp                |   1 +
 .../charconv.from.chars/integral.pass.cpp     |   2 +
 .../integral.roundtrip.pass.cpp               |   2 +
 .../from_chars_result.operator_bool.pass.cpp  |   1 +
 .../charconv.syn/from_chars_result.pass.cpp   |   1 +
 .../to_chars_result.operator_bool.pass.cpp    |   1 +
 .../charconv.syn/to_chars_result.pass.cpp     |   1 +
 .../charconv.to.chars/integral.pass.cpp       |   1 +
 90 files changed, 234 insertions(+), 146 deletions(-)

diff --git a/libcxx/include/__filesystem/directory_entry.h b/libcxx/include/__filesystem/directory_entry.h
index 96d88dcd90b4c..2c638e7ee354b 100644
--- a/libcxx/include/__filesystem/directory_entry.h
+++ b/libcxx/include/__filesystem/directory_entry.h
@@ -20,6 +20,7 @@
 #include <__filesystem/operations.h>
 #include <__filesystem/path.h>
 #include <__filesystem/perms.h>
+#include <__fwd/ostream.h>
 #include <__system_error/errc.h>
 #include <__system_error/error_code.h>
 #include <__utility/move.h>
diff --git a/libcxx/include/__filesystem/path.h b/libcxx/include/__filesystem/path.h
index ff468d517722f..eef1fc0db3ea7 100644
--- a/libcxx/include/__filesystem/path.h
+++ b/libcxx/include/__filesystem/path.h
@@ -21,6 +21,7 @@
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/remove_const.h>
 #include <__type_traits/remove_pointer.h>
+#include <__utility/move.h>
 #include <cstddef>
 #include <string>
 #include <string_view>
diff --git a/libcxx/include/__mdspan/layout_left.h b/libcxx/include/__mdspan/layout_left.h
index d058cbccffd96..59574e83b0d7b 100644
--- a/libcxx/include/__mdspan/layout_left.h
+++ b/libcxx/include/__mdspan/layout_left.h
@@ -21,6 +21,7 @@
 #include <__config>
 #include <__fwd/mdspan.h>
 #include <__mdspan/extents.h>
+#include <__type_traits/common_type.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
 #include <__type_traits/is_nothrow_constructible.h>
diff --git a/libcxx/include/__mdspan/layout_right.h b/libcxx/include/__mdspan/layout_right.h
index 6842e9dc37fdc..d1acdb41238f7 100644
--- a/libcxx/include/__mdspan/layout_right.h
+++ b/libcxx/include/__mdspan/layout_right.h
@@ -21,6 +21,7 @@
 #include <__config>
 #include <__fwd/mdspan.h>
 #include <__mdspan/extents.h>
+#include <__type_traits/common_type.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
 #include <__type_traits/is_nothrow_constructible.h>
diff --git a/libcxx/include/__mdspan/layout_stride.h b/libcxx/include/__mdspan/layout_stride.h
index 86148ac849eca..704a5a4c1aea5 100644
--- a/libcxx/include/__mdspan/layout_stride.h
+++ b/libcxx/include/__mdspan/layout_stride.h
@@ -18,12 +18,15 @@
 #define _LIBCPP___MDSPAN_LAYOUT_STRIDE_H
 
 #include <__assert>
+#include <__concepts/same_as.h>
 #include <__config>
 #include <__fwd/mdspan.h>
 #include <__mdspan/extents.h>
+#include <__type_traits/common_type.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
 #include <__type_traits/is_nothrow_constructible.h>
+#include <__type_traits/is_same.h>
 #include <__utility/as_const.h>
 #include <__utility/integer_sequence.h>
 #include <__utility/swap.h>
@@ -31,6 +34,7 @@
 #include <cinttypes>
 #include <cstddef>
 #include <limits>
+#include <span>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 297d155cb5594..7cde21417561e 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -29,6 +29,16 @@ module std_bitset [system] {
 }
 module std_charconv [system] {
   header "charconv"
+  module chars_format            { header "__charconv/chars_format.h" }
+  module from_chars_integral     { header "__charconv/from_chars_integral.h" }
+  module from_chars_result       { header "__charconv/from_chars_result.h" }
+  module tables                  { header "__charconv/tables.h" }
+  module to_chars                { header "__charconv/to_chars.h" }
+  module to_chars_base_10        { header "__charconv/to_chars_base_10.h" }
+  module to_chars_floating_point { header "__charconv/to_chars_floating_point.h" }
+  module to_chars_integral       { header "__charconv/to_chars_integral.h" }
+  module to_chars_result         { header "__charconv/to_chars_result.h" }
+  module traits                  { header "__charconv/traits.h" }
   export *
 }
 module std_chrono [system] {
@@ -53,10 +63,15 @@ module std_concepts [system] {
 }
 module std_condition_variable [system] {
   header "condition_variable"
+  module condition_variable { header "__condition_variable/condition_variable.h" }
   export *
 }
 module std_coroutine [system] {
   header "coroutine"
+  module coroutine_handle      { header "__coroutine/coroutine_handle.h" }
+  module coroutine_traits      { header "__coroutine/coroutine_traits.h" }
+  module noop_coroutine_handle { header "__coroutine/noop_coroutine_handle.h" }
+  module trivial_awaitables    { header "__coroutine/trivial_awaitables.h" }
   export *
 }
 module std_deque [system] {
@@ -77,6 +92,28 @@ module std_expected [system] {
 }
 module std_filesystem [system] {
   header "filesystem"
+  module copy_options                 { header "__filesystem/copy_options.h" }
+  module directory_entry              { header "__filesystem/directory_entry.h" }
+  module directory_iterator           { header "__filesystem/directory_iterator.h" }
+  module directory_options            { header "__filesystem/directory_options.h" }
+  module file_status                  { header "__filesystem/file_status.h" }
+  module file_time_type               { header "__filesystem/file_time_type.h" }
+  module file_type                    { header "__filesystem/file_type.h" }
+  module filesystem_error             {
+    header "__filesystem/filesystem_error.h"
+    export std_private_memory_shared_ptr
+  }
+  module operations                   { header "__filesystem/operations.h" }
+  module path                         {
+    header "__filesystem/path.h"
+    export std_string // returned by various methods
+  }
+  module path_iterator                { header "__filesystem/path_iterator.h" }
+  module perm_options                 { header "__filesystem/perm_options.h" }
+  module perms                        { header "__filesystem/perms.h" }
+  module recursive_directory_iterator { header "__filesystem/recursive_directory_iterator.h" }
+  module space_info                   { header "__filesystem/space_info.h" }
+  module u8path                       { header "__filesystem/u8path.h" }
   export *
 }
 module std_format [system] {
@@ -149,6 +186,16 @@ module std_map [system] {
 }
 module std_mdspan [system] {
   header "mdspan"
+  module default_accessor { header "__mdspan/default_accessor.h" }
+  module extents          { header "__mdspan/extents.h" }
+  module fwd              { header "__fwd/mdspan.h" }
+  module layout_left      { header "__mdspan/layout_left.h" }
+  module layout_right     { header "__mdspan/layout_right.h" }
+  module layout_stride    { header "__mdspan/layout_stride.h" }
+  module mdspan           {
+    header "__mdspan/mdspan.h"
+    export std_array // for strides()
+  }
   export *
 }
 module std_memory [system] {
@@ -1082,23 +1129,6 @@ module std_private_bit_invert_if      [system] { header "__bit/invert_if.h" }
 module std_private_bit_popcount       [system] { header "__bit/popcount.h" }
 module std_private_bit_rotate         [system] { header "__bit/rotate.h" }
 
-module std_private_charconv_chars_format            [system] { header "__charconv/chars_format.h" }
-module std_private_charconv_from_chars_integral     [system] { header "__charconv/from_chars_integral.h" }
-module std_private_charconv_from_chars_result       [system] { header "__charconv/from_chars_result.h" }
-module std_private_charconv_tables                  [system] { header "__charconv/tables.h" }
-module std_private_charconv_to_chars                [system] { header "__charconv/to_chars.h" }
-module std_private_charconv_to_chars_base_10        [system] { header "__charconv/to_chars_base_10.h" }
-module std_private_charconv_to_chars_floating_point [system] { header "__charconv/to_chars_floating_point.h" }
-module std_private_charconv_to_chars_integral       [system] {
-  header "__charconv/to_chars_integral.h"
-  export std_private_charconv_traits
-}
-module std_private_charconv_to_chars_result         [system] {
-  header "__charconv/to_chars_result.h"
-  export *
-}
-module std_private_charconv_traits                  [system] { header "__charconv/traits.h" }
-
 module std_private_chrono_calendar               [system] { header "__chrono/calendar.h" }
 module std_private_chrono_concepts               [system] { header "__chrono/concepts.h" }
 module std_private_chrono_convert_to_timespec    [system] { header "__chrono/convert_to_timespec.h" }
@@ -1223,16 +1253,6 @@ module std_private_concepts_semiregular           [system] { header "__concepts/
 module std_private_concepts_swappable             [system] { header "__concepts/swappable.h" }
 module std_private_concepts_totally_ordered       [system] { header "__concepts/totally_ordered.h" }
 
-module std_private_condition_variable_condition_variable [system] {
-  header "__condition_variable/condition_variable.h"
-  export *
-}
-
-module std_private_coroutine_coroutine_handle      [system] { header "__coroutine/coroutine_handle.h" }
-module std_private_coroutine_coroutine_traits      [system] { header "__coroutine/coroutine_traits.h" }
-module std_private_coroutine_noop_coroutine_handle [system] { header "__coroutine/noop_coroutine_handle.h" }
-module std_private_coroutine_trivial_awaitables    [system] { header "__coroutine/trivial_awaitables.h" }
-
 module std_private_debug_utils_randomize_range            [system] { header "__debug_utils/randomize_range.h" }
 module std_private_debug_utils_sanitizers                 [system] { header "__debug_utils/sanitizers.h" }
 module std_private_debug_utils_strict_weak_ordering_check [system] {
@@ -1256,38 +1276,6 @@ module std_private_expected_expected            [system] { header "__expected/ex
 module std_private_expected_unexpect            [system] { header "__expected/unexpect.h" }
 module std_private_expected_unexpected          [system] { header "__expected/unexpected.h" }
 
-module std_private_filesystem_copy_options                 [system] { header "__filesystem/copy_options.h" }
-module std_private_filesystem_directory_entry              [system] {
-  header "__filesystem/directory_entry.h"
-  export *
-}
-module std_private_filesystem_directory_iterator           [system] {
-  header "__filesystem/directory_iterator.h"
-  export *
-}
-module std_private_filesystem_directory_options            [system] { header "__filesystem/directory_options.h" }
-module std_private_filesystem_file_status                  [system] { header "__filesystem/file_status.h" }
-module std_private_filesystem_file_time_type               [system] { header "__filesystem/file_time_type.h" }
-module std_private_filesystem_file_type                    [system] { header "__filesystem/file_type.h" }
-module std_private_filesystem_filesystem_error             [system] {
-  header "__filesystem/filesystem_error.h"
-  export *
-}
-module std_private_filesystem_operations                   [system] { header "__filesystem/operations.h" }
-module std_private_filesystem_path                         [system] {
-  header "__filesystem/path.h"
-  export *
-}
-module std_private_filesystem_path_iterator                [system] { header "__filesystem/path_iterator.h" }
-module std_private_filesystem_perm_options                 [system] { header "__filesystem/perm_options.h" }
-module std_private_filesystem_perms                        [system] { header "__filesystem/perms.h" }
-module std_private_filesystem_recursive_directory_iterator [system] {
-  header "__filesystem/recursive_directory_iterator.h"
-  export *
-}
-module std_private_filesystem_space_info                   [system] { header "__filesystem/space_info.h" }
-module std_private_filesystem_u8path                       [system] { header "__filesystem/u8path.h" }
-
 module std_private_format_buffer                          [system] { header "__format/buffer.h" }
 module std_private_format_concepts                        [system] { header "__format/concepts.h" }
 module std_private_format_container_adaptor               [system] { header "__format/container_adaptor.h" }
@@ -1496,17 +1484,6 @@ module std_private_math_special_functions               [system] { header "__mat
 module std_private_math_traits                          [system] { header "__math/traits.h" }
 module std_private_math_trigonometric_functions         [system] { header "__math/trigonometric_functions.h" }
 
-module std_private_mdspan_default_accessor [system] { header "__mdspan/default_accessor.h" }
-module std_private_mdspan_extents          [system] {
-  header "__mdspan/extents.h"
-  export *
-}
-module std_private_mdspan_layout_left      [system] { header "__mdspan/layout_left.h" }
-module std_private_mdspan_layout_right     [system] { header "__mdspan/layout_right.h" }
-module std_private_mdspan_layout_stride    [system] { header "__mdspan/layout_stride.h" }
-module std_private_mdspan_mdspan           [system] { header "__mdspan/mdspan.h" }
-module std_private_mdspan_mdspan_fwd       [system] { header "__fwd/mdspan.h" }
-
 module std_private_memory_addressof                       [system] { header "__memory/addressof.h" }
 module std_private_memory_align                           [system] { header "__memory/align.h" }
 module std_private_memory_aligned_alloc                   [system] { header "__memory/aligned_alloc.h" }
diff --git a/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h b/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h
index 588a5e9774a55..7cd42139758e3 100644
--- a/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h
+++ b/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h
@@ -25,6 +25,7 @@
 #include <cstddef>
 #include <limits>
 #include <mdspan>
+#include <span> // dynamic_extent
 #include <type_traits>
 #include <utility>
 
diff --git a/libcxx/test/std/containers/views/mdspan/extents/comparison.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/comparison.pass.cpp
index 77fbd46fb7ca7..574290ebec854 100644
--- a/libcxx/test/std/containers/views/mdspan/extents/comparison.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/extents/comparison.pass.cpp
@@ -18,9 +18,10 @@
 //
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/extents/conversion.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/conversion.pass.cpp
index 6b0ecff02baab..f6834b0b4133e 100644
--- a/libcxx/test/std/containers/views/mdspan/extents/conversion.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/extents/conversion.pass.cpp
@@ -29,10 +29,11 @@
 //          (numeric_limits<index_type>::max() < numeric_limits<OtherIndexType>::max())
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
 #include <limits>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp
index 9144bb6812e3c..1a6501b391396 100644
--- a/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp
@@ -18,6 +18,7 @@
 
 #include <mdspan>
 #include <cassert>
+#include <span> // dynamic_extent
 #include <type_traits>
 
 #include "../ConvertibleToIntegral.h"
diff --git a/libcxx/test/std/containers/views/mdspan/extents/dextents.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/dextents.pass.cpp
index a9fc8f3bed074..2adfa49d3bc47 100644
--- a/libcxx/test/std/containers/views/mdspan/extents/dextents.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/extents/dextents.pass.cpp
@@ -18,6 +18,7 @@
 
 #include <mdspan>
 #include <cstddef>
+#include <span> // dynamic_extent
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/extents/dims.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/dims.pass.cpp
index e74bc0e66fca1..0476c11efdb64 100644
--- a/libcxx/test/std/containers/views/mdspan/extents/dims.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/extents/dims.pass.cpp
@@ -18,6 +18,7 @@
 
 #include <mdspan>
 #include <cstddef>
+#include <span> // dynamic_extent
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/extents/index_type.verify.cpp b/libcxx/test/std/containers/views/mdspan/extents/index_type.verify.cpp
index ba6941a1ab4c1..cdc0464251419 100644
--- a/libcxx/test/std/containers/views/mdspan/extents/index_type.verify.cpp
+++ b/libcxx/test/std/containers/views/mdspan/extents/index_type.verify.cpp
@@ -19,6 +19,7 @@
 #include <cstddef>
 #include <climits>
 #include <mdspan>
+#include <span> // dynamic_extent
 
 void invalid_index_types() {
   // expected-error@*:* {{static assertion failed: extents::index_type must be a signed or unsigned integer type}}
diff --git a/libcxx/test/std/containers/views/mdspan/extents/obs_static.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/obs_static.pass.cpp
index 0c8d3415a6726..29dd9e2d27072 100644
--- a/libcxx/test/std/containers/views/mdspan/extents/obs_static.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/extents/obs_static.pass.cpp
@@ -28,6 +28,7 @@
 
 #include <mdspan>
 #include <cassert>
+#include <span> // dynamic_extent
 #include <utility>
 
 #include "test_macros.h"
diff --git a/libcxx/test/std/containers/views/mdspan/extents/types.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/types.pass.cpp
index dbaff46e82b6b..2924da91f77ee 100644
--- a/libcxx/test/std/containers/views/mdspan/extents/types.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/extents/types.pass.cpp
@@ -23,9 +23,10 @@
 //  }
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/comparison.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/comparison.pass.cpp
index ab85ccf863963..c8b4083291a68 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_left/comparison.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_left/comparison.pass.cpp
@@ -16,9 +16,10 @@
 // Constraints: extents_type::rank() == OtherExtents::rank() is true.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.default.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.default.pass.cpp
index ca478d047549e..5a4040317d243 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.default.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.default.pass.cpp
@@ -17,6 +17,7 @@
 #include <mdspan>
 #include <cassert>
 #include <cstdint>
+#include <span> // dynamic_extent
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.extents.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.extents.pass.cpp
index 5147539deed26..46505cb961bbd 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.extents.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.extents.pass.cpp
@@ -20,6 +20,7 @@
 #include <mdspan>
 #include <cassert>
 #include <cstdint>
+#include <span> // dynamic_extent
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_right.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_right.pass.cpp
index 1b5a985c1eb33..5f9bd4344d0ec 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_right.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_right.pass.cpp
@@ -21,9 +21,10 @@
 // Preconditions: other.required_span_size() is representable as a value of type index_type
 
 #include <mdspan>
-#include <type_traits>
 #include <cassert>
 #include <limits>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_stride.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_stride.pass.cpp
index 40ecef865477b..34489b7c52d7d 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_stride.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_stride.pass.cpp
@@ -24,9 +24,11 @@
 // Effects: Direct-non-list-initializes extents_ with other.extents().
 
 #include <mdspan>
-#include <type_traits>
+#include <array>
 #include <cassert>
 #include <limits>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.mapping.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.mapping.pass.cpp
index ecfbd3fef705d..63b3c50c73175 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.mapping.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.mapping.pass.cpp
@@ -19,9 +19,10 @@
 // Preconditions: other.required_span_size() is representable as a value of type index_type
 
 #include <mdspan>
-#include <type_traits>
 #include <cassert>
 #include <limits>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/index_operator.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/index_operator.pass.cpp
index 1b6cb5ab3fb25..40cd6bc2812e3 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_left/index_operator.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_left/index_operator.pass.cpp
@@ -24,8 +24,10 @@
 //   * extents_type::index-cast(i) is a multidimensional index in extents_.
 
 #include <mdspan>
+#include <type_traits>
 #include <cassert>
 #include <cstdint>
+#include <span> // dynamic_extent
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/properties.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/properties.pass.cpp
index e4ab972cb1093..cca1c65f088f1 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_left/properties.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_left/properties.pass.cpp
@@ -27,9 +27,10 @@
 // }
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/required_span_size.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/required_span_size.pass.cpp
index d0ee090099cf2..4cb111d29827a 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_left/required_span_size.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_left/required_span_size.pass.cpp
@@ -14,10 +14,10 @@
 //
 // Returns: extents().fwd-prod-of-extents(extents_type::rank()).
 
-
 #include <mdspan>
 #include <cassert>
 #include <cstdint>
+#include <span> // dynamic_extent
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/static_requirements.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/static_requirements.pass.cpp
index 23a7c8a09005c..7a6add60efcd1 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_left/static_requirements.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_left/static_requirements.pass.cpp
@@ -77,9 +77,11 @@
 //    Returns: true only if m.is_strided() is true for all possible objects m of type M.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
+#include <utility>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/stride.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/stride.pass.cpp
index b5fc73b6fd5bf..064c279bcc49f 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_left/stride.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_left/stride.pass.cpp
@@ -19,10 +19,12 @@
 //   Returns: extents().rev-prod-of-extents(i).
 
 #include <mdspan>
-#include <cassert>
 #include <array>
+#include <cassert>
 #include <cstdint>
 #include <cstdio>
+#include <span> // dynamic_extent
+
 #include "test_macros.h"
 
 template <class E, class... Args>
diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/comparison.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/comparison.pass.cpp
index 6ca9041fbf2f8..03c78ca5e91d9 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_right/comparison.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_right/comparison.pass.cpp
@@ -16,9 +16,10 @@
 // Constraints: extents_type::rank() == OtherExtents::rank() is true.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.default.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.default.pass.cpp
index 71242faa05a32..f02174416f33c 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.default.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.default.pass.cpp
@@ -17,6 +17,7 @@
 #include <mdspan>
 #include <cassert>
 #include <cstdint>
+#include <span> // dynamic_extent
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.extents.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.extents.pass.cpp
index d9fa771fcfcd1..9c2c39bc3cb3a 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.extents.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.extents.pass.cpp
@@ -20,6 +20,7 @@
 #include <mdspan>
 #include <cassert>
 #include <cstdint>
+#include <span> // dynamic_extent
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_left.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_left.pass.cpp
index 13f2354c61407..61aba5dae6829 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_left.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_left.pass.cpp
@@ -21,9 +21,10 @@
 // Preconditions: other.required_span_size() is representable as a value of type index_type
 
 #include <mdspan>
-#include <type_traits>
 #include <cassert>
 #include <limits>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_stride.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_stride.pass.cpp
index 3b7e793c69a17..3bc7d82f8ed8d 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_stride.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_stride.pass.cpp
@@ -24,9 +24,11 @@
 // Effects: Direct-non-list-initializes extents_ with other.extents().
 
 #include <mdspan>
-#include <type_traits>
+#include <array>
 #include <cassert>
 #include <limits>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.mapping.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.mapping.pass.cpp
index dd71f3b2af5f2..eeea5ab021e97 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.mapping.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.mapping.pass.cpp
@@ -19,9 +19,10 @@
 // Preconditions: other.required_span_size() is representable as a value of type index_type
 
 #include <mdspan>
-#include <type_traits>
 #include <cassert>
 #include <limits>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/index_operator.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/index_operator.pass.cpp
index 879e6713376d6..989078f17d303 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_right/index_operator.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_right/index_operator.pass.cpp
@@ -26,6 +26,8 @@
 #include <mdspan>
 #include <cassert>
 #include <cstdint>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/properties.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/properties.pass.cpp
index 94ffb1a4db5b4..120cefd619a3a 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_right/properties.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_right/properties.pass.cpp
@@ -27,9 +27,10 @@
 // }
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/required_span_size.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/required_span_size.pass.cpp
index a2efa4ade6a0b..0128d8c26a83e 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_right/required_span_size.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_right/required_span_size.pass.cpp
@@ -18,6 +18,7 @@
 #include <mdspan>
 #include <cassert>
 #include <cstdint>
+#include <span> // dynamic_extent
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/static_requirements.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/static_requirements.pass.cpp
index c4e3d89cb94f4..2b11d17c6717a 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_right/static_requirements.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_right/static_requirements.pass.cpp
@@ -77,9 +77,11 @@
 //    Returns: true only if m.is_strided() is true for all possible objects m of type M.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
+#include <utility>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/stride.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/stride.pass.cpp
index 9a27859afb959..c04f07847c0be 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_right/stride.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_right/stride.pass.cpp
@@ -19,10 +19,12 @@
 //   Returns: extents().rev-prod-of-extents(i).
 
 #include <mdspan>
-#include <cassert>
 #include <array>
+#include <cassert>
 #include <cstdint>
 #include <cstdio>
+#include <span> // dynamic_extent
+
 #include "test_macros.h"
 
 template <class E, class... Args>
diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/comparison.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/comparison.pass.cpp
index 7b452cc43f97d..7c9b4a4ded34d 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_stride/comparison.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_stride/comparison.pass.cpp
@@ -23,9 +23,10 @@
 // Returns: true if x.extents() == y.extents() is true, OFFSET(y) == 0 is true, and each of x.stride(r) == y.stride(r) is true for r in the range [0, x.extents().rank()). Otherwise, false.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.default.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.default.pass.cpp
index 2fa8866058720..108c4c6fca98e 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.default.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.default.pass.cpp
@@ -23,6 +23,7 @@
 #include <mdspan>
 #include <cassert>
 #include <cstdint>
+#include <span> // dynamic_extent
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_array.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_array.pass.cpp
index f9157f8c6eedd..cecfb79ea6867 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_array.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_array.pass.cpp
@@ -28,8 +28,11 @@
 //         direct-non-list-initializes strides_[d] with as_const(s[d]).
 
 #include <mdspan>
+#include <array>
 #include <cassert>
 #include <cstdint>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 #include "../ConvertibleToIntegral.h"
diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_span.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_span.pass.cpp
index 36a87ae7a9e84..d0f26ad23df98 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_span.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_span.pass.cpp
@@ -28,8 +28,11 @@
 //         direct-non-list-initializes strides_[d] with as_const(s[d]).
 
 #include <mdspan>
+#include <array>
 #include <cassert>
 #include <cstdint>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 #include "../ConvertibleToIntegral.h"
diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.strided_mapping.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.strided_mapping.pass.cpp
index 2f73ed512fe45..6ba67ea2d0122 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.strided_mapping.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.strided_mapping.pass.cpp
@@ -36,9 +36,10 @@
 //        is-mapping-of<layout_stride, LayoutStrideMapping>))
 
 #include <mdspan>
-#include <type_traits>
 #include <cassert>
 #include <limits>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/deduction.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/deduction.pass.cpp
index 259f61536ee37..45e2a1c10d6ff 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_stride/deduction.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_stride/deduction.pass.cpp
@@ -11,9 +11,12 @@
 // <mdspan>
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
+#include <array>
 #include <cassert>
+#include <concepts>
+#include <cstdint>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
@@ -33,7 +36,7 @@ constexpr bool test() {
   ASSERT_SAME_TYPE(decltype(std::layout_stride::mapping(std::extents<int, D>(), std::array<char, 1>{1})),
                    std::layout_stride::mapping<std::extents<int, D>>);
   ASSERT_SAME_TYPE(
-      decltype(std::layout_stride::mapping(std::extents<unsigned, D, 3>(), std::array<int64_t, 2>{3, 100})),
+      decltype(std::layout_stride::mapping(std::extents<unsigned, D, 3>(), std::array<std::int64_t, 2>{3, 100})),
       std::layout_stride::mapping<std::extents<unsigned, D, 3>>);
 
   ASSERT_SAME_TYPE(decltype(std::layout_stride::mapping(std::extents<int>(), std::span<unsigned, 0>())),
@@ -43,7 +46,7 @@ constexpr bool test() {
   ASSERT_SAME_TYPE(decltype(std::layout_stride::mapping(std::extents<int, D>(), std::declval<std::span<char, 1>>())),
                    std::layout_stride::mapping<std::extents<int, D>>);
   ASSERT_SAME_TYPE(
-      decltype(std::layout_stride::mapping(std::extents<unsigned, D, 3>(), std::declval<std::span<int64_t, 2>>())),
+      decltype(std::layout_stride::mapping(std::extents<unsigned, D, 3>(), std::declval<std::span<std::int64_t, 2>>())),
       std::layout_stride::mapping<std::extents<unsigned, D, 3>>);
   return true;
 }
diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/index_operator.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/index_operator.pass.cpp
index 30281a8d922d1..5669991b8a13a 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_stride/index_operator.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_stride/index_operator.pass.cpp
@@ -24,8 +24,11 @@
 //   * extents_type::index-cast(i) is a multidimensional index in extents_.
 
 #include <mdspan>
+#include <array>
 #include <cassert>
 #include <cstdint>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/is_exhaustive_corner_case.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/is_exhaustive_corner_case.pass.cpp
index a4218f34105ad..589e32f86e39d 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_stride/is_exhaustive_corner_case.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_stride/is_exhaustive_corner_case.pass.cpp
@@ -20,9 +20,11 @@
 //   - Otherwise, false.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
+#include <array>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/properties.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/properties.pass.cpp
index a5f77a6685470..b1eb84b375b6e 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_stride/properties.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_stride/properties.pass.cpp
@@ -39,9 +39,11 @@
 //   - Otherwise, false.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
+#include <array>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/required_span_size.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/required_span_size.pass.cpp
index 3b6af9f6c1ff0..870518994a939 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_stride/required_span_size.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_stride/required_span_size.pass.cpp
@@ -20,8 +20,10 @@
 //   Returns: REQUIRED-SPAN-SIZE(extents(), strides_).
 
 #include <mdspan>
+#include <array>
 #include <cassert>
 #include <cstdint>
+#include <span> // dynamic_extent
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/static_requirements.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/static_requirements.pass.cpp
index 8c5bf16b37c1e..a69fb4f287c3e 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_stride/static_requirements.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_stride/static_requirements.pass.cpp
@@ -77,9 +77,11 @@
 //    Returns: true only if m.is_strided() is true for all possible objects m of type M.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
+#include <utility>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/stride.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/stride.pass.cpp
index 07a5199e5bbb1..2f16b1f6ec9aa 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_stride/stride.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_stride/stride.pass.cpp
@@ -19,10 +19,12 @@
 //   Returns: extents().rev-prod-of-extents(i).
 
 #include <mdspan>
-#include <cassert>
 #include <array>
+#include <cassert>
 #include <cstdint>
 #include <cstdio>
+#include <span> // dynamic_extent
+
 #include "test_macros.h"
 
 template <class E, class... Args>
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/assign.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/assign.pass.cpp
index 9e128f5180786..4c59b5b61a5a4 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/assign.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/assign.pass.cpp
@@ -12,9 +12,10 @@
 // constexpr mdspan& operator=(const mdspan& rhs) = default;
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/conversion.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/conversion.pass.cpp
index db22e148b34c5..5fe14c1562f4d 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/conversion.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/conversion.pass.cpp
@@ -39,9 +39,10 @@
 //   || !is_convertible_v<const OtherAccessor&, accessor_type>
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.copy.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.copy.pass.cpp
index 6d88e9ff02f96..9540c37cfc4b9 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.copy.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.copy.pass.cpp
@@ -14,9 +14,10 @@
 // A specialization of mdspan is a trivially copyable type if its accessor_type, mapping_type, and data_handle_type are trivially copyable types.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.default.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.default.pass.cpp
index dbb2ad8b41bd4..5b84973bc43ef 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.default.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.default.pass.cpp
@@ -22,9 +22,10 @@
 // Effects: Value-initializes ptr_, map_, and acc_.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_array.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_array.pass.cpp
index 0e0c7667da307..9ca4fe240495c 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_array.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_array.pass.cpp
@@ -29,9 +29,10 @@
 //   - value-initializes acc_.
 
 #include <array>
-#include <concepts>
 #include <cassert>
+#include <concepts>
 #include <mdspan>
+#include <span> // dynamic_extent
 #include <type_traits>
 
 #include "test_macros.h"
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_extents.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_extents.pass.cpp
index 40e82db986350..4d9f91f63d54c 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_extents.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_extents.pass.cpp
@@ -24,9 +24,10 @@
 //   - value-initializes acc_.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_integers.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_integers.pass.cpp
index c2e8d26cd87f6..381b2d3a8bc2a 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_integers.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_integers.pass.cpp
@@ -30,9 +30,10 @@
 //   - value-initializes acc_.
 
 #include <array>
-#include <concepts>
 #include <cassert>
+#include <concepts>
 #include <mdspan>
+#include <span> // dynamic_extent
 #include <type_traits>
 
 #include "test_macros.h"
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map.pass.cpp
index fa65848ac69b5..f5e9c1b7e2a4c 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map.pass.cpp
@@ -22,9 +22,10 @@
 //   - value-initializes acc_.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map_acc.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map_acc.pass.cpp
index 65d32f3d7a7f6..3239c1d65deb9 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map_acc.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_map_acc.pass.cpp
@@ -19,9 +19,10 @@
 //   - direct-non-list-initializes acc_ with a.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_span.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_span.pass.cpp
index f4fb5e681d95f..2240dbe0801be 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_span.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.dh_span.pass.cpp
@@ -29,9 +29,10 @@
 //   - value-initializes acc_.
 
 #include <array>
-#include <concepts>
 #include <cassert>
+#include <concepts>
 #include <mdspan>
+#include <span> // dynamic_extent
 #include <type_traits>
 
 #include "test_macros.h"
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.move.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.move.pass.cpp
index c843c6033524c..46ba2b2096c40 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/ctor.move.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/ctor.move.pass.cpp
@@ -14,9 +14,10 @@
 // A specialization of mdspan is a trivially copyable type if its accessor_type, mapping_type, and data_handle_type are trivially copyable types.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/deduction.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/deduction.pass.cpp
index 876a3e84d6957..8b4d24d2a398d 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/deduction.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/deduction.pass.cpp
@@ -52,9 +52,10 @@
 //                typename MappingType::layout_type, AccessorType>;
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/index_operator.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/index_operator.pass.cpp
index ffb10c007222d..22020b1f64881 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/index_operator.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/index_operator.pass.cpp
@@ -32,6 +32,7 @@
 #include <mdspan>
 #include <cassert>
 #include <cstdint>
+#include <span> // dynamic_extent
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/move.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/move.pass.cpp
index 9e1805431cd76..5ce2d06712bf6 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/move.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/move.pass.cpp
@@ -14,9 +14,10 @@
 // A specialization of mdspan is a trivially copyable type if its accessor_type, mapping_type, and data_handle_type are trivially copyable types.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/properties.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/properties.pass.cpp
index ba1fef1df6779..6368acd0b0f41 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/properties.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/properties.pass.cpp
@@ -52,9 +52,10 @@
 // A specialization of mdspan is a trivially copyable type if its accessor_type, mapping_type, and data_handle_type are trivially copyable types.
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/swap.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/swap.pass.cpp
index a9a8e9a264c16..47f2abecade8f 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/swap.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/swap.pass.cpp
@@ -17,9 +17,10 @@
 //   swap(x.acc_, y.acc_);
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/types.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/types.pass.cpp
index 7e0235105b0b1..934e861e78d78 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/types.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/types.pass.cpp
@@ -28,9 +28,10 @@
 //  };
 
 #include <mdspan>
-#include <type_traits>
-#include <concepts>
 #include <cassert>
+#include <concepts>
+#include <span> // dynamic_extent
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/test_offset_time_zone.h b/libcxx/test/std/time/time.zone/time.zone.zonedtime/test_offset_time_zone.h
index e9262c5d95db1..bbd01d17b9ce3 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/test_offset_time_zone.h
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/test_offset_time_zone.h
@@ -15,6 +15,7 @@
 #include <chrono>
 #include <format>
 #include <string_view>
+#include <system_error>
 #include <type_traits>
 
 enum class offset_time_zone_flags {
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view.pass.cpp
index c4c5e4f5b0231..18d13e32c63f0 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view.pass.cpp
@@ -20,7 +20,9 @@
 // explicit zoned_time(string_view name);
 
 #include <chrono>
+#include <cassert>
 #include <concepts>
+#include <string_view>
 
 #include "../test_offset_time_zone.h"
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time.pass.cpp
index c7fe8f24db687..34d8925709725 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time.pass.cpp
@@ -20,7 +20,9 @@
 // zoned_time(string_view name, const local_time<Duration>& st);
 
 #include <chrono>
+#include <cassert>
 #include <concepts>
+#include <string_view>
 
 #include "../test_offset_time_zone.h"
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time_choose.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time_choose.pass.cpp
index 69eb4a17aada4..9f82cab3461bd 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time_choose.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_local_time_choose.pass.cpp
@@ -20,8 +20,8 @@
 // zoned_time(string_view name, const local_time<Duration>& st, choose c);
 
 #include <chrono>
-#include <concepts>
 #include <cassert>
+#include <concepts>
 
 #include "../test_offset_time_zone.h"
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_sys_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_sys_time.pass.cpp
index 108a4b44706b7..46da996aa13db 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_sys_time.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_sys_time.pass.cpp
@@ -20,7 +20,9 @@
 // zoned_time(string_view name, const sys_time<Duration>& st);
 
 #include <chrono>
+#include <cassert>
 #include <concepts>
+#include <string_view>
 
 #include "../test_offset_time_zone.h"
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2.pass.cpp
index 68d7a5c74f47a..e4995f7400dac 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2.pass.cpp
@@ -21,8 +21,8 @@
 // zoned_time(string_view name, const zoned_time<Duration2, TimeZonePtr2>& y);
 
 #include <chrono>
-#include <concepts>
 #include <cassert>
+#include <concepts>
 
 #include "../test_offset_time_zone.h"
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2_choose.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2_choose.pass.cpp
index a96975cc611f9..31f35fed0f985 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2_choose.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/string_view_zoned_time_duration2_time_zone_ptr2_choose.pass.cpp
@@ -21,8 +21,8 @@
 // zoned_time(string_view name, const zoned_time<Duration2, TimeZonePtr2>& y, choose c);
 
 #include <chrono>
-#include <concepts>
 #include <cassert>
+#include <concepts>
 
 #include "../test_offset_time_zone.h"
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/sys_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/sys_time.pass.cpp
index 431e18f94243d..e4b9d10d2f0ac 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/sys_time.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/sys_time.pass.cpp
@@ -20,6 +20,7 @@
 // zoned_time(const sys_time<Duration>& st);
 
 #include <chrono>
+#include <cassert>
 #include <concepts>
 #include <type_traits>
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer.pass.cpp
index f18be244684bf..5b805fb5abb90 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer.pass.cpp
@@ -20,6 +20,7 @@
 // explicit zoned_time(TimeZonePtr z);
 
 #include <chrono>
+#include <cassert>
 #include <concepts>
 
 #include "../test_offset_time_zone.h"
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time.pass.cpp
index 72116ca710e43..aa20e2420f4b9 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time.pass.cpp
@@ -20,8 +20,8 @@
 // zoned_time(TimeZonePtr z, const local_time<Duration>& st);
 
 #include <chrono>
-#include <concepts>
 #include <cassert>
+#include <concepts>
 
 #include "../test_offset_time_zone.h"
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time_choose.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time_choose.pass.cpp
index efe78141a5cf2..c78bf136fde44 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time_choose.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_local_time_choose.pass.cpp
@@ -20,8 +20,8 @@
 // zoned_time(TimeZonePtr z, const local_time<Duration>& st, choose c);
 
 #include <chrono>
-#include <concepts>
 #include <cassert>
+#include <concepts>
 
 #include "../test_offset_time_zone.h"
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_sys_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_sys_time.pass.cpp
index d0769cbe1408b..290e564848683 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_sys_time.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.ctor/time_zone_pointer_sys_time.pass.cpp
@@ -20,8 +20,8 @@
 // zoned_time(TimeZonePtr z, const sys_time<Duration>& st);
 
 #include <chrono>
-#include <concepts>
 #include <cassert>
+#include <concepts>
 
 #include "../test_offset_time_zone.h"
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_local_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_local_time.pass.cpp
index c465cda36c51e..017d073789f7d 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_local_time.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_local_time.pass.cpp
@@ -20,6 +20,7 @@
 // local_time<duration> get_local_time() const;
 
 #include <chrono>
+#include <cassert>
 #include <concepts>
 
 #include "../test_offset_time_zone.h"
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_sys_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_sys_time.pass.cpp
index 31fcbb689bd85..dde3cff9b5df8 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_sys_time.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_sys_time.pass.cpp
@@ -20,6 +20,7 @@
 // sys_time<duration> get_sys_time() const;
 
 #include <chrono>
+#include <cassert>
 #include <concepts>
 
 #include "../test_offset_time_zone.h"
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_time_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_time_zone.pass.cpp
index af441b0857320..bc4c8be064f2c 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_time_zone.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/get_time_zone.pass.cpp
@@ -20,6 +20,7 @@
 // TimeZonePtr get_time_zone() const;
 
 #include <chrono>
+#include <cassert>
 #include <concepts>
 
 #include "../test_offset_time_zone.h"
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_local_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_local_time.pass.cpp
index 2a9c241997438..90d3484b2553c 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_local_time.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_local_time.pass.cpp
@@ -20,6 +20,7 @@
 // explicit operator local_time<duration>() const;
 
 #include <chrono>
+#include <cassert>
 #include <concepts>
 
 #include "../test_offset_time_zone.h"
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_sys_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_sys_time.pass.cpp
index 327ed495aed1a..68d8a15ec9889 100644
--- a/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_sys_time.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtime/time.zone.zonedtime.members/operator_sys_time.pass.cpp
@@ -20,6 +20,7 @@
 // operator sys_time<duration>() const;
 
 #include <chrono>
+#include <cassert>
 #include <concepts>
 
 #include "../test_offset_time_zone.h"
diff --git a/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.pass.cpp
index c88bf11d61e0e..4708da4c38b9b 100644
--- a/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.pass.cpp
+++ b/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.pass.cpp
@@ -14,6 +14,8 @@
 //                                        Integral& value, int base = 10)
 
 #include <charconv>
+#include <system_error>
+
 #include "test_macros.h"
 #include "charconv_test_helpers.h"
 
diff --git a/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.roundtrip.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.roundtrip.pass.cpp
index 5b5ea7c91e8af..b1f2c77b34f8a 100644
--- a/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.roundtrip.pass.cpp
+++ b/libcxx/test/std/utilities/charconv/charconv.from.chars/integral.roundtrip.pass.cpp
@@ -16,6 +16,8 @@
 //                                        Integral& value, int base = 10)
 
 #include <charconv>
+#include <system_error>
+
 #include "test_macros.h"
 #include "charconv_test_helpers.h"
 
diff --git a/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp
index a6aa590ee944f..e0f20f1e8e3ff 100644
--- a/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp
+++ b/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp
@@ -16,6 +16,7 @@
 #include <charconv>
 
 #include <cassert>
+#include <system_error>
 #include <type_traits>
 
 #include "test_macros.h"
diff --git a/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.pass.cpp
index f218967b08196..24484cfe475f9 100644
--- a/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.pass.cpp
+++ b/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.pass.cpp
@@ -23,6 +23,7 @@
 #include <cassert>
 #include <compare>
 #include <concepts>
+#include <system_error>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp
index 621eb8a493fd3..0908aa33ea7fc 100644
--- a/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp
+++ b/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp
@@ -16,6 +16,7 @@
 #include <charconv>
 
 #include <cassert>
+#include <system_error>
 #include <type_traits>
 
 #include "test_macros.h"
diff --git a/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.pass.cpp
index d4436d0c05b41..e746f19b2cd02 100644
--- a/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.pass.cpp
+++ b/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.pass.cpp
@@ -23,6 +23,7 @@
 #include <cassert>
 #include <compare>
 #include <concepts>
+#include <system_error>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/utilities/charconv/charconv.to.chars/integral.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.to.chars/integral.pass.cpp
index 420b0e168539d..e89b340bdfcbe 100644
--- a/libcxx/test/std/utilities/charconv/charconv.to.chars/integral.pass.cpp
+++ b/libcxx/test/std/utilities/charconv/charconv.to.chars/integral.pass.cpp
@@ -19,6 +19,7 @@
 
 #include <charconv>
 #include <cstdint>
+#include <system_error>
 
 #include "test_macros.h"
 #include "charconv_test_helpers.h"

From 63da545ccdd41d9eb2392a8d0e848a65eb24f5fa Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 4 Sep 2024 13:43:41 -0700
Subject: [PATCH 154/425] Revert "Reland "AtomicExpand: Allow incrementally
 legalizing atomicrmw"" (#107307)

Reverts llvm/llvm-project#106793

`Next == E` is not enough:
https://lab.llvm.org/buildbot/#/builders/169/builds/2834

`Next` is deleted by `processAtomicInstr`
---
 llvm/lib/CodeGen/AtomicExpandPass.cpp       |  35 +-
 llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll | 373 +++++++++-----------
 llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 373 +++++++++-----------
 llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 373 +++++++++-----------
 llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll | 373 +++++++++-----------
 5 files changed, 691 insertions(+), 836 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 2da723a0cc175..39a705599f90c 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -351,30 +351,17 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
 
   bool MadeChange = false;
 
-  for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE;) {
-    BasicBlock *BB = &*BBI;
-    ++BBI;
-
-    BasicBlock::iterator Next;
-
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
-         I = Next) {
-      Instruction &Inst = *I;
-      Next = std::next(I);
-
-      if (processAtomicInstr(&Inst)) {
-        MadeChange = true;
-
-        // Detect control flow change and resume iteration from the original
-        // block to inspect any newly inserted blocks. This allows incremental
-        // legalization of atomicrmw and cmpxchg.
-        if (Next == E || BB != Next->getParent()) {
-          BBI = BB->getIterator();
-          BBE = F.end();
-          break;
-        }
-      }
-    }
+  SmallVector<Instruction *, 1> AtomicInsts;
+
+  // Changing control-flow while iterating through it is a bad idea, so gather a
+  // list of all atomic instructions before we start.
+  for (Instruction &I : instructions(F))
+    if (I.isAtomic() && !isa<FenceInst>(&I))
+      AtomicInsts.push_back(&I);
+
+  for (auto *I : AtomicInsts) {
+    if (processAtomicInstr(I))
+      MadeChange = true;
   }
 
   return MadeChange;
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
index ed9c1b037d0cc..0d230bb9dcc6e 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
@@ -43,49 +43,46 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; SOFTFP-NOLSE-NEXT:    b .LBB0_1
+; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -131,49 +128,46 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; SOFTFP-NOLSE-NEXT:    b .LBB1_1
+; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -238,40 +232,36 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; SOFTFP-NOLSE-NEXT:    b .LBB2_1
+; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2
@@ -337,40 +327,36 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; SOFTFP-NOLSE-NEXT:    b .LBB3_1
+; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 4
@@ -413,38 +399,35 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; SOFTFP-NOLSE-NEXT:    b .LBB4_1
+; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4
@@ -486,40 +469,36 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x20, x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    mov x20, x8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
-; SOFTFP-NOLSE-NEXT:    mov x1, x20
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
 ; SOFTFP-NOLSE-NEXT:    bl __adddf3
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x21
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; SOFTFP-NOLSE-NEXT:    b .LBB5_1
+; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8
@@ -708,18 +687,18 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w22, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
+; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -732,33 +711,29 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w22
+; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w8
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB7_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; SOFTFP-NOLSE-NEXT:    b .LBB7_1
+; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -824,18 +799,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
+; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
@@ -845,28 +819,25 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; SOFTFP-NOLSE-NEXT:    b .LBB8_1
+; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -914,49 +885,45 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB9_2
-; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB9_6
+; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_5
 ; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:    mov w9, w22
-; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w9, w0
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x22, x9
+; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB9_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB9_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB9_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB9_2
-; SOFTFP-NOLSE-NEXT:  .LBB9_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
-; SOFTFP-NOLSE-NEXT:    mov w1, w23
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB9_3
+; SOFTFP-NOLSE-NEXT:    b .LBB9_1
+; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
index 888b795876f7d..bfe0d20ca814b 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
@@ -45,49 +45,46 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; SOFTFP-NOLSE-NEXT:    b .LBB0_1
+; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -133,49 +130,46 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; SOFTFP-NOLSE-NEXT:    b .LBB1_1
+; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -240,40 +234,36 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; SOFTFP-NOLSE-NEXT:    b .LBB2_1
+; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2
@@ -339,40 +329,36 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; SOFTFP-NOLSE-NEXT:    b .LBB3_1
+; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 4
@@ -415,38 +401,35 @@ define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; SOFTFP-NOLSE-NEXT:    b .LBB4_1
+; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4
@@ -488,40 +471,36 @@ define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x20, x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    mov x20, x8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
-; SOFTFP-NOLSE-NEXT:    mov x1, x20
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
 ; SOFTFP-NOLSE-NEXT:    bl fmax
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x21
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; SOFTFP-NOLSE-NEXT:    b .LBB5_1
+; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8
@@ -588,18 +567,18 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w22, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB6_2
-; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB6_6
+; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_5
 ; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
@@ -612,33 +591,29 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w22
+; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w8
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB6_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB6_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB6_2
-; SOFTFP-NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB6_3
+; SOFTFP-NOLSE-NEXT:    b .LBB6_1
+; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -748,18 +723,17 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
+; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -769,28 +743,25 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; SOFTFP-NOLSE-NEXT:    b .LBB7_1
+; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -838,49 +809,45 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
+; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:    mov w9, w22
-; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w9, w0
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x22, x9
+; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB8_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
-; SOFTFP-NOLSE-NEXT:    mov w1, w23
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; SOFTFP-NOLSE-NEXT:    b .LBB8_1
+; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
index a3665c6e42860..6b7d2df044460 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
@@ -45,49 +45,46 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; SOFTFP-NOLSE-NEXT:    b .LBB0_1
+; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -133,49 +130,46 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; SOFTFP-NOLSE-NEXT:    b .LBB1_1
+; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -240,40 +234,36 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; SOFTFP-NOLSE-NEXT:    b .LBB2_1
+; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2
@@ -339,40 +329,36 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; SOFTFP-NOLSE-NEXT:    b .LBB3_1
+; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 4
@@ -415,38 +401,35 @@ define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; SOFTFP-NOLSE-NEXT:    b .LBB4_1
+; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4
@@ -488,40 +471,36 @@ define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x20, x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    mov x20, x8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
-; SOFTFP-NOLSE-NEXT:    mov x1, x20
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
 ; SOFTFP-NOLSE-NEXT:    bl fmin
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x21
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; SOFTFP-NOLSE-NEXT:    b .LBB5_1
+; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8
@@ -588,18 +567,18 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w22, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB6_2
-; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB6_6
+; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_5
 ; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
@@ -612,33 +591,29 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w22
+; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w8
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB6_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB6_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB6_2
-; SOFTFP-NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB6_3
+; SOFTFP-NOLSE-NEXT:    b .LBB6_1
+; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -748,18 +723,17 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
+; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -769,28 +743,25 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; SOFTFP-NOLSE-NEXT:    b .LBB7_1
+; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -838,49 +809,45 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
+; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:    mov w9, w22
-; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w9, w0
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x22, x9
+; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB8_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
-; SOFTFP-NOLSE-NEXT:    mov w1, w23
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; SOFTFP-NOLSE-NEXT:    b .LBB8_1
+; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
index 7725ce0e73185..67e164037d5ce 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
@@ -43,49 +43,46 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; SOFTFP-NOLSE-NEXT:    b .LBB0_1
+; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -131,49 +128,46 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; SOFTFP-NOLSE-NEXT:    b .LBB1_1
+; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -238,40 +232,36 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; SOFTFP-NOLSE-NEXT:    b .LBB2_1
+; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2
@@ -337,40 +327,36 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; SOFTFP-NOLSE-NEXT:    b .LBB3_1
+; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 4
@@ -413,38 +399,35 @@ define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; SOFTFP-NOLSE-NEXT:    b .LBB4_1
+; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4
@@ -486,40 +469,36 @@ define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x20, x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    mov x20, x8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
-; SOFTFP-NOLSE-NEXT:    mov x1, x20
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
 ; SOFTFP-NOLSE-NEXT:    bl __subdf3
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x21
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; SOFTFP-NOLSE-NEXT:    b .LBB5_1
+; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8
@@ -708,18 +687,18 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w22, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
+; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -732,33 +711,29 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w22
+; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w8
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB7_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; SOFTFP-NOLSE-NEXT:    b .LBB7_1
+; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -824,18 +799,17 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
+; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
@@ -845,28 +819,25 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; SOFTFP-NOLSE-NEXT:    b .LBB8_1
+; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -914,49 +885,45 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB9_2
-; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB9_6
+; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_5
 ; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:    mov w9, w22
-; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w9, w0
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x22, x9
+; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB9_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB9_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB9_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB9_2
-; SOFTFP-NOLSE-NEXT:  .LBB9_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
-; SOFTFP-NOLSE-NEXT:    mov w1, w23
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB9_3
+; SOFTFP-NOLSE-NEXT:    b .LBB9_1
+; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload

From 52dc4918ca8b874ddd4e4fcad873a66ecc5b6953 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 4 Sep 2024 16:47:20 -0400
Subject: [PATCH 155/425] [libc++][NFC] Use consistent layout for license in
 Python files

Most Python files were using `# === [...]` instead of `#=== [...]`
so I went with what was the most common in the codebase.
---
 libcxx/test/libcxx/clang_modules_include.gen.py           | 4 ++--
 libcxx/test/libcxx/clang_tidy.gen.py                      | 4 ++--
 libcxx/test/libcxx/double_include.gen.py                  | 4 ++--
 libcxx/test/libcxx/header_inclusions.gen.py               | 4 ++--
 libcxx/test/libcxx/libcpp_version.gen.py                  | 4 ++--
 libcxx/test/libcxx/no_assert_include.gen.py               | 4 ++--
 libcxx/test/libcxx/system_reserved_names.gen.py           | 4 ++--
 libcxx/test/libcxx/transitive_includes.gen.py             | 4 ++--
 libcxx/utils/adb_run.py                                   | 4 ++--
 libcxx/utils/ci/Dockerfile                                | 4 ++--
 libcxx/utils/ci/apple-install-libcxx.sh                   | 4 ++--
 libcxx/utils/ci/build-picolibc.sh                         | 4 ++--
 libcxx/utils/ci/buildkite-pipeline.yml                    | 4 ++--
 libcxx/utils/ci/run-buildbot                              | 4 ++--
 libcxx/utils/ci/vendor/android/Dockerfile.emulator        | 4 ++--
 libcxx/utils/ci/vendor/android/build-emulator-images.sh   | 4 ++--
 libcxx/utils/ci/vendor/android/container-setup.sh         | 4 ++--
 libcxx/utils/ci/vendor/android/emulator-entrypoint.sh     | 4 ++--
 libcxx/utils/ci/vendor/android/emulator-functions.sh      | 4 ++--
 libcxx/utils/ci/vendor/android/emulator-wait-for-ready.sh | 4 ++--
 libcxx/utils/ci/vendor/android/setup-env-for-emulator.sh  | 4 ++--
 libcxx/utils/ci/vendor/android/start-emulator.sh          | 4 ++--
 libcxx/utils/ci/vendor/android/stop-emulator.sh           | 4 ++--
 libcxx/utils/libcxx/test/android.py                       | 4 ++--
 24 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/libcxx/test/libcxx/clang_modules_include.gen.py b/libcxx/test/libcxx/clang_modules_include.gen.py
index f084c38a8fbd3..f0421b2e73813 100644
--- a/libcxx/test/libcxx/clang_modules_include.gen.py
+++ b/libcxx/test/libcxx/clang_modules_include.gen.py
@@ -1,10 +1,10 @@
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 # Test that we can include each header in a TU while using modules.
 # This is important notably because the LLDB data formatters use
diff --git a/libcxx/test/libcxx/clang_tidy.gen.py b/libcxx/test/libcxx/clang_tidy.gen.py
index 76b9db2d5cb83..5e84fbbb9913f 100644
--- a/libcxx/test/libcxx/clang_tidy.gen.py
+++ b/libcxx/test/libcxx/clang_tidy.gen.py
@@ -1,10 +1,10 @@
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 # Run our custom libc++ clang-tidy checks on all public headers.
 
diff --git a/libcxx/test/libcxx/double_include.gen.py b/libcxx/test/libcxx/double_include.gen.py
index c7cb38b8f3590..afc2947dbece9 100644
--- a/libcxx/test/libcxx/double_include.gen.py
+++ b/libcxx/test/libcxx/double_include.gen.py
@@ -1,10 +1,10 @@
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 # Test that we can include each header in two TU's and link them together.
 
diff --git a/libcxx/test/libcxx/header_inclusions.gen.py b/libcxx/test/libcxx/header_inclusions.gen.py
index faaa4cf8710c1..2ecc47cbb1891 100644
--- a/libcxx/test/libcxx/header_inclusions.gen.py
+++ b/libcxx/test/libcxx/header_inclusions.gen.py
@@ -1,10 +1,10 @@
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 # Test that all headers include all the other headers they're supposed to, as
 # prescribed by the Standard.
diff --git a/libcxx/test/libcxx/libcpp_version.gen.py b/libcxx/test/libcxx/libcpp_version.gen.py
index 7d9519d58955b..a9995295e21e4 100644
--- a/libcxx/test/libcxx/libcpp_version.gen.py
+++ b/libcxx/test/libcxx/libcpp_version.gen.py
@@ -1,10 +1,10 @@
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 # Test that all headers define the _LIBCPP_VERSION macro.
 
diff --git a/libcxx/test/libcxx/no_assert_include.gen.py b/libcxx/test/libcxx/no_assert_include.gen.py
index dd8006df93764..67ab98603ca8f 100644
--- a/libcxx/test/libcxx/no_assert_include.gen.py
+++ b/libcxx/test/libcxx/no_assert_include.gen.py
@@ -1,10 +1,10 @@
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 # Ensure that none of the standard C++ headers implicitly include cassert or
 # assert.h (because assert() is implemented as a macro).
diff --git a/libcxx/test/libcxx/system_reserved_names.gen.py b/libcxx/test/libcxx/system_reserved_names.gen.py
index 956a8d1abe3c3..e29e7a2cdd614 100644
--- a/libcxx/test/libcxx/system_reserved_names.gen.py
+++ b/libcxx/test/libcxx/system_reserved_names.gen.py
@@ -1,10 +1,10 @@
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 # Test that headers are not tripped up by the surrounding code defining various
 # alphabetic macros. Also ensure that we don't swallow the definition of user
diff --git a/libcxx/test/libcxx/transitive_includes.gen.py b/libcxx/test/libcxx/transitive_includes.gen.py
index 834f21f125437..22075364bf1b7 100644
--- a/libcxx/test/libcxx/transitive_includes.gen.py
+++ b/libcxx/test/libcxx/transitive_includes.gen.py
@@ -1,10 +1,10 @@
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 # Test that we don't remove transitive includes of public C++ headers in the library accidentally.
 # When we remove a transitive public include, clients tend to break because they don't always
diff --git a/libcxx/utils/adb_run.py b/libcxx/utils/adb_run.py
index dc15b51d7f605..cddbd191e9881 100755
--- a/libcxx/utils/adb_run.py
+++ b/libcxx/utils/adb_run.py
@@ -1,11 +1,11 @@
 #!/usr/bin/env python3
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 """adb_run.py is a utility for running a libc++ test program via adb.
 """
diff --git a/libcxx/utils/ci/Dockerfile b/libcxx/utils/ci/Dockerfile
index 490bee4942e03..dbecfd9fd3325 100644
--- a/libcxx/utils/ci/Dockerfile
+++ b/libcxx/utils/ci/Dockerfile
@@ -1,10 +1,10 @@
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # This file defines the buildkite and github actions builder images.
 # You can build both images using:
diff --git a/libcxx/utils/ci/apple-install-libcxx.sh b/libcxx/utils/ci/apple-install-libcxx.sh
index 1ef52b11b70bd..1b1c30449d5af 100755
--- a/libcxx/utils/ci/apple-install-libcxx.sh
+++ b/libcxx/utils/ci/apple-install-libcxx.sh
@@ -1,11 +1,11 @@
 #!/usr/bin/env bash
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 set -e
 
diff --git a/libcxx/utils/ci/build-picolibc.sh b/libcxx/utils/ci/build-picolibc.sh
index 400e4dcab99a3..521c1bef9fc7e 100755
--- a/libcxx/utils/ci/build-picolibc.sh
+++ b/libcxx/utils/ci/build-picolibc.sh
@@ -1,11 +1,11 @@
 #!/usr/bin/env bash
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 #
 # This script builds picolibc (https://github.com/picolibc/picolibc) from
diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml
index d02d11ae7a756..906df734bc42b 100644
--- a/libcxx/utils/ci/buildkite-pipeline.yml
+++ b/libcxx/utils/ci/buildkite-pipeline.yml
@@ -1,10 +1,10 @@
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 #
 # This file describes the various pre-commit CI bots used to test libc++.
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index 102f1669e63b8..14ff611302981 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -1,11 +1,11 @@
 #!/usr/bin/env bash
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 set -ex
 set -o pipefail
diff --git a/libcxx/utils/ci/vendor/android/Dockerfile.emulator b/libcxx/utils/ci/vendor/android/Dockerfile.emulator
index 2f52b27f6edcd..6ce9b82d258eb 100644
--- a/libcxx/utils/ci/vendor/android/Dockerfile.emulator
+++ b/libcxx/utils/ci/vendor/android/Dockerfile.emulator
@@ -1,10 +1,10 @@
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 FROM ubuntu:jammy
 
diff --git a/libcxx/utils/ci/vendor/android/build-emulator-images.sh b/libcxx/utils/ci/vendor/android/build-emulator-images.sh
index f467ffc6231f5..4e29c172e47ed 100755
--- a/libcxx/utils/ci/vendor/android/build-emulator-images.sh
+++ b/libcxx/utils/ci/vendor/android/build-emulator-images.sh
@@ -1,11 +1,11 @@
 #!/usr/bin/env bash
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 set -e
 
diff --git a/libcxx/utils/ci/vendor/android/container-setup.sh b/libcxx/utils/ci/vendor/android/container-setup.sh
index 56bc232fefa1e..40e405b65f833 100755
--- a/libcxx/utils/ci/vendor/android/container-setup.sh
+++ b/libcxx/utils/ci/vendor/android/container-setup.sh
@@ -1,11 +1,11 @@
 #!/usr/bin/env bash
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 set -e
 
diff --git a/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh b/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh
index 99d4995b2ee1d..dcd7870e37a7d 100755
--- a/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh
+++ b/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh
@@ -1,11 +1,11 @@
 #!/usr/bin/env bash
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 # This script is the entrypoint of an Android Emulator Docker container.
 
diff --git a/libcxx/utils/ci/vendor/android/emulator-functions.sh b/libcxx/utils/ci/vendor/android/emulator-functions.sh
index 27eea2af157cc..56770654989d7 100644
--- a/libcxx/utils/ci/vendor/android/emulator-functions.sh
+++ b/libcxx/utils/ci/vendor/android/emulator-functions.sh
@@ -1,10 +1,10 @@
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 # Bash functions for managing the names of emulator system images.
 
diff --git a/libcxx/utils/ci/vendor/android/emulator-wait-for-ready.sh b/libcxx/utils/ci/vendor/android/emulator-wait-for-ready.sh
index 0c35794792891..73b657cb1433b 100755
--- a/libcxx/utils/ci/vendor/android/emulator-wait-for-ready.sh
+++ b/libcxx/utils/ci/vendor/android/emulator-wait-for-ready.sh
@@ -1,11 +1,11 @@
 #!/usr/bin/env bash
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 set -ex
 
diff --git a/libcxx/utils/ci/vendor/android/setup-env-for-emulator.sh b/libcxx/utils/ci/vendor/android/setup-env-for-emulator.sh
index 7de6cde7a7ad4..a822b26ffe6fd 100644
--- a/libcxx/utils/ci/vendor/android/setup-env-for-emulator.sh
+++ b/libcxx/utils/ci/vendor/android/setup-env-for-emulator.sh
@@ -1,10 +1,10 @@
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 export ADB_SERVER_SOCKET="tcp:$(docker inspect \
     -f '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' \
diff --git a/libcxx/utils/ci/vendor/android/start-emulator.sh b/libcxx/utils/ci/vendor/android/start-emulator.sh
index 2d6e272675ea0..bec8048e5c5fa 100755
--- a/libcxx/utils/ci/vendor/android/start-emulator.sh
+++ b/libcxx/utils/ci/vendor/android/start-emulator.sh
@@ -1,11 +1,11 @@
 #!/usr/bin/env bash
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 # Starts a new Docker container using a Docker image containing the Android
 # Emulator and an OS image. Stops and removes the old container if it exists
diff --git a/libcxx/utils/ci/vendor/android/stop-emulator.sh b/libcxx/utils/ci/vendor/android/stop-emulator.sh
index b5797ccb344f4..4964fcf204505 100755
--- a/libcxx/utils/ci/vendor/android/stop-emulator.sh
+++ b/libcxx/utils/ci/vendor/android/stop-emulator.sh
@@ -1,11 +1,11 @@
 #!/usr/bin/env bash
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 set -e
 
diff --git a/libcxx/utils/libcxx/test/android.py b/libcxx/utils/libcxx/test/android.py
index a40305b4dca02..4536c9b156823 100644
--- a/libcxx/utils/libcxx/test/android.py
+++ b/libcxx/utils/libcxx/test/android.py
@@ -1,10 +1,10 @@
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-#===----------------------------------------------------------------------===##
+# ===----------------------------------------------------------------------===##
 
 import re
 import select

From 16900d3b98e6c8fbdad4411a054e3566bbbf9235 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Wed, 4 Sep 2024 22:01:04 +0100
Subject: [PATCH 156/425] LICM: hoist BO assoc when BinOp is in RHS (#107072)

Extend hoistBOAssociation smoothly to handle the case when the inner
BinaryOperator is in the RHS of the outer BinaryOperator. This completes
the generalization of hoistBOAssociation, and the only limitation after
this patch is the fact that only Add and Mul are hoisted.
---
 llvm/lib/Transforms/Scalar/LICM.cpp      |  26 ++--
 llvm/test/Transforms/LICM/hoist-binop.ll | 161 ++++++++++++++++++++++-
 2 files changed, 171 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 5cf7c252bb5f3..23e9c70b62642 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2803,15 +2803,14 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
 
 /// Reassociate associative binary expressions of the form
 ///
-/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)" if op is an associative BinOp
-/// 2. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" if op is a commutative BinOp
+/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+/// 2. "(C1 op LV) op C2" ==> "LV op (C1 op C2)"
+/// 3. "C2 op (C1 op LV)" ==> "LV op (C1 op C2)"
+/// 4. "C2 op (LV op C1)" ==> "LV op (C1 op C2)"
 ///
-/// where LV is a loop variant, and C1 and C2 are loop invariants that we want
-/// to hoist.
-///
-/// TODO: This can be extended to more cases such as
-/// 1. "C1 op (C2 op LV)" ==> "(C1 op C2) op LV" if op an associative BinOp
-/// 2. "C1 op (LV op C2)" ==> "(C1 op C2) op LV" if op is a commutative BinOp
+/// where op is an associative BinOp, LV is a loop variant, and C1 and C2 are
+/// loop invariants that we want to hoist, noting that associativity implies
+/// commutativity.
 static bool hoistBOAssociation(Instruction &I, Loop &L,
                                ICFLoopSafetyInfo &SafetyInfo,
                                MemorySSAUpdater &MSSAU, AssumptionCache *AC,
@@ -2825,19 +2824,20 @@ static bool hoistBOAssociation(Instruction &I, Loop &L,
   if (Opcode != Instruction::Add && Opcode != Instruction::Mul)
     return false;
 
-  auto *BO0 = dyn_cast<BinaryOperator>(BO->getOperand(0));
+  bool LVInRHS = L.isLoopInvariant(BO->getOperand(0));
+  auto *BO0 = dyn_cast<BinaryOperator>(BO->getOperand(LVInRHS));
   if (!BO0 || BO0->getOpcode() != Opcode || !BO0->isAssociative() ||
       BO0->hasNUsesOrMore(3))
     return false;
 
   Value *LV = BO0->getOperand(0);
   Value *C1 = BO0->getOperand(1);
-  Value *C2 = BO->getOperand(1);
+  Value *C2 = BO->getOperand(!LVInRHS);
 
-  if (L.isLoopInvariant(LV) && !L.isLoopInvariant(C1)) {
-    assert(BO0->isCommutative() && "Associativity implies commutativity");
+  assert(BO->isCommutative() && BO0->isCommutative() &&
+         "Associativity implies commutativity");
+  if (L.isLoopInvariant(LV) && !L.isLoopInvariant(C1))
     std::swap(LV, C1);
-  }
   if (L.isLoopInvariant(LV) || !L.isLoopInvariant(C1) || !L.isLoopInvariant(C2))
     return false;
 
diff --git a/llvm/test/Transforms/LICM/hoist-binop.ll b/llvm/test/Transforms/LICM/hoist-binop.ll
index b0ee45a5fb350..a840e24757884 100644
--- a/llvm/test/Transforms/LICM/hoist-binop.ll
+++ b/llvm/test/Transforms/LICM/hoist-binop.ll
@@ -67,8 +67,8 @@ loop:
   br label %loop
 }
 
-
-; Hoist ADD and copy NUW if both ops have it. Commutative version.
+; Hoist ADD and copy NUW if both ops have it.
+; Version where operands are commuted.
 define void @add_nuw_comm(i64 %c1, i64 %c2) {
 ; CHECK-LABEL: @add_nuw_comm(
 ; CHECK-NEXT:  entry:
@@ -92,6 +92,83 @@ loop:
   br label %loop
 }
 
+; Hoist ADD and copy NUW if both ops have it.
+; Another version where operands are commuted.
+define void @add_nuw_comm2(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_nuw_comm2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add nuw i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nuw i64 [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add nuw i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = add nuw i64 %c2, %step.add
+  br label %loop
+}
+
+; Hoist ADD and copy NUW if both ops have it.
+; Another version where operands are commuted.
+define void @add_nuw_comm3(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_nuw_comm3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add nuw i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nuw i64 [[C1]], [[INDEX]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add nuw i64 %c1, %index
+  call void @use(i64 %step.add)
+  %index.next = add nuw i64 %c2, %step.add
+  br label %loop
+}
+
+; Hoist ADD and copy NUW if both ops have it.
+; A version where the LHS and RHS of the outer BinOp are BinOps.
+define void @add_nuw_twobinops(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_nuw_twobinops(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C2_PLUS_2:%.*]] = add nuw i64 [[C2:%.*]], 2
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add nuw i64 [[C1:%.*]], [[C2_PLUS_2]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nuw i64 [[C1]], [[INDEX]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add nuw i64 %c1, %index
+  call void @use(i64 %step.add)
+  %c2.plus.2 = add nuw i64 %c2, 2
+  %index.next = add nuw i64 %step.add, %c2.plus.2
+  br label %loop
+}
+
 ; Hoist MUL and drop NUW even if both ops have it.
 define void @mul_nuw(i64 %c1, i64 %c2) {
 ; CHECK-LABEL: @mul_nuw(
@@ -116,7 +193,8 @@ loop:
   br label %loop
 }
 
-; Hoist MUL and drop NUW even if both ops have it. Commutative version.
+; Hoist MUL and drop NUW even if both ops have it.
+; Version where operands are commuted.
 define void @mul_nuw_comm(i64 %c1, i64 %c2) {
 ; CHECK-LABEL: @mul_nuw_comm(
 ; CHECK-NEXT:  entry:
@@ -140,6 +218,83 @@ loop:
   br label %loop
 }
 
+; Hoist MUL and drop NUW even if both ops have it.
+; Another version where operands are commuted.
+define void @mul_nuw_comm2(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @mul_nuw_comm2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nuw i64 [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = mul nuw i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = mul nuw i64 %c2, %step.add
+  br label %loop
+}
+
+; Hoist MUL and drop NUW even if both ops have it.
+; Another version where operands are commuted.
+define void @mul_nuw_comm3(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @mul_nuw_comm3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nuw i64 [[C1]], [[INDEX]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = mul nuw i64 %c1, %index
+  call void @use(i64 %step.add)
+  %index.next = mul nuw i64 %c2, %step.add
+  br label %loop
+}
+
+; Hoist MUL and drop NUW even if both ops have it.
+; A version where the LHS and RHS of the outer BinOp are BinOps.
+define void @mul_nuw_twobinops(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @mul_nuw_twobinops(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C2_PLUS_2:%.*]] = add nuw i64 [[C2:%.*]], 2
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul i64 [[C1:%.*]], [[C2_PLUS_2]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nuw i64 [[C1]], [[INDEX]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = mul nuw i64 %c1, %index
+  call void @use(i64 %step.add)
+  %c2.plus.2 = add nuw i64 %c2, 2
+  %index.next = mul nuw i64 %step.add, %c2.plus.2
+  br label %loop
+}
+
 ; Hoist ADD but don't copy NUW if only one op has it.
 define void @add_no_nuw(i64 %c1, i64 %c2) {
 ; CHECK-LABEL: @add_no_nuw(

From 1ff8657b26870e9db4527b621fab0d21b6cbdc3c Mon Sep 17 00:00:00 2001
From: Christopher Ferris <cferris1000@users.noreply.github.com>
Date: Wed, 4 Sep 2024 14:12:25 -0700
Subject: [PATCH 157/425] [scudo] Use variable instead of recomputing.
 (#106647)

In the get fragmentation functions, there is already a variable that
computes the
in use bytes, so use that instead of recomputing it.
---
 compiler-rt/lib/scudo/standalone/primary32.h | 2 +-
 compiler-rt/lib/scudo/standalone/primary64.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index ebfb8dfe0a31f..87264471be2c1 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -940,7 +940,7 @@ template <typename Config> class SizeClassAllocator32 {
 
     uptr Integral;
     uptr Fractional;
-    computePercentage(BlockSize * InUseBlocks, InUsePages * PageSize, &Integral,
+    computePercentage(BlockSize * InUseBlocks, InUseBytes, &Integral,
                       &Fractional);
     Str->append("  %02zu (%6zu): inuse/total blocks: %6zu/%6zu inuse/total "
                 "pages: %6zu/%6zu inuse bytes: %6zuK util: %3zu.%02zu%%\n",
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 8436f33c2fdcf..ffcc22fea0c6e 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -1185,7 +1185,7 @@ template <typename Config> class SizeClassAllocator64 {
 
     uptr Integral;
     uptr Fractional;
-    computePercentage(BlockSize * InUseBlocks, InUsePages * PageSize, &Integral,
+    computePercentage(BlockSize * InUseBlocks, InUseBytes, &Integral,
                       &Fractional);
     Str->append("  %02zu (%6zu): inuse/total blocks: %6zu/%6zu inuse/total "
                 "pages: %6zu/%6zu inuse bytes: %6zuK util: %3zu.%02zu%%\n",

From dd754cd262222bcb489038ac791e4278d90697f0 Mon Sep 17 00:00:00 2001
From: Alexander Shaposhnikov <ashaposhnikov@google.com>
Date: Wed, 4 Sep 2024 14:32:46 -0700
Subject: [PATCH 158/425] [compiler-rt][nsan] Update UnwindImpl (#107313)

Implement __sanitizer::BufferedStackTrace::UnwindImpl following msan.
---
 compiler-rt/lib/nsan/nsan.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/nsan/nsan.cpp b/compiler-rt/lib/nsan/nsan.cpp
index 4be9c673bd4e0..4679bcd589eb4 100644
--- a/compiler-rt/lib/nsan/nsan.cpp
+++ b/compiler-rt/lib/nsan/nsan.cpp
@@ -200,7 +200,14 @@ void __sanitizer::BufferedStackTrace::UnwindImpl(uptr pc, uptr bp,
                                                  bool request_fast,
                                                  u32 max_depth) {
   using namespace __nsan;
-  return Unwind(max_depth, pc, bp, context, 0, 0, false);
+  NsanThread *t = GetCurrentThread();
+  if (!t || !StackTrace::WillUseFastUnwind(request_fast))
+    return Unwind(max_depth, pc, bp, context, t ? t->stack_top() : 0,
+                  t ? t->stack_bottom() : 0, false);
+  if (StackTrace::WillUseFastUnwind(request_fast))
+    Unwind(max_depth, pc, bp, nullptr, t->stack_top(), t->stack_bottom(), true);
+  else
+    Unwind(max_depth, pc, 0, context, 0, 0, false);
 }
 
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __nsan_print_accumulated_stats() {

From dcf0160bd61d150e7b94067fcd991b466a361b08 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 4 Sep 2024 14:46:48 -0700
Subject: [PATCH 159/425] [TableGen] Optimize intrinsic info type signature
 encoding (#106809)

Change the "fixed encoding" table used for encoding intrinsic
type signature to use 16-bit encoding as opposed to 32-bit.

This results in both space and time improvements. For space,
the total static storage size (in bytes) of this info reduces by 50%:
- Current = 14193*4 (Fixed table) + 16058 + 3 (Long Table) = 72833
- New size = 14193*2 (Fixed table) + 19879 + 3 (Long Table) = 48268.
- Reduction = 50.9%

For time, with the added benchmark, we see a 7.3% speedup in
`GetIntrinsicInfoTableEntries` benchmark. Actual output of the
benchmark in included in the GitHub MR.
---
 llvm/benchmarks/CMakeLists.txt                |  1 +
 .../GetIntrinsicInfoTableEntriesBM.cpp        | 30 +++++++
 llvm/lib/IR/Function.cpp                      | 14 ++--
 llvm/utils/TableGen/IntrinsicEmitter.cpp      | 80 +++++++++++--------
 4 files changed, 87 insertions(+), 38 deletions(-)
 create mode 100644 llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp

diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt
index e3366e6f3ffe1..aa0cb77773344 100644
--- a/llvm/benchmarks/CMakeLists.txt
+++ b/llvm/benchmarks/CMakeLists.txt
@@ -6,3 +6,4 @@ add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(GetIntrinsicForClangBuiltin GetIntrinsicForClangBuiltin.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED)
+add_benchmark(GetIntrinsicInfoTableEntriesBM GetIntrinsicInfoTableEntriesBM.cpp PARTIAL_SOURCES_INTENDED)
diff --git a/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp b/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp
new file mode 100644
index 0000000000000..7f3bd3bc9eb6b
--- /dev/null
+++ b/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp
@@ -0,0 +1,30 @@
+//===- GetIntrinsicInfoTableEntries.cpp - IIT signature benchmark ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "benchmark/benchmark.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Intrinsics.h"
+
+using namespace llvm;
+using namespace Intrinsic;
+
+static void BM_GetIntrinsicInfoTableEntries(benchmark::State &state) {
+  SmallVector<IITDescriptor> Table;
+  for (auto _ : state) {
+    for (ID ID = 1; ID < num_intrinsics; ++ID) {
+      // This makes sure the vector does not keep growing, as well as after the
+      // first iteration does not result in additional allocations.
+      Table.clear();
+      getIntrinsicInfoTableEntries(ID, Table);
+    }
+  }
+}
+
+BENCHMARK(BM_GetIntrinsicInfoTableEntries);
+
+BENCHMARK_MAIN();
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 69520fdb03dc7..afef8930669e8 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1381,22 +1381,24 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
 
 void Intrinsic::getIntrinsicInfoTableEntries(ID id,
                                              SmallVectorImpl<IITDescriptor> &T){
+  static_assert(sizeof(IIT_Table[0]) == 2,
+                "Expect 16-bit entries in IIT_Table");
   // Check to see if the intrinsic's type was expressible by the table.
-  unsigned TableVal = IIT_Table[id-1];
+  uint16_t TableVal = IIT_Table[id - 1];
 
   // Decode the TableVal into an array of IITValues.
-  SmallVector<unsigned char, 8> IITValues;
+  SmallVector<unsigned char> IITValues;
   ArrayRef<unsigned char> IITEntries;
   unsigned NextElt = 0;
-  if ((TableVal >> 31) != 0) {
+  if (TableVal >> 15) {
     // This is an offset into the IIT_LongEncodingTable.
     IITEntries = IIT_LongEncodingTable;
 
     // Strip sentinel bit.
-    NextElt = (TableVal << 1) >> 1;
+    NextElt = TableVal & 0x7fff;
   } else {
-    // Decode the TableVal into an array of IITValues.  If the entry was encoded
-    // into a single word in the table itself, decode it now.
+    // If the entry was encoded into a single word in the table itself, decode
+    // it from an array of nibbles to an array of bytes.
     do {
       IITValues.push_back(TableVal & 0xF);
       TableVal >>= 4;
diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index 09eb1ed5e1863..0f4d7bf8db217 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -282,11 +282,37 @@ static TypeSigTy ComputeTypeSignature(const CodeGenIntrinsic &Int) {
   return TypeSig;
 }
 
+// Pack the type signature into 32-bit fixed encoding word.
+static std::optional<uint32_t> encodePacked(const TypeSigTy &TypeSig) {
+  if (TypeSig.size() > 8)
+    return std::nullopt;
+
+  uint32_t Result = 0;
+  for (unsigned char C : reverse(TypeSig)) {
+    if (C > 15)
+      return std::nullopt;
+    Result = (Result << 4) | C;
+  }
+  return Result;
+}
+
 void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
                                      raw_ostream &OS) {
-  // If we can compute a 32-bit fixed encoding for this intrinsic, do so and
+  // Note: the code below can be switched to use 32-bit fixed encoding by
+  // flipping the flag below.
+  constexpr bool Use16BitFixedEncoding = true;
+  using FixedEncodingTy =
+      std::conditional_t<Use16BitFixedEncoding, uint16_t, uint32_t>;
+  constexpr unsigned FixedEncodingBits = sizeof(FixedEncodingTy) * CHAR_BIT;
+  // Mask with all bits 1 except the most significant bit.
+  const unsigned Mask = (1U << (FixedEncodingBits - 1)) - 1;
+  const unsigned MSBPostion = FixedEncodingBits - 1;
+  StringRef FixedEncodingTypeName =
+      Use16BitFixedEncoding ? "uint16_t" : "uint32_t";
+
+  // If we can compute a 16/32-bit fixed encoding for this intrinsic, do so and
   // capture it in this vector, otherwise store a ~0U.
-  std::vector<unsigned> FixedEncodings;
+  std::vector<FixedEncodingTy> FixedEncodings;
   SequenceToOffsetTable<TypeSigTy> LongEncodingTable;
 
   FixedEncodings.reserve(Ints.size());
@@ -296,69 +322,59 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
     // Get the signature for the intrinsic.
     TypeSigTy TypeSig = ComputeTypeSignature(Int);
 
-    // Check to see if we can encode it into a 32-bit word. We can only encode
-    // 8 nibbles into a 32-bit word.
-    if (TypeSig.size() <= 8) {
-      // Attempt to pack elements of TypeSig into a 32-bit word, starting from
-      // the most significant nibble.
-      unsigned Result = 0;
-      bool Failed = false;
-      for (unsigned char C : reverse(TypeSig)) {
-        if (C > 15) {
-          Failed = true;
-          break;
-        }
-        Result = (Result << 4) | C;
-      }
-
-      // If this could be encoded into a 31-bit word, return it.
-      if (!Failed && (Result >> 31) == 0) {
-        FixedEncodings.push_back(Result);
-        continue;
-      }
+    // Check to see if we can encode it into a 16/32 bit word.
+    std::optional<uint32_t> Result = encodePacked(TypeSig);
+    if (Result && (*Result & Mask) == Result) {
+      FixedEncodings.push_back(static_cast<FixedEncodingTy>(*Result));
+      continue;
     }
 
-    // Otherwise, we're going to unique the sequence into the
-    // LongEncodingTable, and use its offset in the 32-bit table instead.
     LongEncodingTable.add(TypeSig);
 
     // This is a placehold that we'll replace after the table is laid out.
-    FixedEncodings.push_back(~0U);
+    FixedEncodings.push_back(static_cast<FixedEncodingTy>(~0U));
   }
 
   LongEncodingTable.layout();
 
-  OS << R"(// Global intrinsic function declaration type table.
+  OS << formatv(R"(// Global intrinsic function declaration type table.
 #ifdef GET_INTRINSIC_GENERATOR_GLOBAL
-static constexpr unsigned IIT_Table[] = {
-  )";
+static constexpr {0} IIT_Table[] = {{
+  )",
+                FixedEncodingTypeName);
 
+  unsigned MaxOffset = 0;
   for (auto [Idx, FixedEncoding, Int] : enumerate(FixedEncodings, Ints)) {
     if ((Idx & 7) == 7)
       OS << "\n  ";
 
     // If the entry fit in the table, just emit it.
-    if (FixedEncoding != ~0U) {
+    if ((FixedEncoding & Mask) == FixedEncoding) {
       OS << "0x" << Twine::utohexstr(FixedEncoding) << ", ";
       continue;
     }
 
     TypeSigTy TypeSig = ComputeTypeSignature(Int);
+    unsigned Offset = LongEncodingTable.get(TypeSig);
+    MaxOffset = std::max(MaxOffset, Offset);
 
     // Otherwise, emit the offset into the long encoding table.  We emit it this
     // way so that it is easier to read the offset in the .def file.
-    OS << "(1U<<31) | " << LongEncodingTable.get(TypeSig) << ", ";
+    OS << formatv("(1U<<{0}) | {1}, ", MSBPostion, Offset);
   }
 
   OS << "0\n};\n\n";
 
+  // verify that all offsets will fit in 16/32 bits.
+  if ((MaxOffset & Mask) != MaxOffset)
+    PrintFatalError("Offset of long encoding table exceeds encoding bits");
+
   // Emit the shared table of register lists.
   OS << "static constexpr unsigned char IIT_LongEncodingTable[] = {\n";
   if (!LongEncodingTable.empty())
     LongEncodingTable.emit(
         OS, [](raw_ostream &OS, unsigned char C) { OS << (unsigned)C; });
-  OS << "  255\n};\n\n";
-
+  OS << "  255\n};\n";
   OS << "#endif\n\n"; // End of GET_INTRINSIC_GENERATOR_GLOBAL
 }
 

From 660cc98647677815a3f5d97d00220071d8cf7a4f Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 4 Sep 2024 14:58:01 -0700
Subject: [PATCH 160/425] [TableGen] Add `CodeGenIntrinsicsMap` for on-demand
 intrinsic creation (#107100)

- Add class `CodeGenIntrinsicMap` for on-demand creation of
  `CodeGenIntrinsic`.
- Add class `CodeGenIntrinsicContext` to capture global information
  required to build `CodeGenIntrinsic` objects.
- Adopt GlobalISel PatternParser and SearchableTableEmitter to use it.
---
 .../TableGen/Basic/CodeGenIntrinsics.cpp      | 28 +++++++++++++------
 llvm/utils/TableGen/Basic/CodeGenIntrinsics.h | 22 +++++++++++++--
 .../TableGen/Common/CodeGenDAGPatterns.cpp    |  6 ++--
 .../Common/GlobalISel/PatternParser.cpp       | 11 +++-----
 .../utils/TableGen/SearchableTableEmitter.cpp | 13 +++++----
 5 files changed, 52 insertions(+), 28 deletions(-)

diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
index 4bca904e9f38b..c0edbf0f01523 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
@@ -25,20 +25,20 @@ using namespace llvm;
 // CodeGenIntrinsic Implementation
 //===----------------------------------------------------------------------===//
 
-CodeGenIntrinsicTable::CodeGenIntrinsicTable(const RecordKeeper &RC) {
-  std::vector<Record *> IntrProperties =
-      RC.getAllDerivedDefinitions("IntrinsicProperty");
-
-  std::vector<const Record *> DefaultProperties;
-  for (const Record *Rec : IntrProperties)
+CodeGenIntrinsicContext::CodeGenIntrinsicContext(const RecordKeeper &RC) {
+  for (const Record *Rec : RC.getAllDerivedDefinitions("IntrinsicProperty"))
     if (Rec->getValueAsBit("IsDefault"))
       DefaultProperties.push_back(Rec);
+}
+
+CodeGenIntrinsicTable::CodeGenIntrinsicTable(const RecordKeeper &RC) {
+  CodeGenIntrinsicContext Ctx(RC);
 
   std::vector<Record *> Defs = RC.getAllDerivedDefinitions("Intrinsic");
   Intrinsics.reserve(Defs.size());
 
   for (const Record *Def : Defs)
-    Intrinsics.push_back(CodeGenIntrinsic(Def, DefaultProperties));
+    Intrinsics.push_back(CodeGenIntrinsic(Def, Ctx));
 
   llvm::sort(Intrinsics,
              [](const CodeGenIntrinsic &LHS, const CodeGenIntrinsic &RHS) {
@@ -54,8 +54,18 @@ CodeGenIntrinsicTable::CodeGenIntrinsicTable(const RecordKeeper &RC) {
   Targets.back().Count = Intrinsics.size() - Targets.back().Offset;
 }
 
+CodeGenIntrinsic &CodeGenIntrinsicMap::operator[](const Record *Record) {
+  if (!Record->isSubClassOf("Intrinsic"))
+    PrintFatalError("Intrinsic defs should be subclass of 'Intrinsic' class");
+
+  auto [Iter, Inserted] = Map.try_emplace(Record);
+  if (Inserted)
+    Iter->second = std::make_unique<CodeGenIntrinsic>(Record, Ctx);
+  return *Iter->second;
+}
+
 CodeGenIntrinsic::CodeGenIntrinsic(const Record *R,
-                                   ArrayRef<const Record *> DefaultProperties)
+                                   const CodeGenIntrinsicContext &Ctx)
     : TheDef(R) {
   StringRef DefName = TheDef->getName();
   ArrayRef<SMLoc> DefLoc = R->getLoc();
@@ -119,7 +129,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(const Record *R,
   }
 
   // Set default properties to true.
-  setDefaultProperties(DefaultProperties);
+  setDefaultProperties(Ctx.DefaultProperties);
 
   // Also record the SDPatternOperator Properties.
   Properties = parseSDPatternOperatorProperties(R);
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
index 19e29af2fa85f..51c2359155380 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
@@ -15,6 +15,7 @@
 
 #include "SDNodeProperties.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ModRef.h"
 #include <string>
@@ -25,6 +26,12 @@ namespace llvm {
 class Record;
 class RecordKeeper;
 
+// Global information needed to build intrinsics.
+struct CodeGenIntrinsicContext {
+  explicit CodeGenIntrinsicContext(const RecordKeeper &RC);
+  std::vector<const Record *> DefaultProperties;
+};
+
 struct CodeGenIntrinsic {
   const Record *TheDef; // The actual record defining this intrinsic.
   std::string Name;     // The name of the LLVM function "llvm.bswap.i32"
@@ -155,8 +162,7 @@ struct CodeGenIntrinsic {
 
   bool isParamImmArg(unsigned ParamIdx) const;
 
-  CodeGenIntrinsic(const Record *R,
-                   ArrayRef<const Record *> DefaultProperties = {});
+  CodeGenIntrinsic(const Record *R, const CodeGenIntrinsicContext &Ctx);
 };
 
 class CodeGenIntrinsicTable {
@@ -171,7 +177,6 @@ class CodeGenIntrinsicTable {
   std::vector<TargetSet> Targets;
 
   explicit CodeGenIntrinsicTable(const RecordKeeper &RC);
-  CodeGenIntrinsicTable() = default;
 
   bool empty() const { return Intrinsics.empty(); }
   size_t size() const { return Intrinsics.size(); }
@@ -182,6 +187,17 @@ class CodeGenIntrinsicTable {
     return Intrinsics[Pos];
   }
 };
+
+// This class builds `CodeGenIntrinsic` on demand for a given Def.
+class CodeGenIntrinsicMap {
+  DenseMap<const Record *, std::unique_ptr<CodeGenIntrinsic>> Map;
+  const CodeGenIntrinsicContext Ctx;
+
+public:
+  explicit CodeGenIntrinsicMap(const RecordKeeper &RC) : Ctx(RC) {}
+  CodeGenIntrinsic &operator[](const Record *Def);
+};
+
 } // namespace llvm
 
 #endif // LLVM_UTILS_TABLEGEN_BASIC_CODEGENINTRINSICS_H
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index a8cecca0d4a54..df3f72ff2ec7f 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -3160,10 +3160,8 @@ void TreePattern::dump() const { print(errs()); }
 
 CodeGenDAGPatterns::CodeGenDAGPatterns(RecordKeeper &R,
                                        PatternRewriterFn PatternRewriter)
-    : Records(R), Target(R), LegalVTS(Target.getLegalValueTypes()),
-      PatternRewriter(PatternRewriter) {
-
-  Intrinsics = CodeGenIntrinsicTable(Records);
+    : Records(R), Target(R), Intrinsics(R),
+      LegalVTS(Target.getLegalValueTypes()), PatternRewriter(PatternRewriter) {
   ParseNodeInfo();
   ParseNodeTransforms();
   ParseComplexPatterns();
diff --git a/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp b/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp
index 82fa3c8f3121f..73b6097554eda 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp
@@ -107,13 +107,10 @@ getInstrForIntrinsic(const CodeGenTarget &CGT, const CodeGenIntrinsic *I) {
 static const CodeGenIntrinsic *getCodeGenIntrinsic(Record *R) {
   // Intrinsics need to have a static lifetime because the match table keeps
   // references to CodeGenIntrinsic objects.
-  static DenseMap<const Record *, std::unique_ptr<CodeGenIntrinsic>>
-      AllIntrinsics;
-
-  auto &Ptr = AllIntrinsics[R];
-  if (!Ptr)
-    Ptr = std::make_unique<CodeGenIntrinsic>(R);
-  return Ptr.get();
+  static CodeGenIntrinsicMap *AllIntrinsics;
+  if (!AllIntrinsics)
+    AllIntrinsics = new CodeGenIntrinsicMap(R->getRecords());
+  return &(*AllIntrinsics)[R];
 }
 
 std::unique_ptr<Pattern>
diff --git a/llvm/utils/TableGen/SearchableTableEmitter.cpp b/llvm/utils/TableGen/SearchableTableEmitter.cpp
index 59ae3bf3daedc..8d394f8051ced 100644
--- a/llvm/utils/TableGen/SearchableTableEmitter.cpp
+++ b/llvm/utils/TableGen/SearchableTableEmitter.cpp
@@ -94,7 +94,7 @@ struct GenericTable {
 class SearchableTableEmitter {
   RecordKeeper &Records;
   std::unique_ptr<CodeGenTarget> Target;
-  DenseMap<Init *, std::unique_ptr<CodeGenIntrinsic>> Intrinsics;
+  std::unique_ptr<CodeGenIntrinsicMap> Intrinsics;
   std::vector<std::unique_ptr<GenericEnum>> Enums;
   DenseMap<Record *, GenericEnum *> EnumMap;
   std::set<std::string> PreprocessorGuards;
@@ -146,10 +146,13 @@ class SearchableTableEmitter {
   }
 
   CodeGenIntrinsic &getIntrinsic(Init *I) {
-    std::unique_ptr<CodeGenIntrinsic> &Intr = Intrinsics[I];
-    if (!Intr)
-      Intr = std::make_unique<CodeGenIntrinsic>(cast<DefInit>(I)->getDef());
-    return *Intr;
+    const Record *Def = cast<DefInit>(I)->getDef();
+    // Build the Intrinsics map on demand. If we instantiate one in the
+    // constructor, we may get errors if the TableGen file being processed does
+    // not include Intrinsics.td and does not do anything with intrinsics.
+    if (!Intrinsics)
+      Intrinsics = std::make_unique<CodeGenIntrinsicMap>(Records);
+    return (*Intrinsics)[Def];
   }
 
   bool compareBy(Record *LHS, Record *RHS, const SearchIndex &Index);

From 98c6bbfe1f3a348633e5e4c192a0134891fe3849 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 4 Sep 2024 15:04:10 -0700
Subject: [PATCH 161/425] [TableGen] Refactor Intrinsics record (#106986)

Eliminate unused `isTarget` field in Intrinsic record.

Eliminate `isOverloaded`, `Types` and `TypeSig` fields from the record,
as they are already available through the `TypeInfo` field. Change
intrinsic emitter code to look for this info using fields of the
`TypeInfo` record attached to the `Intrinsic` record.

Fix several intrinsic related unit tests to source the `Intrinsic` class
def from Intrinsics.td as opposed to defining a skeleton in the test.

This eliminates some duplication of information in the Intrinsic class,
as well as reduces the memory allocated for record fields, resulting in
~2% reduction (though that's not the main goal).
---
 llvm/include/llvm/IR/Intrinsics.td            |  5 --
 llvm/test/TableGen/intrinsic-attrs.td         | 52 +------------------
 llvm/test/TableGen/intrinsic-long-name.td     | 36 ++-----------
 llvm/test/TableGen/intrinsic-struct.td        | 37 ++-----------
 .../TableGen/searchabletables-intrinsic.td    | 37 ++-----------
 .../TableGen/Basic/CodeGenIntrinsics.cpp      | 23 ++++----
 llvm/utils/TableGen/IntrinsicEmitter.cpp      | 11 ++--
 7 files changed, 35 insertions(+), 166 deletions(-)

diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 232d6be1073f4..1bc895eee60f1 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -669,12 +669,7 @@ class Intrinsic<list<LLVMType> ret_types,
   // IntrinsicProperty<1>
   bit DisableDefaultAttributes = disable_default_attributes;
 
-  bit isTarget = false;
-
   TypeInfoGen TypeInfo = TypeInfoGen<RetTypes, ParamTypes>;
-  bit isOverloaded = TypeInfo.isOverloaded;
-  list<LLVMType> Types = TypeInfo.Types;
-  list<list<int>> TypeSig = TypeInfo.TypeSig;
 }
 
 // Intrinsic with default attributes (disable_default_attributes = false).
diff --git a/llvm/test/TableGen/intrinsic-attrs.td b/llvm/test/TableGen/intrinsic-attrs.td
index 29e8cb1e89bb0..3228b32405103 100644
--- a/llvm/test/TableGen/intrinsic-attrs.td
+++ b/llvm/test/TableGen/intrinsic-attrs.td
@@ -1,54 +1,6 @@
-// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include %s | FileCheck %s
+// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include -DTEST_INTRINSICS_SUPPRESS_DEFS %s | FileCheck %s
 
-// Get the minimum blurb necessary to process ...
-include "llvm/CodeGen/ValueTypes.td"
-include "llvm/CodeGen/SDNodeProperties.td"
-
-class LLVMType<ValueType vt> {
-  ValueType VT = vt;
-  int isAny = 0;
-}
-
-def llvm_i32_ty     : LLVMType<i32>;
-def llvm_ptr_ty     : LLVMType<iPTR>;
-
-class AttrIndex<int idx> {
-  int Value = idx;
-}
-
-def FuncIndex : AttrIndex<-1>;
-def RetIndex : AttrIndex<0>;
-class ArgIndex<int argNo> : AttrIndex<!add(argNo, 1)>;
-
-class IntrinsicProperty<bit is_default = 0> {
-  bit IsDefault = is_default;
-}
-
-def IntrNoMem : IntrinsicProperty;
-def IntrHasSideEffects : IntrinsicProperty;
-class Dereferenceable<AttrIndex idx, int bytes> : IntrinsicProperty {
-  int ArgNo = idx.Value;
-  int Bytes = bytes;
-}
-
-class Intrinsic<list<LLVMType> ret_types,
-                list<LLVMType> param_types = [],
-                list<IntrinsicProperty> intr_properties = [],
-                string name = "",
-                list<SDNodeProperty> sd_properties = [],
-                bit disable_default_attributes = 0> : SDPatternOperator {
-  string LLVMName = name;
-  string TargetPrefix = "";
-  list<LLVMType> RetTypes = ret_types;
-  list<LLVMType> ParamTypes = param_types;
-  list<IntrinsicProperty> IntrProperties = intr_properties;
-  let Properties = sd_properties;
-  bit DisableDefaultAttributes = 1;
-
-
-  bit isTarget = 0;
-  bit DisableDefaultAttributes = disable_default_attributes;
-}
+include "llvm/IR/Intrinsics.td"
 
 // ... this intrinsic.
 def int_random_gen   : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects]>;
diff --git a/llvm/test/TableGen/intrinsic-long-name.td b/llvm/test/TableGen/intrinsic-long-name.td
index d66173202302b..c19910d474ed1 100644
--- a/llvm/test/TableGen/intrinsic-long-name.td
+++ b/llvm/test/TableGen/intrinsic-long-name.td
@@ -1,38 +1,10 @@
-// RUN: llvm-tblgen -gen-intrinsic-enums %s | FileCheck %s
+// RUN: llvm-tblgen -gen-intrinsic-enums -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS | FileCheck %s
 // XFAIL: vg_leak
 
-class IntrinsicProperty<bit is_default = 0> {
-  bit IsDefault = is_default;
-}
-
-class SDNodeProperty;
-
-class ValueType<int size, int value> {
-  string Namespace = "MVT";
-  int Size = size;
-  int Value = value;
-}
-
-class LLVMType<ValueType vt> {
-  ValueType VT = vt;
-}
-
-class Intrinsic<string name, list<LLVMType> param_types = []> {
-  string LLVMName = name;
-  bit isTarget = 0;
-  string TargetPrefix = "";
-  list<LLVMType> RetTypes = [];
-  list<LLVMType> ParamTypes = param_types;
-  list<IntrinsicProperty> IntrProperties = [];
-  list<SDNodeProperty> Properties = [];
-  bit DisableDefaultAttributes = 1;
-}
-
-def iAny : ValueType<0, 253>;
-def llvm_anyint_ty : LLVMType<iAny>;
+include "llvm/IR/Intrinsics.td"
 
 // Make sure we generate the long name without crashing
 // CHECK: this_is_a_really_long_intrinsic_name_but_we_should_still_not_crash,  // llvm.this.is.a.really.long.intrinsic.name.but.we.should.still.not.crash
-def int_foo : Intrinsic<"llvm.foo", [llvm_anyint_ty]>;
-def int_this_is_a_really_long_intrinsic_name_but_we_should_still_not_crash : Intrinsic<"llvm.this.is.a.really.long.intrinsic.name.but.we.should.still.not.crash", [llvm_anyint_ty]>;
+def int_foo : Intrinsic<[llvm_anyint_ty], [], [], "llvm.foo">;
+def int_this_is_a_really_long_intrinsic_name_but_we_should_still_not_crash : Intrinsic<[llvm_anyint_ty], [], [], "llvm.this.is.a.really.long.intrinsic.name.but.we.should.still.not.crash">;
 
diff --git a/llvm/test/TableGen/intrinsic-struct.td b/llvm/test/TableGen/intrinsic-struct.td
index bc044a4a6f858..f23a7a7643af2 100644
--- a/llvm/test/TableGen/intrinsic-struct.td
+++ b/llvm/test/TableGen/intrinsic-struct.td
@@ -1,38 +1,11 @@
-// RUN: llvm-tblgen -gen-intrinsic-enums %s | FileCheck %s
+// RUN: llvm-tblgen -gen-intrinsic-enums -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS | FileCheck %s
 // XFAIL: vg_leak
 
-class IntrinsicProperty<bit is_default = 0> {
-  bit IsDefault = is_default;
-}
-
-class SDNodeProperty;
-
-class ValueType<int size, int value> {
-  string Namespace = "MVT";
-  int Size = size;
-  int Value = value;
-}
-
-class LLVMType<ValueType vt> {
-  ValueType VT = vt;
-}
-
-class Intrinsic<string name, list<LLVMType> ret_types = []> {
-  string LLVMName = name;
-  bit isTarget = 0;
-  string TargetPrefix = "";
-  list<LLVMType> RetTypes = ret_types;
-  list<LLVMType> ParamTypes = [];
-  list<IntrinsicProperty> IntrProperties = [];
-  list<SDNodeProperty> Properties = [];
-  bit DisableDefaultAttributes = 1;
-}
-
-def iAny : ValueType<0, 253>;
-def llvm_anyint_ty : LLVMType<iAny>;
+include "llvm/IR/Intrinsics.td"
 
 // Make sure we can return up to 8 values
 // CHECK: returns_8_results = {{[0-9]+}}, // llvm.returns.8.results
-def int_returns_8_results : Intrinsic<"llvm.returns.8.results",
+def int_returns_8_results : Intrinsic<
     [llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty,
-     llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty]>;
+     llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty],
+     [], [], "llvm.returns.8.results">;
diff --git a/llvm/test/TableGen/searchabletables-intrinsic.td b/llvm/test/TableGen/searchabletables-intrinsic.td
index 75722d19b16e9..d4ec105f0243b 100644
--- a/llvm/test/TableGen/searchabletables-intrinsic.td
+++ b/llvm/test/TableGen/searchabletables-intrinsic.td
@@ -1,48 +1,19 @@
-// RUN: llvm-tblgen -gen-searchable-tables -I %p/../../include %s | FileCheck %s
+// RUN: llvm-tblgen -gen-searchable-tables -I %p/../../include -DTEST_INTRINSICS_SUPPRESS_DEFS %s | FileCheck %s
 // XFAIL: vg_leak
 
 include "llvm/TableGen/SearchableTable.td"
-
-class IntrinsicProperty<bit is_default = 0> {
-  bit IsDefault = is_default;
-}
-
-class SDNodeProperty;
-
-class ValueType<int size, int value> {
-  string Namespace = "MVT";
-  int Size = size;
-  int Value = value;
-}
-
-class LLVMType<ValueType vt> {
-  ValueType VT = vt;
-}
-
-class Intrinsic<list<LLVMType> param_types = []> {
-  string LLVMName = "";
-  bit isTarget = 0;
-  string TargetPrefix = "";
-  list<LLVMType> RetTypes = [];
-  list<LLVMType> ParamTypes = param_types;
-  list<IntrinsicProperty> IntrProperties = [];
-  list<SDNodeProperty> Properties = [];
-  bit DisableDefaultAttributes = 1;
-}
-
-def iAny : ValueType<0, 253>;
-def llvm_anyint_ty : LLVMType<iAny>;
+include "llvm/IR/Intrinsics.td"
 
 def int_abc : Intrinsic<[llvm_anyint_ty]>;
 def int_xyz : Intrinsic<[llvm_anyint_ty]>;
 
-let isTarget = 1, TargetPrefix = "gtarget" in {
+let TargetPrefix = "gtarget" in {
   def int_gtarget_def : Intrinsic<[llvm_anyint_ty]>;
   def int_gtarget_defg : Intrinsic<[llvm_anyint_ty]>;
   def int_gtarget_uvw : Intrinsic<[llvm_anyint_ty]>;
 }
 
-let isTarget = 1, TargetPrefix = "ftarget" in {
+let TargetPrefix = "ftarget" in {
   def int_ftarget_ghi : Intrinsic<[llvm_anyint_ty]>;
   def int_ftarget_ghi_x : Intrinsic<[llvm_anyint_ty]>;
   def int_ftarget_rst : Intrinsic<[llvm_anyint_ty]>;
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
index c0edbf0f01523..a30a7577408f8 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
@@ -106,17 +106,22 @@ CodeGenIntrinsic::CodeGenIntrinsic(const Record *R,
                                   TargetPrefix + ".'!");
   }
 
-  if (auto *Types = R->getValue("Types")) {
-    auto *TypeList = cast<ListInit>(Types->getValue());
-    isOverloaded = R->getValueAsBit("isOverloaded");
+  const Record *TypeInfo = R->getValueAsDef("TypeInfo");
+  if (!TypeInfo->isSubClassOf("TypeInfoGen"))
+    PrintFatalError(DefLoc, "TypeInfo field in " + DefName +
+                                " should be of subclass of TypeInfoGen!");
 
-    unsigned I = 0;
-    for (unsigned E = R->getValueAsListInit("RetTypes")->size(); I < E; ++I)
-      IS.RetTys.push_back(TypeList->getElementAsRecord(I));
+  isOverloaded = TypeInfo->getValueAsBit("isOverloaded");
+  const ListInit *TypeList = TypeInfo->getValueAsListInit("Types");
 
-    for (unsigned E = TypeList->size(); I < E; ++I)
-      IS.ParamTys.push_back(TypeList->getElementAsRecord(I));
-  }
+  // Types field is a concatenation of Return types followed by Param types.
+  unsigned Idx = 0;
+  unsigned NumRet = R->getValueAsListInit("RetTypes")->size();
+  for (; Idx < NumRet; ++Idx)
+    IS.RetTys.push_back(TypeList->getElementAsRecord(Idx));
+
+  for (unsigned E = TypeList->size(); Idx < E; ++Idx)
+    IS.ParamTys.push_back(TypeList->getElementAsRecord(Idx));
 
   // Parse the intrinsic properties.
   ListInit *PropList = R->getValueAsListInit("IntrProperties");
diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index 0f4d7bf8db217..bda97c61d3d58 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -273,11 +273,12 @@ using TypeSigTy = SmallVector<unsigned char>;
 /// Computes type signature of the intrinsic \p Int.
 static TypeSigTy ComputeTypeSignature(const CodeGenIntrinsic &Int) {
   TypeSigTy TypeSig;
-  if (const auto *R = Int.TheDef->getValue("TypeSig")) {
-    for (const auto *a : cast<ListInit>(R->getValue())->getValues()) {
-      for (const auto *b : cast<ListInit>(a)->getValues())
-        TypeSig.emplace_back(cast<IntInit>(b)->getValue());
-    }
+  const Record *TypeInfo = Int.TheDef->getValueAsDef("TypeInfo");
+  const ListInit *OuterList = TypeInfo->getValueAsListInit("TypeSig");
+
+  for (const auto *Outer : OuterList->getValues()) {
+    for (const auto *Inner : cast<ListInit>(Outer)->getValues())
+      TypeSig.emplace_back(cast<IntInit>(Inner)->getValue());
   }
   return TypeSig;
 }

From df50751d24da4f5fdf8f46119c09a7e941f7174b Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Wed, 4 Sep 2024 15:12:58 -0700
Subject: [PATCH 162/425] [SandboxIR] Implement ConstantAggregateZero (#107172)

This patch implements sandboxir::ConstantAggregateZero mirroring
llvm::ConstantAggregateZero.
---
 llvm/include/llvm/SandboxIR/SandboxIR.h       | 44 +++++++++++
 .../llvm/SandboxIR/SandboxIRValues.def        |  1 +
 llvm/include/llvm/SandboxIR/Type.h            |  1 +
 llvm/lib/SandboxIR/SandboxIR.cpp              | 78 +++++++++++++++----
 llvm/lib/SandboxIR/Type.cpp                   |  5 ++
 llvm/unittests/SandboxIR/SandboxIRTest.cpp    | 69 ++++++++++++++++
 llvm/unittests/SandboxIR/TypesTest.cpp        |  4 +
 7 files changed, 186 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 89e963498426d..89bc9c46581fc 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -120,6 +120,7 @@ namespace sandboxir {
 class BasicBlock;
 class ConstantInt;
 class ConstantFP;
+class ConstantAggregateZero;
 class Context;
 class Function;
 class Instruction;
@@ -316,6 +317,7 @@ class Value {
   friend class CmpInst;               // For getting `Val`.
   friend class ConstantArray;         // For `Val`.
   friend class ConstantStruct;        // For `Val`.
+  friend class ConstantAggregateZero; // For `Val`.
 
   /// All values point to the context.
   Context &Ctx;
@@ -943,6 +945,48 @@ class ConstantVector final : public ConstantAggregate {
   }
 };
 
+// TODO: Inherit from ConstantData.
+class ConstantAggregateZero final : public Constant {
+  ConstantAggregateZero(llvm::ConstantAggregateZero *C, Context &Ctx)
+      : Constant(ClassID::ConstantAggregateZero, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  static ConstantAggregateZero *get(Type *Ty);
+  /// If this CAZ has array or vector type, return a zero with the right element
+  /// type.
+  Constant *getSequentialElement() const;
+  /// If this CAZ has struct type, return a zero with the right element type for
+  /// the specified element.
+  Constant *getStructElement(unsigned Elt) const;
+  /// Return a zero of the right value for the specified GEP index if we can,
+  /// otherwise return null (e.g. if C is a ConstantExpr).
+  Constant *getElementValue(Constant *C) const;
+  /// Return a zero of the right value for the specified GEP index.
+  Constant *getElementValue(unsigned Idx) const;
+  /// Return the number of elements in the array, vector, or struct.
+  ElementCount getElementCount() const {
+    return cast<llvm::ConstantAggregateZero>(Val)->getElementCount();
+  }
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::ConstantAggregateZero;
+  }
+  unsigned getUseOperandNo(const Use &Use) const final {
+    llvm_unreachable("ConstantAggregateZero has no operands!");
+  }
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::ConstantAggregateZero>(Val) && "Expected a CAZ!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
 /// Iterator for `Instruction`s in a `BasicBlock.
 /// \Returns an sandboxir::Instruction & when derereferenced.
 class BBIterator {
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index f320f61934efa..b2180ba58afcc 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -30,6 +30,7 @@ DEF_CONST(ConstantFP, ConstantFP)
 DEF_CONST(ConstantArray, ConstantArray)
 DEF_CONST(ConstantStruct, ConstantStruct)
 DEF_CONST(ConstantVector, ConstantVector)
+DEF_CONST(ConstantAggregateZero, ConstantAggregateZero)
 
 #ifndef DEF_INSTR
 #define DEF_INSTR(ID, OPCODE, CLASS)
diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h
index 61721ca836321..69ca156e82101 100644
--- a/llvm/include/llvm/SandboxIR/Type.h
+++ b/llvm/include/llvm/SandboxIR/Type.h
@@ -293,6 +293,7 @@ class PointerType : public Type {
 
 class ArrayType : public Type {
 public:
+  static ArrayType *get(Type *ElementType, uint64_t NumElements);
   // TODO: add missing functions
   static bool classof(const Type *From) {
     return isa<llvm::ArrayType>(From->LLVMTy);
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index c0e5837209213..89bbdf575c245 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -2402,6 +2402,30 @@ StructType *ConstantStruct::getTypeForElements(Context &Ctx,
   return StructType::get(Ctx, EltTypes, Packed);
 }
 
+ConstantAggregateZero *ConstantAggregateZero::get(Type *Ty) {
+  auto *LLVMC = llvm::ConstantAggregateZero::get(Ty->LLVMTy);
+  return cast<ConstantAggregateZero>(
+      Ty->getContext().getOrCreateConstant(LLVMC));
+}
+
+Constant *ConstantAggregateZero::getSequentialElement() const {
+  return cast<Constant>(Ctx.getValue(
+      cast<llvm::ConstantAggregateZero>(Val)->getSequentialElement()));
+}
+Constant *ConstantAggregateZero::getStructElement(unsigned Elt) const {
+  return cast<Constant>(Ctx.getValue(
+      cast<llvm::ConstantAggregateZero>(Val)->getStructElement(Elt)));
+}
+Constant *ConstantAggregateZero::getElementValue(Constant *C) const {
+  return cast<Constant>(
+      Ctx.getValue(cast<llvm::ConstantAggregateZero>(Val)->getElementValue(
+          cast<llvm::Constant>(C->Val))));
+}
+Constant *ConstantAggregateZero::getElementValue(unsigned Idx) const {
+  return cast<Constant>(Ctx.getValue(
+      cast<llvm::ConstantAggregateZero>(Val)->getElementValue(Idx)));
+}
+
 FunctionType *Function::getFunctionType() const {
   return cast<FunctionType>(
       Ctx.getType(cast<llvm::Function>(Val)->getFunctionType()));
@@ -2489,26 +2513,48 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
     return It->second.get();
 
   if (auto *C = dyn_cast<llvm::Constant>(LLVMV)) {
-    if (auto *CI = dyn_cast<llvm::ConstantInt>(C)) {
-      It->second = std::unique_ptr<ConstantInt>(new ConstantInt(CI, *this));
+    switch (C->getValueID()) {
+    case llvm::Value::ConstantIntVal:
+      It->second = std::unique_ptr<ConstantInt>(
+          new ConstantInt(cast<llvm::ConstantInt>(C), *this));
       return It->second.get();
-    }
-    if (auto *CF = dyn_cast<llvm::ConstantFP>(C)) {
-      It->second = std::unique_ptr<ConstantFP>(new ConstantFP(CF, *this));
+    case llvm::Value::ConstantFPVal:
+      It->second = std::unique_ptr<ConstantFP>(
+          new ConstantFP(cast<llvm::ConstantFP>(C), *this));
       return It->second.get();
+    case llvm::Value::ConstantAggregateZeroVal: {
+      auto *CAZ = cast<llvm::ConstantAggregateZero>(C);
+      It->second = std::unique_ptr<ConstantAggregateZero>(
+          new ConstantAggregateZero(CAZ, *this));
+      auto *Ret = It->second.get();
+      // Must create sandboxir for elements.
+      auto EC = CAZ->getElementCount();
+      if (EC.isFixed()) {
+        for (auto ElmIdx : seq<unsigned>(0, EC.getFixedValue()))
+          getOrCreateValueInternal(CAZ->getElementValue(ElmIdx), CAZ);
+      }
+      return Ret;
     }
-    if (auto *CA = dyn_cast<llvm::ConstantArray>(C))
-      It->second = std::unique_ptr<ConstantArray>(new ConstantArray(CA, *this));
-    else if (auto *CS = dyn_cast<llvm::ConstantStruct>(C))
-      It->second =
-          std::unique_ptr<ConstantStruct>(new ConstantStruct(CS, *this));
-    else if (auto *CV = dyn_cast<llvm::ConstantVector>(C))
-      It->second =
-          std::unique_ptr<ConstantVector>(new ConstantVector(CV, *this));
-    else if (auto *F = dyn_cast<llvm::Function>(LLVMV))
-      It->second = std::unique_ptr<Function>(new Function(F, *this));
-    else
+    case llvm::Value::ConstantArrayVal:
+      It->second = std::unique_ptr<ConstantArray>(
+          new ConstantArray(cast<llvm::ConstantArray>(C), *this));
+      break;
+    case llvm::Value::ConstantStructVal:
+      It->second = std::unique_ptr<ConstantStruct>(
+          new ConstantStruct(cast<llvm::ConstantStruct>(C), *this));
+      break;
+    case llvm::Value::ConstantVectorVal:
+      It->second = std::unique_ptr<ConstantVector>(
+          new ConstantVector(cast<llvm::ConstantVector>(C), *this));
+      break;
+    case llvm::Value::FunctionVal:
+      It->second = std::unique_ptr<Function>(
+          new Function(cast<llvm::Function>(C), *this));
+      break;
+    default:
       It->second = std::unique_ptr<Constant>(new Constant(C, *this));
+      break;
+    }
     auto *NewC = It->second.get();
     for (llvm::Value *COp : C->operands())
       getOrCreateValueInternal(COp, C);
diff --git a/llvm/lib/SandboxIR/Type.cpp b/llvm/lib/SandboxIR/Type.cpp
index 535b0f75fd874..11a16e865213f 100644
--- a/llvm/lib/SandboxIR/Type.cpp
+++ b/llvm/lib/SandboxIR/Type.cpp
@@ -47,6 +47,11 @@ PointerType *PointerType::get(Context &Ctx, unsigned AddressSpace) {
       Ctx.getType(llvm::PointerType::get(Ctx.LLVMCtx, AddressSpace)));
 }
 
+ArrayType *ArrayType::get(Type *ElementType, uint64_t NumElements) {
+  return cast<ArrayType>(ElementType->getContext().getType(
+      llvm::ArrayType::get(ElementType->LLVMTy, NumElements)));
+}
+
 StructType *StructType::get(Context &Ctx, ArrayRef<Type *> Elements,
                             bool IsPacked) {
   SmallVector<llvm::Type *> LLVMElements;
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index d1c5690ccad5b..ebb127915ba85 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -520,6 +520,75 @@ define void @foo() {
   EXPECT_EQ(StructTy2Packed, StructTyPacked);
 }
 
+TEST_F(SandboxIRTest, ConstantAggregateZero) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr, {i32, i8} %v1, <2 x i8> %v2) {
+  %extr0 = extractvalue [2 x i8] zeroinitializer, 0
+  %extr1 = extractvalue {i32, i8} zeroinitializer, 0
+  %extr2 = extractelement <2 x i8> zeroinitializer, i32 0
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto &BB = *F.begin();
+  auto It = BB.begin();
+  auto *Extr0 = &*It++;
+  auto *Extr1 = &*It++;
+  auto *Extr2 = &*It++;
+  [[maybe_unused]] auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+  auto *Zero32 =
+      sandboxir::ConstantInt::get(sandboxir::Type::getInt32Ty(Ctx), 0);
+  auto *Zero8 = sandboxir::ConstantInt::get(sandboxir::Type::getInt8Ty(Ctx), 0);
+  auto *Int8Ty = sandboxir::Type::getInt8Ty(Ctx);
+  auto *Int32Ty = sandboxir::Type::getInt32Ty(Ctx);
+  auto *ArrayTy = sandboxir::ArrayType::get(Int8Ty, 2u);
+  auto *StructTy = sandboxir::StructType::get(Ctx, {Int32Ty, Int8Ty});
+  auto *VectorTy =
+      sandboxir::VectorType::get(Int8Ty, ElementCount::getFixed(2u));
+
+  // Check creation and classof().
+  auto *ArrayCAZ = cast<sandboxir::ConstantAggregateZero>(Extr0->getOperand(0));
+  EXPECT_EQ(ArrayCAZ->getType(), ArrayTy);
+  auto *StructCAZ =
+      cast<sandboxir::ConstantAggregateZero>(Extr1->getOperand(0));
+  EXPECT_EQ(StructCAZ->getType(), StructTy);
+  auto *VectorCAZ =
+      cast<sandboxir::ConstantAggregateZero>(Extr2->getOperand(0));
+  EXPECT_EQ(VectorCAZ->getType(), VectorTy);
+  // Check get().
+  auto *SameVectorCAZ =
+      sandboxir::ConstantAggregateZero::get(sandboxir::VectorType::get(
+          sandboxir::Type::getInt8Ty(Ctx), ElementCount::getFixed(2)));
+  EXPECT_EQ(SameVectorCAZ, VectorCAZ); // Should be uniqued.
+  auto *NewVectorCAZ =
+      sandboxir::ConstantAggregateZero::get(sandboxir::VectorType::get(
+          sandboxir::Type::getInt8Ty(Ctx), ElementCount::getFixed(4)));
+  EXPECT_NE(NewVectorCAZ, VectorCAZ);
+  // Check getSequentialElement().
+  auto *SeqElm = VectorCAZ->getSequentialElement();
+  EXPECT_EQ(SeqElm,
+            sandboxir::ConstantInt::get(sandboxir::Type::getInt8Ty(Ctx), 0));
+  // Check getStructElement().
+  auto *StructElm0 = StructCAZ->getStructElement(0);
+  auto *StructElm1 = StructCAZ->getStructElement(1);
+  EXPECT_EQ(StructElm0, Zero32);
+  EXPECT_EQ(StructElm1, Zero8);
+  // Check getElementValue(Constant).
+  EXPECT_EQ(ArrayCAZ->getElementValue(Zero32), Zero8);
+  EXPECT_EQ(StructCAZ->getElementValue(Zero32), Zero32);
+  EXPECT_EQ(VectorCAZ->getElementValue(Zero32), Zero8);
+  // Check getElementValue(unsigned).
+  EXPECT_EQ(ArrayCAZ->getElementValue(0u), Zero8);
+  EXPECT_EQ(StructCAZ->getElementValue(0u), Zero32);
+  EXPECT_EQ(VectorCAZ->getElementValue(0u), Zero8);
+  // Check getElementCount().
+  EXPECT_EQ(ArrayCAZ->getElementCount(), ElementCount::getFixed(2));
+  EXPECT_EQ(NewVectorCAZ->getElementCount(), ElementCount::getFixed(4));
+}
+
 TEST_F(SandboxIRTest, Use) {
   parseIR(C, R"IR(
 define i32 @foo(i32 %v0, i32 %v1) {
diff --git a/llvm/unittests/SandboxIR/TypesTest.cpp b/llvm/unittests/SandboxIR/TypesTest.cpp
index d4c2de441268c..36ef0cf8e5291 100644
--- a/llvm/unittests/SandboxIR/TypesTest.cpp
+++ b/llvm/unittests/SandboxIR/TypesTest.cpp
@@ -236,6 +236,10 @@ define void @foo([2 x i8] %v0) {
   // Check classof(), creation.
   [[maybe_unused]] auto *ArrayTy =
       cast<sandboxir::ArrayType>(F->getArg(0)->getType());
+  // Check get().
+  auto *NewArrayTy =
+      sandboxir::ArrayType::get(sandboxir::Type::getInt8Ty(Ctx), 2u);
+  EXPECT_EQ(NewArrayTy, ArrayTy);
 }
 
 TEST_F(SandboxTypeTest, StructType) {

From 9171881d64e4834de7ad7c9807607ce6bc5167a9 Mon Sep 17 00:00:00 2001
From: Scott Linder <Scott.Linder@amd.com>
Date: Wed, 4 Sep 2024 22:19:11 +0000
Subject: [PATCH 163/425] [AMDGPU][Docs] DWARF aspace-aware base types
 (post-review fixes)

---
 .../AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst    | 2 +-
 llvm/docs/AMDGPUUsage.rst                                 | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
index b71e80f922250..0249c580964a0 100644
--- a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
+++ b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
@@ -3807,7 +3807,7 @@ A.5.1 Base Type Entries
 2. A ``DW_TAG_base_type`` debugger information entry with the encoding
    ``DW_ATE_address`` may have a ``DW_AT_LLVM_address_space`` attribute whose
    value is an architecture specific address space (see
-   :ref:`amdgpu-dwarf-address-spaces`). If ommitted it defaults to
+   :ref:`amdgpu-dwarf-address-spaces`). If omitted it defaults to
    ``DW_ASPACE_LLVM_none``.
 
 .. _amdgpu-dwarf-type-modifier-entries:
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index a5ad3f6bbf7b2..ba62a68c4a509 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -2938,13 +2938,15 @@ error.
   or from that address space is always an evaluation error.
 
   For targets which support the generic address space, converting from
-  ``DW_ASPACE_AMDGPU_generic`` to ``DW_ASPACE_LLVM_none`` is always defined and
-  requires no change to the literal value of the address.
+  ``DW_ASPACE_AMDGPU_generic`` to ``DW_ASPACE_LLVM_none`` is defined when the
+  generic address is in the global address space. The conversion requires no
+  change to the literal value of the address.
 
   Converting from ``DW_ASPACE_AMDGPU_generic`` to any of
   ``DW_ASPACE_AMDGPU_local``, ``DW_ASPACE_AMDGPU_private_wave`` or
   ``DW_ASPACE_AMDGPU_private_lane`` is defined when the relevant hardware
-  support is present and setup has been completed. Conversion to
+  support is present, any required hardware setup has been completed, and the
+  generic address is in the corresponding address space. Conversion to
   ``DW_ASPACE_AMDGPU_private_lane`` additionally requires the context to
   include the active lane.
 

From 7c4eb60c9509c3a750961eac2dbcaad369d911f2 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl@gmail.com>
Date: Wed, 4 Sep 2024 18:43:54 -0400
Subject: [PATCH 164/425] [Clang] Fix CLANG_TOOLCHAIN_PROGRAM_TIMEOUT logic

PR #102521, which landed as 1ea0865dd6fa, implemented
`CLANG_TOOLCHAIN_PROGRAM_TIMEOUT`, but the logic is obviously wrong.
If the user-specified value is negative, it should become zero to mean
infinite.  Otherwise, it should be left as is.  Thus, use `std::max`
not `std::min`.  This obvious fixup doesn't seem worth another pull
request.
---
 clang/lib/Driver/ToolChain.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 64f23d43e87ee..16f9b629fc538 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -126,7 +126,7 @@ ToolChain::executeToolChainProgram(StringRef Executable) const {
                                      "CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected "
                                      "an integer, got '" +
                                          *Str + "'");
-    SecondsToWait = std::min(SecondsToWait, 0); // infinite
+    SecondsToWait = std::max(SecondsToWait, 0); // infinite
   }
   if (llvm::sys::ExecuteAndWait(Executable, {}, {}, Redirects, SecondsToWait,
                                 /*MemoryLimit=*/0, &ErrorMessage))

From 950bb68516eb564c29815997450bdb6516ffdcec Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Wed, 4 Sep 2024 15:55:08 -0700
Subject: [PATCH 165/425] [SandboxIR] Implement ConstantPointerNull (#107320)

This patch implements sandboxir::ConstantPointerNull mirroring
llvm::ConstantPointerNull.
---
 llvm/include/llvm/SandboxIR/SandboxIR.h       | 31 +++++++++++++++++++
 .../llvm/SandboxIR/SandboxIRValues.def        |  1 +
 llvm/lib/SandboxIR/SandboxIR.cpp              | 15 +++++++++
 llvm/unittests/SandboxIR/SandboxIRTest.cpp    | 27 ++++++++++++++++
 4 files changed, 74 insertions(+)

diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 89bc9c46581fc..4db4fae24b4b3 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -121,6 +121,7 @@ class BasicBlock;
 class ConstantInt;
 class ConstantFP;
 class ConstantAggregateZero;
+class ConstantPointerNull;
 class Context;
 class Function;
 class Instruction;
@@ -318,6 +319,7 @@ class Value {
   friend class ConstantArray;         // For `Val`.
   friend class ConstantStruct;        // For `Val`.
   friend class ConstantAggregateZero; // For `Val`.
+  friend class ConstantPointerNull;   // For `Val`.
 
   /// All values point to the context.
   Context &Ctx;
@@ -987,6 +989,35 @@ class ConstantAggregateZero final : public Constant {
 #endif
 };
 
+// TODO: Inherit from ConstantData.
+class ConstantPointerNull final : public Constant {
+  ConstantPointerNull(llvm::ConstantPointerNull *C, Context &Ctx)
+      : Constant(ClassID::ConstantPointerNull, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  static ConstantPointerNull *get(PointerType *Ty);
+
+  PointerType *getType() const;
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::ConstantPointerNull;
+  }
+  unsigned getUseOperandNo(const Use &Use) const final {
+    llvm_unreachable("ConstantPointerNull has no operands!");
+  }
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::ConstantPointerNull>(Val) && "Expected a CPNull!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
 /// Iterator for `Instruction`s in a `BasicBlock.
 /// \Returns an sandboxir::Instruction & when derereferenced.
 class BBIterator {
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index b2180ba58afcc..fce5aacc8c86d 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -31,6 +31,7 @@ DEF_CONST(ConstantArray, ConstantArray)
 DEF_CONST(ConstantStruct, ConstantStruct)
 DEF_CONST(ConstantVector, ConstantVector)
 DEF_CONST(ConstantAggregateZero, ConstantAggregateZero)
+DEF_CONST(ConstantPointerNull, ConstantPointerNull)
 
 #ifndef DEF_INSTR
 #define DEF_INSTR(ID, OPCODE, CLASS)
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index 89bbdf575c245..acbc5c17ab256 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -2426,6 +2426,17 @@ Constant *ConstantAggregateZero::getElementValue(unsigned Idx) const {
       cast<llvm::ConstantAggregateZero>(Val)->getElementValue(Idx)));
 }
 
+ConstantPointerNull *ConstantPointerNull::get(PointerType *Ty) {
+  auto *LLVMC =
+      llvm::ConstantPointerNull::get(cast<llvm::PointerType>(Ty->LLVMTy));
+  return cast<ConstantPointerNull>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+
+PointerType *ConstantPointerNull::getType() const {
+  return cast<PointerType>(
+      Ctx.getType(cast<llvm::ConstantPointerNull>(Val)->getType()));
+}
+
 FunctionType *Function::getFunctionType() const {
   return cast<FunctionType>(
       Ctx.getType(cast<llvm::Function>(Val)->getFunctionType()));
@@ -2535,6 +2546,10 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
       }
       return Ret;
     }
+    case llvm::Value::ConstantPointerNullVal:
+      It->second = std::unique_ptr<ConstantPointerNull>(
+          new ConstantPointerNull(cast<llvm::ConstantPointerNull>(C), *this));
+      return It->second.get();
     case llvm::Value::ConstantArrayVal:
       It->second = std::unique_ptr<ConstantArray>(
           new ConstantArray(cast<llvm::ConstantArray>(C), *this));
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index ebb127915ba85..2f5ef92578e77 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -589,6 +589,33 @@ define void @foo(ptr %ptr, {i32, i8} %v1, <2 x i8> %v2) {
   EXPECT_EQ(NewVectorCAZ->getElementCount(), ElementCount::getFixed(4));
 }
 
+TEST_F(SandboxIRTest, ConstantPointerNull) {
+  parseIR(C, R"IR(
+define ptr @foo() {
+  ret ptr null
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto &BB = *F.begin();
+  auto It = BB.begin();
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+  // Check classof() and creation.
+  auto *CPNull = cast<sandboxir::ConstantPointerNull>(Ret->getReturnValue());
+  // Check get().
+  auto *NewCPNull =
+      sandboxir::ConstantPointerNull::get(sandboxir::PointerType::get(Ctx, 0u));
+  EXPECT_EQ(NewCPNull, CPNull);
+  auto *NewCPNull2 =
+      sandboxir::ConstantPointerNull::get(sandboxir::PointerType::get(Ctx, 1u));
+  EXPECT_NE(NewCPNull2, CPNull);
+  // Check getType().
+  EXPECT_EQ(CPNull->getType(), sandboxir::PointerType::get(Ctx, 0u));
+  EXPECT_EQ(NewCPNull2->getType(), sandboxir::PointerType::get(Ctx, 1u));
+}
+
 TEST_F(SandboxIRTest, Use) {
   parseIR(C, R"IR(
 define i32 @foo(i32 %v0, i32 %v1) {

From 9efe377307694be0c92f7cb3b02fd1d090fdbeb8 Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Wed, 4 Sep 2024 16:03:13 -0700
Subject: [PATCH 166/425] [HLSL] Implement '__builtin_hlsl_is_intangible' type
 trait (#104544)

Implements `__builtin_hlsl_is_intangible` type trait.

HLSL intangible types are special implementation-defined types such as
resource handles or samplers. Any class that is an array of intangible
type or contains base class or members of intangible types is also an
intangible type.

Fixes #[102954](https://github.com/llvm/llvm-project/issues/102954)
---
 .../clang/AST/CXXRecordDeclDefinitionBits.def |  4 +
 clang/include/clang/AST/DeclCXX.h             |  4 +
 clang/include/clang/AST/Type.h                |  7 ++
 clang/include/clang/Basic/TokenKinds.def      |  3 +-
 clang/include/clang/Sema/SemaHLSL.h           |  1 +
 clang/lib/AST/DeclCXX.cpp                     | 17 +++-
 clang/lib/Sema/SemaExprCXX.cpp                | 11 +++
 clang/lib/Sema/SemaHLSL.cpp                   | 28 +++++++
 .../Types/Traits/IsIntangibleType.hlsl        | 78 +++++++++++++++++++
 .../Types/Traits/IsIntangibleTypeErrors.hlsl  | 11 +++
 10 files changed, 162 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl
 create mode 100644 clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl

diff --git a/clang/include/clang/AST/CXXRecordDeclDefinitionBits.def b/clang/include/clang/AST/CXXRecordDeclDefinitionBits.def
index cdf0804680ad0..6620840df0ced 100644
--- a/clang/include/clang/AST/CXXRecordDeclDefinitionBits.def
+++ b/clang/include/clang/AST/CXXRecordDeclDefinitionBits.def
@@ -249,4 +249,8 @@ FIELD(HasDeclaredCopyAssignmentWithConstParam, 1, MERGE_OR)
 /// base classes or fields have a no-return destructor
 FIELD(IsAnyDestructorNoReturn, 1, NO_MERGE)
 
+/// Whether the record type is intangible (if any base classes or fields have
+/// type that is intangible). HLSL only.
+FIELD(IsHLSLIntangible, 1, NO_MERGE)
+
 #undef FIELD
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index 0d72cc6a08dcb..252e6e9256414 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -1547,6 +1547,10 @@ class CXXRecordDecl : public RecordDecl {
   /// destructors are marked noreturn.
   bool isAnyDestructorNoReturn() const { return data().IsAnyDestructorNoReturn; }
 
+  /// Returns true if the class contains HLSL intangible type, either as
+  /// a field or in base class.
+  bool isHLSLIntangible() const { return data().IsHLSLIntangible; }
+
   /// If the class is a local class [class.local], returns
   /// the enclosing function declaration.
   const FunctionDecl *isLocalClass() const {
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 853226118af40..ef36a73716454 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2658,6 +2658,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) bool is##Id##Type() const;
 #include "clang/Basic/HLSLIntangibleTypes.def"
   bool isHLSLSpecificType() const; // Any HLSL specific type
+  bool isHLSLIntangibleType() const; // Any HLSL intangible type
 
   /// Determines if this type, which must satisfy
   /// isObjCLifetimeType(), is implicitly __unsafe_unretained rather
@@ -8341,6 +8342,12 @@ inline bool Type::isHLSLSpecificType() const {
       false; // end boolean or operation
 }
 
+inline bool Type::isHLSLIntangibleType() const {
+  // All HLSL specific types are currently intangible type as well, but that
+  // might change in the future.
+  return isHLSLSpecificType();
+}
+
 inline bool Type::isTemplateTypeParmType() const {
   return isa<TemplateTypeParmType>(CanonicalType);
 }
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 212c1f6ff3a12..a82ff684b2ac7 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -660,8 +660,9 @@ KEYWORD(out                         , KEYHLSL)
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) KEYWORD(Name, KEYHLSL)
 #include "clang/Basic/HLSLIntangibleTypes.def"
 
-// HLSL Type traits.
+// HLSL Type traits
 TYPE_TRAIT_2(__builtin_hlsl_is_scalarized_layout_compatible, IsScalarizedLayoutCompatible, KEYHLSL)
+TYPE_TRAIT_1(__builtin_hlsl_is_intangible, IsIntangibleType, KEYHLSL)
 
 // OpenMP Type Traits
 UNARY_EXPR_OR_TYPE_TRAIT(__builtin_omp_required_simd_align, OpenMPRequiredSimdAlign, KEYALL)
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index d79ca9a4fa18d..285e4e5f3c765 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -71,6 +71,7 @@ class SemaHLSL : public SemaBase {
 
   // HLSL Type trait implementations
   bool IsScalarizedLayoutCompatible(QualType T1, QualType T2) const;
+  bool IsIntangibleType(QualType T1);
 
   bool CheckCompatibleParameterABI(FunctionDecl *New, FunctionDecl *Old);
 
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 9a3ede426e914..01143391edab4 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -109,7 +109,7 @@ CXXRecordDecl::DefinitionData::DefinitionData(CXXRecordDecl *D)
       ImplicitCopyAssignmentHasConstParam(true),
       HasDeclaredCopyConstructorWithConstParam(false),
       HasDeclaredCopyAssignmentWithConstParam(false),
-      IsAnyDestructorNoReturn(false), IsLambda(false),
+      IsAnyDestructorNoReturn(false), IsHLSLIntangible(false), IsLambda(false),
       IsParsingBaseSpecifiers(false), ComputedVisibleConversions(false),
       HasODRHash(false), Definition(D) {}
 
@@ -431,6 +431,9 @@ CXXRecordDecl::setBases(CXXBaseSpecifier const * const *Bases,
     if (BaseClassDecl->isAnyDestructorNoReturn())
       data().IsAnyDestructorNoReturn = true;
 
+    if (BaseClassDecl->isHLSLIntangible())
+      data().IsHLSLIntangible = true;
+
     // C++11 [class.copy]p18:
     //   The implicitly-declared copy assignment operator for a class X will
     //   have the form 'X& X::operator=(const X&)' if each direct base class B
@@ -1401,6 +1404,18 @@ void CXXRecordDecl::addedMember(Decl *D) {
     //   than subobjects of zero size
     if (data().Empty && !IsZeroSize)
       data().Empty = false;
+
+    if (getLangOpts().HLSL) {
+      const Type *Ty = Field->getType()->getUnqualifiedDesugaredType();
+      while (isa<ConstantArrayType>(Ty))
+        Ty = Ty->getArrayElementTypeNoTypeQual();
+
+      Ty = Ty->getUnqualifiedDesugaredType();
+      if (Ty->isBuiltinType())
+        data().IsHLSLIntangible |= Ty->isHLSLIntangibleType();
+      else if (const RecordType *RT = dyn_cast<RecordType>(Ty))
+        data().IsHLSLIntangible |= RT->getAsCXXRecordDecl()->isHLSLIntangible();
+    }
   }
 
   // Handle using declarations of conversion functions.
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index b7531581d37ff..14feafd1e6b17 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -5111,6 +5111,7 @@ static bool CheckUnaryTypeTraitTypeCompleteness(Sema &S, TypeTrait UTT,
   case UTT_IsDestructible:
   case UTT_IsNothrowDestructible:
   case UTT_IsTriviallyDestructible:
+  case UTT_IsIntangibleType:
     if (ArgTy->isIncompleteArrayType() || ArgTy->isVoidType())
       return true;
 
@@ -5696,6 +5697,16 @@ static bool EvaluateUnaryTypeTrait(Sema &Self, TypeTrait UTT,
         return true;
     return false;
   }
+  case UTT_IsIntangibleType:
+    assert(Self.getLangOpts().HLSL && "intangible types are HLSL-only feature");
+    if (!T->isVoidType() && !T->isIncompleteArrayType())
+      if (Self.RequireCompleteType(TInfo->getTypeLoc().getBeginLoc(), T,
+                                   diag::err_incomplete_type))
+        return false;
+    if (DiagnoseVLAInCXXTypeTrait(Self, TInfo,
+                                  tok::kw___builtin_hlsl_is_intangible))
+      return false;
+    return Self.HLSL().IsIntangibleType(T);
   }
 }
 
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 778d524a00548..65aeda4b7b613 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -10,8 +10,11 @@
 
 #include "clang/Sema/SemaHLSL.h"
 #include "clang/AST/Decl.h"
+#include "clang/AST/DeclBase.h"
+#include "clang/AST/DeclCXX.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/AST/Type.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/SourceLocation.h"
@@ -1609,6 +1612,31 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   return false;
 }
 
+bool SemaHLSL::IsIntangibleType(clang::QualType QT) {
+  if (QT.isNull())
+    return false;
+
+  const Type *Ty = QT->getUnqualifiedDesugaredType();
+
+  // check if it's a builtin type first (simple check, no need to cache it)
+  if (Ty->isBuiltinType())
+    return Ty->isHLSLIntangibleType();
+
+  // unwrap arrays
+  while (isa<ConstantArrayType>(Ty))
+    Ty = Ty->getArrayElementTypeNoTypeQual();
+
+  const RecordType *RT =
+      dyn_cast<RecordType>(Ty->getUnqualifiedDesugaredType());
+  if (!RT)
+    return false;
+
+  CXXRecordDecl *RD = RT->getAsCXXRecordDecl();
+  assert(RD != nullptr &&
+         "all HLSL struct and classes should be CXXRecordDecl");
+  return RD->isHLSLIntangible();
+}
+
 static void BuildFlattenedTypeList(QualType BaseTy,
                                    llvm::SmallVectorImpl<QualType> &List) {
   llvm::SmallVector<QualType, 16> WorkList;
diff --git a/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl b/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl
new file mode 100644
index 0000000000000..92cba1dcd4bdf
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl
@@ -0,0 +1,78 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -fnative-half-type -verify %s
+// expected-no-diagnostics
+
+_Static_assert(__builtin_hlsl_is_intangible(__hlsl_resource_t), "");
+// no need to check array of __hlsl_resource_t, arrays of sizeless types are not supported
+
+_Static_assert(!__builtin_hlsl_is_intangible(int), "");
+_Static_assert(!__builtin_hlsl_is_intangible(float3), "");
+_Static_assert(!__builtin_hlsl_is_intangible(half[4]), "");
+
+typedef __hlsl_resource_t Res;
+_Static_assert(__builtin_hlsl_is_intangible(const Res), "");
+// no need to check array of Res, arrays of sizeless types are not supported
+
+struct ABuffer {
+    const int i[10];
+    __hlsl_resource_t h;
+};
+_Static_assert(__builtin_hlsl_is_intangible(ABuffer), "");
+_Static_assert(__builtin_hlsl_is_intangible(ABuffer[10]), "");
+
+struct MyStruct {
+    half2 h2;
+    int3 i3;
+};
+_Static_assert(!__builtin_hlsl_is_intangible(MyStruct), "");
+_Static_assert(!__builtin_hlsl_is_intangible(MyStruct[10]), "");
+
+class MyClass {
+    int3 ivec;
+    float farray[12];
+    MyStruct ms;
+    ABuffer buf;
+};
+_Static_assert(__builtin_hlsl_is_intangible(MyClass), "");
+_Static_assert(__builtin_hlsl_is_intangible(MyClass[2]), "");
+
+union U {
+    double d[4];
+    Res buf;
+};
+_Static_assert(__builtin_hlsl_is_intangible(U), "");
+_Static_assert(__builtin_hlsl_is_intangible(U[100]), "");
+
+class MyClass2 {
+    int3 ivec;
+    float farray[12];
+    U u;
+};
+_Static_assert(__builtin_hlsl_is_intangible(MyClass2), "");
+_Static_assert(__builtin_hlsl_is_intangible(MyClass2[5]), "");
+
+class Simple {
+    int a;
+};
+
+template<typename T> struct TemplatedBuffer {
+    T a;
+    __hlsl_resource_t h;
+};
+_Static_assert(__builtin_hlsl_is_intangible(TemplatedBuffer<int>), "");
+
+struct MyStruct2 : TemplatedBuffer<float> {
+    float x;
+};
+_Static_assert(__builtin_hlsl_is_intangible(MyStruct2), "");
+
+struct MyStruct3 {
+    const TemplatedBuffer<float> TB[10];
+};
+_Static_assert(__builtin_hlsl_is_intangible(MyStruct3), "");
+
+template<typename T> struct SimpleTemplate {
+    T a;
+};
+_Static_assert(__builtin_hlsl_is_intangible(SimpleTemplate<__hlsl_resource_t>), "");
+_Static_assert(!__builtin_hlsl_is_intangible(SimpleTemplate<float>), "");
diff --git a/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl b/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl
new file mode 100644
index 0000000000000..0803086749bd7
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library  -finclude-default-header -verify %s
+
+struct Undefined; // expected-note {{forward declaration of 'Undefined'}}
+_Static_assert(!__builtin_hlsl_is_intangible(Undefined), ""); // expected-error{{incomplete type 'Undefined' used in type trait expression}}
+
+void fn(int X) {
+  // expected-error@#vla {{variable length arrays are not supported for the current target}}
+  // expected-error@#vla {{variable length arrays are not supported in '__builtin_hlsl_is_intangible'}}
+  // expected-warning@#vla {{variable length arrays in C++ are a Clang extension}}
+  _Static_assert(!__builtin_hlsl_is_intangible(int[X]), ""); // #vla
+}

From aecbc924102ee57ea639cd76ed32b37eb2d257fc Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Wed, 4 Sep 2024 16:14:13 -0700
Subject: [PATCH 167/425] [WebAssembly] Rename CATCH/CATCH_ALL to *_LEGACY
 (#107187)

This renames MIR instruction `CATCH` and `CATCH_ALL` to `CATCH_LEGACY`
and `CATCH_ALL_LEGACY` respectively.

Follow-up PRs for the new EH (exnref) implementation will use `CATCH`,
`CATCH_REF`, `CATCH_ALL`, and `CATCH_ALL_REF` as pseudo-instructions
that return extracted values or `exnref` or both, because we don't
currently support block return values in LLVM. So to give the old (real)
`CATCH`es and the new (pseudo) `CATCH`es different names, this attaches
`_LEGACY` prefix to the old names.

This also rearranges `WebAssemblyInstrControl.td` so that the old legacy
instructions are listed all together at the end.
---
 .../MCTargetDesc/WebAssemblyInstPrinter.cpp   | 17 +++++-----
 .../MCTargetDesc/WebAssemblyInstPrinter.h     |  2 +-
 .../MCTargetDesc/WebAssemblyMCTargetDesc.h    |  8 ++---
 .../WebAssembly/WebAssemblyCFGStackify.cpp    | 10 +++---
 .../WebAssembly/WebAssemblyISelDAGToDAG.cpp   |  2 +-
 .../WebAssembly/WebAssemblyInstrControl.td    | 34 ++++++++++++-------
 .../WebAssembly/WebAssemblyLateEHPrepare.cpp  |  2 +-
 .../WebAssembly/cfg-stackify-eh-legacy.mir    | 23 +++++++------
 .../CodeGen/WebAssembly/exception-legacy.mir  |  4 +--
 .../CodeGen/WebAssembly/function-info.mir     |  4 +--
 .../WebAssemblyExceptionInfoTest.cpp          | 16 ++++-----
 11 files changed, 66 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index bf6d6dce1f8ac..b85ed1d93593b 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -166,15 +166,15 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address,
       }
       return;
 
-    case WebAssembly::CATCH:
-    case WebAssembly::CATCH_S:
-    case WebAssembly::CATCH_ALL:
-    case WebAssembly::CATCH_ALL_S:
+    case WebAssembly::CATCH_LEGACY:
+    case WebAssembly::CATCH_LEGACY_S:
+    case WebAssembly::CATCH_ALL_LEGACY:
+    case WebAssembly::CATCH_ALL_LEGACY_S:
       // There can be multiple catch instructions for one try instruction, so
       // we print a label only for the first 'catch' label.
       if (EHInstStack.empty()) {
         printAnnotation(OS, "try-catch mismatch!");
-      } else if (EHInstStack.back() == CATCH_ALL) {
+      } else if (EHInstStack.back() == CATCH_ALL_LEGACY) {
         printAnnotation(OS, "catch/catch_all cannot occur after catch_all");
       } else if (EHInstStack.back() == TRY) {
         if (TryStack.empty()) {
@@ -183,10 +183,11 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address,
           printAnnotation(OS, "catch" + utostr(TryStack.pop_back_val()) + ':');
         }
         EHInstStack.pop_back();
-        if (Opc == WebAssembly::CATCH || Opc == WebAssembly::CATCH_S) {
-          EHInstStack.push_back(CATCH);
+        if (Opc == WebAssembly::CATCH_LEGACY ||
+            Opc == WebAssembly::CATCH_LEGACY_S) {
+          EHInstStack.push_back(CATCH_LEGACY);
         } else {
-          EHInstStack.push_back(CATCH_ALL);
+          EHInstStack.push_back(CATCH_ALL_LEGACY);
         }
       }
       return;
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
index f3c0124fd7f1f..8fd54d1640905 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
@@ -28,7 +28,7 @@ class WebAssemblyInstPrinter final : public MCInstPrinter {
   SmallVector<std::pair<uint64_t, bool>, 4> ControlFlowStack;
   SmallVector<uint64_t, 4> TryStack;
 
-  enum EHInstKind { TRY, CATCH, CATCH_ALL };
+  enum EHInstKind { TRY, CATCH_LEGACY, CATCH_ALL_LEGACY };
   SmallVector<EHInstKind, 4> EHInstStack;
 
 public:
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index eb3087dafed2a..00f15e1db5e13 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -470,10 +470,10 @@ inline bool isMarker(unsigned Opc) {
 
 inline bool isCatch(unsigned Opc) {
   switch (Opc) {
-  case WebAssembly::CATCH:
-  case WebAssembly::CATCH_S:
-  case WebAssembly::CATCH_ALL:
-  case WebAssembly::CATCH_ALL_S:
+  case WebAssembly::CATCH_LEGACY:
+  case WebAssembly::CATCH_LEGACY_S:
+  case WebAssembly::CATCH_ALL_LEGACY:
+  case WebAssembly::CATCH_ALL_LEGACY_S:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 6fd882f62f3f0..3362ea5316e45 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -1305,7 +1305,7 @@ bool WebAssemblyCFGStackify::fixCatchUnwindMismatches(MachineFunction &MF) {
 
         // catch_all always catches an exception, so we don't need to do
         // anything
-        if (MI.getOpcode() == WebAssembly::CATCH_ALL) {
+        if (MI.getOpcode() == WebAssembly::CATCH_ALL_LEGACY) {
         }
 
         // This can happen when the unwind dest was removed during the
@@ -1448,8 +1448,8 @@ void WebAssemblyCFGStackify::recalculateScopeTops(MachineFunction &MF) {
       case WebAssembly::DELEGATE:
         updateScopeTops(EndToBegin[&MI]->getParent(), &MBB);
         break;
-      case WebAssembly::CATCH:
-      case WebAssembly::CATCH_ALL:
+      case WebAssembly::CATCH_LEGACY:
+      case WebAssembly::CATCH_ALL_LEGACY:
         updateScopeTops(EHPadToTry[&MBB]->getParent(), &MBB);
         break;
       }
@@ -1698,8 +1698,8 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
         Stack.push_back(std::make_pair(EndToBegin[&MI]->getParent(), &MI));
         break;
 
-      case WebAssembly::CATCH:
-      case WebAssembly::CATCH_ALL:
+      case WebAssembly::CATCH_LEGACY:
+      case WebAssembly::CATCH_ALL_LEGACY:
         EHPadStack.pop_back();
         break;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index 0f06f54f219f9..60c5e18fbb0cd 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -212,7 +212,7 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
       int Tag = Node->getConstantOperandVal(2);
       SDValue SymNode = getTagSymNode(Tag, CurDAG);
       MachineSDNode *Catch =
-          CurDAG->getMachineNode(WebAssembly::CATCH, DL,
+          CurDAG->getMachineNode(WebAssembly::CATCH_LEGACY, DL,
                                  {
                                      PtrVT,     // exception pointer
                                      MVT::Other // outchain type
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index be6547007aaf7..dd40015577fd7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -127,11 +127,27 @@ defm DEBUG_UNREACHABLE : NRI<(outs), (ins), [(debugtrap)], "unreachable", 0x00>;
 
 let Predicates = [HasExceptionHandling] in {
 
-// Throwing an exception: throw / rethrow
+// Throwing an exception: throw
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
 defm THROW : I<(outs), (ins tag_op:$tag, variable_ops),
                (outs), (ins tag_op:$tag), [],
                "throw   \t$tag", "throw   \t$tag", 0x08>;
+} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+
+// Pseudo instructions: cleanupret / catchret
+let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
+    isPseudo = 1, isEHScopeReturn = 1 in {
+  defm CLEANUPRET : NRI<(outs), (ins), [(cleanupret)], "cleanupret", 0>;
+  defm CATCHRET : NRI<(outs), (ins bb_op:$dst, bb_op:$from),
+                      [(catchret bb:$dst, bb:$from)], "catchret", 0>;
+} // isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
+  // isPseudo = 1, isEHScopeReturn = 1
+
+// Below are instructions from the legacy EH proposal. Could be deprecated if
+// usage gets low enough.
+
+// Rethrowing an exception: rethrow
+let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
 defm RETHROW : NRI<(outs), (ins i32imm:$depth), [], "rethrow \t$depth", 0x09>;
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 // The depth argument will be computed in CFGStackify. We set it to 0 here for
@@ -147,22 +163,14 @@ defm END_TRY : NRI<(outs), (ins), [], "end_try", 0x0b>;
 // Catching an exception: catch / catch_all
 let hasCtrlDep = 1, hasSideEffects = 1 in {
 let variadicOpsAreDefs = 1 in
-defm CATCH : I<(outs), (ins tag_op:$tag, variable_ops),
-               (outs), (ins tag_op:$tag), [],
-               "catch",  "catch   \t$tag", 0x07>;
-defm CATCH_ALL : NRI<(outs), (ins), [], "catch_all", 0x19>;
+defm CATCH_LEGACY : I<(outs), (ins tag_op:$tag, variable_ops),
+                      (outs), (ins tag_op:$tag), [],
+                      "catch",  "catch   \t$tag", 0x07>;
+defm CATCH_ALL_LEGACY : NRI<(outs), (ins), [], "catch_all", 0x19>;
 }
 
 // Delegating an exception: delegate
 let isTerminator = 1, hasCtrlDep = 1, hasSideEffects = 1 in
 defm DELEGATE : NRI<(outs), (ins bb_op:$dst), [], "delegate \t $dst", 0x18>;
 
-// Pseudo instructions: cleanupret / catchret
-let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
-    isPseudo = 1, isEHScopeReturn = 1 in {
-  defm CLEANUPRET : NRI<(outs), (ins), [(cleanupret)], "cleanupret", 0>;
-  defm CATCHRET : NRI<(outs), (ins bb_op:$dst, bb_op:$from),
-                      [(catchret bb:$dst, bb:$from)], "catchret", 0>;
-} // isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
-  // isPseudo = 1, isEHScopeReturn = 1
 } // Predicates = [HasExceptionHandling]
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index 94037b9ab189d..f0c205cdb6aeb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -216,7 +216,7 @@ bool WebAssemblyLateEHPrepare::addCatchAlls(MachineFunction &MF) {
       Changed = true;
       BuildMI(MBB, InsertPos,
               InsertPos == MBB.end() ? DebugLoc() : InsertPos->getDebugLoc(),
-              TII.get(WebAssembly::CATCH_ALL));
+              TII.get(WebAssembly::CATCH_ALL_LEGACY));
     }
   }
   return Changed;
diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir
index 0386410d1b612..c37a82fe80826 100644
--- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir
+++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.mir
@@ -31,22 +31,23 @@ body: |
   bb.1 (landing-pad):
     successors: %bb.2
     ; CHECK: bb.1 (landing-pad):
-    ; CHECK: CATCH
+    ; CHECK: CATCH_LEGACY
     ; CHECK: TRY
-    ; This RETHROW rethrows the exception caught by this BB's CATCH, but after
-    ; CFGStackify a TRY is placed between the CATCH and this RETHROW, so after
-    ; CFGStackify its immediate argument should become not 0, but 1.
+    ; This RETHROW rethrows the exception caught by this BB's CATCH_LEGACY, but
+    ; after CFGStackify a TRY is placed between the CATCH_LEGACY and this
+    ; RETHROW, so after CFGStackify its immediate argument should become not 0,
+    ; but 1.
     ; CHECK: RETHROW 1
     EH_LABEL <mcsymbol .Ltmp2>
-    %0:i32 = CATCH &__cpp_exception, implicit-def dead $arguments
+    %0:i32 = CATCH_LEGACY &__cpp_exception, implicit-def dead $arguments
     RETHROW 0, implicit-def dead $arguments
 
   bb.2 (landing-pad):
     ; CHECK: bb.2 (landing-pad):
-    ; CHECK: CATCH
+    ; CHECK: CATCH_LEGACY
     ; CHECK: RETHROW 0
     EH_LABEL <mcsymbol .Ltmp3>
-    %1:i32 = CATCH &__cpp_exception, implicit-def dead $arguments
+    %1:i32 = CATCH_LEGACY &__cpp_exception, implicit-def dead $arguments
     RETHROW 0, implicit-def dead $arguments
 
   bb.3:
@@ -78,13 +79,13 @@ body: |
   ; CHECK:       CALL @foo
   ; CHECK:     DELEGATE
   ; CHECK:     RETURN
-  ; CHECK:   CATCH
+  ; CHECK:   CATCH_LEGACY
   ;; This TRY should have the return type i32 (127)
   ; CHECK:     TRY 127
   ; CHECK:       RETHROW
   ; CHECK:     DELEGATE
   ; CHECK:   END_TRY
-  ; CHECK: CATCH
+  ; CHECK: CATCH_LEGACY
   ; CHECK:   RETHROW
   ; CHECK: END_TRY
   bb.0:
@@ -105,11 +106,11 @@ body: |
 
   bb.3 (landing-pad):
     EH_LABEL <mcsymbol .Ltmp4>
-    %0:i32 = CATCH &__cpp_exception, implicit-def dead $arguments
+    %0:i32 = CATCH_LEGACY &__cpp_exception, implicit-def dead $arguments
     RETHROW 0, implicit-def dead $arguments
 
   bb.4 (landing-pad):
     EH_LABEL <mcsymbol .Ltmp5>
-    %1:i32 = CATCH &__cpp_exception, implicit-def dead $arguments
+    %1:i32 = CATCH_LEGACY &__cpp_exception, implicit-def dead $arguments
     RETHROW 0, implicit-def dead $arguments
 ...
diff --git a/llvm/test/CodeGen/WebAssembly/exception-legacy.mir b/llvm/test/CodeGen/WebAssembly/exception-legacy.mir
index 895e8d8864ea2..f9eb40bb03cc7 100644
--- a/llvm/test/CodeGen/WebAssembly/exception-legacy.mir
+++ b/llvm/test/CodeGen/WebAssembly/exception-legacy.mir
@@ -42,13 +42,13 @@ body: |
   bb.1 (landing-pad):
   ; predecessors: %bb.0
     successors: %bb.2
-    ; CATCH_ALL should be after EH_LABELs in the beginning of an EH pad.
+    ; CATCH_ALL_LEGACY should be after EH_LABELs in the beginning of an EH pad.
     ; (Sometimes there are multiple EH_LABELs in an EH pad. This test tests
     ; that.) GLOBAL_SET should follow right after that.
     ; CHECK:      bb.1
     ; CHECK:      EH_LABEL
     ; CHECK:      EH_LABEL
-    ; CHECK-NEXT: CATCH_ALL
+    ; CHECK-NEXT: CATCH_ALL_LEGACY
     ; CHECK-NEXT: GLOBAL_SET_I32
     EH_LABEL <mcsymbol .Ltmp2>
     EH_LABEL <mcsymbol .Ltmp2>
diff --git a/llvm/test/CodeGen/WebAssembly/function-info.mir b/llvm/test/CodeGen/WebAssembly/function-info.mir
index 2971d234c9b2d..d241d59b67a39 100644
--- a/llvm/test/CodeGen/WebAssembly/function-info.mir
+++ b/llvm/test/CodeGen/WebAssembly/function-info.mir
@@ -61,12 +61,12 @@ body:             |
 
   bb.2 (landing-pad):
     successors: %bb.1, %bb.3
-    %0:i32 = CATCH &__cpp_exception, implicit-def dead $arguments
+    %0:i32 = CATCH_LEGACY &__cpp_exception, implicit-def dead $arguments
     CALL @foo, implicit-def dead $arguments, implicit $sp32, implicit $sp64, implicit-def dead $arguments, implicit $sp32, implicit $sp64
     BR %bb.1, implicit-def $arguments
 
   bb.3 (landing-pad):
-    CATCH_ALL implicit-def $arguments
+    CATCH_ALL_LEGACY implicit-def $arguments
     RETHROW 0, implicit-def $arguments
 ...
 
diff --git a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
index 55caaf5d13b6c..073beb9446ffb 100644
--- a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
+++ b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
@@ -101,14 +101,14 @@ body: |
   ; predecessors: %bb.0
     successors: %bb.3, %bb.9
     liveins: $value_stack
-    CATCH_ALL implicit-def $arguments
+    CATCH_ALL_LEGACY implicit-def $arguments
     RETHROW 0, implicit-def dead $arguments
 
   bb.3 (landing-pad):
   ; predecessors: %bb.2
     successors: %bb.4, %bb.6
     liveins: $value_stack
-    %1:i32 = CATCH &__cpp_exception, implicit-def $arguments
+    %1:i32 = CATCH_LEGACY &__cpp_exception, implicit-def $arguments
     BR_IF %bb.4, %58:i32, implicit-def $arguments, implicit-def $value_stack, implicit $value_stack
     BR %bb.6, implicit-def $arguments
 
@@ -139,13 +139,13 @@ body: |
   ; predecessors: %bb.4
     successors: %bb.9
     liveins: $value_stack
-    CATCH_ALL implicit-def $arguments
+    CATCH_ALL_LEGACY implicit-def $arguments
     RETHROW 0, implicit-def dead $arguments
 
   bb.9 (landing-pad):
   ; predecessors: %bb.2, %bb.6, %bb.8
     liveins: $value_stack
-    CATCH_ALL implicit-def $arguments
+    CATCH_ALL_LEGACY implicit-def $arguments
     RETHROW 0, implicit-def dead $arguments
 
   bb.10:
@@ -257,7 +257,7 @@ body: |
   ; predecessors: %bb.0
     successors: %bb.2, %bb.8
     liveins: $value_stack
-    %0:i32 = CATCH &__cpp_exception, implicit-def $arguments
+    %0:i32 = CATCH_LEGACY &__cpp_exception, implicit-def $arguments
     BR_IF %bb.2, %32:i32, implicit-def $arguments, implicit-def $value_stack, implicit $value_stack
     BR %bb.8, implicit-def $arguments
 
@@ -271,7 +271,7 @@ body: |
   ; predecessors: %bb.2
     successors: %bb.4, %bb.6
     liveins: $value_stack
-    %1:i32 = CATCH &__cpp_exception, implicit-def $arguments
+    %1:i32 = CATCH_LEGACY &__cpp_exception, implicit-def $arguments
     BR_IF %bb.4, %43:i32, implicit-def $arguments, implicit-def $value_stack, implicit $value_stack
     BR %bb.6, implicit-def $arguments
 
@@ -313,13 +313,13 @@ body: |
   ; predecessors: %bb.4
     successors: %bb.11
     liveins: $value_stack
-    CATCH_ALL implicit-def $arguments
+    CATCH_ALL_LEGACY implicit-def $arguments
     RETHROW 0, implicit-def dead $arguments
 
   bb.11 (landing-pad):
   ; predecessors: %bb.2, %bb.6, %bb.10
     liveins: $value_stack
-    CATCH_ALL implicit-def $arguments
+    CATCH_ALL_LEGACY implicit-def $arguments
     RETHROW 0, implicit-def dead $arguments
 
   bb.12:

From 23457964392d00fc872fa6021763859024fb38da Mon Sep 17 00:00:00 2001
From: ziqingluo-90 <ziqing_luo@apple.com>
Date: Wed, 4 Sep 2024 16:26:04 -0700
Subject: [PATCH 168/425] Revert "[-Wunsafe-buffer-usage] Warning Libc
 functions (#101583)"

This reverts commit 0fffdeb5f46078ddcc61e112cd38856b1165f050.

Will re-land this commit soon with a way to opt-out
---
 .../Analysis/Analyses/UnsafeBufferUsage.h     |  15 -
 .../Analyses/UnsafeBufferUsageGadgets.def     |   1 -
 .../clang/Basic/DiagnosticSemaKinds.td        |   7 -
 clang/lib/Analysis/UnsafeBufferUsage.cpp      | 513 +-----------------
 clang/lib/Sema/AnalysisBasedWarnings.cpp      |  14 -
 ...-usage-libc-functions-inline-namespace.cpp |  60 --
 ...arn-unsafe-buffer-usage-libc-functions.cpp | 106 ----
 ...n-unsafe-buffer-usage-test-unreachable.cpp |   4 +-
 8 files changed, 4 insertions(+), 716 deletions(-)
 delete mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp
 delete mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp

diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
index aa2c01ad10d45..228b4ae1e3e11 100644
--- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
+++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
@@ -15,7 +15,6 @@
 #define LLVM_CLANG_ANALYSIS_ANALYSES_UNSAFEBUFFERUSAGE_H
 
 #include "clang/AST/Decl.h"
-#include "clang/AST/Expr.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/Support/Debug.h"
@@ -107,20 +106,6 @@ class UnsafeBufferUsageHandler {
   virtual void handleUnsafeOperation(const Stmt *Operation,
                                      bool IsRelatedToDecl, ASTContext &Ctx) = 0;
 
-  /// Invoked when a call to an unsafe libc function is found.
-  /// \param PrintfInfo
-  ///  is 0 if the callee function is not a member of the printf family;
-  ///  is 1 if the callee is `sprintf`;
-  ///  is 2 if arguments of the call have `__size_by` relation but are not in a
-  ///  safe pattern;
-  ///  is 3 if string arguments do not guarantee null-termination
-  ///  is 4 if the callee takes va_list
-  /// \param UnsafeArg one of the actual arguments that is unsafe, non-null
-  /// only when `2 <= PrintfInfo <= 3`
-  virtual void handleUnsafeLibcCall(const CallExpr *Call, unsigned PrintfInfo,
-                                    ASTContext &Ctx,
-                                    const Expr *UnsafeArg = nullptr) = 0;
-
   /// Invoked when an unsafe operation with a std container is found.
   virtual void handleUnsafeOperationInContainer(const Stmt *Operation,
                                                 bool IsRelatedToDecl,
diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
index ac01b285ae833..242ad763ba62b 100644
--- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
+++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
@@ -38,7 +38,6 @@ WARNING_GADGET(PointerArithmetic)
 WARNING_GADGET(UnsafeBufferUsageAttr)
 WARNING_GADGET(UnsafeBufferUsageCtorAttr)
 WARNING_GADGET(DataInvocation)
-WARNING_GADGET(UnsafeLibcFunctionCall)
 WARNING_CONTAINER_GADGET(SpanTwoParamConstructor) // Uses of `std::span(arg0, arg1)`
 FIXABLE_GADGET(ULCArraySubscript)          // `DRE[any]` in an Unspecified Lvalue Context
 FIXABLE_GADGET(DerefSimplePtrArithFixable)
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 35f68f51dfb35..dcb49d8a67604 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12412,13 +12412,6 @@ def warn_unsafe_buffer_operation : Warning<
   "unsafe buffer access|function introduces unsafe buffer manipulation|unsafe invocation of span::data|"
   "field %1 prone to unsafe buffer manipulation}0">,
   InGroup<UnsafeBufferUsage>, DefaultIgnore;
-def warn_unsafe_buffer_libc_call : Warning<
-  "function %0 is unsafe">,
-  InGroup<UnsafeBufferUsage>, DefaultIgnore;
-def note_unsafe_buffer_printf_call : Note<
-  "%select{|change to 'snprintf' for explicit bounds checking | buffer pointer and size may not match"
-          "|string argument is not guaranteed to be null-terminated"
-          "|'va_list' is unsafe}0">;
 def note_unsafe_buffer_operation : Note<
   "used%select{| in pointer arithmetic| in buffer access}0 here">;
 def note_unsafe_buffer_variable_fixit_group : Note<
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index f0d072643f8ff..da7446913f7c8 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -10,12 +10,12 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
-#include "clang/AST/FormatString.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtVisitor.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Basic/CharInfo.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Lex/Preprocessor.h"
@@ -443,426 +443,6 @@ AST_MATCHER(ArraySubscriptExpr, isSafeArraySubscript) {
   return false;
 }
 
-AST_MATCHER_P(CallExpr, hasNumArgs, unsigned, Num) {
-  return Node.getNumArgs() == Num;
-}
-
-namespace libc_func_matchers {
-// Under `libc_func_matchers`, define a set of matchers that match unsafe
-// functions in libc and unsafe calls to them.
-
-//  A tiny parser to strip off common prefix and suffix of libc function names
-//  in real code.
-//
-//  Given a function name, `matchName` returns `CoreName` according to the
-//  following grammar:
-//
-//  LibcName     := CoreName | CoreName + "_s"
-//  MatchingName := "__builtin_" + LibcName              |
-//                  "__builtin___" + LibcName + "_chk"   |
-//                  "__asan_" + LibcName
-//
-struct LibcFunNamePrefixSuffixParser {
-  StringRef matchName(StringRef FunName, bool isBuiltin) {
-    // Try to match __builtin_:
-    if (isBuiltin && FunName.starts_with("__builtin_"))
-      // Then either it is __builtin_LibcName or __builtin___LibcName_chk or
-      // no match:
-      return matchLibcNameOrBuiltinChk(
-          FunName.drop_front(10 /* truncate "__builtin_" */));
-    // Try to match __asan_:
-    if (FunName.starts_with("__asan_"))
-      return matchLibcName(FunName.drop_front(7 /* truncate of "__asan_" */));
-    return matchLibcName(FunName);
-  }
-
-  // Parameter `Name` is the substring after stripping off the prefix
-  // "__builtin_".
-  StringRef matchLibcNameOrBuiltinChk(StringRef Name) {
-    if (Name.starts_with("__") && Name.ends_with("_chk"))
-      return matchLibcName(
-          Name.drop_front(2).drop_back(4) /* truncate "__" and "_chk" */);
-    return matchLibcName(Name);
-  }
-
-  StringRef matchLibcName(StringRef Name) {
-    if (Name.ends_with("_s"))
-      return Name.drop_back(2 /* truncate "_s" */);
-    return Name;
-  }
-};
-
-// A pointer type expression is known to be null-terminated, if it has the
-// form: E.c_str(), for any expression E of `std::string` type.
-static bool isNullTermPointer(const Expr *Ptr) {
-  if (isa<StringLiteral>(Ptr->IgnoreParenImpCasts()))
-    return true;
-  if (isa<PredefinedExpr>(Ptr->IgnoreParenImpCasts()))
-    return true;
-  if (auto *MCE = dyn_cast<CXXMemberCallExpr>(Ptr->IgnoreParenImpCasts())) {
-    const CXXMethodDecl *MD = MCE->getMethodDecl();
-    const CXXRecordDecl *RD = MCE->getRecordDecl()->getCanonicalDecl();
-
-    if (MD && RD && RD->isInStdNamespace())
-      if (MD->getName() == "c_str" && RD->getName() == "basic_string")
-        return true;
-  }
-  return false;
-}
-
-// Return true iff at least one of following cases holds:
-//  1. Format string is a literal and there is an unsafe pointer argument
-//     corresponding to an `s` specifier;
-//  2. Format string is not a literal and there is least an unsafe pointer
-//     argument (including the formatter argument).
-//
-// `UnsafeArg` is the output argument that will be set only if this function
-// returns true.
-static bool hasUnsafeFormatOrSArg(const CallExpr *Call, const Expr *&UnsafeArg,
-                                  const unsigned FmtArgIdx, ASTContext &Ctx,
-                                  bool isKprintf = false) {
-  class StringFormatStringHandler
-      : public analyze_format_string::FormatStringHandler {
-    const CallExpr *Call;
-    unsigned FmtArgIdx;
-    const Expr *&UnsafeArg;
-
-  public:
-    StringFormatStringHandler(const CallExpr *Call, unsigned FmtArgIdx,
-                              const Expr *&UnsafeArg)
-        : Call(Call), FmtArgIdx(FmtArgIdx), UnsafeArg(UnsafeArg) {}
-
-    bool HandlePrintfSpecifier(const analyze_printf::PrintfSpecifier &FS,
-                               const char *startSpecifier,
-                               unsigned specifierLen,
-                               const TargetInfo &Target) override {
-      if (FS.getConversionSpecifier().getKind() ==
-          analyze_printf::PrintfConversionSpecifier::sArg) {
-        unsigned ArgIdx = FS.getPositionalArgIndex() + FmtArgIdx;
-
-        if (0 < ArgIdx && ArgIdx < Call->getNumArgs())
-          if (!isNullTermPointer(Call->getArg(ArgIdx))) {
-            UnsafeArg = Call->getArg(ArgIdx); // output
-            // returning false stops parsing immediately
-            return false;
-          }
-      }
-      return true; // continue parsing
-    }
-  };
-
-  const Expr *Fmt = Call->getArg(FmtArgIdx);
-
-  if (auto *SL = dyn_cast<StringLiteral>(Fmt->IgnoreParenImpCasts())) {
-    StringRef FmtStr = SL->getString();
-    StringFormatStringHandler Handler(Call, FmtArgIdx, UnsafeArg);
-
-    return analyze_format_string::ParsePrintfString(
-        Handler, FmtStr.begin(), FmtStr.end(), Ctx.getLangOpts(),
-        Ctx.getTargetInfo(), isKprintf);
-  }
-  // If format is not a string literal, we cannot analyze the format string.
-  // In this case, this call is considered unsafe if at least one argument
-  // (including the format argument) is unsafe pointer.
-  return llvm::any_of(
-      llvm::make_range(Call->arg_begin() + FmtArgIdx, Call->arg_end()),
-      [&UnsafeArg](const Expr *Arg) -> bool {
-        if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg)) {
-          UnsafeArg = Arg;
-          return true;
-        }
-        return false;
-      });
-}
-
-// Matches a FunctionDecl node such that
-//  1. It's name, after stripping off predefined prefix and suffix, is
-//     `CoreName`; and
-//  2. `CoreName` or `CoreName[str/wcs]` is one of the `PredefinedNames`, which
-//     is a set of libc function names.
-//
-//  Note: For predefined prefix and suffix, see `LibcFunNamePrefixSuffixParser`.
-//  The notation `CoreName[str/wcs]` means a new name obtained from replace
-//  string "wcs" with "str" in `CoreName`.
-AST_MATCHER(FunctionDecl, isPredefinedUnsafeLibcFunc) {
-  static std::unique_ptr<std::set<StringRef>> PredefinedNames = nullptr;
-  if (!PredefinedNames)
-    PredefinedNames =
-        std::make_unique<std::set<StringRef>, std::set<StringRef>>({
-            // numeric conversion:
-            "atof",
-            "atoi",
-            "atol",
-            "atoll",
-            "strtol",
-            "strtoll",
-            "strtoul",
-            "strtoull",
-            "strtof",
-            "strtod",
-            "strtold",
-            "strtoimax",
-            "strtoumax",
-            // "strfromf",  "strfromd", "strfroml", // C23?
-            // string manipulation:
-            "strcpy",
-            "strncpy",
-            "strlcpy",
-            "strcat",
-            "strncat",
-            "strlcat",
-            "strxfrm",
-            "strdup",
-            "strndup",
-            // string examination:
-            "strlen",
-            "strnlen",
-            "strcmp",
-            "strncmp",
-            "stricmp",
-            "strcasecmp",
-            "strcoll",
-            "strchr",
-            "strrchr",
-            "strspn",
-            "strcspn",
-            "strpbrk",
-            "strstr",
-            "strtok",
-            // "mem-" functions
-            "memchr",
-            "wmemchr",
-            "memcmp",
-            "wmemcmp",
-            "memcpy",
-            "memccpy",
-            "mempcpy",
-            "wmemcpy",
-            "memmove",
-            "wmemmove",
-            "memset",
-            "wmemset",
-            // IO:
-            "fread",
-            "fwrite",
-            "fgets",
-            "fgetws",
-            "gets",
-            "fputs",
-            "fputws",
-            "puts",
-            // others
-            "strerror_s",
-            "strerror_r",
-            "bcopy",
-            "bzero",
-            "bsearch",
-            "qsort",
-        });
-
-  auto *II = Node.getIdentifier();
-
-  if (!II)
-    return false;
-
-  StringRef Name = LibcFunNamePrefixSuffixParser().matchName(
-      II->getName(), Node.getBuiltinID());
-
-  // Match predefined names:
-  if (PredefinedNames->find(Name) != PredefinedNames->end())
-    return true;
-
-  std::string NameWCS = Name.str();
-  size_t WcsPos = NameWCS.find("wcs");
-
-  while (WcsPos != std::string::npos) {
-    NameWCS[WcsPos++] = 's';
-    NameWCS[WcsPos++] = 't';
-    NameWCS[WcsPos++] = 'r';
-    WcsPos = NameWCS.find("wcs", WcsPos);
-  }
-  if (PredefinedNames->find(NameWCS) != PredefinedNames->end())
-    return true;
-  // All `scanf` functions are unsafe (including `sscanf`, `vsscanf`, etc.. They
-  // all should end with "scanf"):
-  return Name.ends_with("scanf");
-}
-
-// Match a call to one of the `v*printf` functions taking `va_list`.  We cannot
-// check safety for these functions so they should be changed to their
-// non-va_list versions.
-AST_MATCHER(FunctionDecl, isUnsafeVaListPrintfFunc) {
-  auto *II = Node.getIdentifier();
-
-  if (!II)
-    return false;
-
-  StringRef Name = LibcFunNamePrefixSuffixParser().matchName(
-      II->getName(), Node.getBuiltinID());
-
-  if (!Name.ends_with("printf"))
-    return false; // neither printf nor scanf
-  return Name.starts_with("v");
-}
-
-// Matches a call to one of the `sprintf` functions as they are always unsafe
-// and should be changed to `snprintf`.
-AST_MATCHER(FunctionDecl, isUnsafeSprintfFunc) {
-  auto *II = Node.getIdentifier();
-
-  if (!II)
-    return false;
-
-  StringRef Name = LibcFunNamePrefixSuffixParser().matchName(
-      II->getName(), Node.getBuiltinID());
-
-  if (!Name.ends_with("printf") ||
-      // Let `isUnsafeVaListPrintfFunc` check for cases with va-list:
-      Name.starts_with("v"))
-    return false;
-
-  StringRef Prefix = Name.drop_back(6);
-
-  if (Prefix.ends_with("w"))
-    Prefix = Prefix.drop_back(1);
-  return Prefix == "s";
-}
-
-// Match function declarations of `printf`, `fprintf`, `snprintf` and their wide
-// character versions.  Calls to these functions can be safe if their arguments
-// are carefully made safe.
-AST_MATCHER(FunctionDecl, isNormalPrintfFunc) {
-  auto *II = Node.getIdentifier();
-
-  if (!II)
-    return false;
-
-  StringRef Name = LibcFunNamePrefixSuffixParser().matchName(
-      II->getName(), Node.getBuiltinID());
-
-  if (!Name.ends_with("printf") || Name.starts_with("v"))
-    return false;
-
-  StringRef Prefix = Name.drop_back(6);
-
-  if (Prefix.ends_with("w"))
-    Prefix = Prefix.drop_back(1);
-
-  return Prefix.empty() || Prefix == "k" || Prefix == "f" || Prefix == "sn";
-}
-
-// This matcher requires that it is known that the callee `isNormalPrintf`.
-// Then if the format string is a string literal, this matcher matches when at
-// least one string argument is unsafe. If the format is not a string literal,
-// this matcher matches when at least one pointer type argument is unsafe.
-AST_MATCHER_P(CallExpr, hasUnsafePrintfStringArg,
-              clang::ast_matchers::internal::Matcher<Expr>,
-              UnsafeStringArgMatcher) {
-  // Determine what printf it is:
-  const Expr *FirstArg = Node.getArg(0);
-  ASTContext &Ctx = Finder->getASTContext();
-
-  if (isa<StringLiteral>(FirstArg->IgnoreParenImpCasts())) {
-    // It is a printf/kprintf. And, the format is a string literal:
-    bool isKprintf = false;
-    const Expr *UnsafeArg;
-
-    if (auto *Callee = Node.getDirectCallee())
-      if (auto *II = Node.getDirectCallee()->getIdentifier())
-        isKprintf = II->getName() == "kprintf";
-    if (hasUnsafeFormatOrSArg(&Node, UnsafeArg, 0, Ctx, isKprintf))
-      return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder);
-    return false;
-  }
-
-  QualType PtrTy = FirstArg->getType();
-
-  assert(PtrTy->isPointerType());
-
-  QualType PteTy = (cast<PointerType>(PtrTy))->getPointeeType();
-
-  if (!Ctx.getFILEType().isNull() /* If `FILE *` is not ever in the ASTContext,
-                                     there can't be any file pointer then */
-      && PteTy.getCanonicalType() == Ctx.getFILEType().getCanonicalType()) {
-    // It is a fprintf:
-    const Expr *UnsafeArg;
-
-    if (hasUnsafeFormatOrSArg(&Node, UnsafeArg, 1, Ctx, false))
-      return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder);
-    return false;
-  }
-
-  const Expr *SecondArg = Node.getArg(1);
-
-  if (SecondArg->getType()->isIntegerType()) {
-    // It is a snprintf:
-    const Expr *UnsafeArg;
-
-    if (unsigned UnsafeArgIdx =
-            hasUnsafeFormatOrSArg(&Node, UnsafeArg, 2, Ctx, false))
-      return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder);
-    return false;
-  }
-  // It is printf but the format string is passed by pointer. The only thing we
-  // can do is to require all pointers to be null-terminated:
-  for (auto Arg : Node.arguments())
-    if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg))
-      if (UnsafeStringArgMatcher.matches(*Arg, Finder, Builder))
-        return true;
-  return false;
-}
-
-// This matcher requires that it is known that the callee `isNormalPrintf`.
-// Then it matches if the first two arguments of the call is a pointer and an
-// integer and they are not in a safe pattern.
-//
-// For the first two arguments: `ptr` and `size`, they are safe if in the
-// following patterns:
-//    ptr := DRE.data();
-//    size:= DRE.size()/DRE.size_bytes()
-// And DRE is a hardened container or view.
-AST_MATCHER(CallExpr, hasUnsafeSnprintfBuffer) {
-  if (Node.getNumArgs() < 3)
-    return false; // not an snprintf call
-
-  const Expr *Buf = Node.getArg(0), *Size = Node.getArg(1);
-
-  if (!Buf->getType()->isPointerType() || !Size->getType()->isIntegerType())
-    return false; // not an snprintf call
-
-  static StringRef SizedObjs[] = {"span", "array", "vector",
-                                  "basic_string_view", "basic_string"};
-  Buf = Buf->IgnoreParenImpCasts();
-  Size = Size->IgnoreParenImpCasts();
-  if (auto *MCEPtr = dyn_cast<CXXMemberCallExpr>(Buf))
-    if (auto *MCESize = dyn_cast<CXXMemberCallExpr>(Size)) {
-      auto *DREOfPtr = dyn_cast<DeclRefExpr>(
-          MCEPtr->getImplicitObjectArgument()->IgnoreParenImpCasts());
-      auto *DREOfSize = dyn_cast<DeclRefExpr>(
-          MCESize->getImplicitObjectArgument()->IgnoreParenImpCasts());
-
-      if (!DREOfPtr || !DREOfSize)
-        return true; // not in safe pattern
-      if (DREOfPtr->getDecl() != DREOfSize->getDecl())
-        return true; // not in safe pattern
-      if (MCEPtr->getMethodDecl()->getName() != "data")
-        return true; // not in safe pattern
-
-      if (MCESize->getMethodDecl()->getName() == "size_bytes" ||
-          // Note here the pointer must be a pointer-to-char type unless there
-          // is explicit casting.  If there is explicit casting, this branch
-          // is unreachable. Thus, at this branch "size" and "size_bytes" are
-          // equivalent as the pointer is a char pointer:
-          MCESize->getMethodDecl()->getName() == "size")
-        for (StringRef SizedObj : SizedObjs)
-          if (MCEPtr->getRecordDecl()->isInStdNamespace() &&
-              MCEPtr->getRecordDecl()->getCanonicalDecl()->getName() ==
-                  SizedObj)
-            return false; // It is in fact safe
-    }
-  return true; // ptr and size are not in safe pattern
-}
-} // namespace libc_func_matchers
 } // namespace clang::ast_matchers
 
 namespace {
@@ -1450,97 +1030,6 @@ class DataInvocationGadget : public WarningGadget {
   DeclUseList getClaimedVarUseSites() const override { return {}; }
 };
 
-class UnsafeLibcFunctionCallGadget : public WarningGadget {
-  const CallExpr *const Call;
-  const Expr *UnsafeArg = nullptr;
-  constexpr static const char *const Tag = "UnsafeLibcFunctionCall";
-  // Extra tags for additional information:
-  constexpr static const char *const UnsafeSprintfTag =
-      "UnsafeLibcFunctionCall_sprintf";
-  constexpr static const char *const UnsafeSizedByTag =
-      "UnsafeLibcFunctionCall_sized_by";
-  constexpr static const char *const UnsafeStringTag =
-      "UnsafeLibcFunctionCall_string";
-  constexpr static const char *const UnsafeVaListTag =
-      "UnsafeLibcFunctionCall_va_list";
-
-  enum UnsafeKind {
-    OTHERS = 0,  // no specific information, the callee function is unsafe
-    SPRINTF = 1, // never call `-sprintf`s, call `-snprintf`s instead.
-    SIZED_BY =
-        2, // the first two arguments of `snprintf` function have
-           // "__sized_by" relation but they do not conform to safe patterns
-    STRING = 3,  // an argument is a pointer-to-char-as-string but does not
-                 // guarantee null-termination
-    VA_LIST = 4, // one of the `-printf`s function that take va_list, which is
-                 // considered unsafe as it is not compile-time check
-  } WarnedFunKind = OTHERS;
-
-public:
-  UnsafeLibcFunctionCallGadget(const MatchFinder::MatchResult &Result)
-      : WarningGadget(Kind::UnsafeLibcFunctionCall),
-        Call(Result.Nodes.getNodeAs<CallExpr>(Tag)) {
-    if (Result.Nodes.getNodeAs<Decl>(UnsafeSprintfTag))
-      WarnedFunKind = SPRINTF;
-    else if (auto *E = Result.Nodes.getNodeAs<Expr>(UnsafeStringTag)) {
-      WarnedFunKind = STRING;
-      UnsafeArg = E;
-    } else if (Result.Nodes.getNodeAs<CallExpr>(UnsafeSizedByTag)) {
-      WarnedFunKind = SIZED_BY;
-      UnsafeArg = Call->getArg(0);
-    } else if (Result.Nodes.getNodeAs<Decl>(UnsafeVaListTag))
-      WarnedFunKind = VA_LIST;
-  }
-
-  static Matcher matcher() {
-    return stmt(anyOf(
-        callExpr(
-            callee(functionDecl(anyOf(
-                // Match a predefined unsafe libc
-                // function:
-                functionDecl(libc_func_matchers::isPredefinedUnsafeLibcFunc()),
-                // Match a call to one of the `v*printf` functions
-                // taking va-list, which cannot be checked at
-                // compile-time:
-                functionDecl(libc_func_matchers::isUnsafeVaListPrintfFunc())
-                    .bind(UnsafeVaListTag),
-                // Match a call to a `sprintf` function, which is never
-                // safe:
-                functionDecl(libc_func_matchers::isUnsafeSprintfFunc())
-                    .bind(UnsafeSprintfTag)))),
-            //  (unless the call has a sole string literal argument):
-            unless(
-                allOf(hasArgument(0, expr(stringLiteral())), hasNumArgs(1)))),
-
-        // The following two cases require checking against actual
-        // arguments of the call:
-
-        // Match a call to an `snprintf` function. And first two
-        // arguments of the call (that describe a buffer) are not in
-        // safe patterns:
-        callExpr(callee(functionDecl(libc_func_matchers::isNormalPrintfFunc())),
-                 libc_func_matchers::hasUnsafeSnprintfBuffer())
-            .bind(UnsafeSizedByTag),
-        // Match a call to a `printf` function, which can be safe if
-        // all arguments are null-terminated:
-        callExpr(callee(functionDecl(libc_func_matchers::isNormalPrintfFunc())),
-                 libc_func_matchers::hasUnsafePrintfStringArg(
-                     expr().bind(UnsafeStringTag)))));
-  }
-
-  const Stmt *getBaseStmt() const { return Call; }
-
-  SourceLocation getSourceLoc() const override { return Call->getBeginLoc(); }
-
-  void handleUnsafeOperation(UnsafeBufferUsageHandler &Handler,
-                             bool IsRelatedToDecl,
-                             ASTContext &Ctx) const override {
-    Handler.handleUnsafeLibcCall(Call, WarnedFunKind, Ctx, UnsafeArg);
-  }
-
-  DeclUseList getClaimedVarUseSites() const override { return {}; }
-};
-
 // Represents expressions of the form `DRE[*]` in the Unspecified Lvalue
 // Context (see `isInUnspecifiedLvalueContext`).
 // Note here `[]` is the built-in subscript operator.
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 72078ae1534b0..e6ce89dc7ec40 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -2304,20 +2304,6 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler {
     }
   }
 
-  void handleUnsafeLibcCall(const CallExpr *Call, unsigned PrintfInfo,
-                            ASTContext &Ctx,
-                            const Expr *UnsafeArg = nullptr) override {
-    S.Diag(Call->getBeginLoc(), diag::warn_unsafe_buffer_libc_call)
-        << Call->getDirectCallee() // We've checked there is a direct callee
-        << Call->getSourceRange();
-    if (PrintfInfo > 0) {
-      SourceRange R =
-          UnsafeArg ? UnsafeArg->getSourceRange() : Call->getSourceRange();
-      S.Diag(R.getBegin(), diag::note_unsafe_buffer_printf_call)
-          << PrintfInfo << R;
-    }
-  }
-
   void handleUnsafeOperationInContainer(const Stmt *Operation,
                                         bool IsRelatedToDecl,
                                         ASTContext &Ctx) override {
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp
deleted file mode 100644
index 2bd12db93fd52..0000000000000
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage \
-// RUN:            -verify %s
-
-namespace std {
-  inline namespace __1 {
-  template< class InputIt, class OutputIt >
-  OutputIt copy( InputIt first, InputIt last,
-		 OutputIt d_first );
-
-  struct iterator{};
-  template<typename T>
-  struct span {
-    T * ptr;
-    T * data();
-    unsigned size_bytes();
-    unsigned size();
-    iterator begin() const noexcept;
-    iterator end() const noexcept;
-  };
-
-  template<typename T>
-  struct basic_string {
-    T* p;
-    T *c_str();
-    T *data();
-    unsigned size_bytes();
-  };
-
-  typedef basic_string<char> string;
-  typedef basic_string<wchar_t> wstring;
-
-  // C function under std:
-  void memcpy();
-  void strcpy();
-  int snprintf( char* buffer, unsigned buf_size, const char* format, ... );
-  }
-}
-
-void f(char * p, char * q, std::span<char> s) {
-  std::memcpy();              // expected-warning{{function 'memcpy' is unsafe}}
-  std::strcpy();              // expected-warning{{function 'strcpy' is unsafe}}
-  std::__1::memcpy();              // expected-warning{{function 'memcpy' is unsafe}}
-  std::__1::strcpy();              // expected-warning{{function 'strcpy' is unsafe}}
-
-  /* Test printfs */
-  std::snprintf(s.data(), 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}}
-  std::__1::snprintf(s.data(), 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}}
-  std::snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn
-  std::__1::snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn
-}
-
-void v(std::string s1) {
-  std::snprintf(s1.data(), s1.size_bytes(), "%s%d", s1.c_str(), 0); // no warn
-  std::__1::snprintf(s1.data(), s1.size_bytes(), "%s%d", s1.c_str(), 0); // no warn
-}
-
-void g(char *begin, char *end, char *p, std::span<char> s) {
-  std::copy(begin, end, p); // no warn
-  std::copy(s.begin(), s.end(), s.begin()); // no warn
-}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp
deleted file mode 100644
index 1a29654f660c9..0000000000000
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage \
-// RUN:            -verify %s
-
-typedef struct {} FILE;
-void memcpy();
-void __asan_memcpy();
-void strcpy();
-void strcpy_s();
-void wcscpy_s();
-unsigned strlen( const char* str );
-int fprintf( FILE* stream, const char* format, ... );
-int printf( const char* format, ... );
-int sprintf( char* buffer, const char* format, ... );
-int swprintf( char* buffer, const char* format, ... );
-int snprintf( char* buffer, unsigned buf_size, const char* format, ... );
-int snwprintf( char* buffer, unsigned buf_size, const char* format, ... );
-int snwprintf_s( char* buffer, unsigned buf_size, const char* format, ... );
-int vsnprintf( char* buffer, unsigned buf_size, const char* format, ... );
-int sscanf_s(const char * buffer, const char * format, ...);
-int sscanf(const char * buffer, const char * format, ... );
-
-namespace std {
-  template< class InputIt, class OutputIt >
-  OutputIt copy( InputIt first, InputIt last,
-		 OutputIt d_first );
-
-  struct iterator{};
-  template<typename T>
-  struct span {
-    T * ptr;
-    T * data();
-    unsigned size_bytes();
-    unsigned size();
-    iterator begin() const noexcept;
-    iterator end() const noexcept;
-  };
-
-  template<typename T>
-  struct basic_string {
-    T* p;
-    T *c_str();
-    T *data();
-    unsigned size_bytes();
-  };
-
-  typedef basic_string<char> string;
-  typedef basic_string<wchar_t> wstring;
-
-  // C function under std:
-  void memcpy();
-  void strcpy();
-}
-
-void f(char * p, char * q, std::span<char> s, std::span<char> s2) {
-  memcpy();                   // expected-warning{{function 'memcpy' is unsafe}}
-  std::memcpy();              // expected-warning{{function 'memcpy' is unsafe}}
-  __builtin_memcpy(p, q, 64); // expected-warning{{function '__builtin_memcpy' is unsafe}}
-  __builtin___memcpy_chk(p, q, 8, 64);  // expected-warning{{function '__builtin___memcpy_chk' is unsafe}}
-  __asan_memcpy();                      // expected-warning{{function '__asan_memcpy' is unsafe}}
-  strcpy();                   // expected-warning{{function 'strcpy' is unsafe}}
-  std::strcpy();              // expected-warning{{function 'strcpy' is unsafe}}
-  strcpy_s();                 // expected-warning{{function 'strcpy_s' is unsafe}}
-  wcscpy_s();                 // expected-warning{{function 'wcscpy_s' is unsafe}}
-
-
-  /* Test printfs */
-  fprintf((FILE*)p, "%s%d", p, *p);  // expected-warning{{function 'fprintf' is unsafe}} expected-note{{string argument is not guaranteed to be null-terminated}}
-  printf("%s%d", // expected-warning{{function 'printf' is unsafe}}
-	 p,    // expected-note{{string argument is not guaranteed to be null-terminated}} note attached to the unsafe argument
-	 *p);
-  sprintf(q, "%s%d", "hello", *p); // expected-warning{{function 'sprintf' is unsafe}} expected-note{{change to 'snprintf' for explicit bounds checking}}
-  swprintf(q, "%s%d", "hello", *p); // expected-warning{{function 'swprintf' is unsafe}} expected-note{{change to 'snprintf' for explicit bounds checking}}
-  snprintf(q, 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}}
-  snprintf(s.data(), s2.size(), "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}}
-  snwprintf(s.data(), s2.size(), "%s%d", "hello", *p); // expected-warning{{function 'snwprintf' is unsafe}} expected-note{{buffer pointer and size may not match}}
-  snwprintf_s(                      // expected-warning{{function 'snwprintf_s' is unsafe}}
-	      s.data(),             // expected-note{{buffer pointer and size may not match}} // note attached to the buffer
-	      s2.size(),
-	      "%s%d", "hello", *p);
-  vsnprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // expected-warning{{function 'vsnprintf' is unsafe}} expected-note{{'va_list' is unsafe}}
-  sscanf(p, "%s%d", "hello", *p);    // expected-warning{{function 'sscanf' is unsafe}}
-  sscanf_s(p, "%s%d", "hello", *p);  // expected-warning{{function 'sscanf_s' is unsafe}}
-  fprintf((FILE*)p, "%P%d%p%i hello world %32s", *p, *p, p, *p, p); // expected-warning{{function 'fprintf' is unsafe}} expected-note{{string argument is not guaranteed to be null-terminated}}
-  fprintf((FILE*)p, "%P%d%p%i hello world %32s", *p, *p, p, *p, "hello"); // no warn
-  printf("%s%d", "hello", *p); // no warn
-  snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn
-  snprintf(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn
-  snwprintf(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn
-  snwprintf_s(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn
-  strlen("hello");// no warn
-}
-
-void v(std::string s1, int *p) {
-  snprintf(s1.data(), s1.size_bytes(), "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str()); // no warn
-  snprintf(s1.data(), s1.size_bytes(), s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str());      // no warn
-  printf("%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str());              // no warn
-  printf(s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str());                   // no warn
-  fprintf((FILE*)0, "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str());   // no warn
-  fprintf((FILE*)0, s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str());        // no warn
-}
-
-
-void g(char *begin, char *end, char *p, std::span<char> s) {
-  std::copy(begin, end, p); // no warn
-  std::copy(s.begin(), s.end(), s.begin()); // no warn
-}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp
index 989931e41c0cc..844311c3a51a5 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp
@@ -1,6 +1,8 @@
 // RUN: %clang_cc1 -std=c++20 -Wunsafe-buffer-usage -fsafe-buffer-usage-suggestions -verify %s
 
+// expected-no-diagnostics
+
 typedef unsigned __darwin_size_t;
 typedef __darwin_size_t size_t;
  #define bzero(s, n) __builtin_bzero(s, n)
-void __nosan_bzero(void *dst, size_t sz) { bzero(dst, sz); } // expected-warning{{function '__builtin_bzero' is unsafe}}
+void __nosan_bzero(void *dst, size_t sz) { bzero(dst, sz); }

From 1254259e325428c5912843aa94f6fc663a40ea1b Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 4 Sep 2024 17:03:18 -0700
Subject: [PATCH 169/425] [hwasan] Disable test with hwasan-aliasing

It's likely flaky because we tag the stack, which
is unsupported in this mode.
---
 .../test/sanitizer_common/TestCases/Posix/fork_threaded.c      | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/fork_threaded.c b/compiler-rt/test/sanitizer_common/TestCases/Posix/fork_threaded.c
index 27b67db0c0a38..5f26ba2f330bd 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Posix/fork_threaded.c
+++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/fork_threaded.c
@@ -21,6 +21,9 @@
 //     thread.
 //  2. Stack poisoned by `inparent` is not poisoned in `inchild` thread.
 
+// Stack tagging is unsupported.
+// UNSUPPORTED: hwasan-aliasing
+
 #include <assert.h>
 #include <pthread.h>
 #include <stdint.h>

From ef1ef03d4c1014d41713feb0c7edc4d0e36982f4 Mon Sep 17 00:00:00 2001
From: yonghong-song <yhs@fb.com>
Date: Wed, 4 Sep 2024 17:38:25 -0700
Subject: [PATCH 170/425] [BPF] Fix dst/val mismatch in class ATOMIC_NOFETCH
 (#107288)

All ATOMIC_NOFETCH insns have "$dst = $val" constraints. So let us
enforce "$dst = $val" having the same register type in ATOMIC_NOFETCH as
well.

Currently, things work since ATOMIC_NOFETCH does not have source code
pattern matching. I am experimenting to introduce memory ordering to
BPFInstrInfo.td file and pattern matching will be needed. Eventually,
for atomic_fetch_*() insns locked insns could be generated if memory
ordering is memory_order_relaxed.

[1] https://lore.kernel.org/bpf/7b941f53-2a05-48ec-9032-8f106face3a3@linux.dev/
---
 llvm/lib/Target/BPF/BPFInstrInfo.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 6c750af5c2fd9..f7e17901c7ed5 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -790,7 +790,7 @@ let Predicates = [BPFNoALU32] in {
 class ATOMIC_NOFETCH<BPFWidthModifer SizeOp, string OpType, RegisterClass RegTp,
                      BPFArithOp Opc, string Opstr>
     : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
-                 (outs GPR:$dst),
+                 (outs RegTp:$dst),
                  (ins MEMri:$addr, RegTp:$val),
                  "lock *(" #OpType# " *)($addr) " #Opstr# "= $val",
                  []> {

From c82a5496c80747981efb8d25ad8bc4d8c6785b2e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 4 Sep 2024 17:49:50 -0700
Subject: [PATCH 171/425] [RISCV] Support fixed vector VP_LOAD/STORE for bf16
 and f16 without Zvfh. (#107297)

This allows odd sized vector load/store to be legalized to a
VP_LOAD/STORE using EVL.

I changed the bf16 tests in fixed-vectors-load.ll and
fixed-vectors-store.ll to use an illegal type to be consistent with the
intent of these files. A legal type is already tested in
fixed-vectors-load-store.ll
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |    4 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll     | 2786 ++++++-----------
 .../CodeGen/RISCV/rvv/fixed-vectors-load.ll   |   10 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-store.ll  |    8 +-
 4 files changed, 974 insertions(+), 1834 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 2ea909b085a6d..a9061a05c7c67 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1317,6 +1317,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         // FIXME: mload, mstore, mgather, mscatter, vp_load/store,
         // vp_stride_load/store, vp_gather/scatter can be hoisted to here.
         setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
+        setOperationAction({ISD::VP_LOAD, ISD::VP_STORE}, VT, Custom);
 
         setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
         setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT,
@@ -1378,8 +1379,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         setOperationAction(
             {ISD::MLOAD, ISD::MSTORE, ISD::MGATHER, ISD::MSCATTER}, VT, Custom);
 
-        setOperationAction({ISD::VP_LOAD, ISD::VP_STORE,
-                            ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
+        setOperationAction({ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
                             ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER,
                             ISD::VP_SCATTER},
                            VT, Custom);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index 56cd718536daa..d996a9c05aca4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -49,42 +49,20 @@ define void @fadd_v6f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fadd_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfadd.vv v8, v8, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fadd_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfadd.vv v8, v8, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fadd_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v8, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
   %c = fadd <6 x half> %a, %b
@@ -173,42 +151,20 @@ define void @fsub_v6f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fsub_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfsub.vv v8, v8, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fsub_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfsub.vv v8, v8, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fsub_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfsub.vv v8, v8, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
   %c = fsub <6 x half> %a, %b
@@ -297,42 +253,20 @@ define void @fmul_v6f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fmul_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmul.vv v8, v8, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fmul_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmul.vv v8, v8, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fmul_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmul.vv v8, v8, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
   %c = fmul <6 x half> %a, %b
@@ -421,42 +355,20 @@ define void @fdiv_v6f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fdiv_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfdiv.vv v8, v8, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fdiv_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfdiv.vv v8, v8, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fdiv_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfdiv.vv v8, v8, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
   %c = fdiv <6 x half> %a, %b
@@ -576,115 +488,55 @@ define void @fneg_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fneg_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    addi sp, sp, -16
-; ZVFHMIN-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-RV32-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-RV32-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa4
-; ZVFHMIN-RV32-NEXT:    lui a3, 1048568
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa3
-; ZVFHMIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-RV32-NEXT:    xor a1, a1, a3
-; ZVFHMIN-RV32-NEXT:    xor a2, a2, a3
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-RV32-NEXT:    xor a4, a4, a3
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a6, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-RV32-NEXT:    xor a5, a5, a3
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-RV32-NEXT:    xor a6, a6, a3
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-RV32-NEXT:    lui t0, 8
-; ZVFHMIN-RV32-NEXT:    xor a7, a7, t0
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a7
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a6
-; ZVFHMIN-RV32-NEXT:    xor a6, a7, a3
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a6
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a6, fa5
-; ZVFHMIN-RV32-NEXT:    xor a3, a6, a3
-; ZVFHMIN-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    addi sp, sp, 16
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fneg_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    addi sp, sp, -16
-; ZVFHMIN-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-RV64-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-RV64-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa4
-; ZVFHMIN-RV64-NEXT:    lui a3, 1048568
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a4, fa3
-; ZVFHMIN-RV64-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-RV64-NEXT:    lui a5, 8
-; ZVFHMIN-RV64-NEXT:    xor a2, a2, a5
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-RV64-NEXT:    xor a1, a1, a3
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-RV64-NEXT:    xor a4, a4, a3
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-RV64-NEXT:    xor a2, a2, a3
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-RV64-NEXT:    xor a1, a1, a3
-; ZVFHMIN-RV64-NEXT:    xor a2, a2, a5
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-RV64-NEXT:    xor a2, a2, a3
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    xor a1, a1, a3
-; ZVFHMIN-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    addi sp, sp, 16
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fneg_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    mv a1, sp
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-NEXT:    lui a3, 1048568
+; ZVFHMIN-NEXT:    fmv.x.h a4, fa3
+; ZVFHMIN-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-NEXT:    lui a5, 8
+; ZVFHMIN-NEXT:    xor a2, a2, a5
+; ZVFHMIN-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-NEXT:    xor a1, a1, a3
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-NEXT:    xor a4, a4, a3
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-NEXT:    xor a2, a2, a3
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-NEXT:    xor a1, a1, a3
+; ZVFHMIN-NEXT:    xor a2, a2, a5
+; ZVFHMIN-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-NEXT:    xor a2, a2, a3
+; ZVFHMIN-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    xor a1, a1, a3
+; ZVFHMIN-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
+; ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 4, v0.t
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = fneg <6 x half> %a
   store <6 x half> %b, ptr %x
@@ -851,9 +703,10 @@ define void @fabs_v6f16(ptr %x) {
 ; ZVFHMIN-RV32:       # %bb.0:
 ; ZVFHMIN-RV32-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-RV32-NEXT:    flh fa5, 2(sp)
 ; ZVFHMIN-RV32-NEXT:    flh fa4, 0(sp)
@@ -864,44 +717,35 @@ define void @fabs_v6f16(ptr %x) {
 ; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa3
 ; ZVFHMIN-RV32-NEXT:    flh fa5, 6(sp)
 ; ZVFHMIN-RV32-NEXT:    addi a3, a3, -1
-; ZVFHMIN-RV32-NEXT:    and a1, a1, a3
 ; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 10(sp)
 ; ZVFHMIN-RV32-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-RV32-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-RV32-NEXT:    and a1, a1, a3
 ; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVFHMIN-RV32-NEXT:    and a4, a4, a3
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
 ; ZVFHMIN-RV32-NEXT:    flh fa5, 8(sp)
 ; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-RV32-NEXT:    and a5, a5, a3
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
 ; ZVFHMIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-RV32-NEXT:    and a6, a6, a3
-; ZVFHMIN-RV32-NEXT:    and a7, a7, a3
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a7
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-RV32-NEXT:    and a1, a1, a3
+; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
 ; ZVFHMIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a6
-; ZVFHMIN-RV32-NEXT:    and a6, a7, a3
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a6
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a6, fa5
-; ZVFHMIN-RV32-NEXT:    and a3, a6, a3
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-RV32-NEXT:    and a2, a2, a3
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-RV32-NEXT:    and a1, a1, a3
 ; ZVFHMIN-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-RV32-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-RV32-NEXT:    ret
 ;
@@ -909,9 +753,10 @@ define void @fabs_v6f16(ptr %x) {
 ; ZVFHMIN-RV64:       # %bb.0:
 ; ZVFHMIN-RV64-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-RV64-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-RV64-NEXT:    flh fa5, 2(sp)
 ; ZVFHMIN-RV64-NEXT:    flh fa4, 0(sp)
@@ -948,12 +793,9 @@ define void @fabs_v6f16(ptr %x) {
 ; ZVFHMIN-RV64-NEXT:    and a1, a1, a3
 ; ZVFHMIN-RV64-NEXT:    vmv.v.i v0, 15
 ; ZVFHMIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-RV64-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-RV64-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-RV64-NEXT:    ret
   %a = load <6 x half>, ptr %x
@@ -1314,10 +1156,11 @@ define void @copysign_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFH-RV32:       # %bb.0:
 ; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -32
 ; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a1)
@@ -1345,35 +1188,26 @@ define void @copysign_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa4
 ; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 28(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 12(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a6
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 30(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a2
 ; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a6
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
 ; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 32
 ; ZVFHMIN-ZFH-RV32-NEXT:    ret
 ;
@@ -1381,10 +1215,11 @@ define void @copysign_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFH-RV64:       # %bb.0:
 ; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -32
 ; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a1)
@@ -1429,12 +1264,9 @@ define void @copysign_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
 ; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
 ; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFH-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFH-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 32
 ; ZVFHMIN-ZFH-RV64-NEXT:    ret
 ;
@@ -1442,93 +1274,85 @@ define void @copysign_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -32
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a1)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 18(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 2(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 16(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 0(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 20(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a2, 1048568
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a1, 1048568
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, a2, a1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 22(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    lui t1, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a3, t1, -1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a2, t1, -1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t2, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a4, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a3, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, t1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a6, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 26(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a7, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, t0, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a5, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, a7, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, t2, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a6, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, t0, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a5, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 24(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a6, a6, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, t0, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t0, t2, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, t2, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a7, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a7, t0, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t0, t2, t1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a6, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t1, t1, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or t0, t1, t0
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a5, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a7, t1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, t0
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 30(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, t1, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t0, t0, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a6, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a7, t0, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, t1, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a7, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a3, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a5, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a3, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 32
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
 ;
@@ -1536,10 +1360,11 @@ define void @copysign_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -32
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a1)
@@ -1611,12 +1436,9 @@ define void @copysign_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 32
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
   %a = load <6 x half>, ptr %x
@@ -1909,9 +1731,10 @@ define void @copysign_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-ZFH-RV32:       # %bb.0:
 ; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 2(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 0(sp)
@@ -1928,36 +1751,27 @@ define void @copysign_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a4, fa4
 ; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 8(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
 ; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa5, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a6
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a2
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    fmv.h fa5, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
 ; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFH-RV32-NEXT:    ret
 ;
@@ -1965,9 +1779,10 @@ define void @copysign_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-ZFH-RV64:       # %bb.0:
 ; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 2(sp)
 ; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 0(sp)
@@ -2002,12 +1817,9 @@ define void @copysign_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
 ; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
 ; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFH-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFH-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFH-RV64-NEXT:    ret
 ;
@@ -2015,70 +1827,62 @@ define void @copysign_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a3, 1048568
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a1, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a2, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a2, 1048568
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a4, 8
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 0(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a5, a2, -1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a1, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a4, a4, -1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a3, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a2, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, a5, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a6, a6, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, a5, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, a7, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a7, a7, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a3, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t0, t0, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or t0, t0, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t1, t1, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, t1, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, a5, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a3, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, t0
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a4, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a4, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, a4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a2, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
 ;
@@ -2086,9 +1890,10 @@ define void @copysign_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa0
@@ -2138,12 +1943,9 @@ define void @copysign_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
   %a = load <6 x half>, ptr %x
@@ -2523,10 +2325,11 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFH-RV32:       # %bb.0:
 ; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -32
 ; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 30(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 28(sp)
@@ -2549,40 +2352,31 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh ft1, 6(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa0, ft0, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa0
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa1, ft1, fa1
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa0, 10(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a4, fa1
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa1
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 8(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa2, fa0, fa2
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa2
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa2
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa3, fa1, fa3
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a6, fa3
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa3
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a6
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 14(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa4, fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa5, fa2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
 ; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 32
 ; ZVFHMIN-ZFH-RV32-NEXT:    ret
 ;
@@ -2590,10 +2384,11 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFH-RV64:       # %bb.0:
 ; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -32
 ; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 30(sp)
 ; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 28(sp)
@@ -2638,12 +2433,9 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
 ; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
 ; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFH-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFH-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 32
 ; ZVFHMIN-ZFH-RV64-NEXT:    ret
 ;
@@ -2651,10 +2443,11 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -32
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 30(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 28(sp)
@@ -2665,87 +2458,78 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 22(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 20(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 16(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    not a1, a1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    not a2, a2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 18(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    not a6, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    not t1, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    not t2, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not t1, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a6, a6
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a4, sp
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a4)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a5, sp
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a5)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    not t3, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    not a5, t0
-; ZVFHMIN-ZFHIN-RV32-NEXT:    not a3, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui t0, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a4, t0, -1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, a7, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not a7, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not t0, t0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    not t2, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui t3, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a3, t3, -1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, a5, a3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 0(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a7, 1048568
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a3, t4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a5, 1048568
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t2, t2, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or t2, t4, t2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, t4, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, t4, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t0, t0, t3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a5, t4, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, t4, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or t0, t4, t0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, t0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, t2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t0, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and t0, t0, a3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t3, t3, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or t3, t4, t3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, t3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, t4, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, a7, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a7, t0, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, a7, a3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t2, t2, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or t2, t4, t2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, t2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a6, a7, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, t4, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t1, t1, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or t1, t4, t1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h t4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and t4, t4, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, t0
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, t1, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a6, a6, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a7, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a7, a7, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, t3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a6, t4, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a6
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, t1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a6, a6, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a4, a7, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a4, a4, a3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a7
-; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a6, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a4, a2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a7
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, t3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, t2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 32
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
 ;
@@ -2753,10 +2537,11 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -32
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 30(sp)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 28(sp)
@@ -2836,12 +2621,9 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 32
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
   %a = load <6 x half>, ptr %x
@@ -3112,135 +2894,124 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) {
 ;
 ; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_trunc_v3f16_v3f32:
 ; ZVFHMIN-ZFH-RV32:       # %bb.0:
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
 ; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 3, e32, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV32-NEXT:    vle32.v v9, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 8
+; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
 ; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFH-RV32-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 8
 ; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 18(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 10(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa5, fa5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 8(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 16(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 20(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 8(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 4(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 12(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa5, fa4, fa3
 ; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa5
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa5, fa2, fa1
 ; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 22(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnjn.h fa5, fa5, fa4
 ; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa5
 ; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a1, sp, 24
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fsh fa5, 4(a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFH-RV32-NEXT:    ret
 ;
 ; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_trunc_v3f16_v3f32:
 ; ZVFHMIN-ZFH-RV64:       # %bb.0:
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 3, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vle32.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vle64.v v9, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
+; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    vle32.v v9, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v10, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 8
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 2(sp)
 ; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 10(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 16(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa5, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 0(sp)
 ; ZVFHMIN-ZFH-RV64-NEXT:    flh fa3, 8(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 20(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa2, 4(sp)
 ; ZVFHMIN-ZFH-RV64-NEXT:    flh fa1, 12(sp)
 ; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa3, fa4
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa4, fa3
 ; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa1, fa2
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa2, fa1
 ; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 22(sp)
+; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 6(sp)
 ; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 14(sp)
 ; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa4, fa5
+; ZVFHMIN-ZFH-RV64-NEXT:    fsgnjn.h fa5, fa5, fa4
 ; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa5
 ; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    addi a1, sp, 24
-; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFH-RV64-NEXT:    fsh fa5, 4(a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFH-RV64-NEXT:    ret
 ;
 ; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_trunc_v3f16_v3f32:
 ; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 3, e32, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vle32.v v9, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 8
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 18(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa4, 10(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a2, 8
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 0(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a4, a2, -1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    not a3, a3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 16(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 8(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    lui a6, 1048568
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a6
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a1, a3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 4(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    and a5, a5, a4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    not a3, a3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a3, a2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 20(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 12(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    or a2, a5, a2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 6(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    and a3, a3, a4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    not a1, a1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a6
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 22(sp)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 14(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a3, a1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    and a2, a2, a4
@@ -3249,75 +3020,66 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) {
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    and a1, a1, a6
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    or a1, a2, a1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a1, sp, 24
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fsh fa5, 4(a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
 ;
 ; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_trunc_v3f16_v3f32:
 ; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 32
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 3, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle32.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vle64.v v9, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vle32.v v9, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v10, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 18(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa4, 10(sp)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    not a1, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a2, 1048568
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a2, 8
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 16(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a4, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a5, a4, -1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addiw a4, a2, -1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a3, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a5, fa5
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a3, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    not a3, a6
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a6, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 20(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a4
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a4, a6, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a3, a4, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    not a1, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    lui a6, 1048568
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a6
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a1, a3
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 22(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a3, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a5, a5, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a3, a3
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a3, a2
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a2, a5, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    not a1, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a2
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a3, a3, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a1, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a6
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a3, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a2, a2, a4
+; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-ZFHIN-RV64-NEXT:    not a1, a1
+; ZVFHMIN-ZFHIN-RV64-NEXT:    and a1, a1, a6
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    or a1, a2, a1
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a1, sp, 24
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 28(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    fsh fa5, 4(a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 32
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 3, e16, mf4, ta, ma
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
   %a = load <3 x half>, ptr %x
   %b = load <3 x float>, ptr %y
@@ -3385,38 +3147,18 @@ define void @sqrt_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: sqrt_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfsqrt.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: sqrt_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfsqrt.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: sqrt_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfsqrt.v v8, v9
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = call <6 x half> @llvm.sqrt.v6f16(<6 x half> %a)
   store <6 x half> %b, ptr %x
@@ -3508,46 +3250,22 @@ define void @fma_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-NEXT:    vse16.v v10, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fma_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a2)
-; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    vle16.v v10, (a1)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmadd.vv v9, v8, v11
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fma_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a2)
-; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vle16.v v10, (a1)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmadd.vv v9, v8, v11
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v8, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fma_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a2)
+; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
   %c = load <6 x half>, ptr %z
@@ -3692,128 +3410,63 @@ define void @fmsub_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-NEXT:    vse16.v v10, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fmsub_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    addi sp, sp, -16
-; ZVFHMIN-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a2)
-; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    vle16.v v10, (a1)
-; ZVFHMIN-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-RV32-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-RV32-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa4
-; ZVFHMIN-RV32-NEXT:    lui a3, 1048568
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa3
-; ZVFHMIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-RV32-NEXT:    lui a5, 8
-; ZVFHMIN-RV32-NEXT:    xor a2, a2, a5
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-RV32-NEXT:    xor a1, a1, a3
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-RV32-NEXT:    xor a4, a4, a3
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-RV32-NEXT:    xor a2, a2, a3
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-RV32-NEXT:    xor a1, a1, a3
-; ZVFHMIN-RV32-NEXT:    xor a2, a2, a5
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v11, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v11, v11, a1
-; ZVFHMIN-RV32-NEXT:    xor a2, a2, a3
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v11, v11, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    xor a1, a1, a3
-; ZVFHMIN-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v11, v11, a1
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v11, v8, 4, v0.t
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v11
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmadd.vv v9, v11, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    addi sp, sp, 16
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fmsub_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    addi sp, sp, -16
-; ZVFHMIN-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a2)
-; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vle16.v v10, (a1)
-; ZVFHMIN-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-RV64-NEXT:    vse16.v v8, (a1)
-; ZVFHMIN-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-RV64-NEXT:    flh fa4, 0(sp)
-; ZVFHMIN-RV64-NEXT:    flh fa3, 4(sp)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa4
-; ZVFHMIN-RV64-NEXT:    lui a3, 1048568
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a4, fa3
-; ZVFHMIN-RV64-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-RV64-NEXT:    lui a5, 8
-; ZVFHMIN-RV64-NEXT:    xor a2, a2, a5
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-RV64-NEXT:    xor a1, a1, a3
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a1
-; ZVFHMIN-RV64-NEXT:    xor a4, a4, a3
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-RV64-NEXT:    xor a2, a2, a3
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-RV64-NEXT:    xor a1, a1, a3
-; ZVFHMIN-RV64-NEXT:    xor a2, a2, a5
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v11, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v11, v11, a1
-; ZVFHMIN-RV64-NEXT:    xor a2, a2, a3
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v11, v11, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    xor a1, a1, a3
-; ZVFHMIN-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v11, v11, a1
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v11, v8, 4, v0.t
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v11
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmadd.vv v9, v11, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v8, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    addi sp, sp, 16
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fmsub_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a2)
+; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    mv a1, sp
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-NEXT:    vse16.v v8, (a1)
+; ZVFHMIN-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-NEXT:    flh fa4, 0(sp)
+; ZVFHMIN-NEXT:    flh fa3, 4(sp)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-NEXT:    lui a3, 1048568
+; ZVFHMIN-NEXT:    fmv.x.h a4, fa3
+; ZVFHMIN-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-NEXT:    lui a5, 8
+; ZVFHMIN-NEXT:    xor a2, a2, a5
+; ZVFHMIN-NEXT:    vmv.v.x v8, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-NEXT:    xor a1, a1, a3
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-NEXT:    xor a4, a4, a3
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-NEXT:    xor a2, a2, a3
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-NEXT:    xor a1, a1, a3
+; ZVFHMIN-NEXT:    xor a2, a2, a5
+; ZVFHMIN-NEXT:    vmv.v.x v11, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-NEXT:    vslide1down.vx v11, v11, a1
+; ZVFHMIN-NEXT:    xor a2, a2, a3
+; ZVFHMIN-NEXT:    vslide1down.vx v11, v11, a2
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    xor a1, a1, a3
+; ZVFHMIN-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-NEXT:    vslide1down.vx v11, v11, a1
+; ZVFHMIN-NEXT:    vslidedown.vi v11, v8, 4, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v8
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
   %c = load <6 x half>, ptr %z
@@ -4565,52 +4218,25 @@ define void @fadd_vf_v6f16(ptr %x, half %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fadd_vf_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    li a2, 192
-; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfadd.vv v8, v9, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fadd_vf_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    li a2, 192
-; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfadd.vv v8, v9, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fadd_vf_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    li a2, 192
+; ZVFHMIN-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v9, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = insertelement <6 x half> poison, half %y, i32 0
   %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer
@@ -4699,52 +4325,25 @@ define void @fadd_fv_v6f16(ptr %x, half %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fadd_fv_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    li a2, 192
-; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfadd.vv v8, v10, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fadd_fv_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    li a2, 192
-; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfadd.vv v8, v10, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fadd_fv_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    li a2, 192
+; ZVFHMIN-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v9
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = insertelement <6 x half> poison, half %y, i32 0
   %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer
@@ -4833,52 +4432,25 @@ define void @fsub_vf_v6f16(ptr %x, half %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fsub_vf_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    li a2, 192
-; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfsub.vv v8, v9, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fsub_vf_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    li a2, 192
-; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfsub.vv v8, v9, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fsub_vf_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    li a2, 192
+; ZVFHMIN-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfsub.vv v8, v9, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = insertelement <6 x half> poison, half %y, i32 0
   %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer
@@ -4967,52 +4539,25 @@ define void @fsub_fv_v6f16(ptr %x, half %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fsub_fv_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    li a2, 192
-; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfsub.vv v8, v10, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fsub_fv_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    li a2, 192
-; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfsub.vv v8, v10, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fsub_fv_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    li a2, 192
+; ZVFHMIN-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfsub.vv v8, v10, v9
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = insertelement <6 x half> poison, half %y, i32 0
   %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer
@@ -5101,52 +4646,25 @@ define void @fmul_vf_v6f16(ptr %x, half %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fmul_vf_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    li a2, 192
-; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmul.vv v8, v9, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fmul_vf_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    li a2, 192
-; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmul.vv v8, v9, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fmul_vf_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    li a2, 192
+; ZVFHMIN-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmul.vv v8, v9, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = insertelement <6 x half> poison, half %y, i32 0
   %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer
@@ -5235,52 +4753,25 @@ define void @fmul_fv_v6f16(ptr %x, half %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fmul_fv_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    li a2, 192
-; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmul.vv v8, v10, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fmul_fv_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    li a2, 192
-; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmul.vv v8, v10, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fmul_fv_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    li a2, 192
+; ZVFHMIN-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmul.vv v8, v10, v9
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = insertelement <6 x half> poison, half %y, i32 0
   %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer
@@ -5369,52 +4860,25 @@ define void @fdiv_vf_v6f16(ptr %x, half %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fdiv_vf_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    li a2, 192
-; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfdiv.vv v8, v9, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fdiv_vf_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    li a2, 192
-; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfdiv.vv v8, v9, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fdiv_vf_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    li a2, 192
+; ZVFHMIN-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfdiv.vv v8, v9, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = insertelement <6 x half> poison, half %y, i32 0
   %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer
@@ -5503,52 +4967,25 @@ define void @fdiv_fv_v6f16(ptr %x, half %y) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fdiv_fv_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    li a2, 192
-; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfdiv.vv v8, v10, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fdiv_fv_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    li a2, 192
-; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a2
-; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a1, v0
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfdiv.vv v8, v10, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fdiv_fv_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    li a2, 192
+; ZVFHMIN-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a2
+; ZVFHMIN-NEXT:    vmerge.vxm v9, v9, a1, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfdiv.vv v8, v10, v9
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = insertelement <6 x half> poison, half %y, i32 0
   %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer
@@ -5642,56 +5079,27 @@ define void @fma_vf_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFH-NEXT:    vse16.v v9, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fma_vf_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    li a2, 192
-; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v10, a2
-; ZVFHMIN-RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmadd.vv v9, v11, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fma_vf_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    li a2, 192
-; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v10, a2
-; ZVFHMIN-RV64-NEXT:    vmerge.vxm v10, v10, a1, v0
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmadd.vv v9, v11, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v8, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fma_vf_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    li a2, 192
+; ZVFHMIN-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a2
+; ZVFHMIN-NEXT:    vmerge.vxm v10, v10, a1, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
   %c = insertelement <6 x half> poison, half %z, i32 0
@@ -5791,56 +5199,27 @@ define void @fma_fv_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFH-NEXT:    vse16.v v9, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fma_fv_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV32-NEXT:    li a2, 192
-; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v10, a2
-; ZVFHMIN-RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmadd.vv v9, v11, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fma_fv_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa5
-; ZVFHMIN-RV64-NEXT:    li a2, 192
-; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a2
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v10, a2
-; ZVFHMIN-RV64-NEXT:    vmerge.vxm v10, v10, a1, v0
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v11, v10
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmadd.vv v9, v11, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v8, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fma_fv_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa5
+; ZVFHMIN-NEXT:    li a2, 192
+; ZVFHMIN-NEXT:    vmv.s.x v0, a2
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a2
+; ZVFHMIN-NEXT:    vmerge.vxm v10, v10, a1, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
   %c = insertelement <6 x half> poison, half %z, i32 0
@@ -5982,138 +5361,68 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFH-NEXT:    vse16.v v9, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fmsub_vf_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    addi sp, sp, -16
-; ZVFHMIN-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    mv a1, sp
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 0(sp)
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-RV32-NEXT:    li a4, 192
-; ZVFHMIN-RV32-NEXT:    vmv.s.x v0, a4
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-RV32-NEXT:    vmerge.vxm v9, v9, a3, v0
-; ZVFHMIN-RV32-NEXT:    lui a1, 1048568
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-RV32-NEXT:    lui a5, 8
-; ZVFHMIN-RV32-NEXT:    xor a4, a4, a5
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v10, a4
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-RV32-NEXT:    xor a2, a2, a1
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v10, v10, a2
-; ZVFHMIN-RV32-NEXT:    xor a3, a3, a1
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v10, v10, a3
-; ZVFHMIN-RV32-NEXT:    xor a4, a4, a1
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v10, v10, a4
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-RV32-NEXT:    xor a2, a2, a1
-; ZVFHMIN-RV32-NEXT:    xor a3, a3, a5
-; ZVFHMIN-RV32-NEXT:    vmv.v.x v11, a3
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-RV32-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v11, v11, a2
-; ZVFHMIN-RV32-NEXT:    xor a3, a3, a1
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v11, v11, a3
-; ZVFHMIN-RV32-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV32-NEXT:    xor a1, a2, a1
-; ZVFHMIN-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-RV32-NEXT:    vslide1down.vx v11, v11, a1
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v11, v10, 4, v0.t
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v10, v11
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmadd.vv v9, v11, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    addi sp, sp, 16
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fmsub_vf_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    addi sp, sp, -16
-; ZVFHMIN-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
-; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a1)
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    mv a1, sp
-; ZVFHMIN-RV64-NEXT:    vse16.v v9, (a1)
-; ZVFHMIN-RV64-NEXT:    flh fa5, 2(sp)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 0(sp)
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-RV64-NEXT:    li a4, 192
-; ZVFHMIN-RV64-NEXT:    vmv.s.x v0, a4
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 4(sp)
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-RV64-NEXT:    vmerge.vxm v9, v9, a3, v0
-; ZVFHMIN-RV64-NEXT:    lui a1, 1048568
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 6(sp)
-; ZVFHMIN-RV64-NEXT:    lui a5, 8
-; ZVFHMIN-RV64-NEXT:    xor a4, a4, a5
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v10, a4
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a4, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 10(sp)
-; ZVFHMIN-RV64-NEXT:    xor a2, a2, a1
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v10, v10, a2
-; ZVFHMIN-RV64-NEXT:    xor a3, a3, a1
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 8(sp)
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v10, v10, a3
-; ZVFHMIN-RV64-NEXT:    xor a4, a4, a1
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v10, v10, a4
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 12(sp)
-; ZVFHMIN-RV64-NEXT:    xor a2, a2, a1
-; ZVFHMIN-RV64-NEXT:    xor a3, a3, a5
-; ZVFHMIN-RV64-NEXT:    vmv.v.x v11, a3
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a3, fa5
-; ZVFHMIN-RV64-NEXT:    flh fa5, 14(sp)
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v11, v11, a2
-; ZVFHMIN-RV64-NEXT:    xor a3, a3, a1
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v11, v11, a3
-; ZVFHMIN-RV64-NEXT:    fmv.x.h a2, fa5
-; ZVFHMIN-RV64-NEXT:    xor a1, a2, a1
-; ZVFHMIN-RV64-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-RV64-NEXT:    vslide1down.vx v11, v11, a1
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v11, v10, 4, v0.t
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v10, v11
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmadd.vv v9, v11, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v8, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    addi sp, sp, 16
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fmsub_vf_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    mv a1, sp
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, mu
+; ZVFHMIN-NEXT:    vse16.v v9, (a1)
+; ZVFHMIN-NEXT:    flh fa5, 2(sp)
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 0(sp)
+; ZVFHMIN-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-NEXT:    li a4, 192
+; ZVFHMIN-NEXT:    vmv.s.x v0, a4
+; ZVFHMIN-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-NEXT:    flh fa5, 4(sp)
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vmerge.vxm v9, v9, a3, v0
+; ZVFHMIN-NEXT:    lui a1, 1048568
+; ZVFHMIN-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-NEXT:    flh fa5, 6(sp)
+; ZVFHMIN-NEXT:    lui a5, 8
+; ZVFHMIN-NEXT:    xor a4, a4, a5
+; ZVFHMIN-NEXT:    vmv.v.x v10, a4
+; ZVFHMIN-NEXT:    fmv.x.h a4, fa5
+; ZVFHMIN-NEXT:    flh fa5, 10(sp)
+; ZVFHMIN-NEXT:    xor a2, a2, a1
+; ZVFHMIN-NEXT:    vslide1down.vx v10, v10, a2
+; ZVFHMIN-NEXT:    xor a3, a3, a1
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    flh fa5, 8(sp)
+; ZVFHMIN-NEXT:    vslide1down.vx v10, v10, a3
+; ZVFHMIN-NEXT:    xor a4, a4, a1
+; ZVFHMIN-NEXT:    vslide1down.vx v10, v10, a4
+; ZVFHMIN-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-NEXT:    flh fa5, 12(sp)
+; ZVFHMIN-NEXT:    xor a2, a2, a1
+; ZVFHMIN-NEXT:    xor a3, a3, a5
+; ZVFHMIN-NEXT:    vmv.v.x v11, a3
+; ZVFHMIN-NEXT:    fmv.x.h a3, fa5
+; ZVFHMIN-NEXT:    flh fa5, 14(sp)
+; ZVFHMIN-NEXT:    vslide1down.vx v11, v11, a2
+; ZVFHMIN-NEXT:    xor a3, a3, a1
+; ZVFHMIN-NEXT:    vslide1down.vx v11, v11, a3
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa5
+; ZVFHMIN-NEXT:    xor a1, a2, a1
+; ZVFHMIN-NEXT:    vmv.v.i v0, 15
+; ZVFHMIN-NEXT:    vslide1down.vx v11, v11, a1
+; ZVFHMIN-NEXT:    vslidedown.vi v11, v10, 4, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v11
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
   %c = insertelement <6 x half> poison, half %z, i32 0
@@ -6715,9 +6024,10 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN-ZFH-RV32:       # %bb.0:
 ; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-ZFH-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV32-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFH-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 2(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    lui a1, %hi(.LCPI116_0)
@@ -6750,79 +6060,69 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB116_6:
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa3, 6(sp)
 ; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa0, fa3
-; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a2, fa0, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa1
-; ZVFHMIN-ZFH-RV32-NEXT:    beqz a2, .LBB116_8
+; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a1, fa0, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa1
+; ZVFHMIN-ZFH-RV32-NEXT:    beqz a1, .LBB116_8
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.7:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a2, fa3, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa1, a2, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa3, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa1, a1, rtz
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa3, fa1, fa3
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB116_8:
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa1, 10(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa4
 ; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa4, fa1
 ; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a3, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-ZFH-RV32-NEXT:    beqz a3, .LBB116_10
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.9:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a3, fa1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa4, a3, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a2, fa1, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa4, a2, rtz
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa1, fa4, fa1
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB116_10:
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 8(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa2
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa2
 ; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa2, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a4, fa2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa1
-; ZVFHMIN-ZFH-RV32-NEXT:    beqz a4, .LBB116_12
+; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a3, fa2, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa1
+; ZVFHMIN-ZFH-RV32-NEXT:    beqz a3, .LBB116_12
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.11:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a4, fa4, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa2, a4, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a3, fa4, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa2, a3, rtz
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa2, fa4
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB116_12:
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa2, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a4, fa3
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a6, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a6
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a2, fa3
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v9, a3
 ; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa4, fa2
-; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a6, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFH-RV32-NEXT:    beqz a6, .LBB116_14
+; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a3, fa4, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    beqz a3, .LBB116_14
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.13:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a5, fa2, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa4, a5, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa2, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa4, a1, rtz
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa2, fa4, fa2
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB116_14:
 ; ZVFHMIN-ZFH-RV32-NEXT:    flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa2
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa2
 ; ZVFHMIN-ZFH-RV32-NEXT:    fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a6, fa3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFH-RV32-NEXT:    beqz a6, .LBB116_16
+; ZVFHMIN-ZFH-RV32-NEXT:    flt.h a2, fa3, fa5
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    beqz a2, .LBB116_16
 ; ZVFHMIN-ZFH-RV32-NEXT:  # %bb.15:
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a5, fa4, rtz
-; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa5, a5, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.w.h a1, fa4, rtz
+; ZVFHMIN-ZFH-RV32-NEXT:    fcvt.h.w fa5, a1, rtz
 ; ZVFHMIN-ZFH-RV32-NEXT:    fsgnj.h fa4, fa5, fa4
 ; ZVFHMIN-ZFH-RV32-NEXT:  .LBB116_16:
-; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a5, fa4
+; ZVFHMIN-ZFH-RV32-NEXT:    fmv.x.h a1, fa4
 ; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFH-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFH-RV32-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFH-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV32-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFH-RV32-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFH-RV32-NEXT:    ret
 ;
@@ -6830,9 +6130,10 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN-ZFH-RV64:       # %bb.0:
 ; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-ZFH-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV64-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFH-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFH-RV64-NEXT:    flh fa4, 2(sp)
 ; ZVFHMIN-ZFH-RV64-NEXT:    lui a1, %hi(.LCPI116_0)
@@ -6925,13 +6226,9 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN-ZFH-RV64-NEXT:    fmv.x.h a1, fa4
 ; ZVFHMIN-ZFH-RV64-NEXT:    vmv.v.i v0, 15
 ; ZVFHMIN-ZFH-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFH-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFH-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFH-RV64-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFH-RV64-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFH-RV64-NEXT:    ret
 ;
@@ -6939,9 +6236,10 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN-ZFHIN-RV32:       # %bb.0:
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa5, 2(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa5
@@ -6980,89 +6278,79 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa2, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa0
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa0, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a2, fa0, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a2, .LBB116_8
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a1, fa0, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a1, .LBB116_8
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.7:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a2, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa1, a2, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa4, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa1, a1, rtz
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa1, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB116_8:
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa1, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa2, fa1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa1, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a3, fa1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a2, fa1, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa3, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a3, .LBB116_10
+; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a2, .LBB116_10
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.9:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a3, fa2, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa1, a3, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a2, fa2, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa1, a2, rtz
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa2, fa1, fa2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB116_10:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa1, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa3, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa2, fa2
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa1, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a4, fa1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a4, .LBB116_12
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a3, fa1, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a2, fa2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a3, .LBB116_12
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.11:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a4, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa2, a4, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a3, fa4, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa2, a3, rtz
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa2, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB116_12:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a4, fa3
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 12(sp)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a6, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a6
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a3, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v9, a3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a6, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a6, .LBB116_14
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a3, fa3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a2
+; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a3, .LBB116_14
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.13:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a5, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a5, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a2, fa4, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa3, a2, rtz
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa3, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB116_14:
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    flh fa3, 14(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa4
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.h fa4, fa3
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a6, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a6, .LBB116_16
+; ZVFHMIN-ZFHIN-RV32-NEXT:    flt.s a2, fa3, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    beqz a2, .LBB116_16
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  # %bb.15:
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a5, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa5, a5, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.w.s a1, fa4, rtz
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.s.w fa5, a1, rtz
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fsgnj.s fa4, fa5, fa4
 ; ZVFHMIN-ZFHIN-RV32-NEXT:  .LBB116_16:
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    fcvt.h.s fa5, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a5, fa5
+; ZVFHMIN-ZFHIN-RV32-NEXT:    fmv.x.h a1, fa5
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a5
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v9, v9, a1
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslide1down.vx v8, v8, a4
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFHIN-RV32-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV32-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFHIN-RV32-NEXT:    ret
 ;
@@ -7070,9 +6358,10 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN-ZFHIN-RV64:       # %bb.0:
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    mv a1, sp
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v8, (a1)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    flh fa5, 2(sp)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fcvt.s.h fa4, fa5
@@ -7181,13 +6470,9 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    fmv.x.h a1, fa5
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vmv.v.i v0, 15
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 6, e16, mf2, ta, mu
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-ZFHIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-ZFHIN-RV64-NEXT:    vse32.v v8, (a0)
+; ZVFHMIN-ZFHIN-RV64-NEXT:    vse16.v v9, (a0)
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-ZFHIN-RV64-NEXT:    ret
   %a = load <6 x half>, ptr %x
@@ -7320,56 +6605,27 @@ define void @ceil_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: ceil_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-RV32-NEXT:    lui a1, 307200
-; ZVFHMIN-RV32-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-RV32-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-RV32-NEXT:    fsrmi a1, 3
-; ZVFHMIN-RV32-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; ZVFHMIN-RV32-NEXT:    fsrm a1
-; ZVFHMIN-RV32-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-RV32-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: ceil_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-RV64-NEXT:    lui a1, 307200
-; ZVFHMIN-RV64-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-RV64-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-RV64-NEXT:    fsrmi a1, 3
-; ZVFHMIN-RV64-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; ZVFHMIN-RV64-NEXT:    fsrm a1
-; ZVFHMIN-RV64-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-RV64-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v8, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: ceil_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v9
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    fsrmi a1, 3
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    fsrm a1
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
+; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = call <6 x half> @llvm.ceil.v6f16(<6 x half> %a)
   store <6 x half> %b, ptr %x
@@ -7506,56 +6762,27 @@ define void @floor_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: floor_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-RV32-NEXT:    lui a1, 307200
-; ZVFHMIN-RV32-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-RV32-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-RV32-NEXT:    fsrmi a1, 2
-; ZVFHMIN-RV32-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; ZVFHMIN-RV32-NEXT:    fsrm a1
-; ZVFHMIN-RV32-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-RV32-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: floor_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-RV64-NEXT:    lui a1, 307200
-; ZVFHMIN-RV64-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-RV64-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-RV64-NEXT:    fsrmi a1, 2
-; ZVFHMIN-RV64-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; ZVFHMIN-RV64-NEXT:    fsrm a1
-; ZVFHMIN-RV64-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-RV64-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v8, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: floor_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v9
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    fsrmi a1, 2
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    fsrm a1
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
+; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = call <6 x half> @llvm.floor.v6f16(<6 x half> %a)
   store <6 x half> %b, ptr %x
@@ -7692,56 +6919,27 @@ define void @round_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: round_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-RV32-NEXT:    lui a1, 307200
-; ZVFHMIN-RV32-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-RV32-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-RV32-NEXT:    fsrmi a1, 4
-; ZVFHMIN-RV32-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; ZVFHMIN-RV32-NEXT:    fsrm a1
-; ZVFHMIN-RV32-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-RV32-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v9, v8, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v9, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: round_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-RV64-NEXT:    lui a1, 307200
-; ZVFHMIN-RV64-NEXT:    fmv.w.x fa5, a1
-; ZVFHMIN-RV64-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-RV64-NEXT:    fsrmi a1, 4
-; ZVFHMIN-RV64-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; ZVFHMIN-RV64-NEXT:    fsrm a1
-; ZVFHMIN-RV64-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; ZVFHMIN-RV64-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v8, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: round_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v8, v9
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    fsrmi a1, 4
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    fsrm a1
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
+; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = call <6 x half> @llvm.round.v6f16(<6 x half> %a)
   store <6 x half> %b, ptr %x
@@ -8075,56 +7273,27 @@ define void @fmuladd_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-NEXT:    vse16.v v10, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fmuladd_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    vle16.v v10, (a2)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmul.vv v8, v8, v11
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfadd.vv v8, v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fmuladd_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vle16.v v10, (a2)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmul.vv v8, v8, v11
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfadd.vv v8, v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fmuladd_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a2)
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmul.vv v8, v8, v11
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v8, v9
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
   %c = load <6 x half>, ptr %z
@@ -8233,56 +7402,27 @@ define void @fmsub_fmuladd_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-NEXT:    vse16.v v10, (a0)
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-RV32-LABEL: fmsub_fmuladd_v6f16:
-; ZVFHMIN-RV32:       # %bb.0:
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    vle16.v v10, (a2)
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfmul.vv v8, v8, v11
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV32-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfsub.vv v8, v8, v9
-; ZVFHMIN-RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV32-NEXT:    addi a1, a0, 8
-; ZVFHMIN-RV32-NEXT:    vse32.v v8, (a1)
-; ZVFHMIN-RV32-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
-; ZVFHMIN-RV32-NEXT:    vse16.v v9, (a0)
-; ZVFHMIN-RV32-NEXT:    ret
-;
-; ZVFHMIN-RV64-LABEL: fmsub_fmuladd_v6f16:
-; ZVFHMIN-RV64:       # %bb.0:
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-RV64-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vle16.v v10, (a2)
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfmul.vv v8, v8, v11
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-RV64-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfsub.vv v8, v8, v9
-; ZVFHMIN-RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vfncvt.f.f.w v9, v8
-; ZVFHMIN-RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFHMIN-RV64-NEXT:    vse64.v v9, (a0)
-; ZVFHMIN-RV64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN-RV64-NEXT:    addi a0, a0, 8
-; ZVFHMIN-RV64-NEXT:    vse32.v v8, (a0)
-; ZVFHMIN-RV64-NEXT:    ret
+; ZVFHMIN-LABEL: fmsub_fmuladd_v6f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a2)
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfmul.vv v8, v8, v11
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfsub.vv v8, v8, v9
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vse16.v v9, (a0)
+; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
   %c = load <6 x half>, ptr %z
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
index 19587438ea947..22cde3c36ef61 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
@@ -182,12 +182,12 @@ define <16 x i64> @exact_vlen_i64_m8(ptr %p) vscale_range(2,2) {
   ret <16 x i64> %v
 }
 
-define <8 x bfloat> @load_v8bf16(ptr %p) {
-; CHECK-LABEL: load_v8bf16:
+define <6 x bfloat> @load_v6bf16(ptr %p) {
+; CHECK-LABEL: load_v6bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %x = load <8 x bfloat>, ptr %p
-  ret <8 x bfloat> %x
+  %x = load <6 x bfloat>, ptr %p
+  ret <6 x bfloat> %x
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
index 7f18ee44631a1..169d99abb13c2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
@@ -294,13 +294,13 @@ define void @exact_vlen_i64_m8(ptr %p) vscale_range(2,2) {
   ret void
 }
 
-define void @store_v8bf16(ptr %p, <8 x bfloat> %v) {
-; CHECK-LABEL: store_v8bf16:
+define void @store_v6bf16(ptr %p, <6 x bfloat> %v) {
+; CHECK-LABEL: store_v6bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  store <8 x bfloat> %v, ptr %p
+  store <6 x bfloat> %v, ptr %p
   ret void
 }
 

From b2048de55ea934b70902864767b0cc8dfada8be0 Mon Sep 17 00:00:00 2001
From: Max Winkler <max.enrico.winkler@gmail.com>
Date: Wed, 4 Sep 2024 18:52:39 -0700
Subject: [PATCH 172/425] [Clang] [Driver] Support `-fjmc` for `*-windows-msvc`
 target in non cl driver modes (#107177)

Allow `-fjmc` to be used if the target triple is targeting msvc,
`*-windows-msvc`, irrelevant of the driver mode used.

In general the driver mode shouldn't control the target triple.

Also in our custom build system I am trying to just treat clang as
clang. This is because while the `cl` driver mode emulates msvc
interface quite well there are still a lot of operations that are just
clang specific.
The optimization modes do not map directly from msvc to clang.
Warnings do not map from msvc to clang.
Instead of wrapping options with `/clang:` when targeting `clang-cl.exe`
it is just easier to target the clang driver always irrelevant of the
target triple.
---
 clang/lib/Driver/ToolChains/Clang.cpp | 3 ++-
 clang/test/Driver/clang_f_opts.c      | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index baac1215215b9..90a747ca58986 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4853,7 +4853,8 @@ renderDebugOptions(const ToolChain &TC, const Driver &D, const llvm::Triple &T,
 
   // This controls whether or not we perform JustMyCode instrumentation.
   if (Args.hasFlag(options::OPT_fjmc, options::OPT_fno_jmc, false)) {
-    if (TC.getTriple().isOSBinFormatELF() || D.IsCLMode()) {
+    if (TC.getTriple().isOSBinFormatELF() ||
+        TC.getTriple().isWindowsMSVCEnvironment()) {
       if (DebugInfoKind >= llvm::codegenoptions::DebugInfoConstructor)
         CmdArgs.push_back("-fjmc");
       else if (D.IsCLMode())
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index d69cd199ac61d..335fa546a1388 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -600,10 +600,12 @@
 // CHECK_NO_DISABLE_DIRECT-NOT: -fobjc-disable-direct-methods-for-testing
 
 // RUN: %clang -### -S -fjmc -target x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefixes=CHECK_JMC_WARN,CHECK_NOJMC %s
-// RUN: %clang -### -S -fjmc -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefixes=CHECK_JMC_WARN_NOT_ELF,CHECK_NOJMC %s
+// RUN: %clang -### -S -fjmc -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefixes=CHECK_JMC_WARN,CHECK_NOJMC %s
+// RUN: %clang -### -S -fjmc -g -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefix=CHECK_JMC %s
+// RUN: %clang -### -S -fjmc -g -fno-jmc -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC %s
 // RUN: %clang -### -S -fjmc -g -target x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_JMC %s
 // RUN: %clang -### -S -fjmc -g -fno-jmc -target x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC %s
-// RUN: %clang -### -fjmc -g -flto -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefixes=CHECK_JMC_WARN_NOT_ELF,CHECK_NOJMC_LTO %s
+// RUN: %clang -### -fjmc -g -flto -target x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC_LTO %s
 // RUN: %clang -### -fjmc -g -flto -target x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_JMC_LTO %s
 // RUN: %clang -### -fjmc -g -flto -fno-jmc -target x86_64-unknown-linux %s 2>&1 | FileCheck -check-prefix=CHECK_NOJMC_LTO %s
 // CHECK_JMC_WARN: -fjmc requires debug info. Use -g or debug options that enable debugger's stepping function; option ignored

From eb2929d323c0c44f2037cf8a345ca6984ec228eb Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Wed, 4 Sep 2024 18:59:42 -0700
Subject: [PATCH 173/425] [DirectX] use DXILMetadataAnalysis to build
 PSVRuntimeInfo (#107101)

Replace the hardcoded values for compute shader in
DXContainer::addPipelineStateValidationInfo.
Still missing wave size.

Add preserved for previous passes so the information is not lost.

Fix https://github.com/llvm/wg-hlsl/issues/51
---
 .../lib/Target/DirectX/DXContainerGlobals.cpp | 29 +++++++++----
 llvm/lib/Target/DirectX/DXILPrepare.cpp       |  2 +
 .../Target/DirectX/DXILTranslateMetadata.cpp  |  2 +
 .../DirectX/ContainerData/RuntimeInfoCS.ll    | 41 +++++++++++++++++++
 4 files changed, 67 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RuntimeInfoCS.ll

diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index d47b9c7a25b8f..aa7769899ff27 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DXILMetadataAnalysis.h"
 #include "llvm/BinaryFormat/DXContainer.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Constants.h"
@@ -57,6 +58,7 @@ class DXContainerGlobals : public llvm::ModulePass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
     AU.addRequired<ShaderFlagsAnalysisWrapper>();
+    AU.addRequired<DXILMetadataAnalysisWrapperPass>();
   }
 };
 
@@ -143,23 +145,35 @@ void DXContainerGlobals::addPipelineStateValidationInfo(
   SmallString<256> Data;
   raw_svector_ostream OS(Data);
   PSVRuntimeInfo PSV;
-  Triple TT(M.getTargetTriple());
   PSV.BaseData.MinimumWaveLaneCount = 0;
   PSV.BaseData.MaximumWaveLaneCount = std::numeric_limits<uint32_t>::max();
+
+  dxil::ModuleMetadataInfo &MMI =
+      getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
+  assert(MMI.EntryPropertyVec.size() == 1 ||
+         MMI.ShaderStage == Triple::Library);
   PSV.BaseData.ShaderStage =
-      static_cast<uint8_t>(TT.getEnvironment() - Triple::Pixel);
+      static_cast<uint8_t>(MMI.ShaderStage - Triple::Pixel);
 
   // Hardcoded values here to unblock loading the shader into D3D.
   //
   // TODO: Lots more stuff to do here!
   //
   // See issue https://github.com/llvm/llvm-project/issues/96674.
-  PSV.BaseData.NumThreadsX = 1;
-  PSV.BaseData.NumThreadsY = 1;
-  PSV.BaseData.NumThreadsZ = 1;
-  PSV.EntryName = "main";
+  switch (MMI.ShaderStage) {
+  case Triple::Compute:
+    PSV.BaseData.NumThreadsX = MMI.EntryPropertyVec[0].NumThreadsX;
+    PSV.BaseData.NumThreadsY = MMI.EntryPropertyVec[0].NumThreadsY;
+    PSV.BaseData.NumThreadsZ = MMI.EntryPropertyVec[0].NumThreadsZ;
+    break;
+  default:
+    break;
+  }
+
+  if (MMI.ShaderStage != Triple::Library)
+    PSV.EntryName = MMI.EntryPropertyVec[0].Entry->getName();
 
-  PSV.finalize(TT.getEnvironment());
+  PSV.finalize(MMI.ShaderStage);
   PSV.write(OS);
   Constant *Constant =
       ConstantDataArray::getString(M.getContext(), Data, /*AddNull*/ false);
@@ -170,6 +184,7 @@ char DXContainerGlobals::ID = 0;
 INITIALIZE_PASS_BEGIN(DXContainerGlobals, "dxil-globals",
                       "DXContainer Global Emitter", false, true)
 INITIALIZE_PASS_DEPENDENCY(ShaderFlagsAnalysisWrapper)
+INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass)
 INITIALIZE_PASS_END(DXContainerGlobals, "dxil-globals",
                     "DXContainer Global Emitter", false, true)
 
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index 56098864e987f..f6b7355b93625 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/Analysis/DXILMetadataAnalysis.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/AttributeMask.h"
 #include "llvm/IR/IRBuilder.h"
@@ -247,6 +248,7 @@ class DXILPrepareModule : public ModulePass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addPreserved<ShaderFlagsAnalysisWrapper>();
     AU.addPreserved<DXILResourceMDWrapper>();
+    AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
   }
   static char ID; // Pass identification.
 };
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index 2c6d20112060d..11cd9df1d1dc4 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -13,6 +13,7 @@
 #include "DXILShaderFlags.h"
 #include "DirectX.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/Analysis/DXILMetadataAnalysis.h"
 #include "llvm/Analysis/DXILResource.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Metadata.h"
@@ -103,6 +104,7 @@ class DXILTranslateMetadataLegacy : public ModulePass {
     AU.addRequired<DXILResourceWrapperPass>();
     AU.addRequired<DXILResourceMDWrapper>();
     AU.addRequired<ShaderFlagsAnalysisWrapper>();
+    AU.addRequired<DXILMetadataAnalysisWrapperPass>();
   }
 
   bool runOnModule(Module &M) override {
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RuntimeInfoCS.ll b/llvm/test/CodeGen/DirectX/ContainerData/RuntimeInfoCS.ll
new file mode 100644
index 0000000000000..595e70092bb08
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RuntimeInfoCS.ll
@@ -0,0 +1,41 @@
+; RUN: opt %s -dxil-embed -dxil-globals -S -o - | FileCheck %s
+; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: @dx.psv0 = private constant [80 x i8] c"{{.*}}", section "PSV0", align 4
+
+define void @cs_main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.numthreads"="8,8,1" "hlsl.shader"="compute" }
+
+!dx.valver = !{!0}
+
+!0 = !{i32 1, i32 7}
+
+; DXC: - Name:            PSV0
+; DXC-NEXT:   Size:            80
+; DXC-NEXT:    PSVInfo:
+; DXC-NEXT:      Version:         3
+; DXC-NEXT:      ShaderStage:     5
+; DXC-NEXT:      MinimumWaveLaneCount: 0
+; DXC-NEXT:      MaximumWaveLaneCount: 4294967295
+; DXC-NEXT:      UsesViewID:      0
+; DXC-NEXT:      SigInputVectors: 0
+; DXC-NEXT:      SigOutputVectors: [ 0, 0, 0, 0 ]
+; DXC-NEXT:      NumThreadsX:     8
+; DXC-NEXT:      NumThreadsY:     8
+; DXC-NEXT:      NumThreadsZ:     1
+; DXC-NEXT:      EntryName:       cs_main
+; DXC-NEXT:      ResourceStride:  24
+; DXC-NEXT:      Resources:       []
+; DXC-NEXT:      SigInputElements: []
+; DXC-NEXT:      SigOutputElements: []
+; DXC-NEXT:      SigPatchOrPrimElements: []
+; DXC-NEXT:      InputOutputMap:
+; DXC-NEXT:        - [  ]
+; DXC-NEXT:        - [  ]
+; DXC-NEXT:        - [  ]
+; DXC-NEXT:        - [  ]

From c28b1a19aadff97b369889aee084073a181cfda8 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 4 Sep 2024 19:14:18 -0700
Subject: [PATCH 174/425] [LegalizeTypes][RISCV] Use SPLAT_VECTOR_PARTS to
 legalize splat BUILD_VECTOR (#107290)

If the element type needs to be expanded, we can use SPLAT_VECTOR_PARTS
if the target supports it.

There's already a DAGCombine to turn BUILD_VECTOR into SPLAT_VECTOR if
the target makes SPLAT_VECTOR legal, but it doesn't fire for vectors
that need to be split.
---
 .../SelectionDAG/LegalizeTypesGeneric.cpp     |   9 +
 .../CodeGen/RISCV/rvv/fixed-vectors-llrint.ll |  16 +-
 .../rvv/fixed-vectors-masked-load-int.ll      |  56 ++---
 .../rvv/fixed-vectors-masked-store-int.ll     | 122 ++++-------
 .../RISCV/rvv/fixed-vectors-vadd-vp.ll        | 172 +++++----------
 .../RISCV/rvv/fixed-vectors-vand-vp.ll        |  36 ++--
 .../RISCV/rvv/fixed-vectors-vmax-vp.ll        |  66 ++----
 .../RISCV/rvv/fixed-vectors-vmaxu-vp.ll       |  66 ++----
 .../RISCV/rvv/fixed-vectors-vmin-vp.ll        |  66 ++----
 .../RISCV/rvv/fixed-vectors-vminu-vp.ll       |  66 ++----
 .../RISCV/rvv/fixed-vectors-vsadd-vp.ll       | 187 +++++------------
 .../RISCV/rvv/fixed-vectors-vsaddu-vp.ll      | 187 +++++------------
 .../RISCV/rvv/fixed-vectors-vssub-vp.ll       | 195 ++++++------------
 .../RISCV/rvv/fixed-vectors-vssubu-vp.ll      | 195 ++++++------------
 .../CodeGen/RISCV/srem-seteq-illegal-types.ll |  61 +++---
 15 files changed, 500 insertions(+), 1000 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index b402e82376276..2655e8428309d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -376,6 +376,15 @@ SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) {
   assert(OldVT == VecVT.getVectorElementType() &&
          "BUILD_VECTOR operand type doesn't match vector element type!");
 
+  if (VecVT.isInteger() && TLI.isOperationLegal(ISD::SPLAT_VECTOR, VecVT) &&
+      TLI.isOperationLegalOrCustom(ISD::SPLAT_VECTOR_PARTS, VecVT)) {
+    if (SDValue V = cast<BuildVectorSDNode>(N)->getSplatValue()) {
+      SDValue Lo, Hi;
+      GetExpandedOp(V, Lo, Hi);
+      return DAG.getNode(ISD::SPLAT_VECTOR_PARTS, dl, VecVT, Lo, Hi);
+    }
+  }
+
   // Build a vector of twice the length out of the expanded elements.
   // For example <3 x i64> -> <6 x i32>.
   SmallVector<SDValue, 16> NewElts;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
index 901be442c0012..d52cbb54c4b2d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
@@ -14,9 +14,11 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vfmv.f.s fa0, v8
 ; RV32-NEXT:    call llrintf
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a0, 0(sp)
+; RV32-NEXT:    mv a0, sp
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a0), zero
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -669,9 +671,11 @@ define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) {
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vfmv.f.s fa0, v8
 ; RV32-NEXT:    call llrint
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    sw a0, 0(sp)
+; RV32-NEXT:    mv a0, sp
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a0), zero
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
index ad075e4b4e198..2f20caa6eb189 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
@@ -397,43 +397,22 @@ define void @masked_load_v32i32(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
 declare <32 x i32> @llvm.masked.load.v32i32(ptr, i32, <32 x i1>, <32 x i32>)
 
 define void @masked_load_v32i64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; RV32-LABEL: masked_load_v32i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi a3, a1, 128
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vle64.v v24, (a3)
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vmseq.vv v8, v0, v16
-; RV32-NEXT:    vmseq.vv v0, v24, v16
-; RV32-NEXT:    addi a1, a0, 128
-; RV32-NEXT:    vle64.v v16, (a1), v0.t
-; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vle64.v v8, (a0), v0.t
-; RV32-NEXT:    vse64.v v8, (a2)
-; RV32-NEXT:    addi a0, a2, 128
-; RV32-NEXT:    vse64.v v16, (a0)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: masked_load_v32i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi a3, a1, 128
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v16, (a1)
-; RV64-NEXT:    vle64.v v24, (a3)
-; RV64-NEXT:    vmseq.vi v8, v16, 0
-; RV64-NEXT:    vmseq.vi v0, v24, 0
-; RV64-NEXT:    addi a1, a0, 128
-; RV64-NEXT:    vle64.v v16, (a1), v0.t
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vle64.v v8, (a0), v0.t
-; RV64-NEXT:    vse64.v v8, (a2)
-; RV64-NEXT:    addi a0, a2, 128
-; RV64-NEXT:    vse64.v v16, (a0)
-; RV64-NEXT:    ret
+; CHECK-LABEL: masked_load_v32i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a3, a1, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    vle64.v v24, (a3)
+; CHECK-NEXT:    vmseq.vi v8, v16, 0
+; CHECK-NEXT:    vmseq.vi v0, v24, 0
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vle64.v v16, (a1), v0.t
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vle64.v v8, (a0), v0.t
+; CHECK-NEXT:    vse64.v v8, (a2)
+; CHECK-NEXT:    addi a0, a2, 128
+; CHECK-NEXT:    vse64.v v16, (a0)
+; CHECK-NEXT:    ret
   %m = load <32 x i64>, ptr %m_ptr
   %mask = icmp eq <32 x i64> %m, zeroinitializer
   %load = call <32 x i64> @llvm.masked.load.v32i64(ptr %a, i32 8, <32 x i1> %mask, <32 x i64> undef)
@@ -547,3 +526,6 @@ define void @masked_load_v256i8(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
   ret void
 }
 declare <256 x i8> @llvm.masked.load.v256i8(ptr, i32, <256 x i1>, <256 x i8>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
index 86c28247e97ef..90690bbc8e208 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
@@ -397,87 +397,44 @@ define void @masked_store_v32i32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 declare void @llvm.masked.store.v32i32.p0(<32 x i32>, ptr, i32, <32 x i1>)
 
 define void @masked_store_v32i64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
-; RV32-LABEL: masked_store_v32i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    sub sp, sp, a3
-; RV32-NEXT:    addi a3, a2, 128
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle64.v v24, (a2)
-; RV32-NEXT:    vle64.v v8, (a3)
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vmseq.vv v7, v24, v8
-; RV32-NEXT:    addi a2, a0, 128
-; RV32-NEXT:    vle64.v v24, (a2)
-; RV32-NEXT:    vle64.v v16, (a0)
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vmseq.vv v0, v16, v8
-; RV32-NEXT:    addi a0, a1, 128
-; RV32-NEXT:    vse64.v v24, (a0), v0.t
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vse64.v v8, (a1), v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: masked_store_v32i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a3, a3, 4
-; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v8, (a2)
-; RV64-NEXT:    addi a2, a2, 128
-; RV64-NEXT:    vle64.v v16, (a2)
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmseq.vi v0, v8, 0
-; RV64-NEXT:    vle64.v v24, (a0)
-; RV64-NEXT:    addi a0, a0, 128
-; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vmseq.vi v8, v16, 0
-; RV64-NEXT:    vse64.v v24, (a1), v0.t
-; RV64-NEXT:    addi a0, a1, 128
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vse64.v v8, (a0), v0.t
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    ret
+; CHECK-LABEL: masked_store_v32i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    sub sp, sp, a3
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a2)
+; CHECK-NEXT:    addi a2, a2, 128
+; CHECK-NEXT:    vle64.v v16, (a2)
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vle64.v v24, (a0)
+; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmseq.vi v8, v16, 0
+; CHECK-NEXT:    vse64.v v24, (a1), v0.t
+; CHECK-NEXT:    addi a0, a1, 128
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
   %m = load <32 x i64>, ptr %m_ptr
   %mask = icmp eq <32 x i64> %m, zeroinitializer
   %val = load <32 x i64>, ptr %val_ptr
@@ -683,3 +640,6 @@ define void @masked_store_v256i8(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
   ret void
 }
 declare void @llvm.masked.store.v256i8.p0(<256 x i8>, ptr, i32, <256 x i1>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
index 5601bd5ee7a3a..805a3c640957b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
@@ -1346,93 +1346,48 @@ define <16 x i64> @vadd_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 declare <32 x i64> @llvm.vp.add.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
 
 define <32 x i64> @vadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vadd_vx_v32i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB108_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB108_2:
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vadd_vx_v32i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB108_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB108_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v8, v8, -1, v0.t
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v16, v16, -1, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vadd_vx_v32i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB108_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:  .LBB108_2:
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
 }
 
 define <32 x i64> @vadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vadd_vi_v32i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB109_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB109_2:
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v16, v24
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vadd_vi_v32i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB109_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB109_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v8, v8, -1
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v16, v16, -1
-; RV64-NEXT:    ret
+; CHECK-LABEL: vadd_vi_v32i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB109_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:  .LBB109_2:
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vadd.vi v8, v8, -1
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vadd.vi v16, v16, -1
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
 }
@@ -1440,49 +1395,26 @@ define <32 x i64> @vadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; FIXME: We don't match vadd.vi on RV32.
 
 define <32 x i64> @vadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
-; RV32-LABEL: vadd_vx_v32i64_evl12:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v16, -1
-; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vadd_vx_v32i64_evl12:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v8, v8, -1, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vadd_vx_v32i64_evl12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; CHECK-NEXT:    vadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 12)
   ret <32 x i64> %v
 }
 
 define <32 x i64> @vadd_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
-; RV32-LABEL: vadd_vx_v32i64_evl27:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vadd_vx_v32i64_evl27:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v8, v8, -1, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vi v16, v16, -1, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vadd_vx_v32i64_evl27:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; CHECK-NEXT:    vadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 27)
   ret <32 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
index d414be76672ab..c413dd86f3712 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
@@ -1139,18 +1139,16 @@ define <11 x i64> @vand_vv_v11i64_unmasked(<11 x i64> %va, <11 x i64> %b, i32 ze
 define <11 x i64> @vand_vx_v11i64(<11 x i64> %va, i64 %b, <11 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vand_vx_v11i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vmv1r.v v16, v0
-; RV32-NEXT:    lui a3, 341
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a3
-; RV32-NEXT:    li a3, 32
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vmerge.vxm v24, v24, a0, v0
-; RV32-NEXT:    vmv1r.v v0, v16
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vand_vx_v11i64:
@@ -1167,16 +1165,16 @@ define <11 x i64> @vand_vx_v11i64(<11 x i64> %va, i64 %b, <11 x i1> %m, i32 zero
 define <11 x i64> @vand_vx_v11i64_unmasked(<11 x i64> %va, i64 %b, i32 zeroext %evl) {
 ; RV32-LABEL: vand_vx_v11i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a3, 32
-; RV32-NEXT:    lui a4, 341
-; RV32-NEXT:    addi a4, a4, 1365
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v0, a4
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vmerge.vxm v16, v16, a0, v0
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vand_vx_v11i64_unmasked:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
index 0b0d758ad8ded..6adc6ba9621a8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
@@ -1018,51 +1018,27 @@ define <16 x i64> @vmax_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %
 declare <32 x i64> @llvm.vp.smax.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
 
 define <32 x i64> @vmax_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vmax_vx_v32i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB74_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB74_2:
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmax.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmax.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vmax_vx_v32i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB74_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB74_2:
-; RV64-NEXT:    li a2, -1
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmax.vx v8, v8, a2, v0.t
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmax.vx v16, v16, a2, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vmax_vx_v32i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB74_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:  .LBB74_2:
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vmax.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmax.vx v16, v16, a2, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.smax.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
index 98e630a0e59e5..baeb372c017e2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
@@ -1017,51 +1017,27 @@ define <16 x i64> @vmaxu_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext
 declare <32 x i64> @llvm.vp.umax.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
 
 define <32 x i64> @vmaxu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vmaxu_vx_v32i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB74_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB74_2:
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmaxu.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmaxu.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vmaxu_vx_v32i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB74_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB74_2:
-; RV64-NEXT:    li a2, -1
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmaxu.vx v8, v8, a2, v0.t
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmaxu.vx v16, v16, a2, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vmaxu_vx_v32i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB74_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:  .LBB74_2:
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vmaxu.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmaxu.vx v16, v16, a2, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.umax.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
index a6e3764b37550..d0c21ce05c025 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
@@ -1018,51 +1018,27 @@ define <16 x i64> @vmin_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %
 declare <32 x i64> @llvm.vp.smin.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
 
 define <32 x i64> @vmin_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vmin_vx_v32i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB74_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB74_2:
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmin.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmin.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vmin_vx_v32i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB74_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB74_2:
-; RV64-NEXT:    li a2, -1
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmin.vx v8, v8, a2, v0.t
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmin.vx v16, v16, a2, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vmin_vx_v32i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB74_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:  .LBB74_2:
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vmin.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmin.vx v16, v16, a2, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.smin.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
index c59b65edd1ec1..a730ba4729d25 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
@@ -1017,51 +1017,27 @@ define <16 x i64> @vminu_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext
 declare <32 x i64> @llvm.vp.umin.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
 
 define <32 x i64> @vminu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vminu_vx_v32i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB74_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB74_2:
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vminu.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vminu.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vminu_vx_v32i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB74_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB74_2:
-; RV64-NEXT:    li a2, -1
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vminu.vx v8, v8, a2, v0.t
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vminu.vx v16, v16, a2, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vminu_vx_v32i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB74_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:  .LBB74_2:
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vminu.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vminu.vx v16, v16, a2, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.umin.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
index df2c83028e5df..c5dd6ac344a37 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
@@ -1359,93 +1359,48 @@ define <16 x i64> @vsadd_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 declare <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
 
 define <32 x i64> @vsadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vsadd_vx_v32i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB108_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB108_2:
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsadd.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vsadd_vx_v32i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB108_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB108_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsadd.vi v8, v8, -1, v0.t
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsadd.vi v16, v16, -1, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vsadd_vx_v32i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB108_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:  .LBB108_2:
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
 }
 
 define <32 x i64> @vsadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vsadd_vi_v32i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB109_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB109_2:
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsadd.vv v8, v8, v24
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsadd.vv v16, v16, v24
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vsadd_vi_v32i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB109_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB109_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsadd.vi v8, v8, -1
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsadd.vi v16, v16, -1
-; RV64-NEXT:    ret
+; CHECK-LABEL: vsadd_vi_v32i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB109_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:  .LBB109_2:
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
 }
@@ -1453,59 +1408,31 @@ define <32 x i64> @vsadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; FIXME: We don't match vsadd.vi on RV32.
 
 define <32 x i64> @vsadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
-; RV32-LABEL: vsadd_vx_v32i64_evl12:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
-; RV32-NEXT:    vsadd.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
-; RV32-NEXT:    vsadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vsadd_vx_v32i64_evl12:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
-; RV64-NEXT:    vsadd.vi v8, v8, -1, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
-; RV64-NEXT:    vsadd.vi v16, v16, -1, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vsadd_vx_v32i64_evl12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 12)
   ret <32 x i64> %v
 }
 
 define <32 x i64> @vsadd_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
-; RV32-LABEL: vsadd_vx_v32i64_evl27:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vsadd.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
-; RV32-NEXT:    vsadd.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vsadd_vx_v32i64_evl27:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsadd.vi v8, v8, -1, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
-; RV64-NEXT:    vsadd.vi v16, v16, -1, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vsadd_vx_v32i64_evl27:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 27)
   ret <32 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
index f50dadf019910..17d9c437590a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
@@ -1355,93 +1355,48 @@ define <16 x i64> @vsaddu_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 declare <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
 
 define <32 x i64> @vsaddu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vsaddu_vx_v32i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB108_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB108_2:
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsaddu.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsaddu.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vsaddu_vx_v32i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB108_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB108_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsaddu.vi v8, v8, -1, v0.t
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsaddu.vi v16, v16, -1, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vsaddu_vx_v32i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB108_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:  .LBB108_2:
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
 }
 
 define <32 x i64> @vsaddu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vsaddu_vi_v32i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB109_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB109_2:
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsaddu.vv v8, v8, v24
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsaddu.vv v16, v16, v24
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vsaddu_vi_v32i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB109_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB109_2:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsaddu.vi v8, v8, -1
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsaddu.vi v16, v16, -1
-; RV64-NEXT:    ret
+; CHECK-LABEL: vsaddu_vi_v32i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB109_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:  .LBB109_2:
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
 }
@@ -1449,59 +1404,31 @@ define <32 x i64> @vsaddu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; FIXME: We don't match vsaddu.vi on RV32.
 
 define <32 x i64> @vsaddu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
-; RV32-LABEL: vsaddu_vx_v32i64_evl12:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
-; RV32-NEXT:    vsaddu.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
-; RV32-NEXT:    vsaddu.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vsaddu_vx_v32i64_evl12:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
-; RV64-NEXT:    vsaddu.vi v8, v8, -1, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
-; RV64-NEXT:    vsaddu.vi v16, v16, -1, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vsaddu_vx_v32i64_evl12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 12)
   ret <32 x i64> %v
 }
 
 define <32 x i64> @vsaddu_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
-; RV32-LABEL: vsaddu_vx_v32i64_evl27:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vsaddu.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
-; RV32-NEXT:    vsaddu.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vsaddu_vx_v32i64_evl27:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsaddu.vi v8, v8, -1, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
-; RV64-NEXT:    vsaddu.vi v16, v16, -1, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vsaddu_vx_v32i64_evl27:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 27)
   ret <32 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
index b82ca70477ba3..90e1b5ce55752 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
@@ -1399,95 +1399,50 @@ define <16 x i64> @vssub_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 declare <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
 
 define <32 x i64> @vssub_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vssub_vx_v32i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB108_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB108_2:
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vssub.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vssub.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vssub_vx_v32i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB108_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB108_2:
-; RV64-NEXT:    li a2, -1
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vssub.vx v8, v8, a2, v0.t
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vssub.vx v16, v16, a2, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vssub_vx_v32i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB108_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:  .LBB108_2:
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a2, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
 }
 
 define <32 x i64> @vssub_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vssub_vi_v32i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB109_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB109_2:
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vssub.vv v8, v8, v24
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vssub.vv v16, v16, v24
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vssub_vi_v32i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB109_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB109_2:
-; RV64-NEXT:    li a2, -1
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vssub.vx v8, v8, a2
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vssub.vx v16, v16, a2
-; RV64-NEXT:    ret
+; CHECK-LABEL: vssub_vi_v32i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB109_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:  .LBB109_2:
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a2
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a2
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
 }
@@ -1495,61 +1450,33 @@ define <32 x i64> @vssub_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; FIXME: We don't match vssub.vi on RV32.
 
 define <32 x i64> @vssub_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
-; RV32-LABEL: vssub_vx_v32i64_evl12:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
-; RV32-NEXT:    vssub.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
-; RV32-NEXT:    vssub.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vssub_vx_v32i64_evl12:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    li a0, -1
-; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
-; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
-; RV64-NEXT:    vssub.vx v16, v16, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vssub_vx_v32i64_evl12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 12)
   ret <32 x i64> %v
 }
 
 define <32 x i64> @vssub_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
-; RV32-LABEL: vssub_vx_v32i64_evl27:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vssub.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
-; RV32-NEXT:    vssub.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vssub_vx_v32i64_evl27:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    li a0, -1
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
-; RV64-NEXT:    vssub.vx v16, v16, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vssub_vx_v32i64_evl27:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 27)
   ret <32 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
index 6d8ed563f02bd..59899ab8b9994 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
@@ -1394,95 +1394,50 @@ define <16 x i64> @vssubu_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 declare <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
 
 define <32 x i64> @vssubu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vssubu_vx_v32i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB108_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB108_2:
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vssubu.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vssubu.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vssubu_vx_v32i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB108_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB108_2:
-; RV64-NEXT:    li a2, -1
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vssubu.vx v8, v8, a2, v0.t
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vssubu.vx v16, v16, a2, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vssubu_vx_v32i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB108_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:  .LBB108_2:
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a2, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
 }
 
 define <32 x i64> @vssubu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vssubu_vi_v32i64_unmasked:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB109_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:  .LBB109_2:
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vssubu.vv v8, v8, v24
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vssubu.vv v16, v16, v24
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vssubu_vi_v32i64_unmasked:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    bltu a0, a2, .LBB109_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:  .LBB109_2:
-; RV64-NEXT:    li a2, -1
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vssubu.vx v8, v8, a2
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vssubu.vx v16, v16, a2
-; RV64-NEXT:    ret
+; CHECK-LABEL: vssubu_vi_v32i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB109_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:  .LBB109_2:
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a2
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a2
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
 }
@@ -1490,61 +1445,33 @@ define <32 x i64> @vssubu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; FIXME: We don't match vssubu.vi on RV32.
 
 define <32 x i64> @vssubu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
-; RV32-LABEL: vssubu_vx_v32i64_evl12:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
-; RV32-NEXT:    vssubu.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
-; RV32-NEXT:    vssubu.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vssubu_vx_v32i64_evl12:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    li a0, -1
-; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
-; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
-; RV64-NEXT:    vssubu.vx v16, v16, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vssubu_vx_v32i64_evl12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 12)
   ret <32 x i64> %v
 }
 
 define <32 x i64> @vssubu_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
-; RV32-LABEL: vssubu_vx_v32i64_evl27:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v24, -1
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vssubu.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
-; RV32-NEXT:    vssubu.vv v16, v16, v24, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vssubu_vx_v32i64_evl27:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    li a0, -1
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
-; RV64-NEXT:    vssubu.vx v16, v16, a0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: vssubu_vx_v32i64_evl27:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 27)
   ret <32 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index dc27158cfb31f..162f7e34536a7 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -599,13 +599,13 @@ define void @test_srem_vec(ptr %X) nounwind {
 ;
 ; RV32MV-LABEL: test_srem_vec:
 ; RV32MV:       # %bb.0:
-; RV32MV-NEXT:    addi sp, sp, -48
-; RV32MV-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32MV-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32MV-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
-; RV32MV-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
-; RV32MV-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
-; RV32MV-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
+; RV32MV-NEXT:    addi sp, sp, -64
+; RV32MV-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
+; RV32MV-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32MV-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
+; RV32MV-NEXT:    sw s2, 48(sp) # 4-byte Folded Spill
+; RV32MV-NEXT:    sw s3, 44(sp) # 4-byte Folded Spill
+; RV32MV-NEXT:    sw s4, 40(sp) # 4-byte Folded Spill
 ; RV32MV-NEXT:    csrr a1, vlenb
 ; RV32MV-NEXT:    slli a1, a1, 1
 ; RV32MV-NEXT:    sub sp, sp, a1
@@ -624,29 +624,33 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    srai s3, a0, 31
 ; RV32MV-NEXT:    srli a1, a1, 1
 ; RV32MV-NEXT:    slli a1, a1, 31
-; RV32MV-NEXT:    lw a0, 0(s0)
 ; RV32MV-NEXT:    srai s4, a1, 31
+; RV32MV-NEXT:    lw a0, 0(s0)
 ; RV32MV-NEXT:    slli a1, a3, 31
 ; RV32MV-NEXT:    srai a1, a1, 31
+; RV32MV-NEXT:    li a2, 1
+; RV32MV-NEXT:    sw a2, 20(sp)
+; RV32MV-NEXT:    li a2, -1
+; RV32MV-NEXT:    sw a2, 16(sp)
 ; RV32MV-NEXT:    li a2, 6
 ; RV32MV-NEXT:    li a3, 0
 ; RV32MV-NEXT:    call __moddi3
 ; RV32MV-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32MV-NEXT:    vmv.v.x v8, a0
 ; RV32MV-NEXT:    vslide1down.vx v8, v8, a1
-; RV32MV-NEXT:    addi a0, sp, 16
+; RV32MV-NEXT:    addi a0, sp, 32
 ; RV32MV-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32MV-NEXT:    li a2, 7
 ; RV32MV-NEXT:    mv a0, s2
 ; RV32MV-NEXT:    mv a1, s4
 ; RV32MV-NEXT:    li a3, 0
 ; RV32MV-NEXT:    call __moddi3
-; RV32MV-NEXT:    addi a2, sp, 16
+; RV32MV-NEXT:    addi a2, sp, 32
 ; RV32MV-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32MV-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32MV-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32MV-NEXT:    vslide1down.vx v8, v8, a1
-; RV32MV-NEXT:    addi a0, sp, 16
+; RV32MV-NEXT:    addi a0, sp, 32
 ; RV32MV-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32MV-NEXT:    li a2, -5
 ; RV32MV-NEXT:    li a3, -1
@@ -654,18 +658,17 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    mv a1, s3
 ; RV32MV-NEXT:    call __moddi3
 ; RV32MV-NEXT:    addi a2, sp, 16
-; RV32MV-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; RV32MV-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32MV-NEXT:    vlse64.v v8, (a2), zero
+; RV32MV-NEXT:    addi a2, sp, 32
+; RV32MV-NEXT:    vl2r.v v10, (a2) # Unknown-size Folded Reload
 ; RV32MV-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32MV-NEXT:    vslide1down.vx v8, v8, a0
-; RV32MV-NEXT:    vslide1down.vx v8, v8, a1
-; RV32MV-NEXT:    vslidedown.vi v8, v8, 2
-; RV32MV-NEXT:    li a0, 511
-; RV32MV-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32MV-NEXT:    vmv.v.x v10, a0
-; RV32MV-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32MV-NEXT:    vsext.vf4 v12, v10
-; RV32MV-NEXT:    vand.vv v8, v8, v12
-; RV32MV-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32MV-NEXT:    vslide1down.vx v10, v10, a0
+; RV32MV-NEXT:    vslide1down.vx v10, v10, a1
+; RV32MV-NEXT:    vslidedown.vi v10, v10, 2
+; RV32MV-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32MV-NEXT:    vand.vv v8, v10, v8
+; RV32MV-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32MV-NEXT:    vmv.v.i v10, 1
 ; RV32MV-NEXT:    vmv.v.i v11, 0
 ; RV32MV-NEXT:    vsetivli zero, 3, e8, mf2, tu, ma
@@ -712,13 +715,13 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    csrr a0, vlenb
 ; RV32MV-NEXT:    slli a0, a0, 1
 ; RV32MV-NEXT:    add sp, sp, a0
-; RV32MV-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
-; RV32MV-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
-; RV32MV-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
-; RV32MV-NEXT:    lw s2, 32(sp) # 4-byte Folded Reload
-; RV32MV-NEXT:    lw s3, 28(sp) # 4-byte Folded Reload
-; RV32MV-NEXT:    lw s4, 24(sp) # 4-byte Folded Reload
-; RV32MV-NEXT:    addi sp, sp, 48
+; RV32MV-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; RV32MV-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; RV32MV-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
+; RV32MV-NEXT:    lw s2, 48(sp) # 4-byte Folded Reload
+; RV32MV-NEXT:    lw s3, 44(sp) # 4-byte Folded Reload
+; RV32MV-NEXT:    lw s4, 40(sp) # 4-byte Folded Reload
+; RV32MV-NEXT:    addi sp, sp, 64
 ; RV32MV-NEXT:    ret
 ;
 ; RV64MV-LABEL: test_srem_vec:

From c2fc33204caca8c52b27425255bbc78c9e4d99e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Wed, 4 Sep 2024 20:41:31 -0700
Subject: [PATCH 175/425] [flang][cuda] Add c_devptr and bypass output semantic
 check (#107318)

Add a builtin type for c_devptr since it will need some special handling
for some function like c_f_pointer.
`c_ptr` is defined as a builtin type and was raising a semantic error if
you try to use it in a I/O statement. This patch add a check for c_ptr
and c_devptr to bypass the semantic check and allow the variables of
these types to be used in I/O.
---
 flang/lib/Semantics/check-io.cpp      |  5 +++++
 flang/module/__fortran_builtins.f90   |  4 ++++
 flang/test/Lower/CUDA/cuda-devptr.cuf | 16 ++++++++++++++++
 3 files changed, 25 insertions(+)
 create mode 100644 flang/test/Lower/CUDA/cuda-devptr.cuf

diff --git a/flang/lib/Semantics/check-io.cpp b/flang/lib/Semantics/check-io.cpp
index 54e8e09cbf7e4..d7d2f0fa322cb 100644
--- a/flang/lib/Semantics/check-io.cpp
+++ b/flang/lib/Semantics/check-io.cpp
@@ -1171,6 +1171,11 @@ parser::Message *IoChecker::CheckForBadIoType(const evaluate::DynamicType &type,
             "Derived type '%s' in I/O may not be polymorphic unless using defined I/O"_err_en_US,
             derived.name());
       }
+      if (IsBuiltinDerivedType(&derived, "c_ptr") ||
+          IsBuiltinDerivedType(&derived, "c_devptr")) {
+        // Bypass the check below for c_ptr and c_devptr.
+        return nullptr;
+      }
       if (const Symbol *
           bad{FindInaccessibleComponent(which, derived, scope)}) {
         return &context_.Say(where,
diff --git a/flang/module/__fortran_builtins.f90 b/flang/module/__fortran_builtins.f90
index 44b0f17339cd9..a9d3ac897eb58 100644
--- a/flang/module/__fortran_builtins.f90
+++ b/flang/module/__fortran_builtins.f90
@@ -102,6 +102,10 @@
     __builtin_threadIdx, __builtin_blockDim, __builtin_blockIdx, &
     __builtin_gridDim
   integer, parameter, public :: __builtin_warpsize = 32
+  
+  type, public, bind(c) :: __builtin_c_devptr
+    type(__builtin_c_ptr) :: cptr
+  end type
 
   intrinsic :: __builtin_fma
   intrinsic :: __builtin_ieee_is_nan, __builtin_ieee_is_negative, &
diff --git a/flang/test/Lower/CUDA/cuda-devptr.cuf b/flang/test/Lower/CUDA/cuda-devptr.cuf
new file mode 100644
index 0000000000000..4e11e3c0fc8f8
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-devptr.cuf
@@ -0,0 +1,16 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test CUDA Fortran specific type
+
+subroutine sub1()
+  use iso_c_binding
+  use __fortran_builtins, only : c_devptr => __builtin_c_devptr
+
+  type(c_ptr) :: ptr
+  type(c_devptr) :: dptr
+  print*,ptr
+  print*,dptr
+end
+
+! CHECK-LABEL: func.func @_QPsub1()
+! CHECK-COUNT-2: %{{.*}} = fir.call @_FortranAioOutputDerivedType

From aad699776496a80af5e062b446fe26a4313ff3e3 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 5 Sep 2024 12:04:33 +0800
Subject: [PATCH 176/425] [RISCV] Fold PseudoVMV_V_V with undef passthru,
 handling policy (#106943)

If a vmv.v.v has an undef passthru then we can just replace it with its
input operand, since the tail is completely undefined.

This is a reattempt of #106840, but also checks to see if the input was
a pseudo where we can relax its tail policy to undef.

This also means we don't need to check for undef passthrus in
foldVMV_V_V anymore because they will be handled by
foldUndefPassthruVMV_V_V.
---
 llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 42 +++++++++++++++++--
 .../RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir  |  1 -
 .../CodeGen/RISCV/rvv/vmv.v.v-peephole.ll     |  4 +-
 .../CodeGen/RISCV/rvv/vmv.v.v-peephole.mir    |  3 +-
 4 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 35c3bc9708d91..026e0a365b8dc 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -66,6 +66,7 @@ class RISCVVectorPeephole : public MachineFunctionPass {
   bool convertToWholeRegister(MachineInstr &MI) const;
   bool convertToUnmasked(MachineInstr &MI) const;
   bool convertVMergeToVMv(MachineInstr &MI) const;
+  bool foldUndefPassthruVMV_V_V(MachineInstr &MI);
   bool foldVMV_V_V(MachineInstr &MI);
 
   bool isAllOnesMask(const MachineInstr *MaskDef) const;
@@ -472,6 +473,38 @@ bool RISCVVectorPeephole::ensureDominates(const MachineOperand &MO,
   return true;
 }
 
+/// If a PseudoVMV_V_V's passthru is undef then we can replace it with its input
+bool RISCVVectorPeephole::foldUndefPassthruVMV_V_V(MachineInstr &MI) {
+  if (RISCV::getRVVMCOpcode(MI.getOpcode()) != RISCV::VMV_V_V)
+    return false;
+  if (MI.getOperand(1).getReg() != RISCV::NoRegister)
+    return false;
+
+  // If the input was a pseudo with a policy operand, we can give it a tail
+  // agnostic policy if MI's undef tail subsumes the input's.
+  MachineInstr *Src = MRI->getVRegDef(MI.getOperand(2).getReg());
+  if (Src && !Src->hasUnmodeledSideEffects() &&
+      MRI->hasOneUse(MI.getOperand(2).getReg()) &&
+      RISCVII::hasVLOp(Src->getDesc().TSFlags) &&
+      RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags) &&
+      getSEWLMULRatio(MI) == getSEWLMULRatio(*Src)) {
+    const MachineOperand &MIVL = MI.getOperand(3);
+    const MachineOperand &SrcVL =
+        Src->getOperand(RISCVII::getVLOpNum(Src->getDesc()));
+
+    MachineOperand &SrcPolicy =
+        Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc()));
+
+    if (isVLKnownLE(MIVL, SrcVL))
+      SrcPolicy.setImm(SrcPolicy.getImm() | RISCVII::TAIL_AGNOSTIC);
+  }
+
+  MRI->replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(2).getReg());
+  MI.eraseFromParent();
+  V0Defs.erase(&MI);
+  return true;
+}
+
 /// If a PseudoVMV_V_V is the only user of its input, fold its passthru and VL
 /// into it.
 ///
@@ -531,9 +564,8 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
 
   // If MI was tail agnostic and the VL didn't increase, preserve it.
   int64_t Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
-  bool TailAgnostic = (MI.getOperand(5).getImm() & RISCVII::TAIL_AGNOSTIC) ||
-                      Passthru.getReg() == RISCV::NoRegister;
-  if (TailAgnostic && isVLKnownLE(MI.getOperand(3), SrcVL))
+  if ((MI.getOperand(5).getImm() & RISCVII::TAIL_AGNOSTIC) &&
+      isVLKnownLE(MI.getOperand(3), SrcVL))
     Policy |= RISCVII::TAIL_AGNOSTIC;
   Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())).setImm(Policy);
 
@@ -584,6 +616,10 @@ bool RISCVVectorPeephole::runOnMachineFunction(MachineFunction &MF) {
       Changed |= convertToUnmasked(MI);
       Changed |= convertToWholeRegister(MI);
       Changed |= convertVMergeToVMv(MI);
+      if (foldUndefPassthruVMV_V_V(MI)) {
+        Changed |= true;
+        continue; // MI is erased
+      }
       Changed |= foldVMV_V_V(MI);
     }
   }
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
index 1419eede6ca9d..19a918148e6eb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
@@ -15,7 +15,6 @@ body: |
     ; CHECK-NEXT: %avl:gprnox0 = COPY $x1
     ; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 5 /* e32 */
     ; CHECK-NEXT: $v0 = COPY %mask
-    ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 $noreg, %true, %avl, 5 /* e32 */, 0 /* tu, mu */
     %false:vr = COPY $v8
     %true:vr = COPY $v9
     %avl:gprnox0 = COPY $x1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
index c1602c912da63..7f248a39b54fa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
@@ -199,10 +199,8 @@ define <vscale x 2 x i32> @unfoldable_mismatched_sew(<vscale x 2 x i32> %passthr
 define <vscale x 1 x i64> @undef_passthru(<vscale x 1 x i64> %passthru, <vscale x 1 x i64> %x, <vscale x 1 x i64> %y, iXLen %avl) {
 ; CHECK-LABEL: undef_passthru:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v9, v10
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64(<vscale x 1 x i64> %passthru, <vscale x 1 x i64> %x, <vscale x 1 x i64> %y, iXLen %avl)
   %b = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %a, iXLen %avl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
index 7e2ac0e26f251..6858231bf0e6c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
@@ -69,8 +69,7 @@ body: |
     ; CHECK: liveins: $v8
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %passthru:vr = COPY $v8
-    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */
-    ; CHECK-NEXT: %y:vr = PseudoVMV_V_V_M1 $noreg, %x, 4, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 1 /* ta, mu */
     %passthru:vr = COPY $v8
     %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */
     %y:vr = PseudoVMV_V_V_M1 $noreg, %x, 4, 5 /* e32 */, 1 /* ta, mu */

From ad89e617c703239518187912540b8ea811dc2eda Mon Sep 17 00:00:00 2001
From: Matt Hofmann <mrh259@cornell.edu>
Date: Thu, 5 Sep 2024 00:12:03 -0400
Subject: [PATCH 177/425] [MLIR][Python] Fix detached operation coming from
 `IfOp` constructor (#107286)

Without this fix, `scf.if` operations would be created without a parent.
Since `scf.if` operations often have no results, this caused silent bugs
where the generated code was straight-up missing the operation.
---
 mlir/python/mlir/dialects/scf.py |  2 +-
 mlir/test/python/dialects/scf.py | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/mlir/python/mlir/dialects/scf.py b/mlir/python/mlir/dialects/scf.py
index 7025f6e0f1a16..2d0047b76c702 100644
--- a/mlir/python/mlir/dialects/scf.py
+++ b/mlir/python/mlir/dialects/scf.py
@@ -87,7 +87,7 @@ def __init__(self, cond, results_=None, *, hasElse=False, loc=None, ip=None):
         operands.append(cond)
         results = []
         results.extend(results_)
-        super().__init__(results, cond)
+        super().__init__(results, cond, loc=loc, ip=ip)
         self.regions[0].blocks.append(*[])
         if hasElse:
             self.regions[1].blocks.append(*[])
diff --git a/mlir/test/python/dialects/scf.py b/mlir/test/python/dialects/scf.py
index 95a6de86b670d..de61f4613868f 100644
--- a/mlir/test/python/dialects/scf.py
+++ b/mlir/test/python/dialects/scf.py
@@ -278,6 +278,32 @@ def simple_if(cond):
 # CHECK: return
 
 
+@constructAndPrintInModule
+def testNestedIf():
+    bool = IntegerType.get_signless(1)
+    i32 = IntegerType.get_signless(32)
+
+    @func.FuncOp.from_py_func(bool, bool)
+    def nested_if(b, c):
+        if_op = scf.IfOp(b)
+        with InsertionPoint(if_op.then_block) as ip:
+            if_op = scf.IfOp(c, ip=ip)
+            with InsertionPoint(if_op.then_block):
+                one = arith.ConstantOp(i32, 1)
+                add = arith.AddIOp(one, one)
+                scf.YieldOp([])
+            scf.YieldOp([])
+        return
+
+
+# CHECK: func @nested_if(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: i1)
+# CHECK: scf.if %[[ARG0:.*]]
+# CHECK:   scf.if %[[ARG1:.*]]
+# CHECK:     %[[ONE:.*]] = arith.constant 1
+# CHECK:     %[[ADD:.*]] = arith.addi %[[ONE]], %[[ONE]]
+# CHECK: return
+
+
 @constructAndPrintInModule
 def testIfWithElse():
     bool = IntegerType.get_signless(1)

From 41c11ea2af743051013dfcc0fced5a450e2dc9b8 Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Wed, 4 Sep 2024 21:15:48 -0700
Subject: [PATCH 178/425] [HLSL] Remove variables that are used only in assert
 (#107299)

Changes the assert to test the same condition without using the
variables.

This change is done in response to a comment
[here](https://github.com/llvm/llvm-project/pull/106657#issuecomment-2327493439).
---
 clang/lib/Sema/SemaHLSL.cpp | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 65aeda4b7b613..d5ccd3815eb66 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -837,17 +837,10 @@ static void ValidateMultipleRegisterAnnotations(Sema &S, Decl *TheDecl,
 static void DiagnoseHLSLRegisterAttribute(Sema &S, SourceLocation &ArgLoc,
                                           Decl *TheDecl, RegisterType regType) {
 
-  // Samplers, UAVs, and SRVs are VarDecl types
-  VarDecl *TheVarDecl = dyn_cast<VarDecl>(TheDecl);
-  // Cbuffers and Tbuffers are HLSLBufferDecl types
-  HLSLBufferDecl *CBufferOrTBuffer = dyn_cast<HLSLBufferDecl>(TheDecl);
-
   // exactly one of these two types should be set
-  assert(((TheVarDecl && !CBufferOrTBuffer) ||
-          (!TheVarDecl && CBufferOrTBuffer)) &&
-         "either TheVarDecl or CBufferOrTBuffer should be set");
-  (void)TheVarDecl;
-  (void)CBufferOrTBuffer;
+  assert(((isa<VarDecl>(TheDecl) && !isa<HLSLBufferDecl>(TheDecl)) ||
+          (!isa<VarDecl>(TheDecl) && isa<HLSLBufferDecl>(TheDecl))) &&
+         "expecting VarDecl or HLSLBufferDecl");
 
   RegisterBindingFlags Flags = HLSLFillRegisterBindingFlags(S, TheDecl);
   assert((int)Flags.Other + (int)Flags.Resource + (int)Flags.Basic +

From abbcfff706b33a8965afa9f2c520f60ad46f3b9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Wed, 4 Sep 2024 21:16:00 -0700
Subject: [PATCH 179/425] Revert "[flang][cuda] Add c_devptr and bypass output
 semantic check" (#107349)

Reverts llvm/llvm-project#107318

It breaks a test
https://lab.llvm.org/buildbot/#/builders/143/builds/1933
---
 flang/lib/Semantics/check-io.cpp      |  5 -----
 flang/module/__fortran_builtins.f90   |  4 ----
 flang/test/Lower/CUDA/cuda-devptr.cuf | 16 ----------------
 3 files changed, 25 deletions(-)
 delete mode 100644 flang/test/Lower/CUDA/cuda-devptr.cuf

diff --git a/flang/lib/Semantics/check-io.cpp b/flang/lib/Semantics/check-io.cpp
index d7d2f0fa322cb..54e8e09cbf7e4 100644
--- a/flang/lib/Semantics/check-io.cpp
+++ b/flang/lib/Semantics/check-io.cpp
@@ -1171,11 +1171,6 @@ parser::Message *IoChecker::CheckForBadIoType(const evaluate::DynamicType &type,
             "Derived type '%s' in I/O may not be polymorphic unless using defined I/O"_err_en_US,
             derived.name());
       }
-      if (IsBuiltinDerivedType(&derived, "c_ptr") ||
-          IsBuiltinDerivedType(&derived, "c_devptr")) {
-        // Bypass the check below for c_ptr and c_devptr.
-        return nullptr;
-      }
       if (const Symbol *
           bad{FindInaccessibleComponent(which, derived, scope)}) {
         return &context_.Say(where,
diff --git a/flang/module/__fortran_builtins.f90 b/flang/module/__fortran_builtins.f90
index a9d3ac897eb58..44b0f17339cd9 100644
--- a/flang/module/__fortran_builtins.f90
+++ b/flang/module/__fortran_builtins.f90
@@ -102,10 +102,6 @@
     __builtin_threadIdx, __builtin_blockDim, __builtin_blockIdx, &
     __builtin_gridDim
   integer, parameter, public :: __builtin_warpsize = 32
-  
-  type, public, bind(c) :: __builtin_c_devptr
-    type(__builtin_c_ptr) :: cptr
-  end type
 
   intrinsic :: __builtin_fma
   intrinsic :: __builtin_ieee_is_nan, __builtin_ieee_is_negative, &
diff --git a/flang/test/Lower/CUDA/cuda-devptr.cuf b/flang/test/Lower/CUDA/cuda-devptr.cuf
deleted file mode 100644
index 4e11e3c0fc8f8..0000000000000
--- a/flang/test/Lower/CUDA/cuda-devptr.cuf
+++ /dev/null
@@ -1,16 +0,0 @@
-! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
-
-! Test CUDA Fortran specific type
-
-subroutine sub1()
-  use iso_c_binding
-  use __fortran_builtins, only : c_devptr => __builtin_c_devptr
-
-  type(c_ptr) :: ptr
-  type(c_devptr) :: dptr
-  print*,ptr
-  print*,dptr
-end
-
-! CHECK-LABEL: func.func @_QPsub1()
-! CHECK-COUNT-2: %{{.*}} = fir.call @_FortranAioOutputDerivedType

From 1465e23985904d55a014f3377c287ded45c0fa0c Mon Sep 17 00:00:00 2001
From: Brandon Wu <brandon.wu@sifive.com>
Date: Thu, 5 Sep 2024 12:46:20 +0800
Subject: [PATCH 180/425] [RISCV][llvm] Handle `ptr` element type in
 `lowerDeinterleaveIntrinsicToLoad` and `lowerInterleaveIntrinsicToStore`
 (#107079)

Resolve https://github.com/llvm/llvm-project/issues/106970

currently it returns 0 fixed size for `ptr` element type. The `ptr`
element size should depend on `XLen` which is 64 in riscv64 and 32 in
riscv32 respectively.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 12 +++++-----
 .../RISCV/rvv/vector-deinterleave-load.ll     | 22 +++++++++++++++++--
 .../RISCV/rvv/vector-interleave-store.ll      | 21 ++++++++++++++++--
 3 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a9061a05c7c67..d400b2ea1ca2c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21980,10 +21980,10 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
 
   VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
   VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
+  const DataLayout &DL = LI->getDataLayout();
 
   if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
-                                    LI->getPointerAddressSpace(),
-                                    LI->getDataLayout()))
+                                    LI->getPointerAddressSpace(), DL))
     return false;
 
   Function *VlsegNFunc;
@@ -22005,7 +22005,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
         Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7,
         Intrinsic::riscv_vlseg8};
 
-    unsigned SEW = ResVTy->getElementType()->getScalarSizeInBits();
+    unsigned SEW = DL.getTypeSizeInBits(ResVTy->getElementType());
     unsigned NumElts = ResVTy->getElementCount().getKnownMinValue();
     Type *VecTupTy = TargetExtType::get(
         LI->getContext(), "riscv.vector.tuple",
@@ -22051,10 +22051,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
 
   VectorType *VTy = cast<VectorType>(II->getType());
   VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
+  const DataLayout &DL = SI->getDataLayout();
 
   if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
-                                    SI->getPointerAddressSpace(),
-                                    SI->getDataLayout()))
+                                    SI->getPointerAddressSpace(), DL))
     return false;
 
   Function *VssegNFunc;
@@ -22075,7 +22075,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
         Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7,
         Intrinsic::riscv_vsseg8};
 
-    unsigned SEW = InVTy->getElementType()->getScalarSizeInBits();
+    unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType());
     unsigned NumElts = InVTy->getElementCount().getKnownMinValue();
     Type *VecTupTy = TargetExtType::get(
         SI->getContext(), "riscv.vector.tuple",
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index e2f956ca03ff8..54373d94f8f5f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+m | FileCheck %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+m | FileCheck %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+m | FileCheck --check-prefixes=CHECK,RV32 %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+m | FileCheck --check-prefixes=CHECK,RV64 %s
 
 ; Integers
 
@@ -263,9 +263,27 @@ define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_load_
   ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval
 }
 
+define {<vscale x 2 x ptr>, <vscale x 2 x ptr>} @vector_deinterleave_load_nxv2p0_nxv4p0(ptr %p) {
+; RV32-LABEL: vector_deinterleave_load_nxv2p0_nxv4p0:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vlseg2e32.v v8, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_deinterleave_load_nxv2p0_nxv4p0:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64-NEXT:    vlseg2e64.v v8, (a0)
+; RV64-NEXT:    ret
+  %vec = load <vscale x 4 x ptr>, ptr %p
+  %retval = call {<vscale x 2 x ptr>, <vscale x 2 x ptr>} @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %vec)
+  ret {<vscale x 2 x ptr>, <vscale x 2 x ptr>} %retval
+}
+
 declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
 declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
 declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
 declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
 declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
 declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare {<vscale x 2 x ptr>, <vscale x 2 x ptr>} @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index 5ebf63f0a4411..a06aa2d02b11b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh | FileCheck %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh | FileCheck --check-prefixes=CHECK,RV32 %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck --check-prefixes=CHECK,RV64 %s
 
 ; Integers
 
@@ -218,6 +218,22 @@ define void @vector_interleave_store_nxv4f64_nxv2f64(<vscale x 2 x double> %a, <
   ret void
 }
 
+define void @vector_interleave_store_nxv4p0_nxv2p0(<vscale x 2 x ptr> %a, <vscale x 2 x ptr> %b, ptr %p) {
+; RV32-LABEL: vector_interleave_store_nxv4p0_nxv2p0:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vsseg2e32.v v8, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_interleave_store_nxv4p0_nxv2p0:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64-NEXT:    vsseg2e64.v v8, (a0)
+; RV64-NEXT:    ret
+  %res = call <vscale x 4 x ptr> @llvm.vector.interleave2.nxv4p0(<vscale x 2 x ptr> %a, <vscale x 2 x ptr> %b)
+  store <vscale x 4 x ptr> %res, ptr %p
+  ret void
+}
 
 declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
 declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
@@ -225,3 +241,4 @@ declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x floa
 declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
 declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x ptr> @llvm.vector.interleave2.nxv4p0(<vscale x 2 x ptr>, <vscale x 2 x ptr>)

From da8fb7f4dddf48b2395f80dc09efffe38efa3d2f Mon Sep 17 00:00:00 2001
From: Brandon Wu <brandon.wu@sifive.com>
Date: Thu, 5 Sep 2024 12:47:00 +0800
Subject: [PATCH 181/425] [clang][RISCV] Fix typo of vector crypto in
 SemaRISCV.cpp. NFC (#106485)

---
 clang/lib/Sema/SemaRISCV.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp
index abf8e4ac2f3e8..56d6f12fbc6e4 100644
--- a/clang/lib/Sema/SemaRISCV.cpp
+++ b/clang/lib/Sema/SemaRISCV.cpp
@@ -733,7 +733,7 @@ bool SemaRISCV::CheckBuiltinFunctionCall(const TargetInfo &TI,
     if (ElemSize == 64 && !TI.hasFeature("zvknhb"))
       return Diag(TheCall->getBeginLoc(),
                   diag::err_riscv_builtin_requires_extension)
-             << /* IsExtension */ true << TheCall->getSourceRange() << "zvknb";
+             << /* IsExtension */ true << TheCall->getSourceRange() << "zvknhb";
 
     return CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op1Type,
                                    ElemSize * 4) ||

From 845d8d909c37c61298d49c0e91949c669ca15215 Mon Sep 17 00:00:00 2001
From: Elvis Wang <110374989+ElvisWang123@users.noreply.github.com>
Date: Thu, 5 Sep 2024 13:05:01 +0800
Subject: [PATCH 182/425] [RISCV][TTI] Add cost of typebased cast VPIntrinsics
 with functionalOPC. (#97797)

This patch make the instruction cost of type-based cast VP intrinsics
will be same as their non-VP counterpart.
This is the following patch of
[#93435](https://github.com/llvm/llvm-project/pull/93435)
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   22 +-
 llvm/test/Analysis/CostModel/RISCV/cast.ll    | 3409 ++++++++++++++++-
 2 files changed, 3427 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index e655200e7a895..e809e15eacf69 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1011,8 +1011,26 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   case Intrinsic::vp_frem: {
     std::optional<unsigned> FOp =
         VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
-    if (FOp)
-      return getArithmeticInstrCost(*FOp, ICA.getReturnType(), CostKind);
+    assert(FOp.has_value());
+    return getArithmeticInstrCost(*FOp, ICA.getReturnType(), CostKind);
+    break;
+  }
+  // vp int cast ops.
+  case Intrinsic::vp_trunc:
+  case Intrinsic::vp_zext:
+  case Intrinsic::vp_sext:
+  // vp float cast ops.
+  case Intrinsic::vp_fptoui:
+  case Intrinsic::vp_fptosi:
+  case Intrinsic::vp_uitofp:
+  case Intrinsic::vp_sitofp:
+  case Intrinsic::vp_fptrunc:
+  case Intrinsic::vp_fpext: {
+    std::optional<unsigned> FOp =
+        VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
+    assert(FOp.has_value() && !ICA.getArgTypes().empty());
+    return getCastInstrCost(*FOp, RetTy, ICA.getArgTypes()[0],
+                            TTI::CastContextHint::None, CostKind);
     break;
   }
   }
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index c1759b8b03d0f..5650d2cf90eac 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: opt < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput --type-based-intrinsic-cost=true 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput --type-based-intrinsic-cost=true 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV64
 
 define void @sext() {
 ; RV32-LABEL: 'sext'
@@ -14,6 +14,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = sext <2 x i1> undef to <2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
@@ -24,6 +34,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
@@ -34,6 +54,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
@@ -44,6 +74,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
@@ -54,6 +94,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
@@ -64,6 +114,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
@@ -74,6 +134,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
@@ -84,6 +154,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -94,6 +174,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = sext <vscale x 1 x i1> undef to <vscale x 1 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = sext <vscale x 1 x i1> undef to <vscale x 1 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.sext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
@@ -104,6 +194,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext <vscale x 2 x i1> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.sext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
@@ -114,6 +214,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext <vscale x 4 x i1> undef to <vscale x 4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.sext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
@@ -124,6 +234,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.sext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
@@ -134,6 +254,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
@@ -144,6 +274,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
@@ -154,6 +294,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
@@ -164,6 +314,16 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64-LABEL: 'sext'
@@ -177,6 +337,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = sext <2 x i1> undef to <2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
@@ -187,6 +357,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
@@ -197,6 +377,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
@@ -207,6 +397,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
@@ -217,6 +417,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
@@ -227,6 +437,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
@@ -237,6 +457,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
@@ -247,6 +477,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -257,6 +497,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = sext <vscale x 1 x i1> undef to <vscale x 1 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = sext <vscale x 1 x i1> undef to <vscale x 1 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.sext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
@@ -267,6 +517,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext <vscale x 2 x i1> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.sext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
@@ -277,6 +537,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext <vscale x 4 x i1> undef to <vscale x 4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.sext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
@@ -287,6 +557,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.sext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
@@ -297,6 +577,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
@@ -307,6 +597,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
@@ -317,6 +617,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
@@ -327,6 +637,16 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2i16 = sext <2 x i8> undef to <2 x i16>
@@ -340,6 +660,17 @@ define void @sext() {
   %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32>
   %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
 
+  %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i8.v2i16(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i8.v2i32(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i8.v2i64(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i16.v2i32(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i16.v2i64(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i32.v2i64(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i1.v2i8(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i1.v2i16(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i1.v2i32(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i1.v2i64(<2 x i1> undef, <2 x i1> undef, i32 undef)
+
   %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
   %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
   %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
@@ -351,6 +682,17 @@ define void @sext() {
   %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
   %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
 
+  %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i8.v4i16(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i8.v4i32(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i8.v4i64(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i16.v4i32(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i16.v4i64(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i32.v4i64(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i1.v4i8(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i1.v4i16(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i1.v4i32(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i1.v4i64(<4 x i1> undef, <4 x i1> undef, i32 undef)
+
   %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
   %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
   %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
@@ -362,6 +704,17 @@ define void @sext() {
   %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
   %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
 
+  %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i8.v8i16(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i8.v8i32(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i8.v8i64(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i16.v8i32(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i16.v8i64(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i32.v8i64(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i1.v8i8(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i1.v8i16(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i1.v8i32(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i1.v8i64(<8 x i1> undef, <8 x i1> undef, i32 undef)
+
   %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
   %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
   %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
@@ -373,6 +726,17 @@ define void @sext() {
   %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
   %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
 
+  %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i8.v16i16(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i8.v16i32(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i8.v16i64(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i16.v16i32(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i16.v16i64(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i32.v16i64(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i1.v16i8(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i1.v16i16(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i1.v16i32(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i1.v16i64(<16 x i1> undef, <16 x i1> undef, i32 undef)
+
   %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
   %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
   %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
@@ -384,6 +748,17 @@ define void @sext() {
   %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
   %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
 
+  %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i8.v32i16(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i8.v32i32(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i8.v32i64(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i16.v32i32(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i16.v32i64(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i32.v32i64(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i1.v32i8(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i1.v32i16(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i1.v32i32(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i1.v32i64(<32 x i1> undef, <32 x i1> undef, i32 undef)
+
   %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
   %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
   %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
@@ -395,6 +770,17 @@ define void @sext() {
   %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
   %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
 
+  %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i8.v64i16(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i8.v64i32(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i8.v64i64(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i16.v64i32(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i16.v64i64(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i32.v64i64(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i1.v64i8(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i1.v64i16(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i1.v64i32(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i1.v64i64(<64 x i1> undef, <64 x i1> undef, i32 undef)
+
   %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
   %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
   %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
@@ -406,6 +792,17 @@ define void @sext() {
   %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
   %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
 
+  %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i8.v128i16(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i8.v128i32(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i8.v128i64(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i16.v128i32(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i16.v128i64(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i32.v128i64(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i1.v128i8(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i1.v128i16(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i1.v128i32(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i1.v128i64(<128 x i1> undef, <128 x i1> undef, i32 undef)
+
   %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
   %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
   %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
@@ -417,6 +814,17 @@ define void @sext() {
   %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
   %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
 
+  %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i8.v256i16(<256 x i8> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i8.v256i32(<256 x i8> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i8.v256i64(<256 x i8> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i16.v256i32(<256 x i16> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i16.v256i64(<256 x i16> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i32.v256i64(<256 x i32> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i1.v256i8(<256 x i1> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i1.v256i16(<256 x i1> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i1.v256i32(<256 x i1> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i1.v256i64(<256 x i1> undef, <256 x i1> undef, i32 undef)
+
   %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
   %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
   %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -428,6 +836,17 @@ define void @sext() {
   %nxv1i1_nxv1i32 = sext <vscale x 1 x i1> undef to <vscale x 1 x i32>
   %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
 
+  %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i8.nxv1i16(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i8.nxv1i32(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i8.nxv1i64(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i16.nxv1i32(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i16.nxv1i64(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i32.nxv1i64(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.sext.nxv1i1.nxv1i8(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i1.nxv1i16(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i1.nxv1i32(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i1.nxv1i64(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+
   %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
   %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
   %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
@@ -439,6 +858,17 @@ define void @sext() {
   %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
   %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
 
+  %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i8.nxv2i16(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i8.nxv2i32(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i8.nxv2i64(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i16.nxv2i32(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i16.nxv2i64(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i32.nxv2i64(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.sext.nxv2i1.nxv2i8(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i1.nxv2i16(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i1.nxv2i32(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i1.nxv2i64(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+
   %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
   %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
   %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
@@ -450,6 +880,17 @@ define void @sext() {
   %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
   %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
 
+  %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i8.nxv4i16(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i8.nxv4i32(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i8.nxv4i64(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i16.nxv4i32(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i16.nxv4i64(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i32.nxv4i64(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.sext.nxv4i1.nxv4i8(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i1.nxv4i16(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i1.nxv4i32(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i1.nxv4i64(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+
   %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
   %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
   %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
@@ -461,6 +902,17 @@ define void @sext() {
   %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
   %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
 
+  %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i8.nxv8i16(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i8.nxv8i32(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i8.nxv8i64(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i16.nxv8i32(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i16.nxv8i64(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i32.nxv8i64(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.sext.nxv8i1.nxv8i8(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i1.nxv8i16(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i1.nxv8i32(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i1.nxv8i64(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+
   %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
   %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
   %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
@@ -472,6 +924,17 @@ define void @sext() {
   %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
   %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
 
+  %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i8.nxv16i16(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i8.nxv16i32(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i8.nxv16i64(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i16.nxv16i32(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i16.nxv16i64(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i32.nxv16i64(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i1.nxv16i8(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i1.nxv16i16(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i1.nxv16i32(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i1.nxv16i64(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+
   %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
   %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
   %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
@@ -483,6 +946,17 @@ define void @sext() {
   %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
   %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
 
+  %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i8.nxv32i16(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i8.nxv32i32(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i8.nxv32i64(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i16.nxv32i32(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i16.nxv32i64(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i32.nxv32i64(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i1.nxv32i8(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i1.nxv32i16(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i1.nxv32i32(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i1.nxv32i64(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+
   %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
   %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
   %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
@@ -494,6 +968,17 @@ define void @sext() {
   %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
   %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
 
+  %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i8.nxv64i16(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i8.nxv64i32(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i8.nxv64i64(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i16.nxv64i32(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i16.nxv64i64(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i32.nxv64i64(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i1.nxv64i8(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i1.nxv64i16(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i1.nxv64i32(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i1.nxv64i64(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+
   %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
   %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
   %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
@@ -505,6 +990,17 @@ define void @sext() {
   %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
   %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 
+  %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i8.nxv128i16(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i8.nxv128i32(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i8.nxv128i128(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i16.nxv128i32(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i16.nxv128i128(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i32.nxv128i128(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i1.nxv128i8(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i1.nxv128i16(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i1.nxv128i32(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i1.nxv128i128(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+
   ret void
 }
 
@@ -520,6 +1016,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = zext <2 x i1> undef to <2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
@@ -530,6 +1036,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
@@ -540,6 +1056,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
@@ -550,6 +1076,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
@@ -560,6 +1096,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
@@ -570,6 +1116,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
@@ -580,6 +1136,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
@@ -590,6 +1156,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -600,6 +1176,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = zext <vscale x 1 x i1> undef to <vscale x 1 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = zext <vscale x 1 x i1> undef to <vscale x 1 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.zext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
@@ -610,6 +1196,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext <vscale x 2 x i1> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.zext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
@@ -620,6 +1216,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext <vscale x 4 x i1> undef to <vscale x 4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.zext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
@@ -630,6 +1236,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.zext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
@@ -640,6 +1256,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
@@ -650,6 +1276,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
@@ -660,6 +1296,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
@@ -670,6 +1316,16 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64-LABEL: 'zext'
@@ -683,6 +1339,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = zext <2 x i1> undef to <2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
@@ -693,6 +1359,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
@@ -703,6 +1379,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
@@ -713,6 +1399,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
@@ -723,6 +1419,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
@@ -733,6 +1439,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
@@ -743,6 +1459,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
@@ -753,6 +1479,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -763,6 +1499,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = zext <vscale x 1 x i1> undef to <vscale x 1 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = zext <vscale x 1 x i1> undef to <vscale x 1 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.zext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
@@ -773,6 +1519,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext <vscale x 2 x i1> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.zext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
@@ -783,6 +1539,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext <vscale x 4 x i1> undef to <vscale x 4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.zext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
@@ -793,6 +1559,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.zext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
@@ -803,6 +1579,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
@@ -813,6 +1599,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
@@ -823,6 +1619,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
@@ -833,6 +1639,16 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2i16 = zext <2 x i8> undef to <2 x i16>
@@ -846,6 +1662,17 @@ define void @zext() {
   %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32>
   %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
 
+  %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i8.v2i16(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i8.v2i32(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i8.v2i64(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i16.v2i32(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i16.v2i64(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i32.v2i64(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i1.v2i8(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i1.v2i16(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i1.v2i32(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i1.v2i64(<2 x i1> undef, <2 x i1> undef, i32 undef)
+
   %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
   %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
   %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
@@ -857,6 +1684,17 @@ define void @zext() {
   %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
   %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
 
+  %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i8.v4i16(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i8.v4i32(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i8.v4i64(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i16.v4i32(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i16.v4i64(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i32.v4i64(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i1.v4i8(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i1.v4i16(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i1.v4i32(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i1.v4i64(<4 x i1> undef, <4 x i1> undef, i32 undef)
+
   %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
   %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
   %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
@@ -868,6 +1706,17 @@ define void @zext() {
   %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
   %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
 
+  %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i8.v8i16(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i8.v8i32(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i8.v8i64(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i16.v8i32(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i16.v8i64(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i32.v8i64(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i1.v8i8(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i1.v8i16(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i1.v8i32(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i1.v8i64(<8 x i1> undef, <8 x i1> undef, i32 undef)
+
   %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
   %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
   %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
@@ -879,6 +1728,17 @@ define void @zext() {
   %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
   %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
 
+  %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i8.v16i16(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i8.v16i32(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i8.v16i64(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i16.v16i32(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i16.v16i64(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i32.v16i64(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i1.v16i8(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i1.v16i16(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i1.v16i32(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i1.v16i64(<16 x i1> undef, <16 x i1> undef, i32 undef)
+
   %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
   %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
   %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
@@ -890,6 +1750,17 @@ define void @zext() {
   %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
   %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
 
+  %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i8.v32i16(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i8.v32i32(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i8.v32i64(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i16.v32i32(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i16.v32i64(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i32.v32i64(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i1.v32i8(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i1.v32i16(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i1.v32i32(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i1.v32i64(<32 x i1> undef, <32 x i1> undef, i32 undef)
+
   %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
   %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
   %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
@@ -901,6 +1772,17 @@ define void @zext() {
   %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
   %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
 
+  %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i8.v64i16(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i8.v64i32(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i8.v64i64(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i16.v64i32(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i16.v64i64(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i32.v64i64(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i1.v64i8(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i1.v64i16(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i1.v64i32(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i1.v64i64(<64 x i1> undef, <64 x i1> undef, i32 undef)
+
   %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
   %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
   %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
@@ -912,6 +1794,17 @@ define void @zext() {
   %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
   %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
 
+  %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i8.v128i16(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i8.v128i32(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i8.v128i64(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i16.v128i32(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i16.v128i64(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i32.v128i64(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i1.v128i8(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i1.v128i16(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i1.v128i32(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i1.v128i64(<128 x i1> undef, <128 x i1> undef, i32 undef)
+
   %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
   %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
   %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
@@ -923,6 +1816,17 @@ define void @zext() {
   %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
   %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
 
+  %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i8.v256i16(<256 x i8> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i8.v256i32(<256 x i8> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i8.v256i64(<256 x i8> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i16.v256i32(<256 x i16> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i16.v256i64(<256 x i16> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i32.v256i64(<256 x i32> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i1.v256i8(<256 x i1> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i1.v256i16(<256 x i1> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i1.v256i32(<256 x i1> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i1.v256i64(<256 x i1> undef, <256 x i1> undef, i32 undef)
+
   %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
   %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
   %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -934,6 +1838,17 @@ define void @zext() {
   %nxv1i1_nxv1i32 = zext <vscale x 1 x i1> undef to <vscale x 1 x i32>
   %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
 
+  %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i8.nxv1i16(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i8.nxv1i32(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i8.nxv1i64(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i16.nxv1i32(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i16.nxv1i64(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i32.nxv1i64(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.zext.nxv1i1.nxv1i8(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i1.nxv1i16(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i1.nxv1i32(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i1.nxv1i64(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+
   %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
   %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
   %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
@@ -945,6 +1860,17 @@ define void @zext() {
   %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
   %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
 
+  %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i8.nxv2i16(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i8.nxv2i32(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i8.nxv2i64(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i16.nxv2i32(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i16.nxv2i64(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i32.nxv2i64(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.zext.nxv2i1.nxv2i8(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i1.nxv2i16(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i1.nxv2i32(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i1.nxv2i64(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+
   %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
   %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
   %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
@@ -956,6 +1882,17 @@ define void @zext() {
   %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
   %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
 
+  %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i8.nxv4i16(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i8.nxv4i32(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i8.nxv4i64(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i16.nxv4i32(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i16.nxv4i64(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i32.nxv4i64(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.zext.nxv4i1.nxv4i8(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i1.nxv4i16(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i1.nxv4i32(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i1.nxv4i64(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+
   %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
   %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
   %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
@@ -967,6 +1904,17 @@ define void @zext() {
   %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
   %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
 
+  %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i8.nxv8i16(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i8.nxv8i32(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i8.nxv8i64(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i16.nxv8i32(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i16.nxv8i64(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i32.nxv8i64(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.zext.nxv8i1.nxv8i8(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i1.nxv8i16(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i1.nxv8i32(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i1.nxv8i64(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+
   %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
   %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
   %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
@@ -978,6 +1926,17 @@ define void @zext() {
   %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
   %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
 
+  %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i8.nxv16i16(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i8.nxv16i32(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i8.nxv16i64(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i16.nxv16i32(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i16.nxv16i64(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i32.nxv16i64(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i1.nxv16i8(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i1.nxv16i16(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i1.nxv16i32(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i1.nxv16i64(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+
   %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
   %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
   %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
@@ -989,6 +1948,17 @@ define void @zext() {
   %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
   %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
 
+  %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i8.nxv32i16(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i8.nxv32i32(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i8.nxv32i64(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i16.nxv32i32(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i16.nxv32i64(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i32.nxv32i64(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i1.nxv32i8(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i1.nxv32i16(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i1.nxv32i32(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i1.nxv32i64(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+
   %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
   %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
   %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
@@ -1000,6 +1970,17 @@ define void @zext() {
   %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
   %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
 
+  %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i8.nxv64i16(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i8.nxv64i32(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i8.nxv64i64(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i16.nxv64i32(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i16.nxv64i64(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i32.nxv64i64(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i1.nxv64i8(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i1.nxv64i16(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i1.nxv64i32(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i1.nxv64i64(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+
   %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
   %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
   %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
@@ -1011,6 +1992,17 @@ define void @zext() {
   %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
   %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 
+  %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i8.nxv128i16(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i8.nxv128i32(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i8.nxv128i128(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i16.nxv128i32(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i16.nxv128i128(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i32.nxv128i128(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i1.nxv128i8(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i1.nxv128i16(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i1.nxv128i32(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i1.nxv128i128(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+
   ret void
 }
 
@@ -1021,6 +2013,11 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i6.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i4(<2 x i4> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i6(<2 x i6> undef, <2 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8>
@@ -1031,6 +2028,16 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2i1 = trunc <2 x i16> undef to <2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4i8 = trunc <4 x i64> undef to <4 x i8>
@@ -1041,6 +2048,16 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i1 = trunc <4 x i16> undef to <4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8i8 = trunc <8 x i64> undef to <8 x i8>
@@ -1051,6 +2068,16 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i1 = trunc <8 x i16> undef to <8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64_v16i8 = trunc <2 x i64> undef to <2 x i8>
@@ -1061,6 +2088,16 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16i1 = trunc <2 x i16> undef to <2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i64_v32i8 = trunc <16 x i64> undef to <16 x i8>
@@ -1071,6 +2108,16 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i16_v32i1 = trunc <16 x i16> undef to <16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8>
@@ -1081,6 +2128,16 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
@@ -1091,6 +2148,16 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
@@ -1101,6 +2168,16 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
@@ -1111,6 +2188,16 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1i1 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i1 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i1 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1i64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.trunc.nxv1i32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i8_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i16_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
@@ -1121,6 +2208,16 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_nxv2i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2i64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
@@ -1131,6 +2228,16 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i64_nxv4i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i16_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4i64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.trunc.nxv4i32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_nxv8i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i64_nxv8i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
@@ -1141,6 +2248,16 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i64_nxv8i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8i16_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8i32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8i64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8i64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.trunc.nxv8i32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_nxv16i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i32_nxv16i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16i64_nxv16i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
@@ -1151,6 +2268,16 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i1 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i32_nxv16i1 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i64_nxv16i1 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16i16_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16i32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16i64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16i64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16i64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.trunc.nxv16i32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i16_nxv32i8 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i32_nxv32i8 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32i64_nxv32i8 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i8>
@@ -1161,6 +2288,16 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i16_nxv32i1 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i32_nxv32i1 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i64_nxv32i1 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32i16_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32i32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32i64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32i32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32i64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32i64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.trunc.nxv32i32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i16_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i16_nxv64i8 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64i32_nxv64i8 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i8>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i64_nxv64i8 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i8>
@@ -1171,6 +2308,16 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i16_nxv64i1 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i32_nxv64i1 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i1>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i64_nxv64i1 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64i16_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64i32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64i32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %vp_nxv64i64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %vp_nxv64i64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.trunc.nxv64i32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i8_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i16_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64-LABEL: 'trunc'
@@ -1179,6 +2326,11 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i6.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i4(<2 x i4> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i6(<2 x i6> undef, <2 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8>
@@ -1189,6 +2341,16 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2i1 = trunc <2 x i16> undef to <2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4i8 = trunc <4 x i64> undef to <4 x i8>
@@ -1199,6 +2361,16 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i1 = trunc <4 x i16> undef to <4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8i8 = trunc <8 x i64> undef to <8 x i8>
@@ -1209,6 +2381,16 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i1 = trunc <8 x i16> undef to <8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64_v16i8 = trunc <2 x i64> undef to <2 x i8>
@@ -1219,6 +2401,16 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16i1 = trunc <2 x i16> undef to <2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i64_v32i8 = trunc <16 x i64> undef to <16 x i8>
@@ -1229,6 +2421,16 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i16_v32i1 = trunc <16 x i16> undef to <16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8>
@@ -1239,6 +2441,16 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
@@ -1249,6 +2461,16 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
@@ -1259,6 +2481,16 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
@@ -1269,6 +2501,16 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1i1 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i1 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i1 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1i64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.trunc.nxv1i32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i8_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i16_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
@@ -1279,6 +2521,16 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_nxv2i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2i64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
@@ -1289,6 +2541,16 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i64_nxv4i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i16_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4i64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.trunc.nxv4i32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_nxv8i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i64_nxv8i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
@@ -1299,6 +2561,16 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i64_nxv8i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8i16_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8i32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8i64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8i64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.trunc.nxv8i32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_nxv16i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i32_nxv16i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16i64_nxv16i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
@@ -1309,6 +2581,16 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i1 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i32_nxv16i1 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i64_nxv16i1 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16i16_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16i32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16i64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16i64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16i64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.trunc.nxv16i32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i16_nxv32i8 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i32_nxv32i8 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32i64_nxv32i8 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i8>
@@ -1319,6 +2601,16 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i16_nxv32i1 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i32_nxv32i1 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i64_nxv32i1 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32i16_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32i32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32i64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32i32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32i64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32i64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.trunc.nxv32i32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i16_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i16_nxv64i8 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64i32_nxv64i8 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64i64_nxv64i8 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i8>
@@ -1329,6 +2621,16 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i16_nxv64i1 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i32_nxv64i1 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i64_nxv64i1 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64i16_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64i32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64i64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64i32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64i64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64i64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.trunc.nxv64i32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i8_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i16_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
@@ -1338,6 +2640,12 @@ define void @trunc() {
   %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2>
   %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4>
 
+  %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i16.v2i2(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i16.v2i4(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i16.v2i6(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i4.v2i2(<2 x i4> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i6.v2i4(<2 x i6> undef, <2 x i1> undef, i32 undef)
+
   %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8>
   %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8>
   %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8>
@@ -1349,6 +2657,17 @@ define void @trunc() {
   %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1>
   %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1>
 
+  %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i16.v2i8(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i32.v2i8(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i64.v2i8(<2 x i64> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i32.v2i16(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i64.v2i16(<2 x i64> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i64.v2i32(<2 x i64> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i8.v2i1(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i16.v2i1(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i32.v2i1(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i64.v2i1(<2 x i64> undef, <2 x i1> undef, i32 undef)
+
   %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8>
   %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8>
   %v4i64_v4i8 = trunc <4 x i64> undef to <4 x i8>
@@ -1360,6 +2679,17 @@ define void @trunc() {
   %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1>
   %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1>
 
+  %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i16.v4i8(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i32.v4i8(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i64.v4i8(<4 x i64> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i32.v4i16(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i64.v4i16(<4 x i64> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i64.v4i32(<4 x i64> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i8.v4i1(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i16.v4i1(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i32.v4i1(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i64.v4i1(<4 x i64> undef, <4 x i1> undef, i32 undef)
+
   %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8>
   %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8>
   %v8i64_v8i8 = trunc <8 x i64> undef to <8 x i8>
@@ -1371,6 +2701,17 @@ define void @trunc() {
   %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1>
   %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1>
 
+  %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i16.v8i8(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i32.v8i8(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i64.v8i8(<8 x i64> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i32.v8i16(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i64.v8i16(<8 x i64> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i64.v8i32(<8 x i64> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i8.v8i1(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i16.v8i1(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i32.v8i1(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i64.v8i1(<8 x i64> undef, <8 x i1> undef, i32 undef)
+
   %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8>
   %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8>
   %v16i64_v16i8 = trunc <2 x i64> undef to <2 x i8>
@@ -1382,6 +2723,17 @@ define void @trunc() {
   %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1>
   %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1>
 
+  %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i16.v16i8(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i32.v16i8(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i64.v16i8(<16 x i64> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i32.v16i16(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i64.v16i16(<16 x i64> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i64.v16i32(<16 x i64> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i8.v16i1(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i16.v16i1(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i32.v16i1(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i64.v16i1(<16 x i64> undef, <16 x i1> undef, i32 undef)
+
   %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8>
   %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8>
   %v32i64_v32i8 = trunc <16 x i64> undef to <16 x i8>
@@ -1393,6 +2745,17 @@ define void @trunc() {
   %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1>
   %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1>
 
+  %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i16.v32i8(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i32.v32i8(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i64.v32i8(<32 x i64> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i32.v32i16(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i64.v32i16(<32 x i64> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i64.v32i32(<32 x i64> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i8.v32i1(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i16.v32i1(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i32.v32i1(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i64.v32i1(<32 x i64> undef, <32 x i1> undef, i32 undef)
+
   %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8>
   %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8>
   %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8>
@@ -1404,6 +2767,17 @@ define void @trunc() {
   %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
   %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
 
+  %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i16.v64i8(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i32.v64i8(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i64.v64i8(<64 x i64> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i32.v64i16(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i64.v64i16(<64 x i64> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i64.v64i32(<64 x i64> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i8.v64i1(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i16.v64i1(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i32.v64i1(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i64.v64i1(<64 x i64> undef, <64 x i1> undef, i32 undef)
+
   %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
   %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
   %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
@@ -1415,6 +2789,17 @@ define void @trunc() {
   %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
   %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
 
+  %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i16.v128i8(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i32.v128i8(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i64.v128i8(<128 x i64> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i32.v128i16(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i64.v128i16(<128 x i64> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i64.v128i32(<128 x i64> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i8.v128i1(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i16.v128i1(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i32.v128i1(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i64.v128i1(<128 x i64> undef, <128 x i1> undef, i32 undef)
+
   %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
   %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
   %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
@@ -1426,6 +2811,17 @@ define void @trunc() {
   %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
   %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
 
+  %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i16.v256i8(<256 x i16> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i32.v256i8(<256 x i32> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i64.v256i8(<256 x i64> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i32.v256i16(<256 x i32> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i64.v256i16(<256 x i64> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i64.v256i32(<256 x i64> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i8.v256i1(<256 x i8> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i16.v256i1(<256 x i16> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i32.v256i1(<256 x i32> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i64.v256i1(<256 x i64> undef, <256 x i1> undef, i32 undef)
+
   %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
   %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
   %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
@@ -1437,6 +2833,17 @@ define void @trunc() {
   %nxv1i32_nxv1i1 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i1>
   %nxv1i64_nxv1i1 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i1>
 
+  %vp_nxv1i16_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i16.nxv1i8(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i32.nxv1i8(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i64.nxv1i8(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i32.nxv1i16(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i64.nxv1i16(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.trunc.nxv1i64.nxv1i32(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i8_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i8.nxv1i1(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i16_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i16.nxv1i1(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i32.nxv1i1(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i64.nxv1i1(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+
   %nxv2i16_nxv2i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
   %nxv2i32_nxv2i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
   %nxv2i64_nxv2i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
@@ -1448,6 +2855,17 @@ define void @trunc() {
   %nxv2i32_nxv2i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
   %nxv2i64_nxv2i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
 
+  %vp_nxv2i16_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i16.nxv2i8(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i32.nxv2i8(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i64.nxv2i8(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i32.nxv2i16(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i64.nxv2i16(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i64.nxv2i32(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i8_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i8.nxv2i1(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i16_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i16.nxv2i1(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i32.nxv2i1(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i64.nxv2i1(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+
   %nxv4i16_nxv4i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
   %nxv4i32_nxv4i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
   %nxv4i64_nxv4i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
@@ -1459,6 +2877,17 @@ define void @trunc() {
   %nxv4i32_nxv4i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
   %nxv4i64_nxv4i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
 
+  %vp_nxv4i16_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i16.nxv4i8(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i32.nxv4i8(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i64.nxv4i8(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i32.nxv4i16(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i64.nxv4i16(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.trunc.nxv4i64.nxv4i32(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i8_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i8.nxv4i1(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i16_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i16.nxv4i1(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i32.nxv4i1(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i64.nxv4i1(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+
   %nxv8i16_nxv8i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
   %nxv8i32_nxv8i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
   %nxv8i64_nxv8i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
@@ -1470,6 +2899,17 @@ define void @trunc() {
   %nxv8i32_nxv8i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
   %nxv8i64_nxv8i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
 
+  %vp_nxv8i16_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i16.nxv8i8(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i32.nxv8i8(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i64.nxv8i8(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i32.nxv8i16(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i64.nxv8i16(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.trunc.nxv8i64.nxv8i32(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i8_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i8.nxv8i1(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i16_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i16.nxv8i1(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i32.nxv8i1(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i64.nxv8i1(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+
   %nxv16i16_nxv16i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
   %nxv16i32_nxv16i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
   %nxv16i64_nxv16i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
@@ -1481,6 +2921,17 @@ define void @trunc() {
   %nxv16i32_nxv16i1 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i1>
   %nxv16i64_nxv16i1 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i1>
 
+  %vp_nxv16i16_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i16.nxv16i8(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i32.nxv16i8(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i64.nxv16i8(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i32.nxv16i16(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i64.nxv16i16(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.trunc.nxv16i64.nxv16i32(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i8_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i8.nxv16i1(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i16_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i16.nxv16i1(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i32.nxv16i1(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i64.nxv16i1(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+
   %nxv32i16_nxv32i8 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i8>
   %nxv32i32_nxv32i8 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i8>
   %nxv32i64_nxv32i8 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i8>
@@ -1492,6 +2943,17 @@ define void @trunc() {
   %nxv32i32_nxv32i1 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i1>
   %nxv32i64_nxv32i1 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i1>
 
+  %vp_nxv32i16_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i16.nxv32i8(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i32.nxv32i8(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i64.nxv32i8(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i32.nxv32i16(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i64.nxv32i16(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.trunc.nxv32i64.nxv32i32(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i8_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i8.nxv32i1(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i16_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i16.nxv32i1(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i32.nxv32i1(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i64.nxv32i1(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+
   %nxv64i16_nxv64i8 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i8>
   %nxv64i32_nxv64i8 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i8>
   %nxv64i64_nxv64i8 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i8>
@@ -1503,6 +2965,17 @@ define void @trunc() {
   %nxv64i32_nxv64i1 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i1>
   %nxv64i64_nxv64i1 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i1>
 
+  %vp_nxv64i16_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i16.nxv64i8(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i32.nxv64i8(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i64.nxv64i8(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i32.nxv64i16(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i64.nxv64i16(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.trunc.nxv64i64.nxv64i32(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i8_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i8.nxv64i1(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i16_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i16.nxv64i1(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i32.nxv64i1(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i64.nxv64i1(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+
   ret void
 }
 
@@ -1511,103 +2984,201 @@ define void @fpext() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2f32 = fpext <2 x half> undef to <2 x float>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2f64 = fpext <2 x half> undef to <2 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2f64 = fpext <2 x float> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f16_v2f32 = call <2 x float> @llvm.vp.fpext.v2f32.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f16_v2f64 = call <2 x double> @llvm.vp.fpext.v2f64.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2f64 = call <2 x double> @llvm.vp.fpext.v2f64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4f32 = fpext <4 x half> undef to <4 x float>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4f64 = fpext <4 x half> undef to <4 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4f64 = fpext <4 x float> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f16_v4f32 = call <4 x float> @llvm.vp.fpext.v4f32.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f16_v4f64 = call <4 x double> @llvm.vp.fpext.v4f64.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4f64 = call <4 x double> @llvm.vp.fpext.v4f64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8f32 = fpext <8 x half> undef to <8 x float>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f16_v8f64 = fpext <8 x half> undef to <8 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8f64 = fpext <8 x float> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f16_v8f32 = call <8 x float> @llvm.vp.fpext.v8f32.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f16_v8f64 = call <8 x double> @llvm.vp.fpext.v8f64.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8f64 = call <8 x double> @llvm.vp.fpext.v8f64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f16_v16f32 = fpext <16 x half> undef to <16 x float>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f16_v16f64 = fpext <16 x half> undef to <16 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16f64 = fpext <16 x float> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f16_v16f32 = call <16 x float> @llvm.vp.fpext.v16f32.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f16_v16f64 = call <16 x double> @llvm.vp.fpext.v16f64.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16f64 = call <16 x double> @llvm.vp.fpext.v16f64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f16_v32f32 = fpext <32 x half> undef to <32 x float>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f16_v32f64 = fpext <32 x half> undef to <32 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32f64 = fpext <32 x float> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f16_v32f32 = call <32 x float> @llvm.vp.fpext.v32f32.v32f16(<32 x half> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f16_v32f64 = call <32 x double> @llvm.vp.fpext.v32f64.v32f16(<32 x half> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32f64 = call <32 x double> @llvm.vp.fpext.v32f64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64f16_v64f32 = fpext <64 x half> undef to <64 x float>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f16_v64f64 = fpext <64 x half> undef to <64 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64f64 = fpext <64 x float> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64f16_v64f32 = call <64 x float> @llvm.vp.fpext.v64f32.v64f16(<64 x half> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f16_v64f64 = call <64 x double> @llvm.vp.fpext.v64f64.v64f16(<64 x half> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64f64 = call <64 x double> @llvm.vp.fpext.v64f64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128f16_v128f32 = fpext <128 x half> undef to <128 x float>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128f16_v128f64 = fpext <128 x half> undef to <128 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128f64 = fpext <128 x float> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128f16_v128f32 = call <128 x float> @llvm.vp.fpext.v128f32.v128f16(<128 x half> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128f16_v128f64 = call <128 x double> @llvm.vp.fpext.v128f64.v128f16(<128 x half> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128f64 = call <128 x double> @llvm.vp.fpext.v128f64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1f32 = fpext <vscale x 1 x half> undef to <vscale x 1 x float>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1f64 = fpext <vscale x 1 x half> undef to <vscale x 1 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1f64 = fpext <vscale x 1 x float> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.fpext.nxv1f32.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.fpext.nxv1f64.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.fpext.nxv1f64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2f32 = fpext <vscale x 2 x half> undef to <vscale x 2 x float>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2f64 = fpext <vscale x 2 x half> undef to <vscale x 2 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2f64 = fpext <vscale x 2 x float> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4f32 = fpext <vscale x 4 x half> undef to <vscale x 4 x float>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f16_nxv4f64 = fpext <vscale x 4 x half> undef to <vscale x 4 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4f64 = fpext <vscale x 4 x float> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.fpext.nxv4f64.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.fpext.nxv4f64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f16_nxv8f32 = fpext <vscale x 8 x half> undef to <vscale x 8 x float>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f16_nxv8f64 = fpext <vscale x 8 x half> undef to <vscale x 8 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8f64 = fpext <vscale x 8 x float> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.fpext.nxv8f64.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.fpext.nxv8f64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f16_nxv16f32 = fpext <vscale x 16 x half> undef to <vscale x 16 x float>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f16_nxv16f64 = fpext <vscale x 16 x half> undef to <vscale x 16 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16f64 = fpext <vscale x 16 x float> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.fpext.nxv16f64.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.fpext.nxv16f64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32f16_nxv32f32 = fpext <vscale x 32 x half> undef to <vscale x 32 x float>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f16_nxv32f64 = fpext <vscale x 32 x half> undef to <vscale x 32 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32f64 = fpext <vscale x 32 x float> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32f16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.fpext.nxv32f32.nxv32f16(<vscale x 32 x half> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.fpext.nxv32f64.nxv32f16(<vscale x 32 x half> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.fpext.nxv32f64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64f16_nxv64f32 = fpext <vscale x 64 x half> undef to <vscale x 64 x float>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64f64 = fpext <vscale x 64 x half> undef to <vscale x 64 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64f64 = fpext <vscale x 64 x float> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64f16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.fpext.nxv64f32.nxv64f16(<vscale x 64 x half> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64f16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.fpext.nxv64f64.nxv64f16(<vscale x 64 x half> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.fpext.nxv64f64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2f16_v2f32 = fpext <2 x half> undef to <2 x float>
   %v2f16_v2f64 = fpext <2 x half> undef to <2 x double>
   %v2f32_v2f64 = fpext <2 x float> undef to <2 x double>
 
+  %vp_v2f16_v2f32 = call <2 x float> @llvm.vp.fpext.v2half.v2float(<2 x half> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f16_v2f64 = call <2 x double> @llvm.vp.fpext.v2half.v2double(<2 x half> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2f64 = call <2 x double> @llvm.vp.fpext.v2float.v2double(<2 x float> undef, <2 x i1> undef, i32 undef)
+
   %v4f16_v4f32 = fpext <4 x half> undef to <4 x float>
   %v4f16_v4f64 = fpext <4 x half> undef to <4 x double>
   %v4f32_v4f64 = fpext <4 x float> undef to <4 x double>
 
+  %vp_v4f16_v4f32 = call <4 x float> @llvm.vp.fpext.v4half.v4float(<4 x half> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f16_v4f64 = call <4 x double> @llvm.vp.fpext.v4half.v4double(<4 x half> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4f64 = call <4 x double> @llvm.vp.fpext.v4float.v4double(<4 x float> undef, <4 x i1> undef, i32 undef)
+
   %v8f16_v8f32 = fpext <8 x half> undef to <8 x float>
   %v8f16_v8f64 = fpext <8 x half> undef to <8 x double>
   %v8f32_v8f64 = fpext <8 x float> undef to <8 x double>
 
+  %vp_v8f16_v8f32 = call <8 x float> @llvm.vp.fpext.v8half.v8float(<8 x half> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f16_v8f64 = call <8 x double> @llvm.vp.fpext.v8half.v8double(<8 x half> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8f64 = call <8 x double> @llvm.vp.fpext.v8float.v8double(<8 x float> undef, <8 x i1> undef, i32 undef)
+
   %v16f16_v16f32 = fpext <16 x half> undef to <16 x float>
   %v16f16_v16f64 = fpext <16 x half> undef to <16 x double>
   %v16f32_v16f64 = fpext <16 x float> undef to <16 x double>
 
+  %vp_v16f16_v16f32 = call <16 x float> @llvm.vp.fpext.v16half.v16float(<16 x half> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f16_v16f64 = call <16 x double> @llvm.vp.fpext.v16half.v16double(<16 x half> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16f64 = call <16 x double> @llvm.vp.fpext.v16float.v16double(<16 x float> undef, <16 x i1> undef, i32 undef)
+
   %v32f16_v32f32 = fpext <32 x half> undef to <32 x float>
   %v32f16_v32f64 = fpext <32 x half> undef to <32 x double>
   %v32f32_v32f64 = fpext <32 x float> undef to <32 x double>
 
+  %vp_v32f16_v32f32 = call <32 x float> @llvm.vp.fpext.v32half.v32float(<32 x half> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f16_v32f64 = call <32 x double> @llvm.vp.fpext.v32half.v32double(<32 x half> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32f64 = call <32 x double> @llvm.vp.fpext.v32float.v32double(<32 x float> undef, <32 x i1> undef, i32 undef)
+
   %v64f16_v64f32 = fpext <64 x half> undef to <64 x float>
   %v64f16_v64f64 = fpext <64 x half> undef to <64 x double>
   %v64f32_v64f64 = fpext <64 x float> undef to <64 x double>
 
+  %vp_v64f16_v64f32 = call <64 x float> @llvm.vp.fpext.v64half.v64float(<64 x half> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f16_v64f64 = call <64 x double> @llvm.vp.fpext.v64half.v64double(<64 x half> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64f64 = call <64 x double> @llvm.vp.fpext.v64float.v64double(<64 x float> undef, <64 x i1> undef, i32 undef)
+
   %v128f16_v128f32 = fpext <128 x half> undef to <128 x float>
   %v128f16_v128f64 = fpext <128 x half> undef to <128 x double>
   %v128f32_v128f64 = fpext <128 x float> undef to <128 x double>
 
+  %vp_v128f16_v128f32 = call <128 x float> @llvm.vp.fpext.v128half.v128float(<128 x half> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f16_v128f64 = call <128 x double> @llvm.vp.fpext.v128half.v128double(<128 x half> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128f64 = call <128 x double> @llvm.vp.fpext.v128float.v128double(<128 x float> undef, <128 x i1> undef, i32 undef)
+
   %nxv1f16_nxv1f32 = fpext <vscale x 1 x half> undef to <vscale x 1 x float>
   %nxv1f16_nxv1f64 = fpext <vscale x 1 x half> undef to <vscale x 1 x double>
   %nxv1f32_nxv1f64 = fpext <vscale x 1 x float> undef to <vscale x 1 x double>
 
+  %vp_nxv1f16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.fpext.nxv1half.nxv1float(<vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.fpext.nxv1half.nxv1double(<vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.fpext.nxv1float.nxv1double(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+
   %nxv2f16_nxv2f32 = fpext <vscale x 2 x half> undef to <vscale x 2 x float>
   %nxv2f16_nxv2f64 = fpext <vscale x 2 x half> undef to <vscale x 2 x double>
   %nxv2f32_nxv2f64 = fpext <vscale x 2 x float> undef to <vscale x 2 x double>
 
+  %vp_nxv2f16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.fpext.nxv2half.nxv2float(<vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.fpext.nxv2half.nxv2double(<vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.fpext.nxv2float.nxv2double(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+
   %nxv4f16_nxv4f32 = fpext <vscale x 4 x half> undef to <vscale x 4 x float>
   %nxv4f16_nxv4f64 = fpext <vscale x 4 x half> undef to <vscale x 4 x double>
   %nxv4f32_nxv4f64 = fpext <vscale x 4 x float> undef to <vscale x 4 x double>
 
+  %vp_nxv4f16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.fpext.nxv4half.nxv4float(<vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.fpext.nxv4half.nxv4double(<vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.fpext.nxv4float.nxv4double(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+
   %nxv8f16_nxv8f32 = fpext <vscale x 8 x half> undef to <vscale x 8 x float>
   %nxv8f16_nxv8f64 = fpext <vscale x 8 x half> undef to <vscale x 8 x double>
   %nxv8f32_nxv8f64 = fpext <vscale x 8 x float> undef to <vscale x 8 x double>
 
+  %vp_nxv8f16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.fpext.nxv8half.nxv8float(<vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.fpext.nxv8half.nxv8double(<vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.fpext.nxv8float.nxv8double(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+
   %nxv16f16_nxv16f32 = fpext <vscale x 16 x half> undef to <vscale x 16 x float>
   %nxv16f16_nxv16f64 = fpext <vscale x 16 x half> undef to <vscale x 16 x double>
   %nxv16f32_nxv16f64 = fpext <vscale x 16 x float> undef to <vscale x 16 x double>
 
+  %vp_nxv16f16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.fpext.nxv16half.nxv16float(<vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.fpext.nxv16half.nxv16double(<vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.fpext.nxv16float.nxv16double(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+
   %nxv32f16_nxv32f32 = fpext <vscale x 32 x half> undef to <vscale x 32 x float>
   %nxv32f16_nxv32f64 = fpext <vscale x 32 x half> undef to <vscale x 32 x double>
   %nxv32f32_nxv32f64 = fpext <vscale x 32 x float> undef to <vscale x 32 x double>
 
+  %vp_nxv32f16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.fpext.nxv32half.nxv32float(<vscale x 32 x half> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.fpext.nxv32half.nxv32double(<vscale x 32 x half> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.fpext.nxv32float.nxv32double(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+
   %nxv64f16_nxv64f32 = fpext <vscale x 64 x half> undef to <vscale x 64 x float>
   %nxv64f16_nxv64f64 = fpext <vscale x 64 x half> undef to <vscale x 64 x double>
   %nxv64f32_nxv64f64 = fpext <vscale x 64 x float> undef to <vscale x 64 x double>
 
+  %vp_nxv64f16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.fpext.nxv64half.nxv64float(<vscale x 64 x half> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.fpext.nxv64half.nxv64double(<vscale x 64 x half> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.fpext.nxv64float.nxv64double(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+
   ret void
 }
 
@@ -1616,103 +3187,201 @@ define void @fptrunc() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2f16 = fptrunc <2 x float> undef to <2 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2f16 = fptrunc <2 x double> undef to <2 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2f32 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2f32 = call <2 x float> @llvm.vp.fptrunc.v2f32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4f16 = fptrunc <4 x float> undef to <4 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4f16 = fptrunc <4 x double> undef to <4 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4f32 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4f16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4f16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4f32 = call <4 x float> @llvm.vp.fptrunc.v4f32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8f16 = fptrunc <8 x float> undef to <8 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8f16 = fptrunc <8 x double> undef to <8 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8f32 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8f16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8f16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8f32 = call <8 x float> @llvm.vp.fptrunc.v8f32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16f16 = fptrunc <16 x float> undef to <16 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16f16 = fptrunc <16 x double> undef to <16 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16f32 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16f16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16f16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16f32 = call <16 x float> @llvm.vp.fptrunc.v16f32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32f16 = fptrunc <32 x float> undef to <32 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32f16 = fptrunc <32 x double> undef to <32 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32f32 = fptrunc <32 x double> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32f16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32f16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32f32 = call <32 x float> @llvm.vp.fptrunc.v32f32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64f16 = fptrunc <64 x float> undef to <64 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64f16 = fptrunc <64 x double> undef to <64 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64f32 = fptrunc <64 x double> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64f16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64f16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64f32 = call <64 x float> @llvm.vp.fptrunc.v64f32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128f16 = fptrunc <128 x float> undef to <128 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128f16 = fptrunc <128 x double> undef to <128 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128f32 = fptrunc <128 x double> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128f16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128f16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128f32 = call <128 x float> @llvm.vp.fptrunc.v128f32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1f16 = fptrunc <vscale x 1 x float> undef to <vscale x 1 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1f16 = fptrunc <vscale x 1 x double> undef to <vscale x 1 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1f32 = fptrunc <vscale x 1 x double> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1f16 = call <vscale x 1 x half> @llvm.vp.fptrunc.nxv1f16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1f16 = call <vscale x 1 x half> @llvm.vp.fptrunc.nxv1f16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.fptrunc.nxv1f32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv1f16 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv1f16 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv1f32 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2f16 = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2f16 = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.fptrunc.nxv2f32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4f16 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4f16 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4f32 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4f16 = call <vscale x 4 x half> @llvm.vp.fptrunc.nxv4f16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4f16 = call <vscale x 4 x half> @llvm.vp.fptrunc.nxv4f16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.fptrunc.nxv4f32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8f16 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8f16 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8f32 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8f16 = call <vscale x 8 x half> @llvm.vp.fptrunc.nxv8f16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8f16 = call <vscale x 8 x half> @llvm.vp.fptrunc.nxv8f16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.fptrunc.nxv8f32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16f16 = fptrunc <vscale x 16 x float> undef to <vscale x 16 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16f16 = fptrunc <vscale x 16 x double> undef to <vscale x 16 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16f32 = fptrunc <vscale x 16 x double> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16f16 = call <vscale x 16 x half> @llvm.vp.fptrunc.nxv16f16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16f16 = call <vscale x 16 x half> @llvm.vp.fptrunc.nxv16f16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.fptrunc.nxv16f32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32f16 = fptrunc <vscale x 32 x float> undef to <vscale x 32 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32f16 = fptrunc <vscale x 32 x double> undef to <vscale x 32 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32f32 = fptrunc <vscale x 32 x double> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32f16 = call <vscale x 32 x half> @llvm.vp.fptrunc.nxv32f16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32f16 = call <vscale x 32 x half> @llvm.vp.fptrunc.nxv32f16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.fptrunc.nxv32f32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64f16 = fptrunc <vscale x 64 x float> undef to <vscale x 64 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64f16 = fptrunc <vscale x 64 x double> undef to <vscale x 64 x half>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64f32 = fptrunc <vscale x 64 x double> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64f16 = call <vscale x 64 x half> @llvm.vp.fptrunc.nxv64f16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64f16 = call <vscale x 64 x half> @llvm.vp.fptrunc.nxv64f16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.fptrunc.nxv64f32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2f32_v2f16 = fptrunc <2 x float> undef to <2 x half>
   %v2f64_v2f16 = fptrunc <2 x double> undef to <2 x half>
   %v2f64_v2f32 = fptrunc <2 x double> undef to <2 x float>
 
+  %vp_v2f32_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2float.v2half(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2double.v2half(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2f32 = call <2 x float> @llvm.vp.fptrunc.v2double.v2float(<2 x double> undef, <2 x i1> undef, i32 undef)
+
   %v4f32_v4f16 = fptrunc <4 x float> undef to <4 x half>
   %v4f64_v4f16 = fptrunc <4 x double> undef to <4 x half>
   %v4f64_v4f32 = fptrunc <4 x double> undef to <4 x float>
 
+  %vp_v4f32_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4float.v4half(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4double.v4half(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4f32 = call <4 x float> @llvm.vp.fptrunc.v4double.v4float(<4 x double> undef, <4 x i1> undef, i32 undef)
+
   %v8f32_v8f16 = fptrunc <8 x float> undef to <8 x half>
   %v8f64_v8f16 = fptrunc <8 x double> undef to <8 x half>
   %v8f64_v8f32 = fptrunc <8 x double> undef to <8 x float>
 
+  %vp_v8f32_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8float.v8half(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8double.v8half(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8f32 = call <8 x float> @llvm.vp.fptrunc.v8double.v8float(<8 x double> undef, <8 x i1> undef, i32 undef)
+
   %v16f32_v16f16 = fptrunc <16 x float> undef to <16 x half>
   %v16f64_v16f16 = fptrunc <16 x double> undef to <16 x half>
   %v16f64_v16f32 = fptrunc <16 x double> undef to <16 x float>
 
+  %vp_v16f32_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16float.v16half(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16double.v16half(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16f32 = call <16 x float> @llvm.vp.fptrunc.v16double.v16float(<16 x double> undef, <16 x i1> undef, i32 undef)
+
   %v32f32_v32f16 = fptrunc <32 x float> undef to <32 x half>
   %v32f64_v32f16 = fptrunc <32 x double> undef to <32 x half>
   %v32f64_v32f32 = fptrunc <32 x double> undef to <32 x float>
 
+  %vp_v32f32_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32float.v32half(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32double.v32half(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32f32 = call <32 x float> @llvm.vp.fptrunc.v32double.v32float(<32 x double> undef, <32 x i1> undef, i32 undef)
+
   %v64f32_v64f16 = fptrunc <64 x float> undef to <64 x half>
   %v64f64_v64f16 = fptrunc <64 x double> undef to <64 x half>
   %v64f64_v64f32 = fptrunc <64 x double> undef to <64 x float>
 
+  %vp_v64f32_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64float.v64half(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64double.v64half(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64f32 = call <64 x float> @llvm.vp.fptrunc.v64double.v64float(<64 x double> undef, <64 x i1> undef, i32 undef)
+
   %v128f32_v128f16 = fptrunc <128 x float> undef to <128 x half>
   %v128f64_v128f16 = fptrunc <128 x double> undef to <128 x half>
   %v128f64_v128f32 = fptrunc <128 x double> undef to <128 x float>
 
+  %vp_v128f32_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128float.v128half(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128double.v128half(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128f32 = call <128 x float> @llvm.vp.fptrunc.v128double.v128float(<128 x double> undef, <128 x i1> undef, i32 undef)
+
   %nxv1f32_nxv1f16 = fptrunc <vscale x 1 x float> undef to <vscale x 1 x half>
   %nxv1f64_nxv1f16 = fptrunc <vscale x 1 x double> undef to <vscale x 1 x half>
   %nxv1f64_nxv1f32 = fptrunc <vscale x 1 x double> undef to <vscale x 1 x float>
 
+  %vp_nxv1f32_nxv1f16 = call <vscale x 1 x half> @llvm.vp.fptrunc.nxv1float.nxv1half(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1f16 = call <vscale x 1 x half> @llvm.vp.fptrunc.nxv1double.nxv1half(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.fptrunc.nxv1double.nxv1float(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+
   %nxv2f32_nxv1f16 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
   %nxv2f64_nxv1f16 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
   %nxv2f64_nxv1f32 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
 
+  %vp_nxv2f32_nxv2f16 = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2float.nxv2half(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2f16 = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2double.nxv2half(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.fptrunc.nxv2double.nxv2float(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+
   %nxv4f32_nxv4f16 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
   %nxv4f64_nxv4f16 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
   %nxv4f64_nxv4f32 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
 
+  %vp_nxv4f32_nxv4f16 = call <vscale x 4 x half> @llvm.vp.fptrunc.nxv4float.nxv4half(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4f16 = call <vscale x 4 x half> @llvm.vp.fptrunc.nxv4double.nxv4half(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.fptrunc.nxv4double.nxv4float(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+
   %nxv8f32_nxv8f16 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
   %nxv8f64_nxv8f16 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
   %nxv8f64_nxv8f32 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
 
+  %vp_nxv8f32_nxv8f16 = call <vscale x 8 x half> @llvm.vp.fptrunc.nxv8float.nxv8half(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8f16 = call <vscale x 8 x half> @llvm.vp.fptrunc.nxv8double.nxv8half(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.fptrunc.nxv8double.nxv8float(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+
   %nxv16f32_nxv16f16 = fptrunc <vscale x 16 x float> undef to <vscale x 16 x half>
   %nxv16f64_nxv16f16 = fptrunc <vscale x 16 x double> undef to <vscale x 16 x half>
   %nxv16f64_nxv16f32 = fptrunc <vscale x 16 x double> undef to <vscale x 16 x float>
 
+  %vp_nxv16f32_nxv16f16 = call <vscale x 16 x half> @llvm.vp.fptrunc.nxv16float.nxv16half(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16f16 = call <vscale x 16 x half> @llvm.vp.fptrunc.nxv16double.nxv16half(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.fptrunc.nxv16double.nxv16float(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+
   %nxv32f32_nxv32f16 = fptrunc <vscale x 32 x float> undef to <vscale x 32 x half>
   %nxv32f64_nxv32f16 = fptrunc <vscale x 32 x double> undef to <vscale x 32 x half>
   %nxv32f64_nxv32f32 = fptrunc <vscale x 32 x double> undef to <vscale x 32 x float>
 
+  %vp_nxv32f32_nxv32f16 = call <vscale x 32 x half> @llvm.vp.fptrunc.nxv32float.nxv32half(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32f16 = call <vscale x 32 x half> @llvm.vp.fptrunc.nxv32double.nxv32half(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.fptrunc.nxv32double.nxv32float(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+
   %nxv64f32_nxv64f16 = fptrunc <vscale x 64 x float> undef to <vscale x 64 x half>
   %nxv64f64_nxv64f16 = fptrunc <vscale x 64 x double> undef to <vscale x 64 x half>
   %nxv64f64_nxv64f32 = fptrunc <vscale x 64 x double> undef to <vscale x 64 x float>
 
+  %vp_nxv64f32_nxv64f16 = call <vscale x 64 x half> @llvm.vp.fptrunc.nxv64float.nxv64half(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64f16 = call <vscale x 64 x half> @llvm.vp.fptrunc.nxv64double.nxv64half(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.fptrunc.nxv64double.nxv64float(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+
   ret void
 }
 
@@ -1728,6 +3397,16 @@ define void @fptosi() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
@@ -1738,6 +3417,16 @@ define void @fptosi() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
@@ -1748,6 +3437,16 @@ define void @fptosi() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
@@ -1758,6 +3457,16 @@ define void @fptosi() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
@@ -1768,6 +3477,16 @@ define void @fptosi() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
@@ -1778,6 +3497,16 @@ define void @fptosi() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
@@ -1788,6 +3517,16 @@ define void @fptosi() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
@@ -1798,6 +3537,16 @@ define void @fptosi() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
@@ -1808,6 +3557,16 @@ define void @fptosi() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
@@ -1818,6 +3577,16 @@ define void @fptosi() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
@@ -1828,6 +3597,16 @@ define void @fptosi() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
@@ -1838,6 +3617,16 @@ define void @fptosi() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
@@ -1848,6 +3637,16 @@ define void @fptosi() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
@@ -1858,6 +3657,16 @@ define void @fptosi() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64-LABEL: 'fptosi'
@@ -1871,6 +3680,16 @@ define void @fptosi() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
@@ -1881,6 +3700,16 @@ define void @fptosi() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
@@ -1891,6 +3720,16 @@ define void @fptosi() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
@@ -1901,6 +3740,16 @@ define void @fptosi() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
@@ -1911,6 +3760,16 @@ define void @fptosi() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
@@ -1921,6 +3780,16 @@ define void @fptosi() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
@@ -1931,6 +3800,16 @@ define void @fptosi() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
@@ -1941,6 +3820,16 @@ define void @fptosi() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
@@ -1951,6 +3840,16 @@ define void @fptosi() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
@@ -1961,6 +3860,16 @@ define void @fptosi() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
@@ -1971,6 +3880,16 @@ define void @fptosi() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
@@ -1981,6 +3900,16 @@ define void @fptosi() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
@@ -1991,6 +3920,16 @@ define void @fptosi() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
@@ -2001,6 +3940,16 @@ define void @fptosi() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
@@ -2014,6 +3963,17 @@ define void @fptosi() {
   %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
   %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
 
+  %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2float.v2i8(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2double.v2i8(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2float.v2i16(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2double.v2i16(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2float.v2i32(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2double.v2i32(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2float.v2i64(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2double.v2i64(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2float.v2i1(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2double.v2i1(<2 x double> undef, <2 x i1> undef, i32 undef)
+
   %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
   %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
   %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
@@ -2025,6 +3985,17 @@ define void @fptosi() {
   %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
   %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
 
+  %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4float.v4i8(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4double.v4i8(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4float.v4i16(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4double.v4i16(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4float.v4i32(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4double.v4i32(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4float.v4i64(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4double.v4i64(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4float.v4i1(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4double.v4i1(<4 x double> undef, <4 x i1> undef, i32 undef)
+
   %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
   %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
   %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
@@ -2036,6 +4007,17 @@ define void @fptosi() {
   %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
   %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
 
+  %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8float.v8i8(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8double.v8i8(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8float.v8i16(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8double.v8i16(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8float.v8i32(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8double.v8i32(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8float.v8i64(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8double.v8i64(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8float.v8i1(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8double.v8i1(<8 x double> undef, <8 x i1> undef, i32 undef)
+
   %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
   %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
   %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
@@ -2047,6 +4029,17 @@ define void @fptosi() {
   %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
   %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
 
+  %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16float.v16i8(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16double.v16i8(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16float.v16i16(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16double.v16i16(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16float.v16i32(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16double.v16i32(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16float.v16i64(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16double.v16i64(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16float.v16i1(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16double.v16i1(<16 x double> undef, <16 x i1> undef, i32 undef)
+
   %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
   %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
   %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
@@ -2058,6 +4051,17 @@ define void @fptosi() {
   %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
   %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
 
+  %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32float.v32i8(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32double.v32i8(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32float.v32i16(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32double.v32i16(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32float.v32i32(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32double.v32i32(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32float.v32i64(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32double.v32i64(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32float.v32i1(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32double.v32i1(<32 x double> undef, <32 x i1> undef, i32 undef)
+
   %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
   %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
   %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
@@ -2069,6 +4073,17 @@ define void @fptosi() {
   %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
   %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
 
+  %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64float.v64i8(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64double.v64i8(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64float.v64i16(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64double.v64i16(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64float.v64i32(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64double.v64i32(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64float.v64i64(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64double.v64i64(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64float.v64i1(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64double.v64i1(<64 x double> undef, <64 x i1> undef, i32 undef)
+
   %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
   %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
   %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
@@ -2080,6 +4095,17 @@ define void @fptosi() {
   %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
   %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
 
+  %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128float.v128i8(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128double.v128i8(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128float.v128i16(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128double.v128i16(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128float.v128i32(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128double.v128i32(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128float.v128i64(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128double.v128i64(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128float.v128i1(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128double.v128i1(<128 x double> undef, <128 x i1> undef, i32 undef)
+
   %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
   %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
   %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
@@ -2091,6 +4117,17 @@ define void @fptosi() {
   %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
   %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
 
+  %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1float.nxv1i8(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1double.nxv1i8(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1float.nxv1i16(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1double.nxv1i16(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1float.nxv1i32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1double.nxv1i32(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1float.nxv1i64(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1double.nxv1i64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1float.nxv1i1(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1double.nxv1i1(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+
   %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
   %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
   %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
@@ -2102,6 +4139,17 @@ define void @fptosi() {
   %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
   %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
 
+  %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2float.nxv2i8(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2double.nxv2i8(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2float.nxv2i16(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2double.nxv2i16(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2float.nxv2i32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2double.nxv2i32(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2float.nxv2i64(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2double.nxv2i64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2float.nxv2i1(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2double.nxv2i1(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+
   %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
   %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
   %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
@@ -2113,6 +4161,17 @@ define void @fptosi() {
   %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
   %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
 
+  %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4float.nxv4i8(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4double.nxv4i8(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4float.nxv4i16(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4double.nxv4i16(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4float.nxv4i32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4double.nxv4i32(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4float.nxv4i64(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4double.nxv4i64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4float.nxv4i1(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4double.nxv4i1(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+
   %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
   %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
   %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
@@ -2124,6 +4183,17 @@ define void @fptosi() {
   %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
   %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
 
+  %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8float.nxv8i8(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8double.nxv8i8(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8float.nxv8i16(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8double.nxv8i16(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8float.nxv8i32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8double.nxv8i32(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8float.nxv8i64(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8double.nxv8i64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8float.nxv8i1(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8double.nxv8i1(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+
   %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
   %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
   %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
@@ -2135,6 +4205,17 @@ define void @fptosi() {
   %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
   %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
 
+  %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16float.nxv16i8(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16double.nxv16i8(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16float.nxv16i16(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16double.nxv16i16(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16float.nxv16i32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16double.nxv16i32(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16float.nxv16i64(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16double.nxv16i64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16float.nxv16i1(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16double.nxv16i1(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+
   %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
   %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
   %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
@@ -2146,6 +4227,17 @@ define void @fptosi() {
   %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
   %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
 
+  %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32float.nxv32i8(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32double.nxv32i8(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32float.nxv32i16(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32double.nxv32i16(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32float.nxv32i32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32double.nxv32i32(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32float.nxv32i64(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32double.nxv32i64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32float.nxv32i1(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32double.nxv32i1(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+
   %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
   %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
   %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
@@ -2157,6 +4249,17 @@ define void @fptosi() {
   %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
   %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
 
+  %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64float.nxv64i8(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64double.nxv64i8(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64float.nxv64i16(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64double.nxv64i16(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64float.nxv64i32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64double.nxv64i32(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64float.nxv64i64(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64double.nxv64i64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64float.nxv64i1(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64double.nxv64i1(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+
   ret void
 }
 
@@ -2172,6 +4275,16 @@ define void @fptoui() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
@@ -2182,6 +4295,16 @@ define void @fptoui() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
@@ -2192,6 +4315,16 @@ define void @fptoui() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
@@ -2202,6 +4335,16 @@ define void @fptoui() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
@@ -2212,6 +4355,16 @@ define void @fptoui() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
@@ -2222,6 +4375,16 @@ define void @fptoui() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
@@ -2232,6 +4395,16 @@ define void @fptoui() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
@@ -2242,6 +4415,16 @@ define void @fptoui() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
@@ -2252,6 +4435,16 @@ define void @fptoui() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
@@ -2262,6 +4455,16 @@ define void @fptoui() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
@@ -2272,6 +4475,16 @@ define void @fptoui() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
@@ -2282,6 +4495,16 @@ define void @fptoui() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
@@ -2292,6 +4515,16 @@ define void @fptoui() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
@@ -2302,6 +4535,16 @@ define void @fptoui() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64-LABEL: 'fptoui'
@@ -2315,6 +4558,16 @@ define void @fptoui() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
@@ -2325,6 +4578,16 @@ define void @fptoui() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
@@ -2335,6 +4598,16 @@ define void @fptoui() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
@@ -2345,6 +4618,16 @@ define void @fptoui() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
@@ -2355,6 +4638,16 @@ define void @fptoui() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
@@ -2365,6 +4658,16 @@ define void @fptoui() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
@@ -2375,6 +4678,16 @@ define void @fptoui() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
@@ -2385,6 +4698,16 @@ define void @fptoui() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
@@ -2395,6 +4718,16 @@ define void @fptoui() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
@@ -2405,6 +4738,16 @@ define void @fptoui() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
@@ -2415,6 +4758,16 @@ define void @fptoui() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
@@ -2425,6 +4778,16 @@ define void @fptoui() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
@@ -2435,6 +4798,16 @@ define void @fptoui() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
@@ -2445,6 +4818,16 @@ define void @fptoui() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
@@ -2458,6 +4841,17 @@ define void @fptoui() {
   %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
   %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
 
+  %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2float.v2i8(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2double.v2i8(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2float.v2i16(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2double.v2i16(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2float.v2i32(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2double.v2i32(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2float.v2i64(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2double.v2i64(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2float.v2i1(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2double.v2i1(<2 x double> undef, <2 x i1> undef, i32 undef)
+
   %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
   %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
   %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
@@ -2469,6 +4863,17 @@ define void @fptoui() {
   %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
   %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
 
+  %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4float.v4i8(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4double.v4i8(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4float.v4i16(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4double.v4i16(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4float.v4i32(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4double.v4i32(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4float.v4i64(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4double.v4i64(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4float.v4i1(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4double.v4i1(<4 x double> undef, <4 x i1> undef, i32 undef)
+
   %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
   %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
   %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
@@ -2480,6 +4885,17 @@ define void @fptoui() {
   %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
   %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
 
+  %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8float.v8i8(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8double.v8i8(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8float.v8i16(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8double.v8i16(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8float.v8i32(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8double.v8i32(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8float.v8i64(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8double.v8i64(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8float.v8i1(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8double.v8i1(<8 x double> undef, <8 x i1> undef, i32 undef)
+
   %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
   %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
   %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
@@ -2491,6 +4907,17 @@ define void @fptoui() {
   %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
   %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
 
+  %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16float.v16i8(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16double.v16i8(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16float.v16i16(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16double.v16i16(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16float.v16i32(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16double.v16i32(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16float.v16i64(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16double.v16i64(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16float.v16i1(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16double.v16i1(<16 x double> undef, <16 x i1> undef, i32 undef)
+
   %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
   %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
   %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
@@ -2502,6 +4929,17 @@ define void @fptoui() {
   %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
   %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
 
+  %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32float.v32i8(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32double.v32i8(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32float.v32i16(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32double.v32i16(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32float.v32i32(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32double.v32i32(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32float.v32i64(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32double.v32i64(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32float.v32i1(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32double.v32i1(<32 x double> undef, <32 x i1> undef, i32 undef)
+
   %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
   %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
   %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
@@ -2513,6 +4951,17 @@ define void @fptoui() {
   %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
   %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
 
+  %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64float.v64i8(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64double.v64i8(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64float.v64i16(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64double.v64i16(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64float.v64i32(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64double.v64i32(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64float.v64i64(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64double.v64i64(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64float.v64i1(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64double.v64i1(<64 x double> undef, <64 x i1> undef, i32 undef)
+
   %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
   %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
   %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
@@ -2524,6 +4973,17 @@ define void @fptoui() {
   %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
   %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
 
+  %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128float.v128i8(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128double.v128i8(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128float.v128i16(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128double.v128i16(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128float.v128i32(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128double.v128i32(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128float.v128i64(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128double.v128i64(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128float.v128i1(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128double.v128i1(<128 x double> undef, <128 x i1> undef, i32 undef)
+
   %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
   %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
   %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
@@ -2535,6 +4995,17 @@ define void @fptoui() {
   %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
   %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
 
+  %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1float.nxv1i8(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1double.nxv1i8(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1float.nxv1i16(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1double.nxv1i16(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1float.nxv1i32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1double.nxv1i32(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1float.nxv1i64(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1double.nxv1i64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1float.nxv1i1(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1double.nxv1i1(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+
   %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
   %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
   %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
@@ -2546,6 +5017,17 @@ define void @fptoui() {
   %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
   %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
 
+  %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2float.nxv2i8(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2double.nxv2i8(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2float.nxv2i16(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2double.nxv2i16(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2float.nxv2i32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2double.nxv2i32(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2float.nxv2i64(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2double.nxv2i64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2float.nxv2i1(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2double.nxv2i1(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+
   %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
   %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
   %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
@@ -2557,6 +5039,17 @@ define void @fptoui() {
   %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
   %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
 
+  %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4float.nxv4i8(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4double.nxv4i8(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4float.nxv4i16(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4double.nxv4i16(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4float.nxv4i32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4double.nxv4i32(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4float.nxv4i64(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4double.nxv4i64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4float.nxv4i1(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4double.nxv4i1(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+
   %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
   %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
   %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
@@ -2568,6 +5061,17 @@ define void @fptoui() {
   %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
   %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
 
+  %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8float.nxv8i8(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8double.nxv8i8(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8float.nxv8i16(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8double.nxv8i16(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8float.nxv8i32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8double.nxv8i32(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8float.nxv8i64(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8double.nxv8i64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8float.nxv8i1(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8double.nxv8i1(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+
   %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
   %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
   %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
@@ -2579,6 +5083,17 @@ define void @fptoui() {
   %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
   %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
 
+  %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16float.nxv16i8(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16double.nxv16i8(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16float.nxv16i16(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16double.nxv16i16(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16float.nxv16i32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16double.nxv16i32(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16float.nxv16i64(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16double.nxv16i64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16float.nxv16i1(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16double.nxv16i1(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+
   %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
   %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
   %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
@@ -2590,6 +5105,17 @@ define void @fptoui() {
   %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
   %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
 
+  %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32float.nxv32i8(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32double.nxv32i8(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32float.nxv32i16(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32double.nxv32i16(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32float.nxv32i32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32double.nxv32i32(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32float.nxv32i64(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32double.nxv32i64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32float.nxv32i1(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32double.nxv32i1(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+
   %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
   %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
   %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
@@ -2601,6 +5127,17 @@ define void @fptoui() {
   %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
   %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
 
+  %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64float.nxv64i8(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64double.nxv64i8(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64float.nxv64i16(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64double.nxv64i16(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64float.nxv64i32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64double.nxv64i32(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64float.nxv64i64(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64double.nxv64i64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64float.nxv64i1(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64double.nxv64i1(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+
   ret void
 }
 
@@ -2616,6 +5153,16 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
@@ -2626,6 +5173,16 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
@@ -2636,6 +5193,16 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
@@ -2646,6 +5213,16 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
@@ -2656,6 +5233,16 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
@@ -2666,6 +5253,16 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
@@ -2676,6 +5273,16 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
@@ -2686,6 +5293,16 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
@@ -2696,6 +5313,16 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
@@ -2706,6 +5333,16 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
@@ -2716,6 +5353,16 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
@@ -2726,6 +5373,16 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
@@ -2736,6 +5393,16 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
@@ -2746,6 +5413,16 @@ define void @sitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64-LABEL: 'sitofp'
@@ -2759,6 +5436,16 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
@@ -2769,6 +5456,16 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
@@ -2779,6 +5476,16 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
@@ -2789,6 +5496,16 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
@@ -2799,6 +5516,16 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
@@ -2809,6 +5536,16 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
@@ -2819,6 +5556,16 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
@@ -2829,6 +5576,16 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
@@ -2839,6 +5596,16 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
@@ -2849,6 +5616,16 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
@@ -2859,6 +5636,16 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
@@ -2869,6 +5656,16 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
@@ -2879,6 +5676,16 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
@@ -2889,6 +5696,16 @@ define void @sitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
@@ -2902,6 +5719,17 @@ define void @sitofp() {
   %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
   %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
 
+  %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i8.v2float(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i8.v2double(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i16.v2float(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i16.v2double(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i32.v2float(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i32.v2double(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i64.v2float(<2 x i64> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i64.v2double(<2 x i64> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i1.v2float(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i1.v2double(<2 x i1> undef, <2 x i1> undef, i32 undef)
+
   %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
   %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
   %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
@@ -2913,6 +5741,17 @@ define void @sitofp() {
   %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
   %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
 
+  %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i8.v4float(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i8.v4double(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i16.v4float(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i16.v4double(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i32.v4float(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i32.v4double(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i64.v4float(<4 x i64> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i64.v4double(<4 x i64> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i1.v4float(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i1.v4double(<4 x i1> undef, <4 x i1> undef, i32 undef)
+
   %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
   %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
   %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
@@ -2924,6 +5763,17 @@ define void @sitofp() {
   %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
   %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
 
+  %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i8.v8float(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i8.v8double(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i16.v8float(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i16.v8double(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i32.v8float(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i32.v8double(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i64.v8float(<8 x i64> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i64.v8double(<8 x i64> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i1.v8float(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i1.v8double(<8 x i1> undef, <8 x i1> undef, i32 undef)
+
   %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
   %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
   %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
@@ -2935,6 +5785,17 @@ define void @sitofp() {
   %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
   %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
 
+  %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i8.v16float(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i8.v16double(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i16.v16float(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i16.v16double(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i32.v16float(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i32.v16double(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i64.v16float(<16 x i64> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i64.v16double(<16 x i64> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i1.v16float(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i1.v16double(<16 x i1> undef, <16 x i1> undef, i32 undef)
+
   %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
   %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
   %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
@@ -2946,6 +5807,17 @@ define void @sitofp() {
   %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
   %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
 
+  %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i8.v32float(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i8.v32double(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i16.v32float(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i16.v32double(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i32.v32float(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i32.v32double(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i64.v32float(<32 x i64> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i64.v32double(<32 x i64> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i1.v32float(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i1.v32double(<32 x i1> undef, <32 x i1> undef, i32 undef)
+
   %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
   %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
   %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
@@ -2957,6 +5829,17 @@ define void @sitofp() {
   %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
   %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
 
+  %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i8.v64float(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i8.v64double(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i16.v64float(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i16.v64double(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i32.v64float(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i32.v64double(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i64.v64float(<64 x i64> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i64.v64double(<64 x i64> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i1.v64float(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i1.v64double(<64 x i1> undef, <64 x i1> undef, i32 undef)
+
   %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
   %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
   %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
@@ -2968,6 +5851,17 @@ define void @sitofp() {
   %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
   %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
 
+  %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i8.v128float(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i8.v128double(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i16.v128float(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i16.v128double(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i32.v128float(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i32.v128double(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i64.v128float(<128 x i64> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i64.v128double(<128 x i64> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i1.v128float(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i1.v128double(<128 x i1> undef, <128 x i1> undef, i32 undef)
+
   %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
   %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
   %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
@@ -2979,6 +5873,17 @@ define void @sitofp() {
   %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
   %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
 
+  %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1i8.nxv1float(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1i8.nxv1double(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1i16.nxv1float(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1i16.nxv1double(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1i32.nxv1float(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1i32.nxv1double(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1i64.nxv1float(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1i64.nxv1double(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1i1.nxv1float(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1i1.nxv1double(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+
   %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
   %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
   %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
@@ -2990,6 +5895,17 @@ define void @sitofp() {
   %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
   %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
 
+  %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2i8.nxv2float(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2i8.nxv2double(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2i16.nxv2float(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2i16.nxv2double(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2i32.nxv2float(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2i32.nxv2double(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2i64.nxv2float(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2i64.nxv2double(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2i1.nxv2float(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2i1.nxv2double(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+
   %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
   %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
   %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
@@ -3001,6 +5917,17 @@ define void @sitofp() {
   %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
   %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
 
+  %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4i8.nxv4float(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4i8.nxv4double(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4i16.nxv4float(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4i16.nxv4double(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4i32.nxv4float(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4i32.nxv4double(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4i64.nxv4float(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4i64.nxv4double(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4i1.nxv4float(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4i1.nxv4double(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+
   %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
   %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
   %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
@@ -3012,6 +5939,17 @@ define void @sitofp() {
   %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
   %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
 
+  %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv8i8.nxv8float(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv8i8.nxv8double(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv8i16.nxv8float(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv8i16.nxv8double(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv8i32.nxv8float(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv8i32.nxv8double(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv8i64.nxv8float(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv8i64.nxv8double(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv8i1.nxv8float(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv8i1.nxv8double(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+
   %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
   %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
   %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
@@ -3023,6 +5961,17 @@ define void @sitofp() {
   %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
   %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
 
+  %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16i8.nxv16float(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16i8.nxv16double(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16i16.nxv16float(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16i16.nxv16double(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16i32.nxv16float(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16i32.nxv16double(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16i64.nxv16float(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16i64.nxv16double(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16i1.nxv16float(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16i1.nxv16double(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+
   %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
   %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
   %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
@@ -3034,6 +5983,17 @@ define void @sitofp() {
   %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
   %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
 
+  %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32i8.nxv32float(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32i8.nxv32double(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32i16.nxv32float(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32i16.nxv32double(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32i32.nxv32float(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32i32.nxv32double(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32i64.nxv32float(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32i64.nxv32double(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32i1.nxv32float(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32i1.nxv32double(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+
   %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
   %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
   %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
@@ -3045,6 +6005,17 @@ define void @sitofp() {
   %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
   %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 
+  %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64i8.nxv64float(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64i8.nxv64double(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64i16.nxv64float(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64i16.nxv64double(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64i32.nxv64float(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64i32.nxv64double(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64i64.nxv64float(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64i64.nxv64double(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64i1.nxv64float(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64i1.nxv64double(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+
   ret void
 }
 
@@ -3060,6 +6031,16 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
@@ -3070,6 +6051,16 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
@@ -3080,6 +6071,16 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
@@ -3090,6 +6091,16 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
@@ -3100,6 +6111,16 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
@@ -3110,6 +6131,16 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
@@ -3120,6 +6151,16 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
@@ -3130,6 +6171,16 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
@@ -3140,6 +6191,16 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
@@ -3150,6 +6211,16 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
@@ -3160,6 +6231,16 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
@@ -3170,6 +6251,16 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
@@ -3180,6 +6271,16 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
@@ -3190,6 +6291,16 @@ define void @uitofp() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64-LABEL: 'uitofp'
@@ -3203,6 +6314,16 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
@@ -3213,6 +6334,16 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
@@ -3223,6 +6354,16 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
@@ -3233,6 +6374,16 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
@@ -3243,6 +6394,16 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
@@ -3253,6 +6414,16 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
@@ -3263,6 +6434,16 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
@@ -3273,6 +6454,16 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
@@ -3283,6 +6474,16 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
@@ -3293,6 +6494,16 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
@@ -3303,6 +6514,16 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
@@ -3313,6 +6534,16 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
@@ -3323,6 +6554,16 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
@@ -3333,6 +6574,16 @@ define void @uitofp() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
@@ -3346,6 +6597,17 @@ define void @uitofp() {
   %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
   %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
 
+  %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i8.v2float(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i8.v2double(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i16.v2float(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i16.v2double(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i32.v2float(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i32.v2double(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i64.v2float(<2 x i64> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i64.v2double(<2 x i64> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i1.v2float(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i1.v2double(<2 x i1> undef, <2 x i1> undef, i32 undef)
+
   %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
   %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
   %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
@@ -3357,6 +6619,17 @@ define void @uitofp() {
   %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
   %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
 
+  %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i8.v4float(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i8.v4double(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i16.v4float(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i16.v4double(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i32.v4float(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i32.v4double(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i64.v4float(<4 x i64> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i64.v4double(<4 x i64> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i1.v4float(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i1.v4double(<4 x i1> undef, <4 x i1> undef, i32 undef)
+
   %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
   %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
   %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
@@ -3368,6 +6641,17 @@ define void @uitofp() {
   %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
   %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
 
+  %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i8.v8float(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i8.v8double(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i16.v8float(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i16.v8double(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i32.v8float(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i32.v8double(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i64.v8float(<8 x i64> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i64.v8double(<8 x i64> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i1.v8float(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i1.v8double(<8 x i1> undef, <8 x i1> undef, i32 undef)
+
   %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
   %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
   %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
@@ -3379,6 +6663,17 @@ define void @uitofp() {
   %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
   %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
 
+  %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i8.v16float(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i8.v16double(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i16.v16float(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i16.v16double(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i32.v16float(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i32.v16double(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i64.v16float(<16 x i64> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i64.v16double(<16 x i64> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i1.v16float(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i1.v16double(<16 x i1> undef, <16 x i1> undef, i32 undef)
+
   %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
   %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
   %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
@@ -3390,6 +6685,17 @@ define void @uitofp() {
   %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
   %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
 
+  %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i8.v32float(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i8.v32double(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i16.v32float(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i16.v32double(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i32.v32float(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i32.v32double(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i64.v32float(<32 x i64> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i64.v32double(<32 x i64> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i1.v32float(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i1.v32double(<32 x i1> undef, <32 x i1> undef, i32 undef)
+
   %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
   %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
   %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
@@ -3401,6 +6707,17 @@ define void @uitofp() {
   %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
   %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
 
+  %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i8.v64float(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i8.v64double(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i16.v64float(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i16.v64double(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i32.v64float(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i32.v64double(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i64.v64float(<64 x i64> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i64.v64double(<64 x i64> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i1.v64float(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i1.v64double(<64 x i1> undef, <64 x i1> undef, i32 undef)
+
   %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
   %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
   %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
@@ -3412,6 +6729,17 @@ define void @uitofp() {
   %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
   %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
 
+  %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i8.v128float(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i8.v128double(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i16.v128float(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i16.v128double(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i32.v128float(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i32.v128double(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i64.v128float(<128 x i64> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i64.v128double(<128 x i64> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i1.v128float(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i1.v128double(<128 x i1> undef, <128 x i1> undef, i32 undef)
+
   %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
   %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
   %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
@@ -3423,6 +6751,17 @@ define void @uitofp() {
   %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
   %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
 
+  %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1i8.nxv1float(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1i8.nxv1double(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1i16.nxv1float(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1i16.nxv1double(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1i32.nxv1float(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1i32.nxv1double(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1i64.nxv1float(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1i64.nxv1double(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1i1.nxv1float(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1i1.nxv1double(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+
   %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
   %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
   %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
@@ -3434,6 +6773,17 @@ define void @uitofp() {
   %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
   %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
 
+  %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2i8.nxv2float(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2i8.nxv2double(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2i16.nxv2float(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2i16.nxv2double(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2i32.nxv2float(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2i32.nxv2double(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2i64.nxv2float(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2i64.nxv2double(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2i1.nxv2float(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2i1.nxv2double(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+
   %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
   %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
   %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
@@ -3445,6 +6795,17 @@ define void @uitofp() {
   %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
   %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
 
+  %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4i8.nxv4float(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4i8.nxv4double(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4i16.nxv4float(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4i16.nxv4double(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4i32.nxv4float(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4i32.nxv4double(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4i64.nxv4float(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4i64.nxv4double(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4i1.nxv4float(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4i1.nxv4double(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+
   %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
   %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
   %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
@@ -3456,6 +6817,17 @@ define void @uitofp() {
   %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
   %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
 
+  %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8i8.nxv8float(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8i8.nxv8double(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8i16.nxv8float(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8i16.nxv8double(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8i32.nxv8float(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8i32.nxv8double(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8i64.nxv8float(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8i64.nxv8double(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8i1.nxv8float(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8i1.nxv8double(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+
   %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
   %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
   %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
@@ -3467,6 +6839,17 @@ define void @uitofp() {
   %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
   %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
 
+  %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16i8.nxv16float(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16i8.nxv16double(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16i16.nxv16float(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16i16.nxv16double(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16i32.nxv16float(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16i32.nxv16double(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16i64.nxv16float(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16i64.nxv16double(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16i1.nxv16float(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16i1.nxv16double(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+
   %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
   %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
   %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
@@ -3478,6 +6861,17 @@ define void @uitofp() {
   %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
   %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
 
+  %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32i8.nxv32float(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32i8.nxv32double(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32i16.nxv32float(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32i16.nxv32double(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32i32.nxv32float(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32i32.nxv32double(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32i64.nxv32float(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32i64.nxv32double(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32i1.nxv32float(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32i1.nxv32double(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+
   %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
   %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
   %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
@@ -3489,6 +6883,17 @@ define void @uitofp() {
   %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
   %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 
+  %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64i8.nxv64float(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64i8.nxv64double(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64i16.nxv64float(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64i16.nxv64double(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64i32.nxv64float(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64i32.nxv64double(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64i64.nxv64float(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64i64.nxv64double(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64i1.nxv64float(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64i1.nxv64double(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+
   ret void
 }
 

From 76be3a0024fe0027bcba9a597fee32a8b2d962ae Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Wed, 4 Sep 2024 22:12:01 -0700
Subject: [PATCH 183/425] [DirectX] Fix crash in DXILOpBuilder for vector types
 (#107334)

This function needs to return the "undefined" sigil for unknown types so
that the actual error handling triggers instead of a crash.
---
 llvm/lib/Target/DirectX/DXILOpBuilder.cpp     |  3 +--
 llvm/test/CodeGen/DirectX/sin_vector_error.ll | 11 +++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/sin_vector_error.ll

diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
index ab3ea61d05fc4..efe019a07acaa 100644
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
@@ -123,8 +123,7 @@ static OverloadKind getOverloadKind(Type *Ty) {
   case Type::StructTyID:
     return OverloadKind::ObjectType;
   default:
-    llvm_unreachable("invalid overload type");
-    return OverloadKind::VOID;
+    return OverloadKind::UNDEFINED;
   }
 }
 
diff --git a/llvm/test/CodeGen/DirectX/sin_vector_error.ll b/llvm/test/CodeGen/DirectX/sin_vector_error.ll
new file mode 100644
index 0000000000000..45b8d403390b9
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/sin_vector_error.ll
@@ -0,0 +1,11 @@
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.0-library %s 2>&1 | FileCheck %s
+; The sin intrinsic needs to be scalarized before op lowering
+
+; CHECK: error:
+; CHECK-SAME: in function sin_vector
+; CHECK-SAME: Cannot create Sin operation: Invalid overload type
+
+define <4 x float> @sin_vector(<4 x float> %a) {
+  %x = call <4 x float> @llvm.sin.v4f32(<4 x float> %a)
+  ret <4 x float> %x
+}

From 787fc81437dfc924e4a7d6106248e335e32aeeeb Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Wed, 4 Sep 2024 22:12:19 -0700
Subject: [PATCH 184/425] [DirectX] Clean up trailing whitespace. NFC (#107335)

---
 llvm/include/llvm/IR/IntrinsicsDirectX.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 5f48e2ba939f5..d2c0859f52a24 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -36,7 +36,7 @@ def int_dx_cast_handle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>;
 def int_dx_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>;
 def int_dx_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>;
 def int_dx_clamp : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
-def int_dx_uclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; 
+def int_dx_uclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 def int_dx_saturate : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
 
 def int_dx_dot2 :
@@ -70,7 +70,7 @@ def int_dx_isinf :
     DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
     [llvm_anyfloat_ty]>;
 
-def int_dx_lerp : Intrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>], 
+def int_dx_lerp : Intrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>],
     [IntrNoMem, IntrWillReturn] >;
 
 def int_dx_length : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyfloat_ty]>;

From d18ca271f1add262b4ee0318a980f78a402f5e9c Mon Sep 17 00:00:00 2001
From: Harini0924 <harinidonthula@google.com>
Date: Wed, 4 Sep 2024 22:34:05 -0700
Subject: [PATCH 185/425] Reapply "[llvm-lit] Add precommit test to verify
 current behavior of glob expansion in lit's internal shell" (#106763)
 (#107169)

This reverts commit 5af4ba2684b9b59de3bf8135f62e05ab68cfc489.

The previous patch was missing the closing parenthesis `)` in the
`CHECK` statement in the `llvm/utils/lit/tests/shtest-glob.py` file:

`# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}}`

This issue broke some build bots. This patch corrects the `CHECK`
statement by adding the closing parenthesis:

`# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}})`
---
 .../lit/tests/Inputs/shtest-glob/example_file1.input |  2 ++
 .../lit/tests/Inputs/shtest-glob/example_file2.input |  2 ++
 .../utils/lit/tests/Inputs/shtest-glob/glob-echo.txt |  2 ++
 .../lit/tests/Inputs/shtest-glob/glob-mkdir.txt      |  2 ++
 llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg      |  8 ++++++++
 llvm/utils/lit/tests/shtest-glob.py                  | 12 ++++++++++++
 6 files changed, 28 insertions(+)
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg
 create mode 100644 llvm/utils/lit/tests/shtest-glob.py

diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input b/llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input
new file mode 100644
index 0000000000000..0987c9081ca1f
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input
@@ -0,0 +1,2 @@
+## This is the first example file used for testing glob pattern matching.
+This is the first example file.
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input b/llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input
new file mode 100644
index 0000000000000..f1a843f308262
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input
@@ -0,0 +1,2 @@
+## This is the second example file used for testing glob pattern matching.
+This is the second example file.
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt
new file mode 100644
index 0000000000000..b69f5e74fd728
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt
@@ -0,0 +1,2 @@
+## Tests glob pattern expansion by listing matching files.
+# RUN: echo %S/example_file*.input
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt
new file mode 100644
index 0000000000000..d1329f5dbfaae
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt
@@ -0,0 +1,2 @@
+## Tests glob pattern handling in the mkdir command.
+# RUN: not mkdir %S/example_file*.input
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg
new file mode 100644
index 0000000000000..4e5f4cac4c465
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg
@@ -0,0 +1,8 @@
+import lit.formats
+
+config.name = "shtest-glob"
+config.suffixes = [".txt"]
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
+config.substitutions.append(("%{python}", '"%s"' % (sys.executable)))
diff --git a/llvm/utils/lit/tests/shtest-glob.py b/llvm/utils/lit/tests/shtest-glob.py
new file mode 100644
index 0000000000000..ae90f31907d49
--- /dev/null
+++ b/llvm/utils/lit/tests/shtest-glob.py
@@ -0,0 +1,12 @@
+## Tests glob pattern handling in echo command.
+
+# RUN: not %{lit} -a -v %{inputs}/shtest-glob \
+# RUN: | FileCheck -dump-input=fail -match-full-lines %s
+#
+# END.
+
+# CHECK: UNRESOLVED: shtest-glob :: glob-echo.txt ({{[^)]*}})
+# CHECK: TypeError: string argument expected, got 'GlobItem'
+
+# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}})
+# CHECK: # error: command failed with exit status: 1

From 16cda01d22c0ac1713f667d501bdca91594a4e13 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson@amd.com>
Date: Thu, 5 Sep 2024 14:39:28 +0900
Subject: [PATCH 186/425] [AMDGPU] V_SET_INACTIVE optimizations (#98864)

Optimize V_SET_INACTIVE by allow it to run in WWM.
Hence WWM sections are not broken up for inactive lane setting.
WWM V_SET_INACTIVE can typically be lower to V_CNDMASK.
Some cases require use of exec manipulation V_MOV as previous code.
GFX9 sees slight instruction count increase in edge cases due to
smaller constant bus.

Additionally avoid introducing exec manipulation and V_MOVs where
a source of V_SET_INACTIVE is the destination.
This is a common pattern as WWM register pre-allocation often
assigns the same register.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  187 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |    2 +
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp    |   89 +-
 .../GlobalISel/llvm.amdgcn.set.inactive.ll    |  401 ++--
 .../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll |   30 +-
 .../AMDGPU/amdgpu-cs-chain-preserve-cc.ll     |   29 +-
 .../atomic_optimizations_global_pointer.ll    |  536 ++---
 .../atomic_optimizations_local_pointer.ll     | 1913 ++++++-----------
 .../atomic_optimizations_pixelshader.ll       |   64 +-
 llvm/test/CodeGen/AMDGPU/cse-convergent.ll    |   14 +-
 llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll |   18 +-
 .../AMDGPU/global_atomics_scan_fadd.ll        |  607 ++----
 .../AMDGPU/global_atomics_scan_fmax.ll        |  422 ++--
 .../AMDGPU/global_atomics_scan_fmin.ll        |  422 ++--
 .../AMDGPU/global_atomics_scan_fsub.ll        |  607 ++----
 .../llvm.amdgcn.set.inactive.chain.arg.ll     |  389 ++--
 .../AMDGPU/llvm.amdgcn.set.inactive.ll        |  382 ++--
 .../AMDGPU/set-inactive-wwm-overwrite.ll      |   12 +-
 .../AMDGPU/should-not-hoist-set-inactive.ll   |    5 +-
 llvm/test/CodeGen/AMDGPU/wave32.ll            |   64 +-
 llvm/test/CodeGen/AMDGPU/wqm.ll               |   52 +-
 llvm/test/CodeGen/AMDGPU/wqm.mir              |   23 +-
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |  704 +++---
 llvm/test/CodeGen/AMDGPU/wwm-reserved.ll      |  504 +++--
 .../MIR/AMDGPU/machine-function-info.ll       |    7 +-
 25 files changed, 3205 insertions(+), 4278 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index a857bdba53c3e..844f62abc2671 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2098,8 +2098,22 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
   }
 }
 
+Register SIInstrInfo::findSetInactiveMask(const MachineInstr &MI) {
+  assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
+         MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64);
+  for (auto &Op : MI.implicit_operands()) {
+    if (Op.isDef())
+      continue;
+    Register OpReg = Op.getReg();
+    if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
+        OpReg == AMDGPU::SCC)
+      continue;
+    return OpReg;
+  }
+  return Register();
+}
+
 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MBB.findDebugLoc(MI);
   switch (MI.getOpcode()) {
@@ -2273,37 +2287,147 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent();
     break;
   }
-  case AMDGPU::V_SET_INACTIVE_B32: {
-    unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
-    unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
-    // optimizations (mainly Register Coalescer) aware of WWM register liveness.
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
-        .add(MI.getOperand(1));
-    auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
-    FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
-      .add(MI.getOperand(2));
-    BuildMI(MBB, MI, DL, get(NotOpc), Exec)
-      .addReg(Exec);
-    MI.eraseFromParent();
-    break;
-  }
+  case AMDGPU::V_SET_INACTIVE_B32:
   case AMDGPU::V_SET_INACTIVE_B64: {
     unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
-    unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
-                                 MI.getOperand(0).getReg())
-                             .add(MI.getOperand(1));
-    expandPostRAPseudo(*Copy);
-    auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
-    FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
-    Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
-                   MI.getOperand(0).getReg())
-               .add(MI.getOperand(2));
-    expandPostRAPseudo(*Copy);
-    BuildMI(MBB, MI, DL, get(NotOpc), Exec)
-      .addReg(Exec);
+    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
+                           ? AMDGPU::V_MOV_B64_PSEUDO
+                           : AMDGPU::V_MOV_B32_e32;
+    Register ExecReg = RI.getExec();
+    Register DstReg = MI.getOperand(0).getReg();
+    MachineOperand &ActiveSrc = MI.getOperand(1);
+    MachineOperand &InactiveSrc = MI.getOperand(2);
+
+    // Find implicit register defining lanes active outside WWM.
+    Register ExecSrcReg = findSetInactiveMask(MI);
+    assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region");
+    // Note: default here is set to ExecReg so that functional MIR is still
+    // generated if implicit def is not found and assertions are disabled.
+    if (!ExecSrcReg)
+      ExecSrcReg = ExecReg;
+
+    // Ideally in WWM this operation is lowered to V_CNDMASK; however,
+    // constant bus constraints and the presence of literal constants
+    // present an issue.
+    // Fallback to V_MOV base lowering in all but the common cases.
+    const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
+    MachineFunction *MF = MBB.getParent();
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
+    const MCInstrDesc &Desc = get(Opcode);
+
+    const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
+    const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
+    const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
+    const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
+    const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
+    const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
+
+    int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+    int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+
+    int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
+    int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
+    int ConstantBusUses =
+        1 + // Starts at 1 for ExecSrcReg
+        (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
+        (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
+    int LiteralConstants =
+        ((ActiveSrc.isReg() ||
+          (ActiveSrc.isImm() && isInlineConstant(ActiveImm)))
+             ? 0
+             : 1) +
+        ((InactiveSrc.isReg() ||
+          (InactiveSrc.isImm() && isInlineConstant(InactiveImm)))
+             ? 0
+             : 1);
+
+    bool UseVCndMask =
+        ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
+    if (VMov64 && UseVCndMask) {
+      // Decomposition must not introduce new literals.
+      UseVCndMask &=
+          ActiveSrc.isReg() ||
+          (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmHi)) ||
+          (!isInlineConstant(ActiveImm));
+      UseVCndMask &= InactiveSrc.isReg() ||
+                     (isInlineConstant(InactiveImmLo) &&
+                      isInlineConstant(InactiveImmHi)) ||
+                     (!isInlineConstant(InactiveImm));
+    }
+
+    if (UseVCndMask && VMov64) {
+      // Dual V_CNDMASK_B32
+      MachineOperand ActiveLo = buildExtractSubRegOrImm(
+          MI, MRI, ActiveSrc, nullptr, AMDGPU::sub0, nullptr);
+      MachineOperand ActiveHi = buildExtractSubRegOrImm(
+          MI, MRI, ActiveSrc, nullptr, AMDGPU::sub1, nullptr);
+      MachineOperand InactiveLo = buildExtractSubRegOrImm(
+          MI, MRI, InactiveSrc, nullptr, AMDGPU::sub0, nullptr);
+      MachineOperand InactiveHi = buildExtractSubRegOrImm(
+          MI, MRI, InactiveSrc, nullptr, AMDGPU::sub1, nullptr);
+      if (ActiveSrc.isReg())
+        ActiveHi.setIsKill(ActiveSrc.isKill());
+      if (InactiveSrc.isReg())
+        InactiveHi.setIsKill(InactiveSrc.isKill());
+      BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub0))
+          .addImm(0)
+          .add(InactiveLo)
+          .addImm(0)
+          .add(ActiveLo)
+          .addReg(ExecSrcReg)
+          .addReg(DstReg, RegState::ImplicitDefine);
+      BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub1))
+          .addImm(0)
+          .add(InactiveHi)
+          .addImm(0)
+          .add(ActiveHi)
+          .addReg(ExecSrcReg)
+          .addReg(DstReg, RegState::ImplicitDefine);
+    } else if (UseVCndMask) {
+      // Single V_CNDMASK_B32
+      BuildMI(MBB, MI, DL, Desc, DstReg)
+          .addImm(0)
+          .add(InactiveSrc)
+          .addImm(0)
+          .add(ActiveSrc)
+          .addReg(ExecSrcReg);
+    } else {
+      // Fallback V_MOV case.
+      // Avoid unnecessary work if a source VGPR is also the destination.
+      // This can happen if WWM register allocation was efficient.
+      // Note: this assumes WWM execution.
+      bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
+      bool DstIsInactive =
+          InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
+      if (!DstIsInactive) {
+        // Set exec mask to inactive lanes,
+        // but only if active lanes would be overwritten.
+        if (DstIsActive) {
+          BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
+              .addReg(ExecSrcReg)
+              .setOperandDead(3); // Dead scc
+        }
+        // Copy inactive lanes
+        MachineInstr *VMov =
+            BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
+        if (VMov64)
+          expandPostRAPseudo(*VMov);
+      }
+      if (!DstIsActive) {
+        // Set exec mask to active lanes
+        BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
+        // Copy active lanes
+        MachineInstr *VMov =
+            BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
+                .add(ActiveSrc);
+        if (VMov64)
+          expandPostRAPseudo(*VMov);
+      }
+      // Restore WWM
+      BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
+    }
     MI.eraseFromParent();
     break;
   }
@@ -5647,6 +5771,9 @@ unsigned SIInstrInfo::buildExtractSubReg(
     MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
     const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
     unsigned SubIdx, const TargetRegisterClass *SubRC) const {
+  if (!SuperReg.getReg().isVirtual())
+    return RI.getSubReg(SuperReg.getReg(), SubIdx);
+
   MachineBasicBlock *MBB = MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
   Register SubReg = MRI.createVirtualRegister(SubRC);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 4fd9b4366159b..71432510fdee4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1437,6 +1437,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   // This is used if an operand is a 32 bit register but needs to be aligned
   // regardless.
   void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const;
+
+  static Register findSetInactiveMask(const MachineInstr &MI);
 };
 
 /// \brief Returns true if a reg:subreg pair P has a TRC class
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 9a51cbbb9f6b8..bc4b1936cb7e3 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -128,6 +128,7 @@ struct InstrInfo {
   char Needs = 0;
   char Disabled = 0;
   char OutNeeds = 0;
+  char MarkedStates = 0;
 };
 
 struct BlockInfo {
@@ -175,9 +176,10 @@ class SIWholeQuadMode : public MachineFunctionPass {
 
   SmallVector<MachineInstr *, 2> LiveMaskQueries;
   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
-  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
+  SmallSetVector<MachineInstr *, 4> LowerToCopyInstrs;
   SmallVector<MachineInstr *, 4> KillInstrs;
   SmallVector<MachineInstr *, 4> InitExecInstrs;
+  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
 
   void printInfo();
 
@@ -295,6 +297,9 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
 
   assert(!(Flag & StateExact) && Flag != 0);
 
+  // Capture all states requested in marking including disabled ones.
+  II.MarkedStates |= Flag;
+
   // Remove any disabled states from the flag. The user that required it gets
   // an undefined value in the helper lanes. For example, this can happen if
   // the result of an atomic is used by instruction that requires WQM, where
@@ -478,7 +483,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
                                        std::vector<WorkItem> &Worklist) {
   char GlobalFlags = 0;
   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
-  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
   bool HasImplicitDerivatives =
       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
@@ -512,9 +516,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         // The WQM intrinsic requires its output to have all the helper lanes
         // correct, so we need it to be in WQM.
         Flags = StateWQM;
-        LowerToCopyInstrs.push_back(&MI);
+        LowerToCopyInstrs.insert(&MI);
       } else if (Opcode == AMDGPU::SOFT_WQM) {
-        LowerToCopyInstrs.push_back(&MI);
+        LowerToCopyInstrs.insert(&MI);
         SoftWQMInstrs.push_back(&MI);
       } else if (Opcode == AMDGPU::STRICT_WWM) {
         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
@@ -555,16 +559,24 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         GlobalFlags |= StateStrictWQM;
       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
-        III.Disabled = StateStrict;
-        MachineOperand &Inactive = MI.getOperand(2);
-        if (Inactive.isReg()) {
-          if (Inactive.isUndef()) {
-            LowerToCopyInstrs.push_back(&MI);
-          } else {
-            markOperand(MI, Inactive, StateStrictWWM, Worklist);
+        // Ignore these if V_SET_INACTIVE which already has exec src register.
+        // These are generated by an earlier pass which has seperately ensured
+        // WWM and provided a mask of inactive lanes.
+        Register ExecSrc = TII->findSetInactiveMask(MI);
+        if (!ExecSrc) {
+          // Disable strict states; StrictWQM will be added as required later.
+          III.Disabled = StateStrict;
+          MachineOperand &Inactive = MI.getOperand(2);
+          if (Inactive.isReg()) {
+            if (Inactive.isUndef()) {
+              LowerToCopyInstrs.insert(&MI);
+            } else {
+              markOperand(MI, Inactive, StateStrictWWM, Worklist);
+            }
           }
+          SetInactiveInstrs.push_back(&MI);
+          BBI.NeedsLowering = true;
         }
-        SetInactiveInstrs.push_back(&MI);
       } else if (TII->isDisableWQM(MI)) {
         BBI.Needs |= StateExact;
         if (!(BBI.InNeeds & StateExact)) {
@@ -1042,6 +1054,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
 
   SmallVector<MachineInstr *, 4> SplitPoints;
+  Register ActiveLanesReg = 0;
   char State = BI.InitialState;
 
   for (MachineInstr &MI : llvm::make_early_inc_range(
@@ -1058,6 +1071,21 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
       SplitPoint = lowerKillF32(MBB, MI);
       break;
+    case AMDGPU::ENTER_STRICT_WWM:
+      ActiveLanesReg = MI.getOperand(0).getReg();
+      break;
+    case AMDGPU::EXIT_STRICT_WWM:
+      ActiveLanesReg = 0;
+      break;
+    case AMDGPU::V_SET_INACTIVE_B32:
+    case AMDGPU::V_SET_INACTIVE_B64:
+      if (ActiveLanesReg) {
+        MI.addOperand(*MBB.getParent(),
+                      MachineOperand::CreateReg(ActiveLanesReg, false, true));
+      } else {
+        assert(State == StateExact || State == StateWQM);
+      }
+      break;
     default:
       break;
     }
@@ -1497,13 +1525,14 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
     }
   }
   for (MachineInstr *MI : LowerToCopyInstrs) {
+    LLVM_DEBUG(dbgs() << "simplify: " << *MI);
+
+    Register RecomputeReg = 0;
     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
       assert(MI->getNumExplicitOperands() == 3);
-      // the only reason we should be here is V_SET_INACTIVE has
-      // an undef input so it is being replaced by a simple copy.
-      // There should be a second undef source that we should remove.
-      assert(MI->getOperand(2).isUndef());
+      if (MI->getOperand(2).isReg())
+        RecomputeReg = MI->getOperand(2).getReg();
       MI->removeOperand(2);
       MI->untieRegOperand(1);
     } else {
@@ -1514,7 +1543,19 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
                           ? (unsigned)AMDGPU::COPY
                           : TII->getMovOpcode(TRI->getRegClassForOperandReg(
                                 *MRI, MI->getOperand(0)));
+    int Index = MI->findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
+    while (Index >= 0) {
+      MI->removeOperand(Index);
+      Index = MI->findRegisterUseOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
+    }
+
     MI->setDesc(TII->get(CopyOp));
+    LLVM_DEBUG(dbgs() << " -> " << *MI);
+
+    if (RecomputeReg) {
+      LIS->removeInterval(RecomputeReg);
+      LIS->createAndComputeVirtRegInterval(RecomputeReg);
+    }
   }
   return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
 }
@@ -1656,6 +1697,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   LowerToMovInstrs.clear();
   KillInstrs.clear();
   InitExecInstrs.clear();
+  SetInactiveInstrs.clear();
   StateTransition.clear();
 
   ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1712,6 +1754,21 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
     Changed = true;
   }
 
+  // Check if V_SET_INACTIVE was touched by a strict state mode.
+  // If so, promote to WWM; otherwise lower to COPY.
+  for (MachineInstr *MI : SetInactiveInstrs) {
+    if (LowerToCopyInstrs.contains(MI))
+      continue;
+    if (Instructions[MI].MarkedStates & StateStrict) {
+      Instructions[MI].Needs |= StateStrictWWM;
+      Instructions[MI].Disabled &= ~StateStrictWWM;
+      Blocks[MI->getParent()].Needs |= StateStrictWWM;
+    } else {
+      LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI);
+      LowerToCopyInstrs.insert(MI);
+    }
+  }
+
   LLVM_DEBUG(printInfo());
 
   Changed |= lowerLiveMaskQueries();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 8f88aaedf7e95..137366a45cbdf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -4,18 +4,39 @@
 define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: set_inactive:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, 42
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
+  store i32 %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
+; GCN-LABEL: set_inactive_imm_poison:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
+  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
   store i32 %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -24,18 +45,42 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
 ; GCN-LABEL: set_inactive_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
+  %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
+  store i64 %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
+; GCN-LABEL: set_inactive_imm_poison_64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v1
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
+  %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
+  %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
   store i64 %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -45,39 +90,43 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_buffer_load_dword s4, s[4:7], 0x0
-; GCN-NEXT:    s_load_dword s5, s[2:3], 0x2c
+; GCN-NEXT:    s_buffer_load_dword s6, s[4:7], 0x0
+; GCN-NEXT:    s_load_dword s7, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s2, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_lg_u32 s4, 56
+; GCN-NEXT:    s_cmp_lg_u32 s6, 56
 ; GCN-NEXT:    s_cselect_b32 s3, 1, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, 42
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_mov_b32 s2, 1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_cmp_lg_u32 s3, 0
-; GCN-NEXT:    s_cbranch_scc0 .LBB2_2
+; GCN-NEXT:    s_cbranch_scc0 .LBB4_2
 ; GCN-NEXT:  ; %bb.1: ; %.one
-; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v1
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 ; GCN-NEXT:    s_mov_b32 s2, 0
-; GCN-NEXT:  .LBB2_2: ; %Flow
+; GCN-NEXT:  .LBB4_2: ; %Flow
 ; GCN-NEXT:    s_xor_b32 s2, s2, 1
 ; GCN-NEXT:    s_and_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_cbranch_scc1 .LBB2_4
+; GCN-NEXT:    s_cbranch_scc1 .LBB4_4
 ; GCN-NEXT:  ; %bb.3: ; %.zero
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GCN-NEXT:  .LBB2_4: ; %.exit
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT:  .LBB4_4: ; %.exit
 ; GCN-NEXT:    s_endpgm
   %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
   %cmp = icmp eq i32 %val, 56
-  %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
   br i1 %cmp, label %.zero, label %.one
 
 .zero:
@@ -96,19 +145,22 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
 ; GCN-LABEL: set_inactive_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x40400000
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+  %tmp.0 = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+  %tmp = call float @llvm.amdgcn.strict.wwm.f32(float %tmp.0)
   store float %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -117,20 +169,23 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 ; GCN-LABEL: set_inactive_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v2, 0xcccccccd
-; GCN-NEXT:    v_mov_b32_e32 v3, 0x4010cccc
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0xcccccccd
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4010cccc
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+  %tmp.0 = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+  %tmp = call double @llvm.amdgcn.strict.wwm.f64(double %tmp.0)
   store double %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -138,19 +193,22 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
 ; GCN-LABEL: set_inactive_v2i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x10001
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+  %tmp.0 = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+  %tmp = call <2 x i16> @llvm.amdgcn.strict.wwm.v2i16(<2 x i16> %tmp.0)
   store <2 x i16> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -158,19 +216,22 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
 define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; GCN-LABEL: set_inactive_v2f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3c003c00
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+  %tmp.0 = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+  %tmp = call <2 x half> @llvm.amdgcn.strict.wwm.v2i16(<2 x half> %tmp.0)
   store <2 x half> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -179,22 +240,25 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
 ; GCN-LABEL: set_inactive_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 1
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 1
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+  %tmp.0 = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+  %tmp = call <2 x i32> @llvm.amdgcn.strict.wwm.v2i32(<2 x i32> %tmp.0)
   store <2 x i32> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -203,22 +267,25 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 ; GCN-LABEL: set_inactive_v2f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 1.0
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 1.0
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+  %tmp.0 = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+  %tmp = call <2 x float> @llvm.amdgcn.strict.wwm.v2f32(<2 x float> %tmp.0)
   store <2 x float> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -226,19 +293,22 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
 ; GCN-LABEL: set_inactive_v2bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3f803f80
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+  %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+  %tmp = call <2 x bfloat> @llvm.amdgcn.strict.wwm.v2bf16(<2 x bfloat> %tmp.0)
   store <2 x bfloat> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -247,22 +317,25 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
 ; GCN-LABEL: set_inactive_v4i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0x10001
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 0x10001
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+  %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+  %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0)
   store <4 x i16> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -271,22 +344,25 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
 ; GCN-LABEL: set_inactive_v4f16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+  %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+  %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0)
   store <4 x half> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -295,22 +371,25 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
 ; GCN-LABEL: set_inactive_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0x3f803f80
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 0x3f803f80
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+  %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+  %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0)
   store <4 x bfloat> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -319,18 +398,23 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 ; GCN-LABEL: set_inactive_p0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+  %tmp.0 = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+  %tmp = call ptr @llvm.amdgcn.strict.wwm.p0(ptr %tmp.0)
   store ptr %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -338,18 +422,22 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
 ; GCN-LABEL: set_inactive_p2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+  %tmp.0 = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+  %tmp = call ptr addrspace(2) @llvm.amdgcn.strict.wwm.p2(ptr addrspace(2) %tmp.0)
   store ptr addrspace(2) %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -357,18 +445,22 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
 ; GCN-LABEL: set_inactive_p3:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+  %tmp.0 = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+  %tmp = call ptr addrspace(3) @llvm.amdgcn.strict.wwm.p3(ptr addrspace(3) %tmp.0)
   store ptr addrspace(3) %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -376,18 +468,22 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
 ; GCN-LABEL: set_inactive_p5:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+  %tmp.0 = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+  %tmp = call ptr addrspace(5) @llvm.amdgcn.strict.wwm.p5(ptr addrspace(5) %tmp.0)
   store ptr addrspace(5) %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -395,24 +491,31 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
 ; GCN-LABEL: set_inactive_p6:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+  %tmp.0 = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+  %tmp = call ptr addrspace(6) @llvm.amdgcn.strict.wwm.p6(ptr addrspace(6) %tmp.0)
   store ptr addrspace(6) %tmp, ptr addrspace(1) %out
   ret void
 }
 
 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
+declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1
+declare i64 @llvm.amdgcn.strict.wwm.i64(i64) #1
 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
 
 attributes #0 = { convergent readnone }
+attributes #1 = { convergent nounwind readnone speculatable willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index c92b78cd45573..e34ae52fc673a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -571,11 +571,10 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 3
-; GISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 4
-; GISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_cndmask_b32_e64 v1, 4, 3, s0
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v2, v1
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
@@ -591,10 +590,9 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 3
-; GISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 4
-; GISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL-GFX10-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL-GFX10-NEXT:    v_cndmask_b32_e64 v1, 4, 3, s0
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v2, v1
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
@@ -609,11 +607,10 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
 ; DAGISEL-GFX11-LABEL: chain_to_chain_wwm:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_or_saveexec_b32 s4, -1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
-; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 3
-; DAGISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 4
-; DAGISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL-GFX11-NEXT:    v_cndmask_b32_e64 v1, 4, 3, s4
+; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, s4
 ; DAGISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v2, v1
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
@@ -629,11 +626,10 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
 ; DAGISEL-GFX10-LABEL: chain_to_chain_wwm:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_or_saveexec_b32 s4, -1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 3
-; DAGISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 4
-; DAGISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL-GFX10-NEXT:    v_cndmask_b32_e64 v1, 4, 3, s4
+; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v2, v1
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
index 8d9ed9bb4343c..320268564f4db 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
@@ -329,10 +329,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 3
-; GISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 4
-; GISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL-GFX11-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX11-NEXT:    v_cndmask_b32_e64 v1, 4, 3, s0
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
@@ -351,10 +351,9 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 3
-; GISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 4
-; GISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL-GFX10-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL-GFX10-NEXT:    v_cndmask_b32_e64 v1, 4, 3, s0
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, s0
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
@@ -371,11 +370,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
+; DAGISEL-GFX11-NEXT:    s_or_saveexec_b32 s4, -1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
-; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 3
-; DAGISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 4
-; DAGISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL-GFX11-NEXT:    v_cndmask_b32_e64 v1, 4, 3, s4
+; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, s4
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
@@ -393,11 +391,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
+; DAGISEL-GFX10-NEXT:    s_or_saveexec_b32 s4, -1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 3
-; DAGISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 4
-; DAGISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL-GFX10-NEXT:    v_cndmask_b32_e64 v1, 4, 3, s4
+; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index cc7050d08541a..5a8df7b84bf2f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1147,11 +1147,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[4:5]
+; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1200,11 +1198,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1247,13 +1243,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1064_DPP-LABEL: add_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1310,11 +1303,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1032_DPP-LABEL: add_i32_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1364,27 +1354,24 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-LABEL: add_i32_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -1438,35 +1425,33 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-LABEL: add_i32_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 31
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s4, s6
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
@@ -1500,27 +1485,24 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-LABEL: add_i32_varying:
 ; GFX1264_DPP:       ; %bb.0: ; %entry
 ; GFX1264_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1264_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1264_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -1578,33 +1560,30 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-LABEL: add_i32_varying:
 ; GFX1232_DPP:       ; %bb.0: ; %entry
 ; GFX1232_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1232_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1232_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1232_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1232_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s6, v1, 31
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
@@ -2918,15 +2897,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[4:5]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[4:5]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX8_DPP-NEXT:    s_nop 0
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3018,15 +2991,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3115,47 +3082,41 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v3, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v4, 31
@@ -3228,40 +3189,34 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s0
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
@@ -3323,21 +3278,15 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
@@ -3441,22 +3390,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s0
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -3537,56 +3481,50 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX1264_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1264_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1264_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1264_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s5, v4, 31
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v3, 31
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, s4
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1264_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
@@ -3658,35 +3596,29 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1232_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1232_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1232_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s0
+; GFX1232_DPP-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s0
 ; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -4894,11 +4826,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[4:5]
+; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4947,11 +4877,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4994,13 +4922,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1064_DPP-LABEL: sub_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5057,11 +4982,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1032_DPP-LABEL: sub_i32_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5111,27 +5033,24 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-LABEL: sub_i32_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -5185,35 +5104,33 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-LABEL: sub_i32_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 31
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s4, s6
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
@@ -5247,27 +5164,24 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-LABEL: sub_i32_varying:
 ; GFX1264_DPP:       ; %bb.0: ; %entry
 ; GFX1264_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1264_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1264_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -5325,33 +5239,30 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-LABEL: sub_i32_varying:
 ; GFX1232_DPP:       ; %bb.0: ; %entry
 ; GFX1232_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1232_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1232_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1232_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1232_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s6, v1, 31
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
@@ -6707,15 +6618,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[4:5]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[4:5]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX8_DPP-NEXT:    s_nop 0
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6807,15 +6712,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6904,47 +6803,41 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v3, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v4, 31
@@ -7017,40 +6910,34 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s0
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
@@ -7112,21 +6999,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
@@ -7230,22 +7111,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s0
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7326,56 +7202,50 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX1264_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1264_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1264_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1264_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s5, v4, 31
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v3, 31
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, s4
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1264_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
@@ -7447,35 +7317,29 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1232_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1232_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1232_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s0
+; GFX1232_DPP-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s0
 ; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 6d0e0cc7869b3..6bf03a202c143 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -780,14 +780,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -827,14 +825,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -870,13 +866,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: add_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -925,13 +918,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: add_i32_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -971,33 +961,30 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-LABEL: add_i32_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -1037,27 +1024,24 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-LABEL: add_i32_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1315,11 +1299,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1350,11 +1332,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1381,11 +1361,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ;
 ; GFX1064_DPP-LABEL: add_i32_varying_nouse:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1414,11 +1391,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ;
 ; GFX1032_DPP-LABEL: add_i32_varying_nouse:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1442,34 +1416,32 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1164_DPP-LABEL: add_i32_varying_nouse:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
@@ -1482,26 +1454,24 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1132_DPP-LABEL: add_i32_varying_nouse:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
 ; GFX1132_DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
@@ -2398,15 +2368,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX8_DPP-NEXT:    s_nop 0
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2493,15 +2457,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2585,47 +2543,41 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v3, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v4, 31
@@ -2689,43 +2641,37 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP:       ; %bb.0: ; %entry
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
@@ -2734,10 +2680,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v3, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v4, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v3, 15
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
@@ -2779,21 +2725,15 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
@@ -2891,23 +2831,18 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s4
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s4
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -3235,15 +3170,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX8_DPP-NEXT:    s_nop 0
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3312,15 +3241,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3386,23 +3309,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3451,22 +3368,18 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s0
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3507,21 +3420,15 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v6, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v6, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v6
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v7
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v4, v1, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
@@ -3575,22 +3482,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_and_b32 v6, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v6, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v6
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v7
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s0
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4400,14 +4302,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4447,14 +4347,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4490,13 +4388,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: sub_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4545,13 +4440,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: sub_i32_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4591,33 +4483,30 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-LABEL: sub_i32_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -4657,27 +4546,24 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-LABEL: sub_i32_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -4935,11 +4821,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4970,11 +4854,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5001,11 +4883,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ;
 ; GFX1064_DPP-LABEL: sub_i32_varying_nouse:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5034,11 +4913,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ;
 ; GFX1032_DPP-LABEL: sub_i32_varying_nouse:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5062,34 +4938,32 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1164_DPP-LABEL: sub_i32_varying_nouse:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
@@ -5102,26 +4976,24 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1132_DPP-LABEL: sub_i32_varying_nouse:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
 ; GFX1132_DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
@@ -6044,15 +5916,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX8_DPP-NEXT:    s_nop 0
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6139,15 +6005,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6231,47 +6091,41 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v3, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v4, 31
@@ -6335,43 +6189,37 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP:       ; %bb.0: ; %entry
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
@@ -6380,10 +6228,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v3, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v4, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v3, 15
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
@@ -6425,21 +6273,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
@@ -6537,23 +6379,18 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s4
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s4
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -6943,13 +6780,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    s_nop 0
 ; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -6992,13 +6825,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7037,13 +6866,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: and_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, -1
+; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -7092,13 +6918,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: and_i32_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, -1
+; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -7138,33 +6961,30 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-LABEL: and_i32_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -7204,31 +7024,29 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-LABEL: and_i32_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
@@ -7619,16 +7437,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v5, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, -1
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 0
@@ -7683,16 +7495,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v5, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, -1
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 0
@@ -7741,19 +7547,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: and_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v7, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s[0:1]
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -7822,19 +7624,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: and_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v7, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s4
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -7846,11 +7644,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
@@ -7885,47 +7683,43 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164_DPP-LABEL: and_i64_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v7, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v7, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v7, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s[0:1]
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -7981,43 +7775,39 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-LABEL: and_i64_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v7, s4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s4
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, -1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s6, 16
@@ -8375,14 +8165,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -8422,14 +8210,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -8465,13 +8251,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: or_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -8520,13 +8303,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: or_i32_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -8566,33 +8346,30 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-LABEL: or_i32_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -8632,27 +8409,24 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-LABEL: or_i32_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -9047,16 +8821,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 0
@@ -9111,16 +8879,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 0
@@ -9169,19 +8931,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: or_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9250,19 +9008,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: or_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s4
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9274,11 +9028,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
@@ -9313,47 +9067,43 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164_DPP-LABEL: or_i64_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v7, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v7, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -9409,43 +9159,39 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-LABEL: or_i64_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s4
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s6, 16
@@ -9803,14 +9549,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9850,14 +9594,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9893,13 +9635,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: xor_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9948,13 +9687,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: xor_i32_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9994,33 +9730,30 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-LABEL: xor_i32_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -10060,27 +9793,24 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-LABEL: xor_i32_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -10475,16 +10205,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 0
@@ -10539,16 +10263,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 0
@@ -10597,19 +10315,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: xor_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -10678,19 +10392,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: xor_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s4
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -10702,11 +10412,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
@@ -10741,47 +10451,43 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164_DPP-LABEL: xor_i64_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v7, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v7, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -10837,43 +10543,39 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-LABEL: xor_i64_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s4
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s6, 16
@@ -11232,12 +10934,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    s_mov_b64 exec, -1
+; GFX8_DPP-NEXT:    s_nop 0
 ; GFX8_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -11281,12 +10982,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -11325,13 +11025,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: max_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -11380,13 +11077,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: max_i32_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s0
 ; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -11426,33 +11120,30 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-LABEL: max_i32_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -11492,31 +11183,29 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-LABEL: max_i32_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s0
 ; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
@@ -12195,19 +11884,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: max_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX8_DPP-NEXT:    s_mov_b32 s0, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8_DPP-NEXT:    s_brev_b32 s1, 1
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
+; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, v7
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v8
+; GFX8_DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
@@ -12294,19 +11983,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9_DPP-LABEL: max_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX9_DPP-NEXT:    s_mov_b32 s0, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9_DPP-NEXT:    s_brev_b32 s1, 1
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9_DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
@@ -12393,20 +12082,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-LABEL: max_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1064_DPP-NEXT:    s_brev_b32 s1, 1
+; GFX1064_DPP-NEXT:    s_mov_b32 s4, 0
+; GFX1064_DPP-NEXT:    s_brev_b32 s5, 1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, s0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, s1
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, s4, v9, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, s5, v10, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
@@ -12515,20 +12198,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, s0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, s1
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, s0, v9, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, s1, v10, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
@@ -12610,77 +12287,70 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, s0, v9, s[4:5]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v10
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, s0
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s1
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v6, s1, v10, s[4:5]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[3:4]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX1164_DPP-NEXT:    v_readlane_b32 s1, v3, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -12744,55 +12414,48 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v10
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, s1
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, s0, v9, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v6, s1, v10, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v4, 15
@@ -13158,12 +12821,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v2, -2
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    s_mov_b64 exec, -1
+; GFX8_DPP-NEXT:    s_nop 0
 ; GFX8_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -13207,12 +12869,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v2, -2
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -13251,13 +12912,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: min_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v3, -2
+; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -13306,13 +12964,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: min_i32_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
 ; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v3, -2
+; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -13352,33 +13007,30 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-LABEL: min_i32_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v3, -2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -13418,31 +13070,29 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-LABEL: min_i32_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
 ; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v3, -2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
@@ -14124,16 +13774,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX8_DPP-NEXT:    s_brev_b32 s7, -2
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, s7
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, v7
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v8
+; GFX8_DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
@@ -14221,16 +13871,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX9_DPP-NEXT:    s_brev_b32 s7, -2
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9_DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
@@ -14321,14 +13971,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s7
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, s6
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, s7
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, s6, v9, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, s7, v10, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
@@ -14433,17 +14077,11 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1032_DPP-NEXT:    s_brev_b32 s7, -2
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, s7
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, s6
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, s7
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, s6, v9, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, s7, v10, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
@@ -14530,77 +14168,70 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, s6, v9, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v10
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, s6
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s7
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v6, s7, v10, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[5:6], v[3:4]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -14663,56 +14294,49 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v10
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, s6
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, s7
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, s6, v9, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v6, s7, v10, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v3, 31
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v4, 15
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -15076,14 +14700,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -15123,14 +14745,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -15166,13 +14786,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: umax_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -15221,13 +14838,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: umax_i32_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -15267,33 +14881,30 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-LABEL: umax_i32_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -15333,27 +14944,24 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-LABEL: umax_i32_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -16033,14 +15641,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX8_DPP-NEXT:    s_nop 0
@@ -16131,14 +15733,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9_DPP-NEXT:    s_nop 0
@@ -16227,13 +15823,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
@@ -16337,20 +15929,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP:       ; %bb.0: ; %entry
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -16433,77 +16019,70 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v10
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[3:4]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -16564,56 +16143,49 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v10
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v4, 15
@@ -16978,13 +16550,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    s_nop 0
 ; GFX8_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -17027,13 +16595,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -17072,13 +16636,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: umin_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, -1
+; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -17127,13 +16688,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: umin_i32_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, -1
+; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -17173,33 +16731,30 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-LABEL: umin_i32_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -17239,31 +16794,29 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-LABEL: umin_i32_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
@@ -17939,14 +17492,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, -1
-; GFX8_DPP-NEXT:    s_not_b64 exec, exec
-; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v7, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v8, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX8_DPP-NEXT:    s_nop 0
@@ -18037,14 +17584,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, -1
-; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v7, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v8, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9_DPP-NEXT:    s_nop 0
@@ -18133,13 +17674,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, -1
-; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v9, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v10, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
@@ -18243,20 +17780,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP:       ; %bb.0: ; %entry
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, -1
-; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v9, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v10, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -18339,77 +17870,70 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, -1, v9, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v10
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, -1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, -1
-; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v6, -1, v10, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[5:6], v[3:4]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -18470,56 +17994,49 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, -1
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0
-; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v10
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, -1
-; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, -1, v9, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v6, -1, v10, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v4, 15
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 22eb8d05b5ff2..429e6c489bf6f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -277,11 +277,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX8-NEXT:    s_mov_b64 exec, s[10:11]
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
 ; GFX8-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[10:11]
+; GFX8-NEXT:    s_nop 1
 ; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8-NEXT:    s_nop 1
 ; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -334,11 +332,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX9-NEXT:    s_mov_b64 exec, s[10:11]
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
 ; GFX9-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[10:11]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -386,13 +382,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
 ; GFX1064-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_not_b64 exec, exec
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[10:11]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -449,13 +442,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1032-NEXT:    s_and_saveexec_b32 s8, s9
 ; GFX1032-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-NEXT:    s_or_saveexec_b32 s9, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s9
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -503,32 +493,30 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1164-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_not_b64 exec, exec
 ; GFX1164-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[10:11]
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164-NEXT:    v_readlane_b32 s12, v1, 31
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-NEXT:    v_readlane_b32 s12, v1, 15
 ; GFX1164-NEXT:    v_readlane_b32 s13, v1, 31
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_writelane_b32 v3, s12, 16
 ; GFX1164-NEXT:    s_mov_b64 exec, s[10:11]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_or_saveexec_b64 s[10:11], -1
 ; GFX1164-NEXT:    v_readlane_b32 s12, v1, 63
@@ -577,31 +565,29 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1132-NEXT:    s_and_saveexec_b32 s8, s9
 ; GFX1132-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-NEXT:    s_or_saveexec_b32 s9, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s9
 ; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_readlane_b32 s11, v1, 31
 ; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-NEXT:    v_readlane_b32 s10, v1, 15
 ; GFX1132-NEXT:    s_mov_b32 exec_lo, s9
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-NEXT:    s_or_saveexec_b32 s9, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-NEXT:    v_writelane_b32 v3, s10, 16
 ; GFX1132-NEXT:    s_mov_b32 exec_lo, s9
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132-NEXT:    s_and_saveexec_b32 s9, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
index 0d74bd39b56fe..7aca63d34f51b 100644
--- a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
+++ b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
@@ -12,12 +12,7 @@ define i32 @test(i32 %val, i32 %cond) {
 ; GCN-NEXT:    s_mov_b32 exec_lo, s4
 ; GCN-NEXT:    s_or_saveexec_b32 s4, -1
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    s_mov_b32 exec_lo, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, v0
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    s_or_saveexec_b32 s4, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
 ; GCN-NEXT:    v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GCN-NEXT:    s_mov_b32 exec_lo, s4
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0
@@ -27,12 +22,7 @@ define i32 @test(i32 %val, i32 %cond) {
 ; GCN-NEXT:  ; %bb.1: ; %if
 ; GCN-NEXT:    s_or_saveexec_b32 s5, -1
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    s_mov_b32 exec_lo, s5
-; GCN-NEXT:    v_mov_b32_e32 v3, v0
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    s_or_saveexec_b32 s5, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s5
 ; GCN-NEXT:    v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GCN-NEXT:    s_mov_b32 exec_lo, s5
 ; GCN-NEXT:    v_mov_b32_e32 v5, v2
diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
index 82dc6d21cfe33..310f32ce8f83b 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
@@ -6,16 +6,13 @@
 define amdgpu_hs void @wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) {
 ; GCN-LABEL: wwm:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s7, s4
 ; GCN-NEXT:    s_mov_b32 s6, s3
 ; GCN-NEXT:    s_mov_b32 s5, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    s_mov_b32 s7, s4
 ; GCN-NEXT:    s_mov_b32 s4, s1
 ; GCN-NEXT:    s_mov_b32 s1, 1
-; GCN-NEXT:    v_mov_b32_e32 v0, 4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, 4, s[2:3]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
@@ -63,16 +60,13 @@ work:
 define amdgpu_hs void @strict_wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) {
 ; GCN-LABEL: strict_wwm:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s7, s4
 ; GCN-NEXT:    s_mov_b32 s6, s3
 ; GCN-NEXT:    s_mov_b32 s5, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    s_mov_b32 s7, s4
 ; GCN-NEXT:    s_mov_b32 s4, s1
 ; GCN-NEXT:    s_mov_b32 s1, 1
-; GCN-NEXT:    v_mov_b32_e32 v0, 4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, 4, s[2:3]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 584b280cefb8a..311c609291886 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -816,12 +816,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -900,14 +898,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -981,14 +974,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1046,43 +1034,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -1113,15 +1096,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2049,12 +2027,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2133,14 +2109,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2214,14 +2185,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2279,43 +2245,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -2346,15 +2307,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3342,12 +3298,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3426,14 +3380,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3507,14 +3456,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3572,43 +3516,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -3639,15 +3578,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4131,12 +4065,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4215,14 +4147,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4296,14 +4223,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4361,43 +4283,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -4428,15 +4345,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5449,12 +5361,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5533,14 +5443,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -5614,14 +5519,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -5679,43 +5579,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -5759,15 +5654,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7442,14 +7332,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -7579,15 +7463,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -7707,15 +7585,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -7819,17 +7691,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -7864,11 +7731,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -7946,16 +7813,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -9047,14 +8909,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -9152,15 +9008,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9246,15 +9096,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9324,17 +9168,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -9366,9 +9205,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
@@ -9419,16 +9259,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -10497,14 +10332,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -10602,15 +10431,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10696,15 +10519,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10774,17 +10591,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -10816,9 +10628,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
@@ -10869,16 +10682,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -11429,14 +11237,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -11534,15 +11336,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11628,15 +11424,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11706,17 +11496,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -11748,9 +11533,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
@@ -11801,16 +11587,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -13526,14 +13307,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -13663,15 +13438,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13791,15 +13560,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13903,17 +13666,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -13948,11 +13706,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -14030,16 +13788,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 464ec088dc297..9dc82b17bd3f4 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -718,12 +718,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -811,15 +809,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -888,15 +881,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -947,46 +935,41 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1023,42 +1006,38 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -1777,12 +1756,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -1870,15 +1847,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1947,15 +1919,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2006,46 +1973,41 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2082,42 +2044,38 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2836,12 +2794,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2929,15 +2885,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3006,15 +2957,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3065,46 +3011,41 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3141,42 +3082,38 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -4807,14 +4744,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -4953,15 +4884,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5091,15 +5016,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5211,17 +5130,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -5266,11 +5180,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -5350,16 +5264,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -6281,14 +6190,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -6395,15 +6298,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6484,15 +6381,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6555,17 +6446,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -6607,9 +6493,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
@@ -6663,16 +6550,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -8358,14 +8240,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -8504,15 +8380,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8642,15 +8512,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8762,17 +8626,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -8817,11 +8676,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -8901,16 +8760,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 26a0e34d18bdb..945583c88ce26 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -718,12 +718,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -811,15 +809,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -888,15 +881,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -947,46 +935,41 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1023,42 +1006,38 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -1777,12 +1756,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -1870,15 +1847,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1947,15 +1919,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2006,46 +1973,41 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2082,42 +2044,38 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2836,12 +2794,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2929,15 +2885,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3006,15 +2957,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3065,46 +3011,41 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3141,42 +3082,38 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -4807,14 +4744,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -4953,15 +4884,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5091,15 +5016,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5211,17 +5130,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -5266,11 +5180,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -5350,16 +5264,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -6281,14 +6190,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -6395,15 +6298,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6484,15 +6381,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6555,17 +6446,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -6607,9 +6493,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
@@ -6663,16 +6550,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -8358,14 +8240,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -8504,15 +8380,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8642,15 +8512,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8762,17 +8626,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -8817,11 +8676,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -8901,16 +8760,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index c158a8007bcc5..3bc0f2546794d 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -894,12 +894,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -978,14 +976,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1059,14 +1052,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1124,43 +1112,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -1204,15 +1187,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2239,12 +2217,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2323,14 +2299,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2404,14 +2375,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2469,43 +2435,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -2549,15 +2510,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3584,12 +3540,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3668,14 +3622,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3749,14 +3698,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3814,43 +3758,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -3894,15 +3833,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4425,12 +4359,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4509,14 +4441,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4590,14 +4517,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4655,43 +4577,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -4735,15 +4652,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5769,12 +5681,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5853,14 +5763,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -5934,14 +5839,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -5999,43 +5899,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -6079,15 +5974,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7762,14 +7652,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -7899,15 +7783,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8027,15 +7905,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8139,17 +8011,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -8184,11 +8051,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -8266,16 +8133,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -9366,14 +9228,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -9471,15 +9327,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9565,15 +9415,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9643,17 +9487,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -9685,9 +9524,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
@@ -9738,16 +9578,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -10816,14 +10651,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -10921,15 +10750,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11015,15 +10838,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11093,17 +10910,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -11135,9 +10947,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
@@ -11188,16 +11001,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -11748,14 +11556,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -11853,15 +11655,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11947,15 +11743,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -12025,17 +11815,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -12067,9 +11852,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
@@ -12120,16 +11906,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -13844,14 +13625,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX9-DPP-NEXT:    s_nop 0
@@ -13981,15 +13756,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -14109,15 +13878,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -14221,17 +13984,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -14266,11 +14024,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -14348,16 +14106,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index b3acd4949301e..c1b58f1795aae 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -12,97 +12,204 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX11-LABEL: set_inactive_chain_arg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v11
-; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v10
-; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT:    global_store_b32 v[8:9], v0, off
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s0
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    global_store_b32 v[8:9], v1, off
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: set_inactive_chain_arg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v11
-; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX10-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v10
-; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT:    global_store_dword v[8:9], v0, off
+; GFX10-NEXT:    s_mov_b32 exec_lo, s0
+; GFX10-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s0
+; GFX10-NEXT:    s_mov_b32 exec_lo, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    global_store_dword v[8:9], v1, off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11_W64-LABEL: set_inactive_chain_arg:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v11
-; GFX11_W64-NEXT:    s_not_b64 exec, exec
+; GFX11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
-; GFX11_W64-NEXT:    s_not_b64 exec, exec
-; GFX11_W64-NEXT:    global_store_b32 v[8:9], v0, off
+; GFX11_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s[0:1]
+; GFX11_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11_W64-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11_W64-NEXT:    global_store_b32 v[8:9], v1, off
 ; GFX11_W64-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: set_inactive_chain_arg:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v11
-; GFX10_W64-NEXT:    s_not_b64 exec, exec
+; GFX10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v10
-; GFX10_W64-NEXT:    s_not_b64 exec, exec
-; GFX10_W64-NEXT:    global_store_dword v[8:9], v0, off
+; GFX10_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX10_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s[0:1]
+; GFX10_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10_W64-NEXT:    global_store_dword v[8:9], v1, off
 ; GFX10_W64-NEXT:    s_endpgm
   %tmp = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 %active, i32 %inactive) #0
-  store i32 %tmp, ptr addrspace(1) %out
+  %wwm = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp)
+  store i32 %wwm, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i64 %inactive, i64 %active) {
-; GFX11-LABEL: set_inactive_chain_arg_64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v12
-; GFX11-NEXT:    v_mov_b32_e32 v1, v13
-; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_mov_b32_e32 v0, v10
-; GFX11-NEXT:    v_mov_b32_e32 v1, v11
-; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT:    global_store_b64 v[8:9], v[0:1], off
-; GFX11-NEXT:    s_endpgm
+; GISEL11-LABEL: set_inactive_chain_arg_64:
+; GISEL11:       ; %bb.0:
+; GISEL11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL11-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL11-NEXT:    v_dual_mov_b32 v0, v10 :: v_dual_mov_b32 v1, v11
+; GISEL11-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL11-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GISEL11-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s0
+; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GISEL11-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s0
+; GISEL11-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL11-NEXT:    v_mov_b32_e32 v2, v0
+; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GISEL11-NEXT:    v_mov_b32_e32 v3, v1
+; GISEL11-NEXT:    global_store_b64 v[8:9], v[2:3], off
+; GISEL11-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: set_inactive_chain_arg_64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v12
-; GFX10-NEXT:    v_mov_b32_e32 v1, v13
-; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_mov_b32_e32 v0, v10
-; GFX10-NEXT:    v_mov_b32_e32 v1, v11
-; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
-; GFX10-NEXT:    s_endpgm
+; DAGISEL11-LABEL: set_inactive_chain_arg_64:
+; DAGISEL11:       ; %bb.0:
+; DAGISEL11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL11-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL11-NEXT:    v_dual_mov_b32 v1, v11 :: v_dual_mov_b32 v0, v10
+; DAGISEL11-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL11-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; DAGISEL11-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s0
+; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; DAGISEL11-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s0
+; DAGISEL11-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL11-NEXT:    v_mov_b32_e32 v2, v0
+; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; DAGISEL11-NEXT:    v_mov_b32_e32 v3, v1
+; DAGISEL11-NEXT:    global_store_b64 v[8:9], v[2:3], off
+; DAGISEL11-NEXT:    s_endpgm
 ;
-; GFX11_W64-LABEL: set_inactive_chain_arg_64:
-; GFX11_W64:       ; %bb.0:
-; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v12
-; GFX11_W64-NEXT:    v_mov_b32_e32 v1, v13
-; GFX11_W64-NEXT:    s_not_b64 exec, exec
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
-; GFX11_W64-NEXT:    v_mov_b32_e32 v1, v11
-; GFX11_W64-NEXT:    s_not_b64 exec, exec
-; GFX11_W64-NEXT:    global_store_b64 v[8:9], v[0:1], off
-; GFX11_W64-NEXT:    s_endpgm
+; GISEL10-LABEL: set_inactive_chain_arg_64:
+; GISEL10:       ; %bb.0:
+; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL10-NEXT:    v_mov_b32_e32 v0, v10
+; GISEL10-NEXT:    v_mov_b32_e32 v1, v11
+; GISEL10-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL10-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL10-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s0
+; GISEL10-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s0
+; GISEL10-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL10-NEXT:    v_mov_b32_e32 v2, v0
+; GISEL10-NEXT:    v_mov_b32_e32 v3, v1
+; GISEL10-NEXT:    global_store_dwordx2 v[8:9], v[2:3], off
+; GISEL10-NEXT:    s_endpgm
 ;
-; GFX10_W64-LABEL: set_inactive_chain_arg_64:
-; GFX10_W64:       ; %bb.0:
-; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v12
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, v13
-; GFX10_W64-NEXT:    s_not_b64 exec, exec
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v10
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, v11
-; GFX10_W64-NEXT:    s_not_b64 exec, exec
-; GFX10_W64-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
-; GFX10_W64-NEXT:    s_endpgm
+; DAGISEL10-LABEL: set_inactive_chain_arg_64:
+; DAGISEL10:       ; %bb.0:
+; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL10-NEXT:    v_mov_b32_e32 v1, v11
+; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v10
+; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL10-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL10-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s0
+; DAGISEL10-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s0
+; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v2, v0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v3, v1
+; DAGISEL10-NEXT:    global_store_dwordx2 v[8:9], v[2:3], off
+; DAGISEL10-NEXT:    s_endpgm
+;
+; GISEL11_W64-LABEL: set_inactive_chain_arg_64:
+; GISEL11_W64:       ; %bb.0:
+; GISEL11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v1, v11
+; GISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GISEL11_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; GISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GISEL11_W64-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; GISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v2, v0
+; GISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v3, v1
+; GISEL11_W64-NEXT:    global_store_b64 v[8:9], v[2:3], off
+; GISEL11_W64-NEXT:    s_endpgm
+;
+; DAGISEL11_W64-LABEL: set_inactive_chain_arg_64:
+; DAGISEL11_W64:       ; %bb.0:
+; DAGISEL11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v1, v11
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v10
+; DAGISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; DAGISEL11_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; DAGISEL11_W64-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; DAGISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v2, v0
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v3, v1
+; DAGISEL11_W64-NEXT:    global_store_b64 v[8:9], v[2:3], off
+; DAGISEL11_W64-NEXT:    s_endpgm
+;
+; GISEL10_W64-LABEL: set_inactive_chain_arg_64:
+; GISEL10_W64:       ; %bb.0:
+; GISEL10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v1, v11
+; GISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL10_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; GISEL10_W64-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; GISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v2, v0
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v3, v1
+; GISEL10_W64-NEXT:    global_store_dwordx2 v[8:9], v[2:3], off
+; GISEL10_W64-NEXT:    s_endpgm
+;
+; DAGISEL10_W64-LABEL: set_inactive_chain_arg_64:
+; DAGISEL10_W64:       ; %bb.0:
+; DAGISEL10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v1, v11
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v10
+; DAGISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL10_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; DAGISEL10_W64-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; DAGISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v2, v0
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v3, v1
+; DAGISEL10_W64-NEXT:    global_store_dwordx2 v[8:9], v[2:3], off
+; DAGISEL10_W64-NEXT:    s_endpgm
   %tmp = call i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64 %active, i64 %inactive) #0
-  store i64 %tmp, ptr addrspace(1) %out
+  %wwm = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp)
+  store i64 %wwm, ptr addrspace(1) %out
   ret void
 }
 
@@ -113,16 +220,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v11
-; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v0
-; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-NEXT:    global_store_b32 v[8:9], v2, off
 ; GFX11-NEXT:    s_endpgm
@@ -133,11 +237,8 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 ; GFX10-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-NEXT:    v_mov_b32_e32 v0, v11
-; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_mov_b32_e32 v0, v0
-; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX10-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s0
@@ -151,17 +252,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 ; GFX11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11_W64-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v11
-; GFX11_W64-NEXT:    s_not_b64 exec, exec
-; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v0
-; GFX11_W64-NEXT:    s_not_b64 exec, exec
 ; GFX11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s[0:1]
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11_W64-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11_W64-NEXT:    v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX11_W64-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11_W64-NEXT:    global_store_b32 v[8:9], v2, off
 ; GFX11_W64-NEXT:    s_endpgm
@@ -172,11 +269,8 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 ; GFX10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX10_W64-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v11
-; GFX10_W64-NEXT:    s_not_b64 exec, exec
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v0
-; GFX10_W64-NEXT:    s_not_b64 exec, exec
 ; GFX10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX10_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s[0:1]
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX10_W64-NEXT:    s_mov_b64 exec, s[0:1]
@@ -214,11 +308,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; GISEL11-NEXT:    v_mov_b32_e32 v11, 0
 ; GISEL11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GISEL11-NEXT:    v_mov_b32_e32 v12, v43
-; GISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GISEL11-NEXT:    v_mov_b32_e32 v12, v40
-; GISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL11-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL11-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s0
+; GISEL11-NEXT:    s_mov_b32 exec_lo, s0
 ; GISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11-NEXT:    global_store_b32 v[41:42], v0, off
 ; GISEL11-NEXT:    s_endpgm
@@ -244,11 +337,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; DAGISEL11-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0
 ; DAGISEL11-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL11-NEXT:    v_mov_b32_e32 v12, v43
-; DAGISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
-; DAGISEL11-NEXT:    v_mov_b32_e32 v12, v40
-; DAGISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
-; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL11-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s0
+; DAGISEL11-NEXT:    s_mov_b32 exec_lo, s0
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11-NEXT:    global_store_b32 v[41:42], v0, off
 ; DAGISEL11-NEXT:    s_endpgm
@@ -283,10 +375,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; GISEL10-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GISEL10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GISEL10-NEXT:    v_mov_b32_e32 v12, v43
-; GISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GISEL10-NEXT:    v_mov_b32_e32 v12, v40
-; GISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL10-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL10-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s0
+; GISEL10-NEXT:    s_mov_b32 exec_lo, s0
 ; GISEL10-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL10-NEXT:    global_store_dword v[41:42], v0, off
 ; GISEL10-NEXT:    s_endpgm
@@ -321,10 +412,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; DAGISEL10-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; DAGISEL10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; DAGISEL10-NEXT:    v_mov_b32_e32 v12, v43
-; DAGISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
-; DAGISEL10-NEXT:    v_mov_b32_e32 v12, v40
-; DAGISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL10-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL10-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s0
+; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s0
 ; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL10-NEXT:    global_store_dword v[41:42], v0, off
 ; DAGISEL10-NEXT:    s_endpgm
@@ -357,11 +447,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v11, 0
 ; GISEL11_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL11_W64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GISEL11_W64-NEXT:    v_mov_b32_e32 v12, v43
-; GISEL11_W64-NEXT:    s_not_b64 exec, exec
-; GISEL11_W64-NEXT:    v_mov_b32_e32 v12, v40
-; GISEL11_W64-NEXT:    s_not_b64 exec, exec
-; GISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL11_W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL11_W64-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; GISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
 ; GISEL11_W64-NEXT:    s_endpgm
@@ -394,11 +483,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v11, 0
 ; DAGISEL11_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL11_W64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v12, v43
-; DAGISEL11_W64-NEXT:    s_not_b64 exec, exec
-; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v12, v40
-; DAGISEL11_W64-NEXT:    s_not_b64 exec, exec
-; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL11_W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11_W64-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; DAGISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
 ; DAGISEL11_W64-NEXT:    s_endpgm
@@ -433,10 +521,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; GISEL10_W64-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GISEL10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL10_W64-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GISEL10_W64-NEXT:    v_mov_b32_e32 v12, v43
-; GISEL10_W64-NEXT:    s_not_b64 exec, exec
-; GISEL10_W64-NEXT:    v_mov_b32_e32 v12, v40
-; GISEL10_W64-NEXT:    s_not_b64 exec, exec
+; GISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL10_W64-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; GISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GISEL10_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL10_W64-NEXT:    global_store_dword v[41:42], v0, off
 ; GISEL10_W64-NEXT:    s_endpgm
@@ -471,10 +558,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; DAGISEL10_W64-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; DAGISEL10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL10_W64-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v12, v43
-; DAGISEL10_W64-NEXT:    s_not_b64 exec, exec
-; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v12, v40
-; DAGISEL10_W64-NEXT:    s_not_b64 exec, exec
+; DAGISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL10_W64-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; DAGISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL10_W64-NEXT:    global_store_dword v[41:42], v0, off
 ; DAGISEL10_W64-NEXT:    s_endpgm
@@ -511,11 +597,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; GISEL11-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0
 ; GISEL11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GISEL11-NEXT:    v_mov_b32_e32 v12, v43
-; GISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GISEL11-NEXT:    v_mov_b32_e32 v12, v40
-; GISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL11-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL11-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s0
+; GISEL11-NEXT:    s_mov_b32 exec_lo, s0
 ; GISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11-NEXT:    global_store_b32 v[41:42], v0, off
 ; GISEL11-NEXT:    s_endpgm
@@ -541,11 +626,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v11, 0
 ; DAGISEL11-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL11-NEXT:    v_mov_b32_e32 v12, v43
-; DAGISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
-; DAGISEL11-NEXT:    v_mov_b32_e32 v12, v40
-; DAGISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
-; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL11-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s0
+; DAGISEL11-NEXT:    s_mov_b32 exec_lo, s0
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11-NEXT:    global_store_b32 v[41:42], v0, off
 ; DAGISEL11-NEXT:    s_endpgm
@@ -580,10 +664,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; GISEL10-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GISEL10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GISEL10-NEXT:    v_mov_b32_e32 v12, v43
-; GISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GISEL10-NEXT:    v_mov_b32_e32 v12, v40
-; GISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL10-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL10-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s0
+; GISEL10-NEXT:    s_mov_b32 exec_lo, s0
 ; GISEL10-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL10-NEXT:    global_store_dword v[41:42], v0, off
 ; GISEL10-NEXT:    s_endpgm
@@ -618,10 +701,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; DAGISEL10-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; DAGISEL10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; DAGISEL10-NEXT:    v_mov_b32_e32 v12, v43
-; DAGISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
-; DAGISEL10-NEXT:    v_mov_b32_e32 v12, v40
-; DAGISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL10-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL10-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s0
+; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s0
 ; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL10-NEXT:    global_store_dword v[41:42], v0, off
 ; DAGISEL10-NEXT:    s_endpgm
@@ -654,11 +736,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v11, 0
 ; GISEL11_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL11_W64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GISEL11_W64-NEXT:    v_mov_b32_e32 v12, v43
-; GISEL11_W64-NEXT:    s_not_b64 exec, exec
-; GISEL11_W64-NEXT:    v_mov_b32_e32 v12, v40
-; GISEL11_W64-NEXT:    s_not_b64 exec, exec
-; GISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL11_W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL11_W64-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; GISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
 ; GISEL11_W64-NEXT:    s_endpgm
@@ -691,11 +772,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v11, 0
 ; DAGISEL11_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL11_W64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v12, v43
-; DAGISEL11_W64-NEXT:    s_not_b64 exec, exec
-; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v12, v40
-; DAGISEL11_W64-NEXT:    s_not_b64 exec, exec
-; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL11_W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11_W64-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; DAGISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
 ; DAGISEL11_W64-NEXT:    s_endpgm
@@ -730,10 +810,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; GISEL10_W64-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GISEL10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL10_W64-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GISEL10_W64-NEXT:    v_mov_b32_e32 v12, v43
-; GISEL10_W64-NEXT:    s_not_b64 exec, exec
-; GISEL10_W64-NEXT:    v_mov_b32_e32 v12, v40
-; GISEL10_W64-NEXT:    s_not_b64 exec, exec
+; GISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL10_W64-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; GISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GISEL10_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL10_W64-NEXT:    global_store_dword v[41:42], v0, off
 ; GISEL10_W64-NEXT:    s_endpgm
@@ -768,10 +847,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; DAGISEL10_W64-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; DAGISEL10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL10_W64-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v12, v43
-; DAGISEL10_W64-NEXT:    s_not_b64 exec, exec
-; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v12, v40
-; DAGISEL10_W64-NEXT:    s_not_b64 exec, exec
+; DAGISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL10_W64-NEXT:    v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; DAGISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL10_W64-NEXT:    global_store_dword v[41:42], v0, off
 ; DAGISEL10_W64-NEXT:    s_endpgm
@@ -786,6 +864,7 @@ declare i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32, i32) #0
 declare i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64, i64) #0
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg)
 declare i32 @llvm.amdgcn.strict.wwm.i32(i32)
+declare i64 @llvm.amdgcn.strict.wwm.i64(i64)
 declare amdgpu_gfx void @gfx_callee(<12 x i32>)
 
 attributes #0 = { convergent readnone willreturn nocallback nofree}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 114d2d099ab7b..6dc4a2ce0504b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -5,18 +5,22 @@
 define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: set_inactive:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, 42
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
   store i32 %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -25,13 +29,15 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
 ; GCN-LABEL: set_inactive_imm_poison:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
+  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
+  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
   store i32 %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -40,20 +46,25 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
 ; GCN-LABEL: set_inactive_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
+  %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
+  %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
   store i64 %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -63,13 +74,16 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
+  %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
+  %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
   store i64 %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -82,12 +96,15 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_buffer_load_dword s4, s[4:7], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 42
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 56
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN-NEXT:    s_cbranch_scc1 .LBB4_3
 ; GCN-NEXT:  ; %bb.1: ; %Flow
@@ -96,19 +113,20 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 ; GCN-NEXT:  .LBB4_2: ; %.exit
 ; GCN-NEXT:    s_endpgm
 ; GCN-NEXT:  .LBB4_3: ; %.one
-; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 ; GCN-NEXT:    s_cbranch_execnz .LBB4_2
 ; GCN-NEXT:  .LBB4_4: ; %.zero
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
   %cmp = icmp eq i32 %val, 56
-  %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
   br i1 %cmp, label %.zero, label %.one
 
 .zero:
@@ -127,19 +145,23 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
 ; GCN-LABEL: set_inactive_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s7, 0x40400000
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s5, 0x40400000
+; GCN-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+  %tmp.0 = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+  %tmp = call float @llvm.amdgcn.strict.wwm.f32(float %tmp.0)
   store float %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -148,22 +170,27 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 ; GCN-LABEL: set_inactive_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_mov_b32 s0, 0xcccccccd
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    s_mov_b32 s1, 0x4010cccc
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+  %tmp.0 = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+  %tmp = call double @llvm.amdgcn.strict.wwm.f64(double %tmp.0)
   store double %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -171,19 +198,23 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
 ; GCN-LABEL: set_inactive_v2i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s7, 0x10001
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s5, 0x10001
+; GCN-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+  %tmp.0 = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+  %tmp = call <2 x i16> @llvm.amdgcn.strict.wwm.v2i16(<2 x i16> %tmp.0)
   store <2 x i16> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -191,19 +222,23 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
 define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; GCN-LABEL: set_inactive_v2f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s7, 0x3c003c00
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s5, 0x3c003c00
+; GCN-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+  %tmp.0 = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+  %tmp = call <2 x half> @llvm.amdgcn.strict.wwm.v2i16(<2 x half> %tmp.0)
   store <2 x half> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -212,22 +247,27 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
 ; GCN-LABEL: set_inactive_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s8, 1
+; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT:    s_mov_b32 s10, 1
+; GCN-NEXT:    s_mov_b32 s11, s10
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+  %tmp.0 = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+  %tmp = call <2 x i32> @llvm.amdgcn.strict.wwm.v2i32(<2 x i32> %tmp.0)
   store <2 x i32> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -236,22 +276,27 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 ; GCN-LABEL: set_inactive_v2f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s8, 1.0
+; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT:    s_mov_b32 s10, 1.0
+; GCN-NEXT:    s_mov_b32 s11, s10
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+  %tmp.0 = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+  %tmp = call <2 x float> @llvm.amdgcn.strict.wwm.v2f32(<2 x float> %tmp.0)
   store <2 x float> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -259,19 +304,23 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
 ; GCN-LABEL: set_inactive_v2bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s7, 0x3f803f80
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s5, 0x3f803f80
+; GCN-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+  %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+  %tmp = call <2 x bfloat> @llvm.amdgcn.strict.wwm.v2bf16(<2 x bfloat> %tmp.0)
   store <2 x bfloat> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -280,22 +329,27 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
 ; GCN-LABEL: set_inactive_v4i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s8, 0x10001
+; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT:    s_mov_b32 s10, 0x10001
+; GCN-NEXT:    s_mov_b32 s11, s10
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+  %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+  %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0)
   store <4 x i16> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -304,22 +358,27 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
 ; GCN-LABEL: set_inactive_v4f16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s8, 0x3c003c00
+; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT:    s_mov_b32 s10, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s11, s10
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+  %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+  %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0)
   store <4 x half> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -328,22 +387,27 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
 ; GCN-LABEL: set_inactive_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s8, 0x3f803f80
+; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT:    s_mov_b32 s10, 0x3f803f80
+; GCN-NEXT:    s_mov_b32 s11, s10
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+  %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+  %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0)
   store <4 x bfloat> %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -352,20 +416,25 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 ; GCN-LABEL: set_inactive_p0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+  %tmp.0 = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+  %tmp = call ptr @llvm.amdgcn.strict.wwm.p0(ptr %tmp.0)
   store ptr %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -373,18 +442,22 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
 ; GCN-LABEL: set_inactive_p2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+  %tmp.0 = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+  %tmp = call ptr addrspace(2) @llvm.amdgcn.strict.wwm.p2(ptr addrspace(2) %tmp.0)
   store ptr addrspace(2) %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -392,18 +465,22 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
 ; GCN-LABEL: set_inactive_p3:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+  %tmp.0 = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+  %tmp = call ptr addrspace(3) @llvm.amdgcn.strict.wwm.p3(ptr addrspace(3) %tmp.0)
   store ptr addrspace(3) %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -411,18 +488,22 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
 ; GCN-LABEL: set_inactive_p5:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+  %tmp.0 = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+  %tmp = call ptr addrspace(5) @llvm.amdgcn.strict.wwm.p5(ptr addrspace(5) %tmp.0)
   store ptr addrspace(5) %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -430,24 +511,31 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
 ; GCN-LABEL: set_inactive_p6:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+  %tmp.0 = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+  %tmp = call ptr addrspace(6) @llvm.amdgcn.strict.wwm.p6(ptr addrspace(6) %tmp.0)
   store ptr addrspace(6) %tmp, ptr addrspace(1) %out
   ret void
 }
 
 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
+declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1
+declare i64 @llvm.amdgcn.strict.wwm.i64(i64) #1
 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
 
 attributes #0 = { convergent readnone }
+attributes #1 = { convergent nounwind readnone speculatable willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
index 81858bd3d29ee..f60786c1bacbf 100644
--- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
@@ -15,11 +15,8 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i
 ; GCN-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GCN-NEXT:    s_cbranch_execz .LBB0_4
 ; GCN-NEXT:  ; %bb.3: ; %.then
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GCN-NEXT:    s_or_saveexec_b32 s1, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GCN-NEXT:    s_mov_b32 exec_lo, s1
@@ -82,12 +79,7 @@ define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrs
 ; GCN-NEXT:  .LBB1_5: ; %.else
 ; GCN-NEXT:    s_or_saveexec_b32 s1, -1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_mov_b32 exec_lo, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, v3
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    s_or_saveexec_b32 s1, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s1
 ; GCN-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GCN-NEXT:    s_mov_b32 exec_lo, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
index 09e342fe19066..90b32e29e98f6 100644
--- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
@@ -23,11 +23,8 @@ define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 i
 ; GCN-NEXT:    s_cbranch_execz .LBB0_1
 ; GCN-NEXT:  ; %bb.3: ; %bb1
 ; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GCN-NEXT:    s_or_saveexec_b32 s9, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, s4, s9
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GCN-NEXT:    s_mov_b32 exec_lo, s9
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index c3a81771a2790..ff692acda3c25 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1674,13 +1674,13 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0
 ; GFX1032-NEXT:    s_clause 0x1
 ; GFX1032-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 42
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 42, s4, s2
+; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1032-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_set_inactive:
@@ -1688,15 +1688,16 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0
 ; GFX1064-NEXT:    s_clause 0x1
 ; GFX1064-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 42
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 42, s4, s[2:3]
+; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1064-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX1064-NEXT:    s_endpgm
-  %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42)
+  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42)
+  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
   store i32 %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -1705,31 +1706,32 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in)
 ; GFX1032-LABEL: test_set_inactive_64:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 0, s6, s0
+; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s7, s0
+; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1032-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_set_inactive_64:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 0, s6, s[0:1]
+; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s7, s[0:1]
+; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1064-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
 ; GFX1064-NEXT:    s_endpgm
-  %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
+  %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
+  %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
   store i64 %tmp, ptr addrspace(1) %out
   ret void
 }
@@ -2921,6 +2923,8 @@ declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64)
 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32)
 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
 declare float @llvm.amdgcn.strict.wwm.f32(float)
+declare i32 @llvm.amdgcn.strict.wwm.i32(i32)
+declare i64 @llvm.amdgcn.strict.wwm.i64(i64)
 declare float @llvm.amdgcn.wwm.f32(float)
 declare i32 @llvm.amdgcn.wqm.i32(i32)
 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 6b4c2da772cdc..ab84c0c905771 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -835,12 +835,9 @@ define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
 ; GFX9-W64:       ; %bb.0: ; %main_body
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
-; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
 ; GFX9-W64-NEXT:    v_add_u32_e32 v0, v0, v0
 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
@@ -851,12 +848,9 @@ define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s0
 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v0, v0, v0
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
@@ -1317,7 +1311,7 @@ define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
 ; GFX9-W64-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 idxen
 ; GFX9-W64-NEXT:    s_nop 0
 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
-; GFX9-W64-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec
+; GFX9-W64-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $exec
 ; GFX9-W64-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
@@ -1334,7 +1328,7 @@ define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
 ; GFX10-W32-NEXT:    s_clause 0x1
 ; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
-; GFX10-W32-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec
+; GFX10-W32-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
 ; GFX10-W32-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $exec
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
@@ -2263,11 +2257,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
-; GFX9-W64-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
 ; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2293,11 +2284,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s0
 ; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2744,12 +2732,9 @@ define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
 ; GFX9-W64:       ; %bb.0: ; %main_body
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
-; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
 ; GFX9-W64-NEXT:    v_add_u32_e32 v0, v0, v0
 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
@@ -2760,12 +2745,9 @@ define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s0
 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v0, v0, v0
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
@@ -2799,11 +2781,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
-; GFX9-W64-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
 ; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2829,11 +2808,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s0
 ; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index ef6d0780f395f..64a7c4457395c 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -40,6 +40,9 @@
   define amdgpu_vs void @no_wqm_in_vs() {
     ret void
   }
+  define amdgpu_ps void @preloaded_set_inactive() {
+    ret void
+  }
 ...
 ---
 
@@ -282,10 +285,10 @@ body:             |
 #
 #CHECK-NOT: ENTER_STRICT_WWM
 #CHECK: BUFFER_LOAD_DWORDX2
-#CHECK-NOT: ENTER_STRICT_WWM
+#CHECK: ENTER_STRICT_WWM
 #CHECK: V_SET_INACTIVE_B32
 #CHECK: V_SET_INACTIVE_B32
-#CHECK: ENTER_STRICT_WWM
+#CHECK-NOT: ENTER_STRICT_WWM
 #CHECK: V_MAX
 name:            test_wwm_set_inactive_propagation
 tracksRegLiveness: true
@@ -443,3 +446,19 @@ body:             |
 
     %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
 ...
+
+---
+# Preserve V_SET_INACTIVE with exec mask already specified
+#CHECK-LABEL: name: preloaded_set_inactive
+#CHECK: V_SET_INACTIVE_B32
+name:            preloaded_set_inactive
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr1, $vgpr2
+
+    %0:vgpr_32 = COPY $vgpr1
+    %1:vgpr_32 = COPY $vgpr2
+    %mask:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %value:vgpr_32 = V_SET_INACTIVE_B32 %0:vgpr_32, %1:vgpr_32, implicit $exec, implicit-def $scc, implicit %mask:sreg_64
+...
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index e79cb66dcd776..47e1897f6b420 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -30,15 +30,15 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s34
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -80,17 +80,10 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[34:35]
+; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
@@ -177,11 +170,11 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -208,12 +201,8 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[34:35]
+; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
@@ -270,34 +259,25 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[34:35]
+; GFX9-O3-NEXT:    s_nop 1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-O3-NEXT:    s_and_saveexec_b64 s[34:35], vcc
-; GFX9-O3-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-O3-NEXT:  ; %bb.1: ; %if
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[36:37]
+; GFX9-O3-NEXT:    s_nop 1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT:  .LBB1_2: ; %merge
+; GFX9-O3-NEXT:  ; %bb.2: ; %merge
 ; GFX9-O3-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -378,26 +358,26 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x400
 ; GFX9-O0-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX9-O0-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-O0-NEXT:    s_mov_b32 s40, s6
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    s_mov_b32 s42, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s4
-; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT:    s_mov_b32 s41, s7
-; GFX9-O0-NEXT:    s_mov_b32 s42, s41
-; GFX9-O0-NEXT:    s_mov_b32 s43, s40
+; GFX9-O0-NEXT:    ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43
+; GFX9-O0-NEXT:    s_mov_b32 s43, s7
+; GFX9-O0-NEXT:    s_mov_b32 s44, s43
+; GFX9-O0-NEXT:    s_mov_b32 s45, s42
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s5
-; GFX9-O0-NEXT:    s_mov_b32 s44, s35
+; GFX9-O0-NEXT:    s_mov_b32 s46, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s34
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
-; GFX9-O0-NEXT:    s_mov_b32 s37, s44
-; GFX9-O0-NEXT:    s_mov_b32 s38, s43
-; GFX9-O0-NEXT:    s_mov_b32 s39, s42
+; GFX9-O0-NEXT:    s_mov_b32 s37, s46
+; GFX9-O0-NEXT:    s_mov_b32 s38, s45
+; GFX9-O0-NEXT:    s_mov_b32 s39, s44
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    s_getpc_b64 s[42:43]
 ; GFX9-O0-NEXT:    s_add_u32 s42, s42, strict_wwm_called@rel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s43, s43, strict_wwm_called@rel32@hi+12
@@ -437,11 +417,11 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-O3-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
 ; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called@rel32@lo+4
@@ -559,7 +539,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-LABEL: strict_wwm_call_i64:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s48, s33
+; GFX9-O0-NEXT:    s_mov_b32 s50, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -583,41 +563,41 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    v_writelane_b32 v10, s30, 0
 ; GFX9-O0-NEXT:    v_writelane_b32 v10, s31, 1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[38:39], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s38, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s39, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s8
-; GFX9-O0-NEXT:    s_mov_b32 s38, s6
+; GFX9-O0-NEXT:    s_mov_b32 s40, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s4
-; GFX9-O0-NEXT:    ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39
-; GFX9-O0-NEXT:    s_mov_b32 s39, s7
-; GFX9-O0-NEXT:    s_mov_b32 s35, s39
-; GFX9-O0-NEXT:    s_mov_b32 s44, s38
+; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
+; GFX9-O0-NEXT:    s_mov_b32 s41, s7
+; GFX9-O0-NEXT:    s_mov_b32 s35, s41
+; GFX9-O0-NEXT:    s_mov_b32 s42, s40
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37
 ; GFX9-O0-NEXT:    s_mov_b32 s37, s5
-; GFX9-O0-NEXT:    s_mov_b32 s45, s37
-; GFX9-O0-NEXT:    s_mov_b32 s40, s36
-; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43
-; GFX9-O0-NEXT:    s_mov_b32 s41, s45
-; GFX9-O0-NEXT:    s_mov_b32 s42, s44
-; GFX9-O0-NEXT:    s_mov_b32 s43, s35
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s40, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s41, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s42, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s43, 3
+; GFX9-O0-NEXT:    s_mov_b32 s43, s37
+; GFX9-O0-NEXT:    s_mov_b32 s44, s36
+; GFX9-O0-NEXT:    ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45_sgpr46_sgpr47
+; GFX9-O0-NEXT:    s_mov_b32 s45, s43
+; GFX9-O0-NEXT:    s_mov_b32 s46, s42
+; GFX9-O0-NEXT:    s_mov_b32 s47, s35
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s44, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s45, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s46, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s47, 5
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[48:49], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[48:49]
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s9
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b64 s[36:37], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s35
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s36
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s37
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s34, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s35, 5
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[38:39]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s35
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 32
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
@@ -634,20 +614,20 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[48:49], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[48:49]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s34, v6, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s35, v6, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s36, v6, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s37, v6, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s38, v6, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s39, v6, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s34, v6, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s35, v6, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s36, v6, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s37, v6, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s38, v6, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s39, v6, 5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[48:49], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[48:49]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
@@ -679,14 +659,14 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffff000
-; GFX9-O0-NEXT:    s_mov_b32 s33, s48
+; GFX9-O0-NEXT:    s_mov_b32 s33, s50
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: strict_wwm_call_i64:
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O3-NEXT:    s_mov_b32 s40, s33
+; GFX9-O3-NEXT:    s_mov_b32 s38, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -702,28 +682,26 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    v_writelane_b32 v8, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-O3-NEXT:    v_writelane_b32 v8, s31, 1
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
-; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called_i64@gotpcrel32@lo+4
-; GFX9-O3-NEXT:    s_addc_u32 s37, s37, strict_wwm_called_i64@gotpcrel32@hi+12
-; GFX9-O3-NEXT:    s_load_dwordx2 s[36:37], s[36:37], 0x0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s8
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s9
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-O3-NEXT:    s_getpc_b64 s[34:35]
+; GFX9-O3-NEXT:    s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12
+; GFX9-O3-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[38:39], -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s8
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s9
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
 ; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[38:39]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
@@ -739,7 +717,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xf800
-; GFX9-O3-NEXT:    s_mov_b32 s33, s40
+; GFX9-O3-NEXT:    s_mov_b32 s33, s38
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
@@ -778,16 +756,18 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s35, 0x7fffffff
-; GFX9-O0-NEXT:    s_mov_b32 s40, -1
-; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT:    s_mov_b32 s41, s35
+; GFX9-O0-NEXT:    s_mov_b32 s42, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43
+; GFX9-O0-NEXT:    s_mov_b32 s43, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s42
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s43
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s40
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s41
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
@@ -796,21 +776,25 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s42
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s43
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s40
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s41
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s42
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s43
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s40
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s41
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
@@ -851,28 +835,30 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen
 ; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
-; GFX9-O3-NEXT:    s_mov_b32 s34, -1
-; GFX9-O3-NEXT:    s_brev_b32 s35, -2
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    s_mov_b32 s36, -1
+; GFX9-O3-NEXT:    s_brev_b32 s37, -2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s36
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s37
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s35
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s36
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s37
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s34
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s35
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s36
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s37
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v11
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s34
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s35
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
@@ -922,21 +908,9 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
@@ -987,130 +961,110 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v42, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, s5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s13
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s14
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s14
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s25
-; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s26
-; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s28
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s22
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s25
+; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s26
+; GFX9-O0-NEXT:    v_mov_b32_e32 v42, s27
+; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s29
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v34
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v34
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v34
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v34
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v34
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v34
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v34
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v34
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v34
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v34
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v34
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v46
-; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v45
-; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v44
-; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v34
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v39
+; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v38
+; GFX9-O0-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v37
+; GFX9-O0-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v36
+; GFX9-O0-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v34
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v46
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v45
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v44
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v43
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v42
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v41
+; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v40
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v39
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v46
-; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v45
-; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v44
-; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v43
-; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr42 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v38
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v37
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v36
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v35
+; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr34 killed $exec
 ; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -1150,62 +1104,82 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v0
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
-; GFX9-O0-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-O0-NEXT:    s_mov_b64 s[36:37], 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v11
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s35
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v35, v9
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v34, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s35
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v36, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v37, v7
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s35
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v38, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v39, v5
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s35
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v40, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v41, v3
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s35
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v33
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v32
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v33
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v9
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v32
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v33
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v7
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v32
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v33
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v5
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v32
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v33
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v3
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v32
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v32
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v37
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:28
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:24
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v41
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v40
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:32
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s5
@@ -1245,16 +1219,8 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1265,73 +1231,56 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_load_dword v26, off, s[0:3], s32
 ; GFX9-O3-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:4
 ; GFX9-O3-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:8
 ; GFX9-O3-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:12
 ; GFX9-O3-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:16
 ; GFX9-O3-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
-; GFX9-O3-NEXT:    v_mov_b32_e32 v32, v1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v33, v2
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v32, 0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v33, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v34, v3
-; GFX9-O3-NEXT:    v_mov_b32_e32 v35, v4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v34, 0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v35, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v36, v5
-; GFX9-O3-NEXT:    v_mov_b32_e32 v37, v6
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v36, 0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v37, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v38, v7
-; GFX9-O3-NEXT:    v_mov_b32_e32 v39, v8
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v38, 0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v39, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v40, v9
-; GFX9-O3-NEXT:    v_mov_b32_e32 v41, v10
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v40, 0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v41, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:4
-; GFX9-O3-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen
-; GFX9-O3-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:12
-; GFX9-O3-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:8
-; GFX9-O3-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:20
-; GFX9-O3-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:16
-; GFX9-O3-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen offset:28
-; GFX9-O3-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:24
-; GFX9-O3-NEXT:    buffer_store_dword v41, v0, s[0:3], 0 offen offset:36
-; GFX9-O3-NEXT:    buffer_store_dword v40, v0, s[0:3], 0 offen offset:32
-; GFX9-O3-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
-; GFX9-O3-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v1, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v33, 0, v2, s[34:35]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v32
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v33
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v3, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v33, 0, v4, s[34:35]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v32
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v33
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v5, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v33, 0, v6, s[34:35]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v32
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v33
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v7, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v33, 0, v8, s[34:35]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v32
+; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v33
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v9, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v33, 0, v10, s[34:35]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v32
+; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v33
+; GFX9-O3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX9-O3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-O3-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; GFX9-O3-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; GFX9-O3-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; GFX9-O3-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:28
+; GFX9-O3-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; GFX9-O3-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:36
+; GFX9-O3-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:32
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1359,24 +1308,21 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v24, s28
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v25, s29
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
-  %a2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %a, i64 0)
-  %b2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %b, i64 0)
-  %c2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %c, i64 0)
-  %d2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %d, i64 0)
-  %e2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %e, i64 0)
+  %a2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %a, i64 0)
+  %a2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %a2.i)
+  %b2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %b, i64 0)
+  %b2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %b2.i)
+  %c2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %c, i64 0)
+  %c2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %c2.i)
+  %d2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %d, i64 0)
+  %d2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %d2.i)
+  %e2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %e, i64 0)
+  %e2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %e2.i)
   store i64 %a2, ptr addrspace(5) %ptr
   %ptr_b = getelementptr i64, ptr addrspace(5) %ptr, i32 1
   store i64 %b2, ptr addrspace(5) %ptr_b
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index a74dbe1de0d39..7f0db3e362b30 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -26,15 +26,15 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -63,17 +63,10 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[4:5]
+; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
@@ -154,11 +147,11 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -185,12 +178,8 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -236,34 +225,25 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[4:5]
+; GFX9-O3-NEXT:    s_nop 1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-O3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-O3-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-O3-NEXT:  ; %bb.1: ; %if
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[6:7]
+; GFX9-O3-NEXT:    s_nop 1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT:  .LBB1_2: ; %merge
+; GFX9-O3-NEXT:  ; %bb.2: ; %merge
 ; GFX9-O3-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -361,35 +341,35 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
+; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s3, s7
-; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
-; GFX9-O0-NEXT:    s_mov_b32 s7, s9
-; GFX9-O0-NEXT:    s_mov_b32 s16, s8
+; GFX9-O0-NEXT:    s_mov_b32 s3, s9
+; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX9-O0-NEXT:    s_mov_b32 s9, s17
+; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s7
-; GFX9-O0-NEXT:    s_mov_b32 s18, s6
+; GFX9-O0-NEXT:    s_mov_b32 s17, s9
+; GFX9-O0-NEXT:    s_mov_b32 s18, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
 ; GFX9-O0-NEXT:    s_mov_b32 s3, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 9
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -418,13 +398,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -454,12 +434,12 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O3-NEXT:    s_load_dword s4, s[2:3], 0x34
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[2:3], 0x24
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O3-NEXT:    s_add_u32 s8, s2, 56
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 10, v4
@@ -613,35 +593,35 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s8, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s9, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s6, s9
-; GFX9-O0-NEXT:    s_mov_b32 s7, s8
-; GFX9-O0-NEXT:    s_mov_b32 s8, s17
+; GFX9-O0-NEXT:    s_mov_b32 s6, s19
+; GFX9-O0-NEXT:    s_mov_b32 s7, s18
+; GFX9-O0-NEXT:    s_mov_b32 s15, s17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s8
+; GFX9-O0-NEXT:    s_mov_b32 s17, s15
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
-; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s7
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s7
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -678,12 +658,12 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -721,14 +701,14 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O3-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[2:3], 0x24
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O3-NEXT:    s_add_u32 s8, s2, 60
 ; GFX9-O3-NEXT:    s_addc_u32 s9, s3, 0
 ; GFX9-O3-NEXT:    s_getpc_b64 s[2:3]
@@ -792,16 +772,18 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 0x7fffffff
-; GFX9-O0-NEXT:    s_mov_b32 s6, -1
-; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
-; GFX9-O0-NEXT:    s_mov_b32 s7, s5
+; GFX9-O0-NEXT:    s_mov_b32 s8, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; GFX9-O0-NEXT:    s_mov_b32 s9, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
@@ -810,21 +792,25 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
@@ -848,28 +834,30 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
 ; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
-; GFX9-O3-NEXT:    s_mov_b32 s4, -1
-; GFX9-O3-NEXT:    s_brev_b32 s5, -2
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    s_mov_b32 s6, -1
+; GFX9-O3-NEXT:    s_brev_b32 s7, -2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v11
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
@@ -927,15 +915,15 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -964,17 +952,10 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[4:5]
+; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
@@ -1055,11 +1036,11 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -1086,12 +1067,8 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -1137,34 +1114,25 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[4:5]
+; GFX9-O3-NEXT:    s_nop 1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-O3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-O3-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9-O3-NEXT:  ; %bb.1: ; %if
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[6:7]
+; GFX9-O3-NEXT:    s_nop 1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT:  .LBB8_2: ; %merge
+; GFX9-O3-NEXT:  ; %bb.2: ; %merge
 ; GFX9-O3-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1262,35 +1230,35 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
+; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s3, s7
-; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
-; GFX9-O0-NEXT:    s_mov_b32 s7, s9
-; GFX9-O0-NEXT:    s_mov_b32 s16, s8
+; GFX9-O0-NEXT:    s_mov_b32 s3, s9
+; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX9-O0-NEXT:    s_mov_b32 s9, s17
+; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s7
-; GFX9-O0-NEXT:    s_mov_b32 s18, s6
+; GFX9-O0-NEXT:    s_mov_b32 s17, s9
+; GFX9-O0-NEXT:    s_mov_b32 s18, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
 ; GFX9-O0-NEXT:    s_mov_b32 s3, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 9
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -1319,13 +1287,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -1355,12 +1323,12 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O3-NEXT:    s_load_dword s4, s[2:3], 0x34
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[2:3], 0x24
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O3-NEXT:    s_add_u32 s8, s2, 56
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 10, v4
@@ -1514,35 +1482,35 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s8, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s9, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s6, s9
-; GFX9-O0-NEXT:    s_mov_b32 s7, s8
-; GFX9-O0-NEXT:    s_mov_b32 s8, s17
+; GFX9-O0-NEXT:    s_mov_b32 s6, s19
+; GFX9-O0-NEXT:    s_mov_b32 s7, s18
+; GFX9-O0-NEXT:    s_mov_b32 s15, s17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s8
+; GFX9-O0-NEXT:    s_mov_b32 s17, s15
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
-; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s7
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s7
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -1579,12 +1547,12 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -1622,14 +1590,14 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O3-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[2:3], 0x24
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O3-NEXT:    s_add_u32 s8, s2, 60
 ; GFX9-O3-NEXT:    s_addc_u32 s9, s3, 0
 ; GFX9-O3-NEXT:    s_getpc_b64 s[2:3]
@@ -1693,16 +1661,18 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 0x7fffffff
-; GFX9-O0-NEXT:    s_mov_b32 s6, -1
-; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
-; GFX9-O0-NEXT:    s_mov_b32 s7, s5
+; GFX9-O0-NEXT:    s_mov_b32 s8, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; GFX9-O0-NEXT:    s_mov_b32 s9, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
@@ -1711,21 +1681,25 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
@@ -1749,28 +1723,30 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
 ; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
-; GFX9-O3-NEXT:    s_mov_b32 s4, -1
-; GFX9-O3-NEXT:    s_brev_b32 s5, -2
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    s_mov_b32 s6, -1
+; GFX9-O3-NEXT:    s_brev_b32 s7, -2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v11
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index b3ed7376a1ede..f73489b7db77c 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -273,12 +273,15 @@ define amdgpu_cs void @wwm_reserved_regs(ptr addrspace(1) %ptr, <4 x i32> inreg
   %ld1 = load volatile i32, ptr addrspace(1) %ptr
   %inactive0 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %ld1, i32 0)
   %inactive1 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %ld0, i32 0)
-  store volatile i32 %inactive0, ptr addrspace(1) %ptr
-  store volatile i32 %inactive1, ptr addrspace(1) %ptr
+  %wwm0 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %inactive0)
+  %wwm1 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %inactive1)
+  store volatile i32 %wwm0, ptr addrspace(1) %ptr
+  store volatile i32 %wwm1, ptr addrspace(1) %ptr
   ret void
 }
 
 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #6
+declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #6
 
 attributes #0 = { "no-signed-zeros-fp-math" = "true" }
 attributes #1 = { "amdgpu-dx10-clamp" = "false" }

From 96a5aabbd6adada4525d5e0107e96e6f57dbdfbf Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 4 Sep 2024 20:22:27 -0700
Subject: [PATCH 187/425] [NFC][sanitizer] Thread safety annotation for
 Symbolizer

---
 compiler-rt/lib/sanitizer_common/sanitizer_symbolizer.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer.h b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer.h
index 16ef2f2fd717b..bd89dc4e302fc 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer.h
@@ -185,17 +185,17 @@ class Symbolizer final {
   class ModuleNameOwner {
    public:
     explicit ModuleNameOwner(Mutex *synchronized_by)
-        : last_match_(nullptr), mu_(synchronized_by) {
+        : mu_(synchronized_by), last_match_(nullptr) {
       storage_.reserve(kInitialCapacity);
     }
     const char *GetOwnedCopy(const char *str);
 
    private:
     static const uptr kInitialCapacity = 1000;
-    InternalMmapVector<const char*> storage_;
-    const char *last_match_;
 
     Mutex *mu_;
+    const char *last_match_ SANITIZER_GUARDED_BY(mu_);
+    InternalMmapVector<const char *> storage_ SANITIZER_GUARDED_BY(*mu_);
   } module_names_;
 
   /// Platform-specific function for creating a Symbolizer object.
@@ -220,7 +220,7 @@ class Symbolizer final {
   // always synchronized.
   Mutex mu_;
 
-  IntrusiveList<SymbolizerTool> tools_;
+  IntrusiveList<SymbolizerTool> tools_ SANITIZER_GUARDED_BY(mu_);
 
   explicit Symbolizer(IntrusiveList<SymbolizerTool> tools);
 

From aafaa6943463b56db2928081dc72b116e246c249 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 4 Sep 2024 23:15:25 -0700
Subject: [PATCH 188/425] [Target] Use templated MachineFunction::getSubtarget
 in *CallingConv.td. NFC (#107311)

This hides away the static_cast needed to get the target specific Subtarget
object.
---
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td   | 10 ++++------
 llvm/lib/Target/M68k/M68kCallingConv.td       |  4 ++--
 llvm/lib/Target/Mips/MipsCallingConv.td       |  3 +--
 llvm/lib/Target/PowerPC/PPCCallingConv.td     |  6 ++----
 llvm/lib/Target/SystemZ/SystemZCallingConv.td |  4 ++--
 llvm/lib/Target/X86/X86CallingConv.td         |  8 ++++----
 6 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 4be64629ddac8..21412044d5a01 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -214,13 +214,11 @@ def RetCC_AMDGPU_Func : CallingConv<[
 ]>;
 
 def CC_AMDGPU : CallingConv<[
-   CCIf<"static_cast<const GCNSubtarget&>"
-         "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
-           "AMDGPUSubtarget::SOUTHERN_ISLANDS",
+   CCIf<"State.getMachineFunction().getSubtarget<GCNSubtarget>().getGeneration() >= "
+          "AMDGPUSubtarget::SOUTHERN_ISLANDS",
         CCDelegateTo<CC_SI_SHADER>>,
-   CCIf<"static_cast<const GCNSubtarget&>"
-         "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
-           "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
+   CCIf<"State.getMachineFunction().getSubtarget<GCNSubtarget>().getGeneration() >= "
+          "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
         CCDelegateTo<CC_AMDGPU_Func>>
 ]>;
 
diff --git a/llvm/lib/Target/M68k/M68kCallingConv.td b/llvm/lib/Target/M68k/M68kCallingConv.td
index 523f08e646151..cf7e5ef69463e 100644
--- a/llvm/lib/Target/M68k/M68kCallingConv.td
+++ b/llvm/lib/Target/M68k/M68kCallingConv.td
@@ -15,8 +15,8 @@
 // TODO Verify C convention follows SysV M68K ABI
 
 class CCIfSubtarget<string F, CCAction A>
-    : CCIf<!strconcat("static_cast<const M68kSubtarget &>"
-                      "(State.getMachineFunction().getSubtarget()).", F), A>;
+    : CCIf<!strconcat("State.getMachineFunction()."
+                      "getSubtarget<M68kSubtarget>().", F), A>;
 
 //===----------------------------------------------------------------------===//
 // Return Value Calling Conventions
diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td
index 204f11f1107cf..866161bf50638 100644
--- a/llvm/lib/Target/Mips/MipsCallingConv.td
+++ b/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -11,8 +11,7 @@
 /// CCIfSubtarget - Match if the current subtarget has a feature F.
 class CCIfSubtarget<string F, CCAction A, string Invert = "">
     : CCIf<!strconcat(Invert,
-                      "static_cast<const MipsSubtarget&>"
-			"(State.getMachineFunction().getSubtarget()).",
+                      "State.getMachineFunction().getSubtarget<MipsSubtarget>().",
                       F),
            A>;
 
diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.td b/llvm/lib/Target/PowerPC/PPCCallingConv.td
index 825c1a29ed62c..899326ad46656 100644
--- a/llvm/lib/Target/PowerPC/PPCCallingConv.td
+++ b/llvm/lib/Target/PowerPC/PPCCallingConv.td
@@ -13,13 +13,11 @@
 
 /// CCIfSubtarget - Match if the current subtarget has a feature F.
 class CCIfSubtarget<string F, CCAction A>
-    : CCIf<!strconcat("static_cast<const PPCSubtarget&>"
-                       "(State.getMachineFunction().getSubtarget()).",
+    : CCIf<!strconcat("State.getMachineFunction().getSubtarget<PPCSubtarget>().",
                      F),
           A>;
 class CCIfNotSubtarget<string F, CCAction A>
-    : CCIf<!strconcat("!static_cast<const PPCSubtarget&>"
-                       "(State.getMachineFunction().getSubtarget()).",
+    : CCIf<!strconcat("!State.getMachineFunction().getSubtarget<PPCSubtarget>().",
                      F),
           A>;
 class CCIfOrigArgWasNotPPCF128<CCAction A>
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index b0618aafa5da6..99bb697ce2014 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -12,8 +12,8 @@ class CCIfExtend<CCAction A>
   : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
 
 class CCIfSubtarget<string F, CCAction A>
-  : CCIf<!strconcat("static_cast<const SystemZSubtarget&>"
-                    "(State.getMachineFunction().getSubtarget()).", F),
+  : CCIf<!strconcat("State.getMachineFunction()."
+                    "getSubtarget<SystemZSubtarget>().", F),
          A>;
 
 // Match if this specific argument is a fixed (i.e. named) argument.
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index c55ff3dfc9c8e..307aeb2ea4c6f 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -13,14 +13,14 @@
 
 /// CCIfSubtarget - Match if the current subtarget has a feature F.
 class CCIfSubtarget<string F, CCAction A>
-    : CCIf<!strconcat("static_cast<const X86Subtarget&>"
-                       "(State.getMachineFunction().getSubtarget()).", F),
+    : CCIf<!strconcat("State.getMachineFunction()."
+                      "getSubtarget<X86Subtarget>().", F),
            A>;
 
 /// CCIfNotSubtarget - Match if the current subtarget doesn't has a feature F.
 class CCIfNotSubtarget<string F, CCAction A>
-    : CCIf<!strconcat("!static_cast<const X86Subtarget&>"
-                       "(State.getMachineFunction().getSubtarget()).", F),
+    : CCIf<!strconcat("!State.getMachineFunction()."
+                      "getSubtarget<X86Subtarget>().", F),
            A>;
 
 /// CCIfRegCallv4 - Match if RegCall ABIv4 is respected.

From 0c1500ef05e0a5b25cae79d2bd361dbc6e14e726 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 4 Sep 2024 23:19:14 -0700
Subject: [PATCH 189/425] [RISCV] Fix another RV32 Zdinx load/store addressing
 corner case.

RISCVExpandPseudoInsts makes sure the offset is divisible by 8
so we need to enforce that during isel.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |  2 +-
 .../CodeGen/RISCV/zdinx-boundary-check.ll     | 64 +++++++++++++++++++
 2 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 7800688542160..4580f3191d138 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2691,7 +2691,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
           Align Alignment = commonAlignment(
               GA->getGlobal()->getPointerAlignment(DL), GA->getOffset());
           if ((CVal == 0 || Alignment > CVal) &&
-              (!IsRV32Zdinx || Alignment > (CVal + 4))) {
+              (!IsRV32Zdinx || commonAlignment(Alignment, CVal) > 4)) {
             int64_t CombinedOffset = CVal + GA->getOffset();
             Base = Base.getOperand(0);
             Offset = CurDAG->getTargetGlobalAddress(
diff --git a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll
index 01ecaee3d7e7b..a4f56b6d28409 100644
--- a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll
+++ b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll
@@ -493,3 +493,67 @@ entry:
   store double %d, ptr %add.ptr, align 8
   ret void
 }
+
+@f = global double 4.2, align 16
+
+define double @foo13(ptr nocapture %p) nounwind {
+; RV32ZDINX-LABEL: foo13:
+; RV32ZDINX:       # %bb.0: # %entry
+; RV32ZDINX-NEXT:    addi sp, sp, -16
+; RV32ZDINX-NEXT:    lui a0, %hi(f)
+; RV32ZDINX-NEXT:    lw a1, %lo(f+8)(a0)
+; RV32ZDINX-NEXT:    sw a1, 12(sp)
+; RV32ZDINX-NEXT:    lw a0, %lo(f+4)(a0)
+; RV32ZDINX-NEXT:    sw a0, 8(sp)
+; RV32ZDINX-NEXT:    lw a0, 8(sp)
+; RV32ZDINX-NEXT:    lw a1, 12(sp)
+; RV32ZDINX-NEXT:    addi sp, sp, 16
+; RV32ZDINX-NEXT:    ret
+;
+; RV32ZDINXUALIGNED-LABEL: foo13:
+; RV32ZDINXUALIGNED:       # %bb.0: # %entry
+; RV32ZDINXUALIGNED-NEXT:    lui a0, %hi(f)
+; RV32ZDINXUALIGNED-NEXT:    addi a0, a0, %lo(f)
+; RV32ZDINXUALIGNED-NEXT:    lw a1, 8(a0)
+; RV32ZDINXUALIGNED-NEXT:    lw a0, 4(a0)
+; RV32ZDINXUALIGNED-NEXT:    ret
+;
+; RV64ZDINX-LABEL: foo13:
+; RV64ZDINX:       # %bb.0: # %entry
+; RV64ZDINX-NEXT:    lui a0, %hi(f)
+; RV64ZDINX-NEXT:    lwu a1, %lo(f+8)(a0)
+; RV64ZDINX-NEXT:    lwu a0, %lo(f+4)(a0)
+; RV64ZDINX-NEXT:    slli a1, a1, 32
+; RV64ZDINX-NEXT:    or a0, a1, a0
+; RV64ZDINX-NEXT:    ret
+entry:
+  %add.ptr = getelementptr inbounds i8, ptr @f, i64 4
+  %0 = load double, ptr %add.ptr, align 4
+  ret double %0
+}
+
+define double @foo14(ptr nocapture %p) nounwind {
+; RV32ZDINX-LABEL: foo14:
+; RV32ZDINX:       # %bb.0: # %entry
+; RV32ZDINX-NEXT:    lui a0, %hi(f)
+; RV32ZDINX-NEXT:    lw a1, %lo(f+12)(a0)
+; RV32ZDINX-NEXT:    lw a0, %lo(f+8)(a0)
+; RV32ZDINX-NEXT:    ret
+;
+; RV32ZDINXUALIGNED-LABEL: foo14:
+; RV32ZDINXUALIGNED:       # %bb.0: # %entry
+; RV32ZDINXUALIGNED-NEXT:    lui a0, %hi(f)
+; RV32ZDINXUALIGNED-NEXT:    lw a1, %lo(f+12)(a0)
+; RV32ZDINXUALIGNED-NEXT:    lw a0, %lo(f+8)(a0)
+; RV32ZDINXUALIGNED-NEXT:    ret
+;
+; RV64ZDINX-LABEL: foo14:
+; RV64ZDINX:       # %bb.0: # %entry
+; RV64ZDINX-NEXT:    lui a0, %hi(f)
+; RV64ZDINX-NEXT:    ld a0, %lo(f+8)(a0)
+; RV64ZDINX-NEXT:    ret
+entry:
+  %add.ptr = getelementptr inbounds i8, ptr @f, i64 8
+  %0 = load double, ptr %add.ptr, align 8
+  ret double %0
+}

From 77f04882251b1e44239d6d7545cd62301e903a4a Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 5 Sep 2024 08:11:29 +0100
Subject: [PATCH 190/425] [AArch64] Combine zext of deinterleaving shuffle.
 (#107201)

This is part 1 of a few patches that are intended to take deinterleaving
shuffles with masks like `[0,4,8,12]`, where the shuffle is
zero-extended to a larger size, and optimize away the deinterleave. In
this case it converts them to `and(uzp1, mask)`, where the `uzp1` act
upon the elements in the larger type size to get the lanes into the
correct possitions, and the `and` performs the zext. It performs the
combine fairly late, on the legalized type so that uitofp that are
converted to uitofp(zext(..)) will also be handled.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  56 ++++
 llvm/test/CodeGen/AArch64/zext-shuffle.ll     | 286 +++++++-----------
 2 files changed, 163 insertions(+), 179 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5e3f9364ac3e1..d1ddbfa300846 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22187,6 +22187,59 @@ performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   return SDValue();
 }
 
+// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
+// This comes from interleaved vectorization. It is performed late to capture
+// uitofp converts too.
+static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N,
+                                                     SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||
+      N->getOpcode() != ISD::ZERO_EXTEND ||
+      N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
+    return SDValue();
+
+  unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1);
+  if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
+    return SDValue();
+
+  EVT InVT = N->getOperand(0).getOperand(0).getValueType();
+  auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0).getOperand(0));
+  if (!Shuffle ||
+      InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||
+      InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())
+    return SDValue();
+
+  unsigned Idx;
+  bool IsDeInterleave = ShuffleVectorInst::isDeInterleaveMaskOfFactor(
+      Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx);
+  // An undef interleave shuffle can come up after other canonicalizations,
+  // where the shuffle has been converted to
+  //   zext(extract(shuffle b, undef, [u,u,0,4]))
+  bool IsUndefDeInterleave = false;
+  if (!IsDeInterleave)
+    IsUndefDeInterleave =
+        Shuffle->getOperand(1).isUndef() &&
+        ShuffleVectorInst::isDeInterleaveMaskOfFactor(
+            Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,
+                                     VT.getVectorNumElements() / 2),
+            4, Idx);
+  if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)
+    return SDValue();
+  SDLoc DL(N);
+  SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
+                            Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0));
+  SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
+                            Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1));
+  SDValue UZP = DAG.getNode(Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL,
+                            VT, BC1, BC2);
+  if ((Idx & 1) == 1)
+    UZP = DAG.getNode(ISD::SRL, DL, VT, UZP,
+                      DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT));
+  return DAG.getNode(
+      ISD::AND, DL, VT, UZP,
+      DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
+}
+
 static SDValue performExtendCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG) {
@@ -22207,6 +22260,9 @@ static SDValue performExtendCombine(SDNode *N,
     return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
   }
 
+  if (SDValue R = performZExtDeinterleaveShuffleCombine(N, DAG))
+    return R;
+
   if (N->getValueType(0).isFixedLengthVector() &&
       N->getOpcode() == ISD::SIGN_EXTEND &&
       N->getOperand(0)->getOpcode() == ISD::SETCC)
diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
index 4ef8daf141715..af5a92017bbbc 100644
--- a/llvm/test/CodeGen/AArch64/zext-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
@@ -76,12 +76,9 @@ define <2 x i64> @v2i64_37(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i64> @v2i64_i16_04812(<16 x i16> %a) {
 ; CHECK-LABEL: v2i64_i16_04812:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI6_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI6_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    movi v2.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ushll2 v1.2d, v0.4s, #0
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    ret
@@ -93,12 +90,8 @@ define <4 x i64> @v2i64_i16_04812(<16 x i16> %a) {
 define <4 x i64> @v2i64_i16_15913(<16 x i16> %a) {
 ; CHECK-LABEL: v2i64_i16_15913:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI7_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI7_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #16
 ; CHECK-NEXT:    ushll2 v1.2d, v0.4s, #0
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    ret
@@ -110,12 +103,9 @@ define <4 x i64> @v2i64_i16_15913(<16 x i16> %a) {
 define <4 x i64> @v2i64_i16_261014(<16 x i16> %a) {
 ; CHECK-LABEL: v2i64_i16_261014:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI8_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI8_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    movi v2.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ushll2 v1.2d, v0.4s, #0
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    ret
@@ -127,12 +117,8 @@ define <4 x i64> @v2i64_i16_261014(<16 x i16> %a) {
 define <4 x i64> @v2i64_i16_371115(<16 x i16> %a) {
 ; CHECK-LABEL: v2i64_i16_371115:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI9_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI9_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #16
 ; CHECK-NEXT:    ushll2 v1.2d, v0.4s, #0
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    ret
@@ -167,12 +153,9 @@ define <4 x i32> @v4i32_1357(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i32> @v4i32_04812(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: v4i32_04812:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI12_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI12_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    movi v2.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
   %d = zext <4 x i16> %c to <4 x i32>
@@ -182,12 +165,8 @@ define <4 x i32> @v4i32_04812(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i32> @v4i32_15913(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: v4i32_15913:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI13_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI13_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #16
 ; CHECK-NEXT:    ret
   %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
   %d = zext <4 x i16> %c to <4 x i32>
@@ -197,12 +176,9 @@ define <4 x i32> @v4i32_15913(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i32> @v4i32_261014(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: v4i32_261014:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI14_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI14_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    movi v2.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
   %d = zext <4 x i16> %c to <4 x i32>
@@ -212,12 +188,8 @@ define <4 x i32> @v4i32_261014(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i32> @v4i32_371115(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: v4i32_371115:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI15_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI15_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #16
 ; CHECK-NEXT:    ret
   %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
   %d = zext <4 x i16> %c to <4 x i32>
@@ -249,12 +221,8 @@ define <8 x i16> @v8i16_1357(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i16> @v8i16_04812(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: v8i16_04812:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI18_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI18_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    bic v0.8h, #255, lsl #8
 ; CHECK-NEXT:    ret
   %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
   %d = zext <8 x i8> %c to <8 x i16>
@@ -264,12 +232,8 @@ define <8 x i16> @v8i16_04812(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i16> @v8i16_15913(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: v8i16_15913:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI19_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI19_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #8
 ; CHECK-NEXT:    ret
   %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
   %d = zext <8 x i8> %c to <8 x i16>
@@ -279,12 +243,8 @@ define <8 x i16> @v8i16_15913(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i16> @v8i16_261014(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: v8i16_261014:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI20_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI20_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    bic v0.8h, #255, lsl #8
 ; CHECK-NEXT:    ret
   %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
   %d = zext <8 x i8> %c to <8 x i16>
@@ -294,12 +254,8 @@ define <8 x i16> @v8i16_261014(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i16> @v8i16_371115(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: v8i16_371115:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI21_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI21_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #8
 ; CHECK-NEXT:    ret
   %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
   %d = zext <8 x i8> %c to <8 x i16>
@@ -310,42 +266,23 @@ define <8 x i16> @v8i16_371115(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i64> @zext_add(<32 x i16> %l) {
 ; CHECK-LABEL: zext_add:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI22_0
-; CHECK-NEXT:    adrp x9, .LCPI22_3
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI22_0]
-; CHECK-NEXT:    adrp x8, .LCPI22_1
-; CHECK-NEXT:    ldr q7, [x9, :lo12:.LCPI22_3]
-; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI22_1]
-; CHECK-NEXT:    adrp x8, .LCPI22_2
-; CHECK-NEXT:    adrp x9, .LCPI22_7
-; CHECK-NEXT:    ldr q6, [x8, :lo12:.LCPI22_2]
-; CHECK-NEXT:    adrp x8, .LCPI22_4
-; CHECK-NEXT:    ldr q18, [x9, :lo12:.LCPI22_7]
-; CHECK-NEXT:    ldr q16, [x8, :lo12:.LCPI22_4]
-; CHECK-NEXT:    adrp x8, .LCPI22_5
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v5.16b, { v0.16b, v1.16b }, v5.16b
-; CHECK-NEXT:    ldr q17, [x8, :lo12:.LCPI22_5]
-; CHECK-NEXT:    adrp x8, .LCPI22_6
-; CHECK-NEXT:    tbl v7.16b, { v0.16b, v1.16b }, v7.16b
-; CHECK-NEXT:    ldr q19, [x8, :lo12:.LCPI22_6]
-; CHECK-NEXT:    tbl v17.16b, { v0.16b, v1.16b }, v17.16b
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v18.16b
-; CHECK-NEXT:    tbl v1.16b, { v2.16b, v3.16b }, v4.16b
-; CHECK-NEXT:    tbl v4.16b, { v2.16b, v3.16b }, v6.16b
-; CHECK-NEXT:    tbl v6.16b, { v2.16b, v3.16b }, v16.16b
-; CHECK-NEXT:    tbl v2.16b, { v2.16b, v3.16b }, v19.16b
-; CHECK-NEXT:    uaddl v5.4s, v5.4h, v7.4h
-; CHECK-NEXT:    uaddl v7.4s, v17.4h, v0.4h
-; CHECK-NEXT:    uaddl2 v4.4s, v1.8h, v4.8h
-; CHECK-NEXT:    uaddl2 v2.4s, v6.8h, v2.8h
-; CHECK-NEXT:    uaddl v0.2d, v5.2s, v7.2s
-; CHECK-NEXT:    uaddl2 v1.2d, v5.4s, v7.4s
-; CHECK-NEXT:    uaddl2 v3.2d, v4.4s, v2.4s
-; CHECK-NEXT:    uaddl v2.2d, v4.2s, v2.2s
+; CHECK-NEXT:    movi v4.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    uzp1 v5.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uzp1 v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    uzp2 v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    and v3.16b, v5.16b, v4.16b
+; CHECK-NEXT:    and v6.16b, v0.16b, v4.16b
+; CHECK-NEXT:    and v7.16b, v1.16b, v4.16b
+; CHECK-NEXT:    and v4.16b, v2.16b, v4.16b
+; CHECK-NEXT:    usra v3.4s, v5.4s, #16
+; CHECK-NEXT:    usra v6.4s, v0.4s, #16
+; CHECK-NEXT:    usra v7.4s, v1.4s, #16
+; CHECK-NEXT:    usra v4.4s, v2.4s, #16
+; CHECK-NEXT:    uaddl v0.2d, v3.2s, v6.2s
+; CHECK-NEXT:    uaddl2 v1.2d, v3.4s, v6.4s
+; CHECK-NEXT:    uaddl2 v3.2d, v7.4s, v4.4s
+; CHECK-NEXT:    uaddl v2.2d, v7.2s, v4.2s
 ; CHECK-NEXT:    ret
     %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
     %z1 = zext <8 x i16> %s1 to <8 x i64>
@@ -392,86 +329,77 @@ define <8 x i64> @zext_load_add(ptr %p) {
 define <8 x double> @uitofp_fadd(<32 x i16> %l) {
 ; CHECK-LABEL: uitofp_fadd:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI24_0
-; CHECK-NEXT:    adrp x9, .LCPI24_1
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    adrp x10, .LCPI24_6
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI24_0]
-; CHECK-NEXT:    ldr q5, [x9, :lo12:.LCPI24_1]
-; CHECK-NEXT:    adrp x8, .LCPI24_2
-; CHECK-NEXT:    adrp x9, .LCPI24_3
-; CHECK-NEXT:    ldr q6, [x8, :lo12:.LCPI24_2]
-; CHECK-NEXT:    adrp x8, .LCPI24_4
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl v4.16b, { v0.16b, v1.16b }, v4.16b
-; CHECK-NEXT:    tbl v5.16b, { v2.16b, v3.16b }, v5.16b
-; CHECK-NEXT:    ldr q7, [x9, :lo12:.LCPI24_3]
-; CHECK-NEXT:    adrp x9, .LCPI24_5
-; CHECK-NEXT:    ldr q16, [x8, :lo12:.LCPI24_4]
-; CHECK-NEXT:    adrp x8, .LCPI24_7
-; CHECK-NEXT:    ldr q17, [x9, :lo12:.LCPI24_5]
-; CHECK-NEXT:    ldr q18, [x10, :lo12:.LCPI24_6]
-; CHECK-NEXT:    ldr q19, [x8, :lo12:.LCPI24_7]
-; CHECK-NEXT:    tbl v6.16b, { v0.16b, v1.16b }, v6.16b
-; CHECK-NEXT:    tbl v7.16b, { v2.16b, v3.16b }, v7.16b
-; CHECK-NEXT:    tbl v16.16b, { v0.16b, v1.16b }, v16.16b
-; CHECK-NEXT:    tbl v17.16b, { v2.16b, v3.16b }, v17.16b
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v18.16b
-; CHECK-NEXT:    tbl v1.16b, { v2.16b, v3.16b }, v19.16b
-; CHECK-NEXT:    ushll2 v5.4s, v5.8h, #0
-; CHECK-NEXT:    ushll v4.4s, v4.4h, #0
-; CHECK-NEXT:    ushll2 v7.4s, v7.8h, #0
-; CHECK-NEXT:    ushll v6.4s, v6.4h, #0
-; CHECK-NEXT:    ushll v16.4s, v16.4h, #0
-; CHECK-NEXT:    ushll2 v20.2d, v5.4s, #0
-; CHECK-NEXT:    ushll2 v21.2d, v4.4s, #0
-; CHECK-NEXT:    ushll2 v17.4s, v17.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v2.2d, v5.2s, #0
-; CHECK-NEXT:    ushll v3.2d, v4.2s, #0
-; CHECK-NEXT:    ushll2 v4.2d, v7.4s, #0
-; CHECK-NEXT:    ushll2 v5.2d, v6.4s, #0
-; CHECK-NEXT:    ushll v7.2d, v7.2s, #0
-; CHECK-NEXT:    ucvtf v18.2d, v20.2d
-; CHECK-NEXT:    ucvtf v19.2d, v21.2d
+; CHECK-NEXT:    uzp1 v5.4s, v0.4s, v3.4s
+; CHECK-NEXT:    uzp1 v6.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    movi d4, #0x00ffff0000ffff
+; CHECK-NEXT:    uzp1 v7.4s, v2.4s, v3.4s
+; CHECK-NEXT:    uzp2 v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    ext v16.16b, v6.16b, v6.16b, #8
+; CHECK-NEXT:    ext v5.16b, v5.16b, v5.16b, #8
+; CHECK-NEXT:    uzp2 v1.4s, v0.4s, v3.4s
+; CHECK-NEXT:    and v17.8b, v6.8b, v4.8b
+; CHECK-NEXT:    and v18.8b, v7.8b, v4.8b
+; CHECK-NEXT:    ushr v6.2s, v6.2s, #16
+; CHECK-NEXT:    ushr v7.2s, v7.2s, #16
+; CHECK-NEXT:    and v21.8b, v0.8b, v4.8b
+; CHECK-NEXT:    and v22.8b, v2.8b, v4.8b
+; CHECK-NEXT:    ushr v2.2s, v2.2s, #16
+; CHECK-NEXT:    and v19.8b, v16.8b, v4.8b
+; CHECK-NEXT:    and v20.8b, v5.8b, v4.8b
+; CHECK-NEXT:    ushll v3.2d, v17.2s, #0
+; CHECK-NEXT:    ushll v17.2d, v18.2s, #0
+; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    ushr v16.2s, v16.2s, #16
+; CHECK-NEXT:    ushr v5.2s, v5.2s, #16
 ; CHECK-NEXT:    ushll v6.2d, v6.2s, #0
-; CHECK-NEXT:    ushll2 v20.2d, v17.4s, #0
-; CHECK-NEXT:    ushll2 v21.2d, v16.4s, #0
-; CHECK-NEXT:    ushll v17.2d, v17.2s, #0
+; CHECK-NEXT:    ushll v7.2d, v7.2s, #0
+; CHECK-NEXT:    ushll v18.2d, v19.2s, #0
+; CHECK-NEXT:    ushll v19.2d, v20.2s, #0
+; CHECK-NEXT:    ext v20.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
 ; CHECK-NEXT:    ushll v16.2d, v16.2s, #0
-; CHECK-NEXT:    ushll v22.2d, v0.2s, #0
-; CHECK-NEXT:    ushll2 v23.2d, v1.4s, #0
-; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-NEXT:    ushll v21.2d, v21.2s, #0
+; CHECK-NEXT:    ushll v5.2d, v5.2s, #0
+; CHECK-NEXT:    ushll v22.2d, v22.2s, #0
+; CHECK-NEXT:    ushll v2.2d, v2.2s, #0
 ; CHECK-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-NEXT:    ucvtf v4.2d, v4.2d
-; CHECK-NEXT:    ucvtf v5.2d, v5.2d
-; CHECK-NEXT:    ucvtf v7.2d, v7.2d
-; CHECK-NEXT:    ucvtf v6.2d, v6.2d
-; CHECK-NEXT:    ucvtf v20.2d, v20.2d
-; CHECK-NEXT:    ucvtf v21.2d, v21.2d
 ; CHECK-NEXT:    ucvtf v17.2d, v17.2d
+; CHECK-NEXT:    ucvtf v6.2d, v6.2d
+; CHECK-NEXT:    and v23.8b, v20.8b, v4.8b
+; CHECK-NEXT:    and v4.8b, v1.8b, v4.8b
+; CHECK-NEXT:    ushr v20.2s, v20.2s, #16
+; CHECK-NEXT:    ushr v1.2s, v1.2s, #16
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ucvtf v7.2d, v7.2d
+; CHECK-NEXT:    ucvtf v18.2d, v18.2d
+; CHECK-NEXT:    ucvtf v19.2d, v19.2d
 ; CHECK-NEXT:    ucvtf v16.2d, v16.2d
+; CHECK-NEXT:    ushll v23.2d, v23.2s, #0
+; CHECK-NEXT:    ushll v4.2d, v4.2s, #0
+; CHECK-NEXT:    ushll v20.2d, v20.2s, #0
+; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-NEXT:    ucvtf v5.2d, v5.2d
+; CHECK-NEXT:    ucvtf v21.2d, v21.2d
 ; CHECK-NEXT:    ucvtf v22.2d, v22.2d
-; CHECK-NEXT:    ucvtf v23.2d, v23.2d
 ; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-NEXT:    ucvtf v23.2d, v23.2d
+; CHECK-NEXT:    ucvtf v4.2d, v4.2d
+; CHECK-NEXT:    ucvtf v20.2d, v20.2d
 ; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    fadd v4.2d, v18.2d, v4.2d
-; CHECK-NEXT:    fadd v3.2d, v3.2d, v6.2d
-; CHECK-NEXT:    fadd v2.2d, v2.2d, v7.2d
+; CHECK-NEXT:    fadd v16.2d, v18.2d, v16.2d
+; CHECK-NEXT:    fadd v7.2d, v17.2d, v7.2d
 ; CHECK-NEXT:    fadd v5.2d, v19.2d, v5.2d
-; CHECK-NEXT:    fadd v6.2d, v16.2d, v22.2d
-; CHECK-NEXT:    fadd v16.2d, v20.2d, v23.2d
-; CHECK-NEXT:    fadd v7.2d, v17.2d, v1.2d
-; CHECK-NEXT:    fadd v1.2d, v21.2d, v0.2d
-; CHECK-NEXT:    fadd v0.2d, v3.2d, v6.2d
-; CHECK-NEXT:    fadd v3.2d, v4.2d, v16.2d
-; CHECK-NEXT:    fadd v1.2d, v5.2d, v1.2d
-; CHECK-NEXT:    fadd v2.2d, v2.2d, v7.2d
+; CHECK-NEXT:    fadd v3.2d, v3.2d, v6.2d
+; CHECK-NEXT:    fadd v0.2d, v21.2d, v0.2d
+; CHECK-NEXT:    fadd v2.2d, v22.2d, v2.2d
+; CHECK-NEXT:    fadd v4.2d, v4.2d, v1.2d
+; CHECK-NEXT:    fadd v1.2d, v23.2d, v20.2d
+; CHECK-NEXT:    fadd v0.2d, v3.2d, v0.2d
+; CHECK-NEXT:    fadd v2.2d, v7.2d, v2.2d
+; CHECK-NEXT:    fadd v1.2d, v16.2d, v1.2d
+; CHECK-NEXT:    fadd v3.2d, v5.2d, v4.2d
 ; CHECK-NEXT:    ret
     %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
     %z1 = uitofp <8 x i16> %s1 to <8 x double>

From a7697c86559e9d57c9c0e2b5f2daaa5cec4e5119 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 5 Sep 2024 09:26:53 +0200
Subject: [PATCH 191/425] [ARM] Do not assume alignment in vld1xN and vst1xN
 intrinsics (#106984)

These intrinsics currently assume natural alignment. Instead, respect
the alignment attribute on the intrinsic. Teach InstCombine to improve
that alignment.

If desired I could also adjust the clang frontend to add alignment
annotations equivalent to the previous behavior, but I don't see any
indication that such an assumption is correct in the ARM intrinsics
docs.

Fixes https://github.com/llvm/llvm-project/issues/59081.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |   4 +-
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |  16 ++
 llvm/test/CodeGen/ARM/arm-vld1.ll             | 182 ++++++++++-------
 llvm/test/CodeGen/ARM/arm-vst1.ll             | 193 ++++++++++--------
 .../test/CodeGen/ARM/bf16-intrinsics-ld-st.ll |  32 +--
 .../InstCombine/ARM/neon-intrinsics.ll        |   6 +-
 6 files changed, 256 insertions(+), 177 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 9096617a94855..aa663556deb76 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21073,7 +21073,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
     Info.offset = 0;
-    Info.align.reset();
+    Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
     // volatile loads with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOLoad;
     return true;
@@ -21120,7 +21120,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align.reset();
+    Info.align = I.getParamAlign(0).valueOrOne();
     // volatile stores with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOStore;
     return true;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 912569a8fec11..9b5349241c341 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -163,6 +163,22 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     break;
   }
 
+  case Intrinsic::arm_neon_vld1x2:
+  case Intrinsic::arm_neon_vld1x3:
+  case Intrinsic::arm_neon_vld1x4:
+  case Intrinsic::arm_neon_vst1x2:
+  case Intrinsic::arm_neon_vst1x3:
+  case Intrinsic::arm_neon_vst1x4: {
+    Align NewAlign =
+        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
+                          &IC.getAssumptionCache(), &IC.getDominatorTree());
+    Align OldAlign = II.getParamAlign(0).valueOrOne();
+    if (NewAlign > OldAlign)
+      II.addParamAttr(0,
+                      Attribute::getWithAlignment(II.getContext(), NewAlign));
+    break;
+  }
+
   case Intrinsic::arm_mve_pred_i2v: {
     Value *Arg = II.getArgOperand(0);
     Value *ArgArg;
diff --git a/llvm/test/CodeGen/ARM/arm-vld1.ll b/llvm/test/CodeGen/ARM/arm-vld1.ll
index 78b0b92013c39..ec2793589759e 100644
--- a/llvm/test/CodeGen/ARM/arm-vld1.ll
+++ b/llvm/test/CodeGen/ARM/arm-vld1.ll
@@ -68,7 +68,7 @@ declare %struct.uint8x16x4_t @llvm.arm.neon.vld1x4.v16i8.p0(ptr) nounwind readon
 
 define %struct.uint16x4x2_t @test_vld1_u16_x2(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1_u16_x2:
-; CHECK:         vld1.16 {d16, d17}, [r0:64]
+; CHECK:         vld1.16 {d16, d17}, [r0]
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    vmov r2, r3, d17
 ; CHECK-NEXT:    bx lr
@@ -76,9 +76,39 @@ define %struct.uint16x4x2_t @test_vld1_u16_x2(ptr %a) nounwind {
   ret %struct.uint16x4x2_t %tmp
 }
 
+define %struct.uint16x4x2_t @test_vld1_u16_x2_align8(ptr %a) nounwind {
+; CHECK-LABEL: test_vld1_u16_x2_align8:
+; CHECK:         vld1.16 {d16, d17}, [r0:64]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+  %tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld1x2.v4i16.p0(ptr align 8 %a)
+  ret %struct.uint16x4x2_t %tmp
+}
+
+define %struct.uint16x4x2_t @test_vld1_u16_x2_align16(ptr %a) nounwind {
+; CHECK-LABEL: test_vld1_u16_x2_align16:
+; CHECK:         vld1.16 {d16, d17}, [r0:128]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+  %tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld1x2.v4i16.p0(ptr align 16 %a)
+  ret %struct.uint16x4x2_t %tmp
+}
+
+define %struct.uint16x4x2_t @test_vld1_u16_x2_align32(ptr %a) nounwind {
+; CHECK-LABEL: test_vld1_u16_x2_align32:
+; CHECK:         vld1.16 {d16, d17}, [r0:128]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+  %tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld1x2.v4i16.p0(ptr align 32 %a)
+  ret %struct.uint16x4x2_t %tmp
+}
+
 define %struct.uint16x4x3_t @test_vld1_u16_x3(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1_u16_x3:
-; CHECK:         vld1.16 {d16, d17, d18}, [r1:64]
+; CHECK:         vld1.16 {d16, d17, d18}, [r1]
 ; CHECK-NEXT:    vst1.16 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.16 {d17}, [r0:64]!
 ; CHECK-NEXT:    vstr d18, [r0]
@@ -89,7 +119,7 @@ define %struct.uint16x4x3_t @test_vld1_u16_x3(ptr %a) nounwind {
 
 define %struct.uint16x4x4_t @test_vld1_u16_x4(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1_u16_x4:
-; CHECK:         vld1.16 {d16, d17, d18, d19}, [r1:256]
+; CHECK:         vld1.16 {d16, d17, d18, d19}, [r1]
 ; CHECK-NEXT:    vst1.16 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.16 {d17}, [r0:64]!
 ; CHECK-NEXT:    vst1.16 {d18}, [r0:64]!
@@ -101,7 +131,7 @@ define %struct.uint16x4x4_t @test_vld1_u16_x4(ptr %a) nounwind {
 
 define %struct.uint32x2x2_t @test_vld1_u32_x2(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1_u32_x2:
-; CHECK:         vld1.32 {d16, d17}, [r0:64]
+; CHECK:         vld1.32 {d16, d17}, [r0]
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    vmov r2, r3, d17
 ; CHECK-NEXT:    bx lr
@@ -111,7 +141,7 @@ define %struct.uint32x2x2_t @test_vld1_u32_x2(ptr %a) nounwind {
 
 define %struct.uint32x2x3_t @test_vld1_u32_x3(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1_u32_x3:
-; CHECK:         vld1.32 {d16, d17, d18}, [r1:64]
+; CHECK:         vld1.32 {d16, d17, d18}, [r1]
 ; CHECK-NEXT:    vst1.32 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.32 {d17}, [r0:64]!
 ; CHECK-NEXT:    vstr d18, [r0]
@@ -122,7 +152,7 @@ define %struct.uint32x2x3_t @test_vld1_u32_x3(ptr %a) nounwind {
 
 define %struct.uint32x2x4_t @test_vld1_u32_x4(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1_u32_x4:
-; CHECK:         vld1.32 {d16, d17, d18, d19}, [r1:256]
+; CHECK:         vld1.32 {d16, d17, d18, d19}, [r1]
 ; CHECK-NEXT:    vst1.32 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.32 {d17}, [r0:64]!
 ; CHECK-NEXT:    vst1.32 {d18}, [r0:64]!
@@ -134,7 +164,7 @@ define %struct.uint32x2x4_t @test_vld1_u32_x4(ptr %a) nounwind {
 
 define %struct.uint64x1x2_t @test_vld1_u64_x2(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1_u64_x2:
-; CHECK:         vld1.64 {d16, d17}, [r0:64]
+; CHECK:         vld1.64 {d16, d17}, [r0]
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    vmov r2, r3, d17
 ; CHECK-NEXT:    bx lr
@@ -144,7 +174,7 @@ define %struct.uint64x1x2_t @test_vld1_u64_x2(ptr %a) nounwind {
 
 define %struct.uint64x1x3_t @test_vld1_u64_x3(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1_u64_x3:
-; CHECK:         vld1.64 {d16, d17, d18}, [r1:64]
+; CHECK:         vld1.64 {d16, d17, d18}, [r1]
 ; CHECK-NEXT:    vst1.64 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.64 {d17}, [r0:64]!
 ; CHECK-NEXT:    vstr d18, [r0]
@@ -155,7 +185,7 @@ define %struct.uint64x1x3_t @test_vld1_u64_x3(ptr %a) nounwind {
 
 define %struct.uint64x1x4_t @test_vld1_u64_x4(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1_u64_x4:
-; CHECK:         vld1.64 {d16, d17, d18, d19}, [r1:256]
+; CHECK:         vld1.64 {d16, d17, d18, d19}, [r1]
 ; CHECK-NEXT:    vst1.64 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.64 {d17}, [r0:64]!
 ; CHECK-NEXT:    vst1.64 {d18}, [r0:64]!
@@ -167,7 +197,7 @@ define %struct.uint64x1x4_t @test_vld1_u64_x4(ptr %a) nounwind {
 
 define %struct.uint8x8x2_t @test_vld1_u8_x2(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1_u8_x2:
-; CHECK:         vld1.8 {d16, d17}, [r0:64]
+; CHECK:         vld1.8 {d16, d17}, [r0]
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    vmov r2, r3, d17
 ; CHECK-NEXT:    bx lr
@@ -177,7 +207,7 @@ define %struct.uint8x8x2_t @test_vld1_u8_x2(ptr %a) nounwind {
 
 define %struct.uint8x8x3_t @test_vld1_u8_x3(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1_u8_x3:
-; CHECK:         vld1.8 {d16, d17, d18}, [r1:64]
+; CHECK:         vld1.8 {d16, d17, d18}, [r1]
 ; CHECK-NEXT:    vst1.8 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.8 {d17}, [r0:64]!
 ; CHECK-NEXT:    vstr d18, [r0]
@@ -188,7 +218,7 @@ define %struct.uint8x8x3_t @test_vld1_u8_x3(ptr %a) nounwind {
 
 define %struct.uint8x8x4_t @test_vld1_u8_x4(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1_u8_x4:
-; CHECK:         vld1.8 {d16, d17, d18, d19}, [r1:256]
+; CHECK:         vld1.8 {d16, d17, d18, d19}, [r1]
 ; CHECK-NEXT:    vst1.8 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.8 {d17}, [r0:64]!
 ; CHECK-NEXT:    vst1.8 {d18}, [r0:64]!
@@ -200,7 +230,7 @@ define %struct.uint8x8x4_t @test_vld1_u8_x4(ptr %a) nounwind {
 
 define %struct.uint16x8x2_t @test_vld1q_u16_x2(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1q_u16_x2:
-; CHECK:         vld1.16 {d16, d17, d18, d19}, [r1:256]
+; CHECK:         vld1.16 {d16, d17, d18, d19}, [r1]
 ; CHECK-NEXT:    vst1.16 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r0]
 ; CHECK-NEXT:    bx lr
@@ -210,8 +240,8 @@ define %struct.uint16x8x2_t @test_vld1q_u16_x2(ptr %a) nounwind {
 
 define %struct.uint16x8x3_t @test_vld1q_u16_x3(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1q_u16_x3:
-; CHECK:         vld1.16 {d16, d17, d18}, [r1:64]!
-; CHECK-NEXT:    vld1.16 {d19, d20, d21}, [r1:64]
+; CHECK:         vld1.16 {d16, d17, d18}, [r1]!
+; CHECK-NEXT:    vld1.16 {d19, d20, d21}, [r1]
 ; CHECK-NEXT:    vst1.16 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.16 {d18, d19}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d20, d21}, [r0]
@@ -222,8 +252,8 @@ define %struct.uint16x8x3_t @test_vld1q_u16_x3(ptr %a) nounwind {
 
 define %struct.uint16x8x4_t @test_vld1q_u16_x4(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1q_u16_x4:
-; CHECK:         vld1.16 {d16, d17, d18, d19}, [r1:256]!
-; CHECK-NEXT:    vld1.16 {d20, d21, d22, d23}, [r1:256]
+; CHECK:         vld1.16 {d16, d17, d18, d19}, [r1]!
+; CHECK-NEXT:    vld1.16 {d20, d21, d22, d23}, [r1]
 ; CHECK-NEXT:    vst1.16 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.16 {d18, d19}, [r0]!
 ; CHECK-NEXT:    vst1.16 {d20, d21}, [r0]!
@@ -235,7 +265,7 @@ define %struct.uint16x8x4_t @test_vld1q_u16_x4(ptr %a) nounwind {
 
 define %struct.uint32x4x2_t @test_vld1q_u32_x2(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1q_u32_x2:
-; CHECK:         vld1.32 {d16, d17, d18, d19}, [r1:256]
+; CHECK:         vld1.32 {d16, d17, d18, d19}, [r1]
 ; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r0]
 ; CHECK-NEXT:    bx lr
@@ -245,8 +275,8 @@ define %struct.uint32x4x2_t @test_vld1q_u32_x2(ptr %a) nounwind {
 
 define %struct.uint32x4x3_t @test_vld1q_u32_x3(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1q_u32_x3:
-; CHECK:         vld1.32 {d16, d17, d18}, [r1:64]!
-; CHECK-NEXT:    vld1.32 {d19, d20, d21}, [r1:64]
+; CHECK:         vld1.32 {d16, d17, d18}, [r1]!
+; CHECK-NEXT:    vld1.32 {d19, d20, d21}, [r1]
 ; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.32 {d18, d19}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d20, d21}, [r0]
@@ -257,8 +287,8 @@ define %struct.uint32x4x3_t @test_vld1q_u32_x3(ptr %a) nounwind {
 
 define %struct.uint32x4x4_t @test_vld1q_u32_x4(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1q_u32_x4:
-; CHECK:         vld1.32 {d16, d17, d18, d19}, [r1:256]!
-; CHECK-NEXT:    vld1.32 {d20, d21, d22, d23}, [r1:256]
+; CHECK:         vld1.32 {d16, d17, d18, d19}, [r1]!
+; CHECK-NEXT:    vld1.32 {d20, d21, d22, d23}, [r1]
 ; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.32 {d18, d19}, [r0]!
 ; CHECK-NEXT:    vst1.32 {d20, d21}, [r0]!
@@ -270,7 +300,7 @@ define %struct.uint32x4x4_t @test_vld1q_u32_x4(ptr %a) nounwind {
 
 define %struct.uint64x2x2_t @test_vld1q_u64_x2(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1q_u64_x2:
-; CHECK:         vld1.64 {d16, d17, d18, d19}, [r1:256]
+; CHECK:         vld1.64 {d16, d17, d18, d19}, [r1]
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r0]
 ; CHECK-NEXT:    bx lr
@@ -280,8 +310,8 @@ define %struct.uint64x2x2_t @test_vld1q_u64_x2(ptr %a) nounwind {
 
 define %struct.uint64x2x3_t @test_vld1q_u64_x3(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1q_u64_x3:
-; CHECK:         vld1.64 {d16, d17, d18}, [r1:64]!
-; CHECK-NEXT:    vld1.64 {d19, d20, d21}, [r1:64]
+; CHECK:         vld1.64 {d16, d17, d18}, [r1]!
+; CHECK-NEXT:    vld1.64 {d19, d20, d21}, [r1]
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d20, d21}, [r0]
@@ -292,8 +322,8 @@ define %struct.uint64x2x3_t @test_vld1q_u64_x3(ptr %a) nounwind {
 
 define %struct.uint64x2x4_t @test_vld1q_u64_x4(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1q_u64_x4:
-; CHECK:         vld1.64 {d16, d17, d18, d19}, [r1:256]!
-; CHECK-NEXT:    vld1.64 {d20, d21, d22, d23}, [r1:256]
+; CHECK:         vld1.64 {d16, d17, d18, d19}, [r1]!
+; CHECK-NEXT:    vld1.64 {d20, d21, d22, d23}, [r1]
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d20, d21}, [r0]!
@@ -305,7 +335,7 @@ define %struct.uint64x2x4_t @test_vld1q_u64_x4(ptr %a) nounwind {
 
 define %struct.uint8x16x2_t @test_vld1q_u8_x2(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1q_u8_x2:
-; CHECK:         vld1.8 {d16, d17, d18, d19}, [r1:256]
+; CHECK:         vld1.8 {d16, d17, d18, d19}, [r1]
 ; CHECK-NEXT:    vst1.8 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r0]
 ; CHECK-NEXT:    bx lr
@@ -315,8 +345,8 @@ define %struct.uint8x16x2_t @test_vld1q_u8_x2(ptr %a) nounwind {
 
 define %struct.uint8x16x3_t @test_vld1q_u8_x3(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1q_u8_x3:
-; CHECK:         vld1.8 {d16, d17, d18}, [r1:64]!
-; CHECK-NEXT:    vld1.8 {d19, d20, d21}, [r1:64]
+; CHECK:         vld1.8 {d16, d17, d18}, [r1]!
+; CHECK-NEXT:    vld1.8 {d19, d20, d21}, [r1]
 ; CHECK-NEXT:    vst1.8 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.8 {d18, d19}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d20, d21}, [r0]
@@ -327,8 +357,8 @@ define %struct.uint8x16x3_t @test_vld1q_u8_x3(ptr %a) nounwind {
 
 define %struct.uint8x16x4_t @test_vld1q_u8_x4(ptr %a) nounwind {
 ; CHECK-LABEL: test_vld1q_u8_x4:
-; CHECK:         vld1.8 {d16, d17, d18, d19}, [r1:256]!
-; CHECK-NEXT:    vld1.8 {d20, d21, d22, d23}, [r1:256]
+; CHECK:         vld1.8 {d16, d17, d18, d19}, [r1]!
+; CHECK-NEXT:    vld1.8 {d20, d21, d22, d23}, [r1]
 ; CHECK-NEXT:    vst1.8 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.8 {d18, d19}, [r0]!
 ; CHECK-NEXT:    vst1.8 {d20, d21}, [r0]!
@@ -344,7 +374,7 @@ define %struct.uint16x4x2_t @test_vld1_u16_x2_post_imm(ptr %a, ptr %ptr) nounwin
 ; CHECK-LABEL: test_vld1_u16_x2_post_imm:
 ; CHECK:         .save {r11, lr}
 ; CHECK-NEXT:    push {r11, lr}
-; CHECK-NEXT:    vld1.16 {d16, d17}, [r0:64]!
+; CHECK-NEXT:    vld1.16 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vmov lr, r12, d16
 ; CHECK-NEXT:    str r0, [r1]
 ; CHECK-NEXT:    vmov r2, r3, d17
@@ -362,7 +392,7 @@ define %struct.uint16x4x2_t @test_vld1_u16_x2_post_reg(ptr %a, ptr %ptr, i32 %in
 ; CHECK:         .save {r11, lr}
 ; CHECK-NEXT:    push {r11, lr}
 ; CHECK-NEXT:    lsl r2, r2, #1
-; CHECK-NEXT:    vld1.16 {d16, d17}, [r0:64], r2
+; CHECK-NEXT:    vld1.16 {d16, d17}, [r0], r2
 ; CHECK-NEXT:    vmov lr, r12, d16
 ; CHECK-NEXT:    str r0, [r1]
 ; CHECK-NEXT:    vmov r2, r3, d17
@@ -377,7 +407,7 @@ define %struct.uint16x4x2_t @test_vld1_u16_x2_post_reg(ptr %a, ptr %ptr, i32 %in
 
 define %struct.uint16x4x3_t @test_vld1_u16_x3_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1_u16_x3_post_imm:
-; CHECK:         vld1.16 {d16, d17, d18}, [r1:64]!
+; CHECK:         vld1.16 {d16, d17, d18}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.16 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.16 {d17}, [r0:64]!
@@ -392,7 +422,7 @@ define %struct.uint16x4x3_t @test_vld1_u16_x3_post_imm(ptr %a, ptr %ptr) nounwin
 define %struct.uint16x4x3_t @test_vld1_u16_x3_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vld1_u16_x3_post_reg:
 ; CHECK:         lsl r3, r3, #1
-; CHECK-NEXT:    vld1.16 {d16, d17, d18}, [r1:64], r3
+; CHECK-NEXT:    vld1.16 {d16, d17, d18}, [r1], r3
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.16 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.16 {d17}, [r0:64]!
@@ -406,7 +436,7 @@ define %struct.uint16x4x3_t @test_vld1_u16_x3_post_reg(ptr %a, ptr %ptr, i32 %in
 
 define %struct.uint16x4x4_t @test_vld1_u16_x4_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1_u16_x4_post_imm:
-; CHECK:         vld1.16 {d16, d17, d18, d19}, [r1:256]!
+; CHECK:         vld1.16 {d16, d17, d18, d19}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.16 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.16 {d17}, [r0:64]!
@@ -422,7 +452,7 @@ define %struct.uint16x4x4_t @test_vld1_u16_x4_post_imm(ptr %a, ptr %ptr) nounwin
 define %struct.uint16x4x4_t @test_vld1_u16_x4_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vld1_u16_x4_post_reg:
 ; CHECK:         lsl r3, r3, #1
-; CHECK-NEXT:    vld1.16 {d16, d17, d18, d19}, [r1:256], r3
+; CHECK-NEXT:    vld1.16 {d16, d17, d18, d19}, [r1], r3
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.16 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.16 {d17}, [r0:64]!
@@ -439,7 +469,7 @@ define %struct.uint32x2x2_t @test_vld1_u32_x2_post_imm(ptr %a, ptr %ptr) nounwin
 ; CHECK-LABEL: test_vld1_u32_x2_post_imm:
 ; CHECK:         .save {r11, lr}
 ; CHECK-NEXT:    push {r11, lr}
-; CHECK-NEXT:    vld1.32 {d16, d17}, [r0:64]!
+; CHECK-NEXT:    vld1.32 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vmov lr, r12, d16
 ; CHECK-NEXT:    str r0, [r1]
 ; CHECK-NEXT:    vmov r2, r3, d17
@@ -457,7 +487,7 @@ define %struct.uint32x2x2_t @test_vld1_u32_x2_post_reg(ptr %a, ptr %ptr, i32 %in
 ; CHECK:         .save {r11, lr}
 ; CHECK-NEXT:    push {r11, lr}
 ; CHECK-NEXT:    lsl r2, r2, #2
-; CHECK-NEXT:    vld1.32 {d16, d17}, [r0:64], r2
+; CHECK-NEXT:    vld1.32 {d16, d17}, [r0], r2
 ; CHECK-NEXT:    vmov lr, r12, d16
 ; CHECK-NEXT:    str r0, [r1]
 ; CHECK-NEXT:    vmov r2, r3, d17
@@ -472,7 +502,7 @@ define %struct.uint32x2x2_t @test_vld1_u32_x2_post_reg(ptr %a, ptr %ptr, i32 %in
 
 define %struct.uint32x2x3_t @test_vld1_u32_x3_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1_u32_x3_post_imm:
-; CHECK:         vld1.32 {d16, d17, d18}, [r1:64]!
+; CHECK:         vld1.32 {d16, d17, d18}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.32 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.32 {d17}, [r0:64]!
@@ -487,7 +517,7 @@ define %struct.uint32x2x3_t @test_vld1_u32_x3_post_imm(ptr %a, ptr %ptr) nounwin
 define %struct.uint32x2x3_t @test_vld1_u32_x3_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vld1_u32_x3_post_reg:
 ; CHECK:         lsl r3, r3, #2
-; CHECK-NEXT:    vld1.32 {d16, d17, d18}, [r1:64], r3
+; CHECK-NEXT:    vld1.32 {d16, d17, d18}, [r1], r3
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.32 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.32 {d17}, [r0:64]!
@@ -501,7 +531,7 @@ define %struct.uint32x2x3_t @test_vld1_u32_x3_post_reg(ptr %a, ptr %ptr, i32 %in
 
 define %struct.uint32x2x4_t @test_vld1_u32_x4_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1_u32_x4_post_imm:
-; CHECK:         vld1.32 {d16, d17, d18, d19}, [r1:256]!
+; CHECK:         vld1.32 {d16, d17, d18, d19}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.32 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.32 {d17}, [r0:64]!
@@ -517,7 +547,7 @@ define %struct.uint32x2x4_t @test_vld1_u32_x4_post_imm(ptr %a, ptr %ptr) nounwin
 define %struct.uint32x2x4_t @test_vld1_u32_x4_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vld1_u32_x4_post_reg:
 ; CHECK:         lsl r3, r3, #2
-; CHECK-NEXT:    vld1.32 {d16, d17, d18, d19}, [r1:256], r3
+; CHECK-NEXT:    vld1.32 {d16, d17, d18, d19}, [r1], r3
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.32 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.32 {d17}, [r0:64]!
@@ -534,7 +564,7 @@ define %struct.uint64x1x2_t @test_vld1_u64_x2_post_imm(ptr %a, ptr %ptr) nounwin
 ; CHECK-LABEL: test_vld1_u64_x2_post_imm:
 ; CHECK:         .save {r11, lr}
 ; CHECK-NEXT:    push {r11, lr}
-; CHECK-NEXT:    vld1.64 {d16, d17}, [r0:64]!
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vmov lr, r12, d16
 ; CHECK-NEXT:    str r0, [r1]
 ; CHECK-NEXT:    vmov r2, r3, d17
@@ -552,7 +582,7 @@ define %struct.uint64x1x2_t @test_vld1_u64_x2_post_reg(ptr %a, ptr %ptr, i32 %in
 ; CHECK:         .save {r11, lr}
 ; CHECK-NEXT:    push {r11, lr}
 ; CHECK-NEXT:    lsl r2, r2, #3
-; CHECK-NEXT:    vld1.64 {d16, d17}, [r0:64], r2
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0], r2
 ; CHECK-NEXT:    vmov lr, r12, d16
 ; CHECK-NEXT:    str r0, [r1]
 ; CHECK-NEXT:    vmov r2, r3, d17
@@ -567,7 +597,7 @@ define %struct.uint64x1x2_t @test_vld1_u64_x2_post_reg(ptr %a, ptr %ptr, i32 %in
 
 define %struct.uint64x1x3_t @test_vld1_u64_x3_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1_u64_x3_post_imm:
-; CHECK:         vld1.64 {d16, d17, d18}, [r1:64]!
+; CHECK:         vld1.64 {d16, d17, d18}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.64 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.64 {d17}, [r0:64]!
@@ -582,7 +612,7 @@ define %struct.uint64x1x3_t @test_vld1_u64_x3_post_imm(ptr %a, ptr %ptr) nounwin
 define %struct.uint64x1x3_t @test_vld1_u64_x3_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vld1_u64_x3_post_reg:
 ; CHECK:         lsl r3, r3, #3
-; CHECK-NEXT:    vld1.64 {d16, d17, d18}, [r1:64], r3
+; CHECK-NEXT:    vld1.64 {d16, d17, d18}, [r1], r3
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.64 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.64 {d17}, [r0:64]!
@@ -596,7 +626,7 @@ define %struct.uint64x1x3_t @test_vld1_u64_x3_post_reg(ptr %a, ptr %ptr, i32 %in
 
 define %struct.uint64x1x4_t @test_vld1_u64_x4_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1_u64_x4_post_imm:
-; CHECK:         vld1.64 {d16, d17, d18, d19}, [r1:256]!
+; CHECK:         vld1.64 {d16, d17, d18, d19}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.64 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.64 {d17}, [r0:64]!
@@ -612,7 +642,7 @@ define %struct.uint64x1x4_t @test_vld1_u64_x4_post_imm(ptr %a, ptr %ptr) nounwin
 define %struct.uint64x1x4_t @test_vld1_u64_x4_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vld1_u64_x4_post_reg:
 ; CHECK:         lsl r3, r3, #3
-; CHECK-NEXT:    vld1.64 {d16, d17, d18, d19}, [r1:256], r3
+; CHECK-NEXT:    vld1.64 {d16, d17, d18, d19}, [r1], r3
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.64 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.64 {d17}, [r0:64]!
@@ -629,7 +659,7 @@ define %struct.uint8x8x2_t @test_vld1_u8_x2_post_imm(ptr %a, ptr %ptr) nounwind
 ; CHECK-LABEL: test_vld1_u8_x2_post_imm:
 ; CHECK:         .save {r11, lr}
 ; CHECK-NEXT:    push {r11, lr}
-; CHECK-NEXT:    vld1.8 {d16, d17}, [r0:64]!
+; CHECK-NEXT:    vld1.8 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vmov lr, r12, d16
 ; CHECK-NEXT:    str r0, [r1]
 ; CHECK-NEXT:    vmov r2, r3, d17
@@ -646,7 +676,7 @@ define %struct.uint8x8x2_t @test_vld1_u8_x2_post_reg(ptr %a, ptr %ptr, i32 %inc)
 ; CHECK-LABEL: test_vld1_u8_x2_post_reg:
 ; CHECK:         .save {r11, lr}
 ; CHECK-NEXT:    push {r11, lr}
-; CHECK-NEXT:    vld1.8 {d16, d17}, [r0:64], r2
+; CHECK-NEXT:    vld1.8 {d16, d17}, [r0], r2
 ; CHECK-NEXT:    vmov lr, r12, d16
 ; CHECK-NEXT:    str r0, [r1]
 ; CHECK-NEXT:    vmov r2, r3, d17
@@ -661,7 +691,7 @@ define %struct.uint8x8x2_t @test_vld1_u8_x2_post_reg(ptr %a, ptr %ptr, i32 %inc)
 
 define %struct.uint8x8x3_t @test_vld1_u8_x3_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1_u8_x3_post_imm:
-; CHECK:         vld1.8 {d16, d17, d18}, [r1:64]!
+; CHECK:         vld1.8 {d16, d17, d18}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.8 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.8 {d17}, [r0:64]!
@@ -675,7 +705,7 @@ define %struct.uint8x8x3_t @test_vld1_u8_x3_post_imm(ptr %a, ptr %ptr) nounwind
 
 define %struct.uint8x8x3_t @test_vld1_u8_x3_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vld1_u8_x3_post_reg:
-; CHECK:         vld1.8 {d16, d17, d18}, [r1:64], r3
+; CHECK:         vld1.8 {d16, d17, d18}, [r1], r3
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.8 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.8 {d17}, [r0:64]!
@@ -689,7 +719,7 @@ define %struct.uint8x8x3_t @test_vld1_u8_x3_post_reg(ptr %a, ptr %ptr, i32 %inc)
 
 define %struct.uint8x8x4_t @test_vld1_u8_x4_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1_u8_x4_post_imm:
-; CHECK:         vld1.8 {d16, d17, d18, d19}, [r1:256]!
+; CHECK:         vld1.8 {d16, d17, d18, d19}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.8 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.8 {d17}, [r0:64]!
@@ -704,7 +734,7 @@ define %struct.uint8x8x4_t @test_vld1_u8_x4_post_imm(ptr %a, ptr %ptr) nounwind
 
 define %struct.uint8x8x4_t @test_vld1_u8_x4_post_reg(ptr %a, ptr %ptr, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vld1_u8_x4_post_reg:
-; CHECK:         vld1.8 {d16, d17, d18, d19}, [r1:256], r3
+; CHECK:         vld1.8 {d16, d17, d18, d19}, [r1], r3
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.8 {d16}, [r0:64]!
 ; CHECK-NEXT:    vst1.8 {d17}, [r0:64]!
@@ -719,7 +749,7 @@ define %struct.uint8x8x4_t @test_vld1_u8_x4_post_reg(ptr %a, ptr %ptr, i32 %inc)
 
 define %struct.uint16x8x2_t @test_vld1q_u16_x2_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1q_u16_x2_post_imm:
-; CHECK:         vld1.16 {d16, d17, d18, d19}, [r1:256]!
+; CHECK:         vld1.16 {d16, d17, d18, d19}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.16 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r0]
@@ -732,8 +762,8 @@ define %struct.uint16x8x2_t @test_vld1q_u16_x2_post_imm(ptr %a, ptr %ptr) nounwi
 
 define %struct.uint16x8x3_t @test_vld1q_u16_x3_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1q_u16_x3_post_imm:
-; CHECK:         vld1.16 {d16, d17, d18}, [r1:64]!
-; CHECK-NEXT:    vld1.16 {d19, d20, d21}, [r1:64]!
+; CHECK:         vld1.16 {d16, d17, d18}, [r1]!
+; CHECK-NEXT:    vld1.16 {d19, d20, d21}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.16 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.16 {d18, d19}, [r0]!
@@ -747,8 +777,8 @@ define %struct.uint16x8x3_t @test_vld1q_u16_x3_post_imm(ptr %a, ptr %ptr) nounwi
 
 define %struct.uint16x8x4_t @test_vld1q_u16_x4_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1q_u16_x4_post_imm:
-; CHECK:         vld1.16 {d16, d17, d18, d19}, [r1:256]!
-; CHECK-NEXT:    vld1.16 {d20, d21, d22, d23}, [r1:256]!
+; CHECK:         vld1.16 {d16, d17, d18, d19}, [r1]!
+; CHECK-NEXT:    vld1.16 {d20, d21, d22, d23}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.16 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.16 {d18, d19}, [r0]!
@@ -763,7 +793,7 @@ define %struct.uint16x8x4_t @test_vld1q_u16_x4_post_imm(ptr %a, ptr %ptr) nounwi
 
 define %struct.uint32x4x2_t @test_vld1q_u32_x2_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1q_u32_x2_post_imm:
-; CHECK:         vld1.32 {d16, d17, d18, d19}, [r1:256]!
+; CHECK:         vld1.32 {d16, d17, d18, d19}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r0]
@@ -776,8 +806,8 @@ define %struct.uint32x4x2_t @test_vld1q_u32_x2_post_imm(ptr %a, ptr %ptr) nounwi
 
 define %struct.uint32x4x3_t @test_vld1q_u32_x3_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1q_u32_x3_post_imm:
-; CHECK:         vld1.32 {d16, d17, d18}, [r1:64]!
-; CHECK-NEXT:    vld1.32 {d19, d20, d21}, [r1:64]!
+; CHECK:         vld1.32 {d16, d17, d18}, [r1]!
+; CHECK-NEXT:    vld1.32 {d19, d20, d21}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.32 {d18, d19}, [r0]!
@@ -791,8 +821,8 @@ define %struct.uint32x4x3_t @test_vld1q_u32_x3_post_imm(ptr %a, ptr %ptr) nounwi
 
 define %struct.uint32x4x4_t @test_vld1q_u32_x4_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1q_u32_x4_post_imm:
-; CHECK:         vld1.32 {d16, d17, d18, d19}, [r1:256]!
-; CHECK-NEXT:    vld1.32 {d20, d21, d22, d23}, [r1:256]!
+; CHECK:         vld1.32 {d16, d17, d18, d19}, [r1]!
+; CHECK-NEXT:    vld1.32 {d20, d21, d22, d23}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.32 {d18, d19}, [r0]!
@@ -807,7 +837,7 @@ define %struct.uint32x4x4_t @test_vld1q_u32_x4_post_imm(ptr %a, ptr %ptr) nounwi
 
 define %struct.uint64x2x2_t @test_vld1q_u64_x2_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1q_u64_x2_post_imm:
-; CHECK:         vld1.64 {d16, d17, d18, d19}, [r1:256]!
+; CHECK:         vld1.64 {d16, d17, d18, d19}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r0]
@@ -820,8 +850,8 @@ define %struct.uint64x2x2_t @test_vld1q_u64_x2_post_imm(ptr %a, ptr %ptr) nounwi
 
 define %struct.uint64x2x3_t @test_vld1q_u64_x3_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1q_u64_x3_post_imm:
-; CHECK:         vld1.64 {d16, d17, d18}, [r1:64]!
-; CHECK-NEXT:    vld1.64 {d19, d20, d21}, [r1:64]!
+; CHECK:         vld1.64 {d16, d17, d18}, [r1]!
+; CHECK-NEXT:    vld1.64 {d19, d20, d21}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r0]!
@@ -835,8 +865,8 @@ define %struct.uint64x2x3_t @test_vld1q_u64_x3_post_imm(ptr %a, ptr %ptr) nounwi
 
 define %struct.uint64x2x4_t @test_vld1q_u64_x4_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1q_u64_x4_post_imm:
-; CHECK:         vld1.64 {d16, d17, d18, d19}, [r1:256]!
-; CHECK-NEXT:    vld1.64 {d20, d21, d22, d23}, [r1:256]!
+; CHECK:         vld1.64 {d16, d17, d18, d19}, [r1]!
+; CHECK-NEXT:    vld1.64 {d20, d21, d22, d23}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r0]!
@@ -851,7 +881,7 @@ define %struct.uint64x2x4_t @test_vld1q_u64_x4_post_imm(ptr %a, ptr %ptr) nounwi
 
 define %struct.uint8x16x2_t @test_vld1q_u8_x2_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1q_u8_x2_post_imm:
-; CHECK:         vld1.8 {d16, d17, d18, d19}, [r1:256]!
+; CHECK:         vld1.8 {d16, d17, d18, d19}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.8 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r0]
@@ -864,8 +894,8 @@ define %struct.uint8x16x2_t @test_vld1q_u8_x2_post_imm(ptr %a, ptr %ptr) nounwin
 
 define %struct.uint8x16x3_t @test_vld1q_u8_x3_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1q_u8_x3_post_imm:
-; CHECK:         vld1.8 {d16, d17, d18}, [r1:64]!
-; CHECK-NEXT:    vld1.8 {d19, d20, d21}, [r1:64]!
+; CHECK:         vld1.8 {d16, d17, d18}, [r1]!
+; CHECK-NEXT:    vld1.8 {d19, d20, d21}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.8 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.8 {d18, d19}, [r0]!
@@ -879,8 +909,8 @@ define %struct.uint8x16x3_t @test_vld1q_u8_x3_post_imm(ptr %a, ptr %ptr) nounwin
 
 define %struct.uint8x16x4_t @test_vld1q_u8_x4_post_imm(ptr %a, ptr %ptr) nounwind {
 ; CHECK-LABEL: test_vld1q_u8_x4_post_imm:
-; CHECK:         vld1.8 {d16, d17, d18, d19}, [r1:256]!
-; CHECK-NEXT:    vld1.8 {d20, d21, d22, d23}, [r1:256]!
+; CHECK:         vld1.8 {d16, d17, d18, d19}, [r1]!
+; CHECK-NEXT:    vld1.8 {d20, d21, d22, d23}, [r1]!
 ; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    vst1.8 {d16, d17}, [r0]!
 ; CHECK-NEXT:    vst1.8 {d18, d19}, [r0]!
diff --git a/llvm/test/CodeGen/ARM/arm-vst1.ll b/llvm/test/CodeGen/ARM/arm-vst1.ll
index 7dacd8b0b99f9..5d0a7e9614ce9 100644
--- a/llvm/test/CodeGen/ARM/arm-vst1.ll
+++ b/llvm/test/CodeGen/ARM/arm-vst1.ll
@@ -92,7 +92,7 @@ declare void @llvm.arm.neon.vst1x4.p0.v16i8(ptr nocapture, <16 x i8>, <16 x i8>,
 
 define arm_aapcs_vfpcc void @test_vst1_u16_x2(ptr %a, %struct.uint16x4x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u16_x2:
-; CHECK:         vst1.16 {d0, d1}, [r0:64]
+; CHECK:         vst1.16 {d0, d1}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint16x4x2_t %b, 0, 0
@@ -101,9 +101,42 @@ entry:
   ret void
 }
 
+define arm_aapcs_vfpcc void @test_vst1_u16_x2_align8(ptr %a, %struct.uint16x4x2_t %b) nounwind {
+; CHECK-LABEL: test_vst1_u16_x2_align8:
+; CHECK:         vst1.16 {d0, d1}, [r0:64]
+; CHECK-NEXT:    bx lr
+entry:
+  %b0 = extractvalue %struct.uint16x4x2_t %b, 0, 0
+  %b1 = extractvalue %struct.uint16x4x2_t %b, 0, 1
+  tail call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr align 8 %a, <4 x i16> %b0, <4 x i16> %b1)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1_u16_x2_align16(ptr %a, %struct.uint16x4x2_t %b) nounwind {
+; CHECK-LABEL: test_vst1_u16_x2_align16:
+; CHECK:         vst1.16 {d0, d1}, [r0:128]
+; CHECK-NEXT:    bx lr
+entry:
+  %b0 = extractvalue %struct.uint16x4x2_t %b, 0, 0
+  %b1 = extractvalue %struct.uint16x4x2_t %b, 0, 1
+  tail call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr align 16 %a, <4 x i16> %b0, <4 x i16> %b1)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_vst1_u16_x2_align32(ptr %a, %struct.uint16x4x2_t %b) nounwind {
+; CHECK-LABEL: test_vst1_u16_x2_align32:
+; CHECK:         vst1.16 {d0, d1}, [r0:128]
+; CHECK-NEXT:    bx lr
+entry:
+  %b0 = extractvalue %struct.uint16x4x2_t %b, 0, 0
+  %b1 = extractvalue %struct.uint16x4x2_t %b, 0, 1
+  tail call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr align 32 %a, <4 x i16> %b0, <4 x i16> %b1)
+  ret void
+}
+
 define arm_aapcs_vfpcc void @test_vst1_u16_x3(ptr %a, %struct.uint16x4x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u16_x3:
-; CHECK:         vst1.16 {d0, d1, d2}, [r0:64]
+; CHECK:         vst1.16 {d0, d1, d2}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint16x4x3_t %b, 0, 0
@@ -115,7 +148,7 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1_u16_x4(ptr %a, %struct.uint16x4x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u16_x4:
-; CHECK:         vst1.16 {d0, d1, d2, d3}, [r0:256]
+; CHECK:         vst1.16 {d0, d1, d2, d3}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint16x4x4_t %b, 0, 0
@@ -128,7 +161,7 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1_u32_x2(ptr %a, %struct.uint32x2x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u32_x2:
-; CHECK:         vst1.32 {d0, d1}, [r0:64]
+; CHECK:         vst1.32 {d0, d1}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x2x2_t %b, 0, 0
@@ -139,7 +172,7 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1_u32_x3(ptr %a, %struct.uint32x2x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u32_x3:
-; CHECK:         vst1.32 {d0, d1, d2}, [r0:64]
+; CHECK:         vst1.32 {d0, d1, d2}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x2x3_t %b, 0, 0
@@ -151,7 +184,7 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1_u32_x4(ptr %a, %struct.uint32x2x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u32_x4:
-; CHECK:         vst1.32 {d0, d1, d2, d3}, [r0:256]
+; CHECK:         vst1.32 {d0, d1, d2, d3}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x2x4_t %b, 0, 0
@@ -164,7 +197,7 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1_u64_x2(ptr %a, %struct.uint64x1x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u64_x2:
-; CHECK:         vst1.64 {d0, d1}, [r0:64]
+; CHECK:         vst1.64 {d0, d1}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x1x2_t %b, 0, 0
@@ -175,7 +208,7 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1_u64_x3(ptr %a, %struct.uint64x1x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u64_x3:
-; CHECK:         vst1.64 {d0, d1, d2}, [r0:64]
+; CHECK:         vst1.64 {d0, d1, d2}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x1x3_t %b, 0, 0
@@ -187,7 +220,7 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1_u64_x4(ptr %a, %struct.uint64x1x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u64_x4:
-; CHECK:         vst1.64 {d0, d1, d2, d3}, [r0:256]
+; CHECK:         vst1.64 {d0, d1, d2, d3}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x1x4_t %b, 0, 0
@@ -200,7 +233,7 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1_u8_x2(ptr %a, %struct.uint8x8x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u8_x2:
-; CHECK:         vst1.8 {d0, d1}, [r0:64]
+; CHECK:         vst1.8 {d0, d1}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x8x2_t %b, 0, 0
@@ -211,7 +244,7 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1_u8_x3(ptr %a, %struct.uint8x8x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u8_x3:
-; CHECK:         vst1.8 {d0, d1, d2}, [r0:64]
+; CHECK:         vst1.8 {d0, d1, d2}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x8x3_t %b, 0, 0
@@ -223,7 +256,7 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1_u8_x4(ptr %a, %struct.uint8x8x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u8_x4:
-; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0:256]
+; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x8x4_t %b, 0, 0
@@ -236,7 +269,7 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1q_u16_x2(ptr %a, %struct.uint16x8x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u16_x2:
-; CHECK:         vst1.16 {d0, d1, d2, d3}, [r0:256]
+; CHECK:         vst1.16 {d0, d1, d2, d3}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint16x8x2_t %b, 0, 0
@@ -247,8 +280,8 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1q_u16_x3(ptr %a, %struct.uint16x8x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u16_x3:
-; CHECK:         vst1.16 {d0, d1, d2}, [r0:64]!
-; CHECK-NEXT:    vst1.16 {d3, d4, d5}, [r0:64]
+; CHECK:         vst1.16 {d0, d1, d2}, [r0]!
+; CHECK-NEXT:    vst1.16 {d3, d4, d5}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint16x8x3_t %b, 0, 0
@@ -260,8 +293,8 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1q_u16_x4(ptr %a, %struct.uint16x8x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u16_x4:
-; CHECK:         vst1.16 {d0, d1, d2, d3}, [r0:256]!
-; CHECK-NEXT:    vst1.16 {d4, d5, d6, d7}, [r0:256]
+; CHECK:         vst1.16 {d0, d1, d2, d3}, [r0]!
+; CHECK-NEXT:    vst1.16 {d4, d5, d6, d7}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint16x8x4_t %b, 0, 0
@@ -274,7 +307,7 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1q_u32_x2(ptr %a, %struct.uint32x4x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u32_x2:
-; CHECK:         vst1.32 {d0, d1, d2, d3}, [r0:256]
+; CHECK:         vst1.32 {d0, d1, d2, d3}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x4x2_t %b, 0, 0
@@ -285,8 +318,8 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1q_u32_x3(ptr %a, %struct.uint32x4x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u32_x3:
-; CHECK:         vst1.32 {d0, d1, d2}, [r0:64]!
-; CHECK-NEXT:    vst1.32 {d3, d4, d5}, [r0:64]
+; CHECK:         vst1.32 {d0, d1, d2}, [r0]!
+; CHECK-NEXT:    vst1.32 {d3, d4, d5}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x4x3_t %b, 0, 0
@@ -298,8 +331,8 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1q_u32_x4(ptr %a, %struct.uint32x4x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u32_x4:
-; CHECK:         vst1.32 {d0, d1, d2, d3}, [r0:256]!
-; CHECK-NEXT:    vst1.32 {d4, d5, d6, d7}, [r0:256]
+; CHECK:         vst1.32 {d0, d1, d2, d3}, [r0]!
+; CHECK-NEXT:    vst1.32 {d4, d5, d6, d7}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x4x4_t %b, 0, 0
@@ -312,7 +345,7 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1q_u64_x2(ptr %a, %struct.uint64x2x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u64_x2:
-; CHECK:         vst1.64 {d0, d1, d2, d3}, [r0:256]
+; CHECK:         vst1.64 {d0, d1, d2, d3}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x2x2_t %b, 0, 0
@@ -323,8 +356,8 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1q_u64_x3(ptr %a, %struct.uint64x2x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u64_x3:
-; CHECK:         vst1.64 {d0, d1, d2}, [r0:64]!
-; CHECK-NEXT:    vst1.64 {d3, d4, d5}, [r0:64]
+; CHECK:         vst1.64 {d0, d1, d2}, [r0]!
+; CHECK-NEXT:    vst1.64 {d3, d4, d5}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x2x3_t %b, 0, 0
@@ -336,8 +369,8 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1q_u64_x4(ptr %a, %struct.uint64x2x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u64_x4:
-; CHECK:         vst1.64 {d0, d1, d2, d3}, [r0:256]!
-; CHECK-NEXT:    vst1.64 {d4, d5, d6, d7}, [r0:256]
+; CHECK:         vst1.64 {d0, d1, d2, d3}, [r0]!
+; CHECK-NEXT:    vst1.64 {d4, d5, d6, d7}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x2x4_t %b, 0, 0
@@ -350,7 +383,7 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1q_u8_x2(ptr %a, %struct.uint8x16x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u8_x2:
-; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0:256]
+; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x16x2_t %b, 0, 0
@@ -361,8 +394,8 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1q_u8_x3(ptr %a, %struct.uint8x16x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u8_x3:
-; CHECK:         vst1.8 {d0, d1, d2}, [r0:64]!
-; CHECK-NEXT:    vst1.8 {d3, d4, d5}, [r0:64]
+; CHECK:         vst1.8 {d0, d1, d2}, [r0]!
+; CHECK-NEXT:    vst1.8 {d3, d4, d5}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x16x3_t %b, 0, 0
@@ -374,8 +407,8 @@ entry:
 
 define arm_aapcs_vfpcc void @test_vst1q_u8_x4(ptr %a, %struct.uint8x16x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u8_x4:
-; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0:256]!
-; CHECK-NEXT:    vst1.8 {d4, d5, d6, d7}, [r0:256]
+; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0]!
+; CHECK-NEXT:    vst1.8 {d4, d5, d6, d7}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x16x4_t %b, 0, 0
@@ -390,7 +423,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1_u8_x2_post_imm(ptr %a, %struct.uint8x8x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u8_x2_post_imm:
-; CHECK:         vst1.8 {d0, d1}, [r0:64]!
+; CHECK:         vst1.8 {d0, d1}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x8x2_t %b, 0, 0
@@ -402,7 +435,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1_u8_x2_post_reg(ptr %a, %struct.uint8x8x2_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1_u8_x2_post_reg:
-; CHECK:         vst1.8 {d0, d1}, [r0:64], r1
+; CHECK:         vst1.8 {d0, d1}, [r0], r1
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x8x2_t %b, 0, 0
@@ -414,7 +447,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1_u16_x2_post_imm(ptr %a, %struct.uint16x4x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u16_x2_post_imm:
-; CHECK:         vst1.16 {d0, d1}, [r0:64]!
+; CHECK:         vst1.16 {d0, d1}, [r0]!
 ; CHECK-NEXT:    bx lr
   %b0 = extractvalue %struct.uint16x4x2_t %b, 0, 0
   %b1 = extractvalue %struct.uint16x4x2_t %b, 0, 1
@@ -426,7 +459,7 @@ define arm_aapcs_vfpcc ptr @test_vst1_u16_x2_post_imm(ptr %a, %struct.uint16x4x2
 define arm_aapcs_vfpcc ptr @test_vst1_u16_x2_post_reg(ptr %a, %struct.uint16x4x2_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1_u16_x2_post_reg:
 ; CHECK:         lsl r1, r1, #1
-; CHECK-NEXT:    vst1.16 {d0, d1}, [r0:64], r1
+; CHECK-NEXT:    vst1.16 {d0, d1}, [r0], r1
 ; CHECK-NEXT:    bx lr
   %b0 = extractvalue %struct.uint16x4x2_t %b, 0, 0
   %b1 = extractvalue %struct.uint16x4x2_t %b, 0, 1
@@ -437,7 +470,7 @@ define arm_aapcs_vfpcc ptr @test_vst1_u16_x2_post_reg(ptr %a, %struct.uint16x4x2
 
 define arm_aapcs_vfpcc ptr @test_vst1_u32_x2_post_imm(ptr %a, %struct.uint32x2x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u32_x2_post_imm:
-; CHECK:         vst1.32 {d0, d1}, [r0:64]!
+; CHECK:         vst1.32 {d0, d1}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x2x2_t %b, 0, 0
@@ -450,7 +483,7 @@ entry:
 define arm_aapcs_vfpcc ptr @test_vst1_u32_x2_post_reg(ptr %a, %struct.uint32x2x2_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1_u32_x2_post_reg:
 ; CHECK:         lsl r1, r1, #2
-; CHECK-NEXT:    vst1.32 {d0, d1}, [r0:64], r1
+; CHECK-NEXT:    vst1.32 {d0, d1}, [r0], r1
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x2x2_t %b, 0, 0
@@ -462,7 +495,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1_u64_x2_post_imm(ptr %a, %struct.uint64x1x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u64_x2_post_imm:
-; CHECK:         vst1.64 {d0, d1}, [r0:64]!
+; CHECK:         vst1.64 {d0, d1}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x1x2_t %b, 0, 0
@@ -475,7 +508,7 @@ entry:
 define arm_aapcs_vfpcc ptr @test_vst1_u64_x2_post_reg(ptr %a, %struct.uint64x1x2_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1_u64_x2_post_reg:
 ; CHECK:         lsl r1, r1, #3
-; CHECK-NEXT:    vst1.64 {d0, d1}, [r0:64], r1
+; CHECK-NEXT:    vst1.64 {d0, d1}, [r0], r1
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x1x2_t %b, 0, 0
@@ -487,7 +520,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1q_u8_x2_post_imm(ptr %a, %struct.uint8x16x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u8_x2_post_imm:
-; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0:256]!
+; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x16x2_t %b, 0, 0
@@ -499,7 +532,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1q_u8_x2_post_reg(ptr %a, %struct.uint8x16x2_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1q_u8_x2_post_reg:
-; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0:256], r1
+; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0], r1
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x16x2_t %b, 0, 0
@@ -511,7 +544,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1q_u16_x2_post_imm(ptr %a, %struct.uint16x8x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u16_x2_post_imm:
-; CHECK:         vst1.16 {d0, d1, d2, d3}, [r0:256]!
+; CHECK:         vst1.16 {d0, d1, d2, d3}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint16x8x2_t %b, 0, 0
@@ -524,7 +557,7 @@ entry:
 define arm_aapcs_vfpcc ptr @test_vst1q_u16_x2_post_reg(ptr %a, %struct.uint16x8x2_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1q_u16_x2_post_reg:
 ; CHECK:         lsl r1, r1, #1
-; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0:256], r1
+; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0], r1
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint16x8x2_t %b, 0, 0
@@ -536,7 +569,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1q_u32_x2_post_imm(ptr %a, %struct.uint32x4x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u32_x2_post_imm:
-; CHECK:         vst1.32 {d0, d1, d2, d3}, [r0:256]!
+; CHECK:         vst1.32 {d0, d1, d2, d3}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x4x2_t %b, 0, 0
@@ -549,7 +582,7 @@ entry:
 define arm_aapcs_vfpcc ptr @test_vst1q_u32_x2_post_reg(ptr %a, %struct.uint32x4x2_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1q_u32_x2_post_reg:
 ; CHECK:         lsl r1, r1, #2
-; CHECK-NEXT:    vst1.32 {d0, d1, d2, d3}, [r0:256], r1
+; CHECK-NEXT:    vst1.32 {d0, d1, d2, d3}, [r0], r1
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x4x2_t %b, 0, 0
@@ -561,7 +594,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1q_u64_x2_post_imm(ptr %a, %struct.uint64x2x2_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u64_x2_post_imm:
-; CHECK:         vst1.64 {d0, d1, d2, d3}, [r0:256]!
+; CHECK:         vst1.64 {d0, d1, d2, d3}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x2x2_t %b, 0, 0
@@ -574,7 +607,7 @@ entry:
 define arm_aapcs_vfpcc ptr @test_vst1q_u64_x2_post_reg(ptr %a, %struct.uint64x2x2_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1q_u64_x2_post_reg:
 ; CHECK:         lsl r1, r1, #3
-; CHECK-NEXT:    vst1.64 {d0, d1, d2, d3}, [r0:256], r1
+; CHECK-NEXT:    vst1.64 {d0, d1, d2, d3}, [r0], r1
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x2x2_t %b, 0, 0
@@ -587,7 +620,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1_u8_x3_post_imm(ptr %a, %struct.uint8x8x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u8_x3_post_imm:
-; CHECK:         vst1.8 {d0, d1, d2}, [r0:64]!
+; CHECK:         vst1.8 {d0, d1, d2}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x8x3_t %b, 0, 0
@@ -600,7 +633,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1_u8_x3_post_reg(ptr %a, %struct.uint8x8x3_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1_u8_x3_post_reg:
-; CHECK:         vst1.8 {d0, d1, d2}, [r0:64], r1
+; CHECK:         vst1.8 {d0, d1, d2}, [r0], r1
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x8x3_t %b, 0, 0
@@ -613,7 +646,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1_u16_x3_post_imm(ptr %a, %struct.uint16x4x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u16_x3_post_imm:
-; CHECK:         vst1.16 {d0, d1, d2}, [r0:64]!
+; CHECK:         vst1.16 {d0, d1, d2}, [r0]!
 ; CHECK-NEXT:    bx lr
   %b0 = extractvalue %struct.uint16x4x3_t %b, 0, 0
   %b1 = extractvalue %struct.uint16x4x3_t %b, 0, 1
@@ -626,7 +659,7 @@ define arm_aapcs_vfpcc ptr @test_vst1_u16_x3_post_imm(ptr %a, %struct.uint16x4x3
 define arm_aapcs_vfpcc ptr @test_vst1_u16_x3_post_reg(ptr %a, %struct.uint16x4x3_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1_u16_x3_post_reg:
 ; CHECK:         lsl r1, r1, #1
-; CHECK-NEXT:    vst1.16 {d0, d1, d2}, [r0:64], r1
+; CHECK-NEXT:    vst1.16 {d0, d1, d2}, [r0], r1
 ; CHECK-NEXT:    bx lr
   %b0 = extractvalue %struct.uint16x4x3_t %b, 0, 0
   %b1 = extractvalue %struct.uint16x4x3_t %b, 0, 1
@@ -638,7 +671,7 @@ define arm_aapcs_vfpcc ptr @test_vst1_u16_x3_post_reg(ptr %a, %struct.uint16x4x3
 
 define arm_aapcs_vfpcc ptr @test_vst1_u32_x3_post_imm(ptr %a, %struct.uint32x2x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u32_x3_post_imm:
-; CHECK:         vst1.32 {d0, d1, d2}, [r0:64]!
+; CHECK:         vst1.32 {d0, d1, d2}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x2x3_t %b, 0, 0
@@ -652,7 +685,7 @@ entry:
 define arm_aapcs_vfpcc ptr @test_vst1_u32_x3_post_reg(ptr %a, %struct.uint32x2x3_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1_u32_x3_post_reg:
 ; CHECK:         lsl r1, r1, #2
-; CHECK-NEXT:    vst1.32 {d0, d1, d2}, [r0:64], r1
+; CHECK-NEXT:    vst1.32 {d0, d1, d2}, [r0], r1
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x2x3_t %b, 0, 0
@@ -665,7 +698,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1_u64_x3_post_imm(ptr %a, %struct.uint64x1x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u64_x3_post_imm:
-; CHECK:         vst1.64 {d0, d1, d2}, [r0:64]!
+; CHECK:         vst1.64 {d0, d1, d2}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x1x3_t %b, 0, 0
@@ -679,7 +712,7 @@ entry:
 define arm_aapcs_vfpcc ptr @test_vst1_u64_x3_post_reg(ptr %a, %struct.uint64x1x3_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1_u64_x3_post_reg:
 ; CHECK:         lsl r1, r1, #3
-; CHECK-NEXT:    vst1.64 {d0, d1, d2}, [r0:64], r1
+; CHECK-NEXT:    vst1.64 {d0, d1, d2}, [r0], r1
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x1x3_t %b, 0, 0
@@ -692,8 +725,8 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1q_u8_x3_post_imm(ptr %a, %struct.uint8x16x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u8_x3_post_imm:
-; CHECK:         vst1.8 {d0, d1, d2}, [r0:64]!
-; CHECK-NEXT:    vst1.8 {d3, d4, d5}, [r0:64]!
+; CHECK:         vst1.8 {d0, d1, d2}, [r0]!
+; CHECK-NEXT:    vst1.8 {d3, d4, d5}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x16x3_t %b, 0, 0
@@ -706,8 +739,8 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1q_u16_x3_post_imm(ptr %a, %struct.uint16x8x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u16_x3_post_imm:
-; CHECK:         vst1.16 {d0, d1, d2}, [r0:64]!
-; CHECK-NEXT:    vst1.16 {d3, d4, d5}, [r0:64]!
+; CHECK:         vst1.16 {d0, d1, d2}, [r0]!
+; CHECK-NEXT:    vst1.16 {d3, d4, d5}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint16x8x3_t %b, 0, 0
@@ -720,8 +753,8 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1q_u32_x3_post_imm(ptr %a, %struct.uint32x4x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u32_x3_post_imm:
-; CHECK:         vst1.32 {d0, d1, d2}, [r0:64]!
-; CHECK-NEXT:    vst1.32 {d3, d4, d5}, [r0:64]!
+; CHECK:         vst1.32 {d0, d1, d2}, [r0]!
+; CHECK-NEXT:    vst1.32 {d3, d4, d5}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x4x3_t %b, 0, 0
@@ -734,8 +767,8 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1q_u64_x3_post_imm(ptr %a, %struct.uint64x2x3_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u64_x3_post_imm:
-; CHECK:         vst1.64 {d0, d1, d2}, [r0:64]!
-; CHECK-NEXT:    vst1.64 {d3, d4, d5}, [r0:64]!
+; CHECK:         vst1.64 {d0, d1, d2}, [r0]!
+; CHECK-NEXT:    vst1.64 {d3, d4, d5}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x2x3_t %b, 0, 0
@@ -748,7 +781,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1_u8_x4_post_imm(ptr %a, %struct.uint8x8x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u8_x4_post_imm:
-; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0:256]!
+; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x8x4_t %b, 0, 0
@@ -762,7 +795,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1_u8_x4_post_reg(ptr %a, %struct.uint8x8x4_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1_u8_x4_post_reg:
-; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0:256], r1
+; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0], r1
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x8x4_t %b, 0, 0
@@ -776,7 +809,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1_u16_x4_post_imm(ptr %a, %struct.uint16x4x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u16_x4_post_imm:
-; CHECK:         vst1.16 {d0, d1, d2, d3}, [r0:256]!
+; CHECK:         vst1.16 {d0, d1, d2, d3}, [r0]!
 ; CHECK-NEXT:    bx lr
   %b0 = extractvalue %struct.uint16x4x4_t %b, 0, 0
   %b1 = extractvalue %struct.uint16x4x4_t %b, 0, 1
@@ -790,7 +823,7 @@ define arm_aapcs_vfpcc ptr @test_vst1_u16_x4_post_imm(ptr %a, %struct.uint16x4x4
 define arm_aapcs_vfpcc ptr @test_vst1_u16_x4_post_reg(ptr %a, %struct.uint16x4x4_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1_u16_x4_post_reg:
 ; CHECK:         lsl r1, r1, #1
-; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0:256], r1
+; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0], r1
 ; CHECK-NEXT:    bx lr
   %b0 = extractvalue %struct.uint16x4x4_t %b, 0, 0
   %b1 = extractvalue %struct.uint16x4x4_t %b, 0, 1
@@ -803,7 +836,7 @@ define arm_aapcs_vfpcc ptr @test_vst1_u16_x4_post_reg(ptr %a, %struct.uint16x4x4
 
 define arm_aapcs_vfpcc ptr @test_vst1_u32_x4_post_imm(ptr %a, %struct.uint32x2x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u32_x4_post_imm:
-; CHECK:         vst1.32 {d0, d1, d2, d3}, [r0:256]!
+; CHECK:         vst1.32 {d0, d1, d2, d3}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x2x4_t %b, 0, 0
@@ -818,7 +851,7 @@ entry:
 define arm_aapcs_vfpcc ptr @test_vst1_u32_x4_post_reg(ptr %a, %struct.uint32x2x4_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1_u32_x4_post_reg:
 ; CHECK:         lsl r1, r1, #2
-; CHECK-NEXT:    vst1.32 {d0, d1, d2, d3}, [r0:256], r1
+; CHECK-NEXT:    vst1.32 {d0, d1, d2, d3}, [r0], r1
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x2x4_t %b, 0, 0
@@ -832,7 +865,7 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1_u64_x4_post_imm(ptr %a, %struct.uint64x1x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1_u64_x4_post_imm:
-; CHECK:         vst1.64 {d0, d1, d2, d3}, [r0:256]!
+; CHECK:         vst1.64 {d0, d1, d2, d3}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x1x4_t %b, 0, 0
@@ -847,7 +880,7 @@ entry:
 define arm_aapcs_vfpcc ptr @test_vst1_u64_x4_post_reg(ptr %a, %struct.uint64x1x4_t %b, i32 %inc) nounwind {
 ; CHECK-LABEL: test_vst1_u64_x4_post_reg:
 ; CHECK:         lsl r1, r1, #3
-; CHECK-NEXT:    vst1.64 {d0, d1, d2, d3}, [r0:256], r1
+; CHECK-NEXT:    vst1.64 {d0, d1, d2, d3}, [r0], r1
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x1x4_t %b, 0, 0
@@ -861,8 +894,8 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1q_u8_x4_post_imm(ptr %a, %struct.uint8x16x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u8_x4_post_imm:
-; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0:256]!
-; CHECK-NEXT:    vst1.8 {d4, d5, d6, d7}, [r0:256]!
+; CHECK:         vst1.8 {d0, d1, d2, d3}, [r0]!
+; CHECK-NEXT:    vst1.8 {d4, d5, d6, d7}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint8x16x4_t %b, 0, 0
@@ -876,8 +909,8 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1q_u16_x4_post_imm(ptr %a, %struct.uint16x8x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u16_x4_post_imm:
-; CHECK:         vst1.16 {d0, d1, d2, d3}, [r0:256]!
-; CHECK-NEXT:    vst1.16 {d4, d5, d6, d7}, [r0:256]!
+; CHECK:         vst1.16 {d0, d1, d2, d3}, [r0]!
+; CHECK-NEXT:    vst1.16 {d4, d5, d6, d7}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint16x8x4_t %b, 0, 0
@@ -891,8 +924,8 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1q_u32_x4_post_imm(ptr %a, %struct.uint32x4x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u32_x4_post_imm:
-; CHECK:         vst1.32 {d0, d1, d2, d3}, [r0:256]!
-; CHECK-NEXT:    vst1.32 {d4, d5, d6, d7}, [r0:256]!
+; CHECK:         vst1.32 {d0, d1, d2, d3}, [r0]!
+; CHECK-NEXT:    vst1.32 {d4, d5, d6, d7}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint32x4x4_t %b, 0, 0
@@ -906,8 +939,8 @@ entry:
 
 define arm_aapcs_vfpcc ptr @test_vst1q_u64_x4_post_imm(ptr %a, %struct.uint64x2x4_t %b) nounwind {
 ; CHECK-LABEL: test_vst1q_u64_x4_post_imm:
-; CHECK:         vst1.64 {d0, d1, d2, d3}, [r0:256]!
-; CHECK-NEXT:    vst1.64 {d4, d5, d6, d7}, [r0:256]!
+; CHECK:         vst1.64 {d0, d1, d2, d3}, [r0]!
+; CHECK-NEXT:    vst1.64 {d4, d5, d6, d7}, [r0]!
 ; CHECK-NEXT:    bx lr
 entry:
   %b0 = extractvalue %struct.uint64x2x4_t %b, 0, 0
diff --git a/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll b/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll
index e49128f53b115..846cf239e8987 100644
--- a/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll
+++ b/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll
@@ -60,7 +60,7 @@ entry:
 define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld1_bf16_x2(ptr %ptr) {
 ; CHECK-LABEL: test_vld1_bf16_x2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vld1.16 {d0, d1}, [r0:64]
+; CHECK-NEXT:    vld1.16 {d0, d1}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0(ptr %ptr)
@@ -76,7 +76,7 @@ entry:
 define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld1q_bf16_x2(ptr %ptr) {
 ; CHECK-LABEL: test_vld1q_bf16_x2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vld1.16 {d0, d1, d2, d3}, [r0:256]
+; CHECK-NEXT:    vld1.16 {d0, d1, d2, d3}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0(ptr %ptr)
@@ -92,7 +92,7 @@ entry:
 define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld1_bf16_x3(ptr %ptr) {
 ; CHECK-LABEL: test_vld1_bf16_x3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vld1.16 {d0, d1, d2}, [r0:64]
+; CHECK-NEXT:    vld1.16 {d0, d1, d2}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0(ptr %ptr)
@@ -111,8 +111,8 @@ entry:
 define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld1q_bf16_x3(ptr %ptr) {
 ; CHECK-LABEL: test_vld1q_bf16_x3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vld1.16 {d0, d1, d2}, [r0:64]!
-; CHECK-NEXT:    vld1.16 {d3, d4, d5}, [r0:64]
+; CHECK-NEXT:    vld1.16 {d0, d1, d2}, [r0]!
+; CHECK-NEXT:    vld1.16 {d3, d4, d5}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0(ptr %ptr)
@@ -131,7 +131,7 @@ entry:
 define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld1_bf16_x4(ptr %ptr) {
 ; CHECK-LABEL: test_vld1_bf16_x4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vld1.16 {d0, d1, d2, d3}, [r0:256]
+; CHECK-NEXT:    vld1.16 {d0, d1, d2, d3}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0(ptr %ptr)
@@ -153,8 +153,8 @@ entry:
 define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld1q_bf16_x4(ptr %ptr) {
 ; CHECK-LABEL: test_vld1q_bf16_x4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vld1.16 {d0, d1, d2, d3}, [r0:256]!
-; CHECK-NEXT:    vld1.16 {d4, d5, d6, d7}, [r0:256]
+; CHECK-NEXT:    vld1.16 {d0, d1, d2, d3}, [r0]!
+; CHECK-NEXT:    vld1.16 {d4, d5, d6, d7}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0(ptr %ptr)
@@ -635,7 +635,7 @@ define arm_aapcs_vfpcc void @test_vst1_bf16_x2(ptr nocapture %ptr, [2 x <2 x i32
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0 def $q0
 ; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0 def $q0
-; CHECK-NEXT:    vst1.16 {d0, d1}, [r0:64]
+; CHECK-NEXT:    vst1.16 {d0, d1}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0
@@ -651,7 +651,7 @@ define arm_aapcs_vfpcc void @test_vst1q_bf16_x2(ptr nocapture %ptr, [2 x <4 x i3
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0:256]
+; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0
@@ -668,7 +668,7 @@ define arm_aapcs_vfpcc void @test_vst1_bf16_x3(ptr nocapture %ptr, [3 x <2 x i32
 ; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    vst1.16 {d0, d1, d2}, [r0:64]
+; CHECK-NEXT:    vst1.16 {d0, d1, d2}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0
@@ -687,8 +687,8 @@ define arm_aapcs_vfpcc void @test_vst1q_bf16_x3(ptr nocapture %ptr, [3 x <4 x i3
 ; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    vst1.16 {d0, d1, d2}, [r0:64]!
-; CHECK-NEXT:    vst1.16 {d3, d4, d5}, [r0:64]
+; CHECK-NEXT:    vst1.16 {d0, d1, d2}, [r0]!
+; CHECK-NEXT:    vst1.16 {d3, d4, d5}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0
@@ -708,7 +708,7 @@ define arm_aapcs_vfpcc void @test_vst1_bf16_x4(ptr nocapture %ptr, [4 x <2 x i32
 ; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0:256]
+; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0
@@ -730,8 +730,8 @@ define arm_aapcs_vfpcc void @test_vst1q_bf16_x4(ptr nocapture %ptr, [4 x <4 x i3
 ; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0:256]!
-; CHECK-NEXT:    vst1.16 {d4, d5, d6, d7}, [r0:256]
+; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0]!
+; CHECK-NEXT:    vst1.16 {d4, d5, d6, d7}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0
diff --git a/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll b/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll
index 68dda24b53747..b0435f2a276f1 100644
--- a/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll
@@ -29,7 +29,7 @@ define void @test() {
 define { <4 x i16>, <4 x i16> } @test_vld1x2_no_align(ptr align 16 %a) {
 ; CHECK-LABEL: define { <4 x i16>, <4 x i16> } @test_vld1x2_no_align(
 ; CHECK-SAME: ptr align 16 [[A:%.*]]) {
-; CHECK-NEXT:    [[TMP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld1x2.v4i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld1x2.v4i16.p0(ptr align 16 [[A]])
 ; CHECK-NEXT:    ret { <4 x i16>, <4 x i16> } [[TMP]]
 ;
   %tmp = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld1x2.v4i16.p0(ptr %a)
@@ -39,7 +39,7 @@ define { <4 x i16>, <4 x i16> } @test_vld1x2_no_align(ptr align 16 %a) {
 define { <4 x i16>, <4 x i16> } @test_vld1x2_lower_align(ptr align 16 %a) {
 ; CHECK-LABEL: define { <4 x i16>, <4 x i16> } @test_vld1x2_lower_align(
 ; CHECK-SAME: ptr align 16 [[A:%.*]]) {
-; CHECK-NEXT:    [[TMP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld1x2.v4i16.p0(ptr align 8 [[A]])
+; CHECK-NEXT:    [[TMP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld1x2.v4i16.p0(ptr align 16 [[A]])
 ; CHECK-NEXT:    ret { <4 x i16>, <4 x i16> } [[TMP]]
 ;
   %tmp = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld1x2.v4i16.p0(ptr align 8 %a)
@@ -59,7 +59,7 @@ define { <4 x i16>, <4 x i16> } @test_vld1x2_higher_align(ptr align 8 %a) {
 define void @test_vst1x2_no_align(ptr align 16 %a, <4 x i16> %b0, <4 x i16> %b1) {
 ; CHECK-LABEL: define void @test_vst1x2_no_align(
 ; CHECK-SAME: ptr align 16 [[A:%.*]], <4 x i16> [[B0:%.*]], <4 x i16> [[B1:%.*]]) {
-; CHECK-NEXT:    call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr [[A]], <4 x i16> [[B0]], <4 x i16> [[B1]])
+; CHECK-NEXT:    call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr align 16 [[A]], <4 x i16> [[B0]], <4 x i16> [[B1]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr %a, <4 x i16> %b0, <4 x i16> %b1)

From 3d729571fdc86a40218e5743d4386d7d8edc36ae Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 5 Sep 2024 15:27:48 +0800
Subject: [PATCH 192/425] [RISCV] Model dest EEW and fix peepholes not checking
 EEW (#105945)

Previously for vector peepholes that fold based on VL, we checked if the
VLMAX is the same as a proxy to check that the EEWs were the same. This
only worked at LMUL >= 1 because the EMULs of the Src output and user's
input had to be the same because the register classes needed to match.

At fractional LMULs we would have incorrectly folded something like
this:

    %x:vr = PseudoVADD_VV_MF4 $noreg, $noreg, $noreg, 4, 4 /* e16 */, 0
    %y:vr = PseudoVMV_V_V_MF8 $noreg, %x, 4, 3 /* e8 */, 0

This models the EEW of the destination operands of vector instructions
with a TSFlag, which is enough to fix the incorrect folding.

There's some overlap with the TargetOverlapConstraintType and
IsRVVWideningReduction. If we model the source operands as well we may
be able to subsume them.
---
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h |  8 +++
 llvm/lib/Target/RISCV/RISCVInstrFormats.td    | 12 ++++
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      | 12 ++++
 llvm/lib/Target/RISCV/RISCVInstrInfo.h        |  4 ++
 llvm/lib/Target/RISCV/RISCVInstrInfoV.td      | 57 +++++++++++--------
 llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td    |  9 ++-
 llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td  |  5 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td    |  3 +-
 llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 27 +++++----
 .../CodeGen/RISCV/rvv/vmv.v.v-peephole.mir    | 16 ++++++
 .../Target/RISCV/RISCVInstrInfoTest.cpp       | 21 +++++++
 11 files changed, 133 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 7a0b35c1afce2..cf3ea3e4ea213 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -129,6 +129,14 @@ enum {
 
   ElementsDependOnMaskShift = ElementsDependOnVLShift + 1,
   ElementsDependOnMaskMask = 1ULL << ElementsDependOnMaskShift,
+
+  // Indicates the EEW of a vector instruction's destination operand.
+  // 0 -> 1
+  // 1 -> SEW
+  // 2 -> SEW * 2
+  // 3 -> SEW * 4
+  DestEEWShift = ElementsDependOnMaskShift + 1,
+  DestEEWMask = 3ULL << DestEEWShift,
 };
 
 // Helper functions to read TSFlags.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index a389320adc876..fcea18f81b390 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -167,6 +167,14 @@ def EltDepsNone      : EltDeps<vl=0, mask=0>;
 def EltDepsVL        : EltDeps<vl=1, mask=0>;
 def EltDepsVLMask    : EltDeps<vl=1, mask=1>;
 
+class EEW <bits<2> val> {
+  bits<2> Value = val;
+}
+def EEW1     : EEW<0>;
+def EEWSEWx1 : EEW<1>;
+def EEWSEWx2 : EEW<2>;
+def EEWSEWx4 : EEW<3>;
+
 class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
                    list<dag> pattern, InstFormat format> : Instruction {
   let Namespace = "RISCV";
@@ -240,6 +248,10 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
   EltDeps ElementsDependOn = EltDepsNone;
   let TSFlags{23} = ElementsDependOn.VL;
   let TSFlags{24} = ElementsDependOn.Mask;
+
+  // Indicates the EEW of a vector instruction's destination operand.
+  EEW DestEEW = EEWSEWx1;
+  let TSFlags{26-25} = DestEEW.Value;
 }
 
 class RVInst<dag outs, dag ins, string opcodestr, string argstr,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 53340aa8b45f6..0a64a8e144008 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -4006,3 +4006,15 @@ unsigned RISCV::getRVVMCOpcode(unsigned RVVPseudoOpcode) {
     return 0;
   return RVV->BaseInstr;
 }
+
+unsigned RISCV::getDestLog2EEW(const MCInstrDesc &Desc, unsigned Log2SEW) {
+  unsigned DestEEW =
+      (Desc.TSFlags & RISCVII::DestEEWMask) >> RISCVII::DestEEWShift;
+  // EEW = 1
+  if (DestEEW == 0)
+    return 0;
+  // EEW = SEW * n
+  unsigned Scaled = Log2SEW + (DestEEW - 1);
+  assert(Scaled >= 3 && Scaled <= 6);
+  return Scaled;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 8494110adffb9..e040891539ddf 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -354,6 +354,10 @@ std::optional<unsigned> getVectorLowDemandedScalarBits(uint16_t Opcode,
 // Returns the MC opcode of RVV pseudo instruction.
 unsigned getRVVMCOpcode(unsigned RVVPseudoOpcode);
 
+// For a (non-pseudo) RVV instruction \p Desc and the given \p Log2SEW, returns
+// the log2 EEW of the destination operand.
+unsigned getDestLog2EEW(const MCInstrDesc &Desc, unsigned Log2SEW);
+
 // Special immediate for AVL operand of V pseudo instructions to indicate VLMax.
 static constexpr int64_t VLMaxSentinel = -1LL;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 738bb5d9bd65b..6f7d14d5503bd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -1104,7 +1104,7 @@ def : InstAlias<"vneg.v $vd, $vs", (VRSUB_VX VR:$vd, VR:$vs, X0, zero_reg)>;
 // The destination vector register group cannot overlap a source vector
 // register group of a different element width (including the mask register
 // if masked), otherwise an illegal instruction exception is raised.
-let Constraints = "@earlyclobber $vd" in {
+let Constraints = "@earlyclobber $vd", DestEEW = EEWSEWx2 in {
 let RVVConstraint = WidenV in {
 defm VWADDU_V : VALU_MV_V_X<"vwaddu", 0b110000, "v">;
 defm VWSUBU_V : VALU_MV_V_X<"vwsubu", 0b110010, "v">;
@@ -1121,7 +1121,7 @@ defm VWSUBU_W : VALU_MV_V_X<"vwsubu", 0b110110, "w">;
 defm VWADD_W : VALU_MV_V_X<"vwadd", 0b110101, "w">;
 defm VWSUB_W : VALU_MV_V_X<"vwsub", 0b110111, "w">;
 } // RVVConstraint = WidenW
-} // Constraints = "@earlyclobber $vd"
+} // Constraints = "@earlyclobber $vd", DestEEW = EEWSEWx2
 
 def : InstAlias<"vwcvt.x.x.v $vd, $vs$vm",
                 (VWADD_VX VR:$vd, VR:$vs, X0, VMaskOp:$vm)>;
@@ -1147,10 +1147,11 @@ defm VMADC_V : VALUm_IV_V_X_I<"vmadc", 0b010001>;
 defm VMADC_V : VALUNoVm_IV_V_X_I<"vmadc", 0b010001>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
 defm VSBC_V : VALUm_IV_V_X<"vsbc", 0b010010>;
-let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint,
+    DestEEW = EEW1 in {
 defm VMSBC_V : VALUm_IV_V_X<"vmsbc", 0b010011>;
 defm VMSBC_V : VALUNoVm_IV_V_X<"vmsbc", 0b010011>;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
+} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, DestEEW = EEW1
 
 // Vector Bitwise Logical Instructions
 defm VAND_V : VALU_IV_V_X_I<"vand", 0b001001>;
@@ -1183,7 +1184,7 @@ def : InstAlias<"vncvt.x.x.w $vd, $vs",
                 (VNSRL_WX VR:$vd, VR:$vs, X0, zero_reg)>;
 
 // Vector Integer Comparison Instructions
-let RVVConstraint = NoConstraint in {
+let RVVConstraint = NoConstraint, DestEEW = EEW1 in {
 defm VMSEQ_V : VCMP_IV_V_X_I<"vmseq", 0b011000>;
 defm VMSNE_V : VCMP_IV_V_X_I<"vmsne", 0b011001>;
 defm VMSLTU_V : VCMP_IV_V_X<"vmsltu", 0b011010>;
@@ -1192,7 +1193,7 @@ defm VMSLEU_V : VCMP_IV_V_X_I<"vmsleu", 0b011100>;
 defm VMSLE_V : VCMP_IV_V_X_I<"vmsle", 0b011101>;
 defm VMSGTU_V : VCMP_IV_X_I<"vmsgtu", 0b011110>;
 defm VMSGT_V : VCMP_IV_X_I<"vmsgt", 0b011111>;
-} // RVVConstraint = NoConstraint
+} // RVVConstraint = NoConstraint, DestEEW = EEW1
 
 def : InstAlias<"vmsgtu.vv $vd, $va, $vb$vm",
                 (VMSLTU_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
@@ -1204,7 +1205,7 @@ def : InstAlias<"vmsge.vv $vd, $va, $vb$vm",
                 (VMSLE_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
 
 let isCodeGenOnly = 0, isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 0,
-    mayStore = 0 in {
+    mayStore = 0, DestEEW = EEW1 in {
 // For unsigned comparisons we need to special case 0 immediate to maintain
 // the always true/false semantics we would invert if we just decremented the
 // immediate like we do for signed. To match the GNU assembler we will use
@@ -1227,7 +1228,7 @@ def PseudoVMSLT_VI : Pseudo<(outs VR:$vd),
 }
 
 let isCodeGenOnly = 0, isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 0,
-    mayStore = 0 in {
+    mayStore = 0, DestEEW = EEW1 in {
 def PseudoVMSGEU_VX : Pseudo<(outs VR:$vd),
                              (ins VR:$vs2, GPR:$rs1),
                              [], "vmsgeu.vx", "$vd, $vs2, $rs1">;
@@ -1267,11 +1268,12 @@ defm VREMU_V : VDIV_MV_V_X<"vremu", 0b100010>;
 defm VREM_V : VDIV_MV_V_X<"vrem", 0b100011>;
 
 // Vector Widening Integer Multiply Instructions
-let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV,
+    DestEEW = EEWSEWx2 in {
 defm VWMUL_V : VWMUL_MV_V_X<"vwmul", 0b111011>;
 defm VWMULU_V : VWMUL_MV_V_X<"vwmulu", 0b111000>;
 defm VWMULSU_V : VWMUL_MV_V_X<"vwmulsu", 0b111010>;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
+} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV, DestEEW = EEWSEWx2
 
 // Vector Single-Width Integer Multiply-Add Instructions
 defm VMACC_V : VMAC_MV_V_X<"vmacc", 0b101101>;
@@ -1280,10 +1282,12 @@ defm VMADD_V : VMAC_MV_V_X<"vmadd", 0b101001>;
 defm VNMSUB_V : VMAC_MV_V_X<"vnmsub", 0b101011>;
 
 // Vector Widening Integer Multiply-Add Instructions
+let DestEEW = EEWSEWx2 in {
 defm VWMACCU_V : VWMAC_MV_V_X<"vwmaccu", 0b111100>;
 defm VWMACC_V : VWMAC_MV_V_X<"vwmacc", 0b111101>;
 defm VWMACCSU_V : VWMAC_MV_V_X<"vwmaccsu", 0b111111>;
 defm VWMACCUS_V : VWMAC_MV_X<"vwmaccus", 0b111110>;
+} // DestEEW = EEWSEWx2
 
 // Vector Integer Merge Instructions
 defm VMERGE_V : VMRG_IV_V_X_I<"vmerge", 0b010111>;
@@ -1342,7 +1346,8 @@ defm VFRSUB_V : VALU_FV_F<"vfrsub", 0b100111>;
 // Vector Widening Floating-Point Add/Subtract Instructions
 let Constraints = "@earlyclobber $vd",
     Uses = [FRM],
-    mayRaiseFPException = true in {
+    mayRaiseFPException = true,
+    DestEEW = EEWSEWx2 in {
 let RVVConstraint = WidenV in {
 defm VFWADD_V : VWALU_FV_V_F<"vfwadd", 0b110000, "v">;
 defm VFWSUB_V : VWALU_FV_V_F<"vfwsub", 0b110010, "v">;
@@ -1355,7 +1360,7 @@ let RVVConstraint = WidenW in {
 defm VFWADD_W : VWALU_FV_V_F<"vfwadd", 0b110100, "w">;
 defm VFWSUB_W : VWALU_FV_V_F<"vfwsub", 0b110110, "w">;
 } // RVVConstraint = WidenW
-} // Constraints = "@earlyclobber $vd", Uses = [FRM], mayRaiseFPException = true
+} // Constraints = "@earlyclobber $vd", Uses = [FRM], mayRaiseFPException = true, DestEEW = EEWSEWx2
 
 // Vector Single-Width Floating-Point Multiply/Divide Instructions
 let Uses = [FRM], mayRaiseFPException = true in {
@@ -1366,9 +1371,9 @@ defm VFRDIV_V : VDIV_FV_F<"vfrdiv", 0b100001>;
 
 // Vector Widening Floating-Point Multiply
 let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV,
-    Uses = [FRM], mayRaiseFPException = true in {
+    Uses = [FRM], mayRaiseFPException = true, DestEEW = EEWSEWx2 in {
 defm VFWMUL_V : VWMUL_FV_V_F<"vfwmul", 0b111000>;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV, Uses = [FRM], mayRaiseFPException = true
+} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV, Uses = [FRM], mayRaiseFPException = true, DestEEW = EEWSEWx2
 
 // Vector Single-Width Floating-Point Fused Multiply-Add Instructions
 let Uses = [FRM], mayRaiseFPException = true in {
@@ -1383,12 +1388,12 @@ defm VFNMSUB_V : VMAC_FV_V_F<"vfnmsub", 0b101011>;
 }
 
 // Vector Widening Floating-Point Fused Multiply-Add Instructions
-let Uses = [FRM], mayRaiseFPException = true in {
+let Uses = [FRM], mayRaiseFPException = true, DestEEW = EEWSEWx2 in {
 defm VFWMACC_V : VWMAC_FV_V_F<"vfwmacc", 0b111100>;
 defm VFWNMACC_V : VWMAC_FV_V_F<"vfwnmacc", 0b111101>;
 defm VFWMSAC_V : VWMAC_FV_V_F<"vfwmsac", 0b111110>;
 defm VFWNMSAC_V : VWMAC_FV_V_F<"vfwnmsac", 0b111111>;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV, Uses = [FRM], mayRaiseFPException = true
+} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV, Uses = [FRM], mayRaiseFPException = true, DestEEW = EEWSEWx2
 
 // Vector Floating-Point Square-Root Instruction
 let Uses = [FRM], mayRaiseFPException = true in {
@@ -1420,14 +1425,14 @@ def : InstAlias<"vfabs.v $vd, $vs",
                 (VFSGNJX_VV VR:$vd, VR:$vs, VR:$vs, zero_reg)>;
 
 // Vector Floating-Point Compare Instructions
-let RVVConstraint = NoConstraint, mayRaiseFPException = true in {
+let RVVConstraint = NoConstraint, mayRaiseFPException = true, DestEEW = EEW1 in {
 defm VMFEQ_V : VCMP_FV_V_F<"vmfeq", 0b011000>;
 defm VMFNE_V : VCMP_FV_V_F<"vmfne", 0b011100>;
 defm VMFLT_V : VCMP_FV_V_F<"vmflt", 0b011011>;
 defm VMFLE_V : VCMP_FV_V_F<"vmfle", 0b011001>;
 defm VMFGT_V : VCMP_FV_F<"vmfgt", 0b011101>;
 defm VMFGE_V : VCMP_FV_F<"vmfge", 0b011111>;
-} // RVVConstraint = NoConstraint, mayRaiseFPException = true
+} // RVVConstraint = NoConstraint, mayRaiseFPException = true, DestEEW = EEW1
 
 def : InstAlias<"vmfgt.vv $vd, $va, $vb$vm",
                 (VMFLT_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
@@ -1471,7 +1476,7 @@ defm VFCVT_F_X_V : VCVTF_IV_VS2<"vfcvt.f.x.v", 0b010010, 0b00011>;
 
 // Widening Floating-Point/Integer Type-Convert Instructions
 let Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt,
-    mayRaiseFPException = true in {
+    mayRaiseFPException = true, DestEEW = EEWSEWx2 in {
 let Uses = [FRM] in {
 defm VFWCVT_XU_F_V : VWCVTI_FV_VS2<"vfwcvt.xu.f.v", 0b010010, 0b01000>;
 defm VFWCVT_X_F_V : VWCVTI_FV_VS2<"vfwcvt.x.f.v", 0b010010, 0b01001>;
@@ -1481,7 +1486,7 @@ defm VFWCVT_RTZ_X_F_V : VWCVTI_FV_VS2<"vfwcvt.rtz.x.f.v", 0b010010, 0b01111>;
 defm VFWCVT_F_XU_V : VWCVTF_IV_VS2<"vfwcvt.f.xu.v", 0b010010, 0b01010>;
 defm VFWCVT_F_X_V : VWCVTF_IV_VS2<"vfwcvt.f.x.v", 0b010010, 0b01011>;
 defm VFWCVT_F_F_V : VWCVTF_FV_VS2<"vfwcvt.f.f.v", 0b010010, 0b01100>;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt
+} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt, DestEEW = EEWSEWx2
 
 // Narrowing Floating-Point/Integer Type-Convert Instructions
 let Constraints = "@earlyclobber $vd", mayRaiseFPException = true in {
@@ -1515,14 +1520,14 @@ defm VREDXOR  : VRED_MV_V<"vredxor", 0b000011>;
 } // RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask
 
 // Vector Widening Integer Reduction Instructions
-let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask, DestEEW = EEWSEWx2 in {
 // Set earlyclobber for following instructions for second and mask operands.
 // This has the downside that the earlyclobber constraint is too coarse and
 // will impose unnecessary restrictions by not allowing the destination to
 // overlap with the first (wide) operand.
 defm VWREDSUMU : VWRED_IV_V<"vwredsumu", 0b110000>;
 defm VWREDSUM : VWRED_IV_V<"vwredsum", 0b110001>;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask
+} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask, DestEEW = EEWSEWx2
 
 } // Predicates = [HasVInstructions]
 
@@ -1543,7 +1548,7 @@ def : InstAlias<"vfredsum.vs $vd, $vs2, $vs1$vm",
                 (VFREDUSUM_VS VR:$vd, VR:$vs2, VR:$vs1, VMaskOp:$vm), 0>;
 
 // Vector Widening Floating-Point Reduction Instructions
-let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask, DestEEW = EEWSEWx2 in {
 // Set earlyclobber for following instructions for second and mask operands.
 // This has the downside that the earlyclobber constraint is too coarse and
 // will impose unnecessary restrictions by not allowing the destination to
@@ -1552,7 +1557,7 @@ let Uses = [FRM], mayRaiseFPException = true in {
 defm VFWREDOSUM : VWREDO_FV_V<"vfwredosum", 0b110011>;
 defm VFWREDUSUM : VWRED_FV_V<"vfwredusum", 0b110001>;
 }
-} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask
+} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint, ElementsDependOn = EltDepsVLMask, DestEEW = EEWSEWx2
 
 def : InstAlias<"vfwredsum.vs $vd, $vs2, $vs1$vm",
                 (VFWREDUSUM_VS VR:$vd, VR:$vs2, VR:$vs1, VMaskOp:$vm), 0>;
@@ -1560,7 +1565,7 @@ def : InstAlias<"vfwredsum.vs $vd, $vs2, $vs1$vm",
 
 let Predicates = [HasVInstructions] in {
 // Vector Mask-Register Logical Instructions
-let RVVConstraint = NoConstraint in {
+let RVVConstraint = NoConstraint, DestEEW = EEW1 in {
 defm VMAND_M : VMALU_MV_Mask<"vmand", 0b011001, "m">;
 defm VMNAND_M : VMALU_MV_Mask<"vmnand", 0b011101, "m">;
 defm VMANDN_M : VMALU_MV_Mask<"vmandn", 0b011000, "m">;
@@ -1607,12 +1612,14 @@ def : InstAlias<"vpopc.m $vd, $vs2$vm",
 
 let Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsVLMask in {
 
+let DestEEW = EEW1 in {
 // vmsbf.m set-before-first mask bit
 defm VMSBF_M : VMSFS_MV_V<"vmsbf.m", 0b010100, 0b00001>;
 // vmsif.m set-including-first mask bit
 defm VMSIF_M : VMSFS_MV_V<"vmsif.m", 0b010100, 0b00011>;
 // vmsof.m set-only-first mask bit
 defm VMSOF_M : VMSFS_MV_V<"vmsof.m", 0b010100, 0b00010>;
+} // DestEEW = EEW1
 // Vector Iota Instruction
 defm VIOTA_M : VIOTA_MV_V<"viota.m", 0b010100, 0b10000>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 3c1fb38349d5c..851e817c50125 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -201,21 +201,24 @@ let Predicates = [HasVendorXSfvcp], mayLoad = 0, mayStore = 0,
   defm FVW : CustomSiFiveVCIX<"fvw", VCIX_XVW, VR,    VR,    FPR32>, Sched<[]>;
 }
 
-let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvqmaccdod" in {
+let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvqmaccdod",
+    DestEEW = EEWSEWx4 in {
   def VQMACCU_2x8x2  : CustomSiFiveVMACC<0b101100, OPMVV, "sf.vqmaccu.2x8x2">;
   def VQMACC_2x8x2   : CustomSiFiveVMACC<0b101101, OPMVV, "sf.vqmacc.2x8x2">;
   def VQMACCUS_2x8x2 : CustomSiFiveVMACC<0b101110, OPMVV, "sf.vqmaccus.2x8x2">;
   def VQMACCSU_2x8x2 : CustomSiFiveVMACC<0b101111, OPMVV, "sf.vqmaccsu.2x8x2">;
 }
 
-let Predicates = [HasVendorXSfvqmaccqoq], DecoderNamespace = "XSfvqmaccqoq" in {
+let Predicates = [HasVendorXSfvqmaccqoq], DecoderNamespace = "XSfvqmaccqoq",
+    DestEEW = EEWSEWx4 in {
   def VQMACCU_4x8x4  : CustomSiFiveVMACC<0b111100, OPMVV, "sf.vqmaccu.4x8x4">;
   def VQMACC_4x8x4   : CustomSiFiveVMACC<0b111101, OPMVV, "sf.vqmacc.4x8x4">;
   def VQMACCUS_4x8x4 : CustomSiFiveVMACC<0b111110, OPMVV, "sf.vqmaccus.4x8x4">;
   def VQMACCSU_4x8x4 : CustomSiFiveVMACC<0b111111, OPMVV, "sf.vqmaccsu.4x8x4">;
 }
 
-let Predicates = [HasVendorXSfvfwmaccqqq], DecoderNamespace = "XSfvfwmaccqqq" in {
+let Predicates = [HasVendorXSfvfwmaccqqq], DecoderNamespace = "XSfvfwmaccqqq",
+    DestEEW = EEWSEWx2 in {
   def VFWMACC_4x4x4 : CustomSiFiveVMACC<0b111100, OPFVV, "sf.vfwmacc.4x4x4">;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
index 1b1f3b9b16e44..a79f757753325 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
@@ -19,7 +19,7 @@
 
 let Predicates = [HasStdExtZvfbfmin], Constraints = "@earlyclobber $vd",
     mayRaiseFPException = true in {
-let RVVConstraint = WidenCvt in
+let RVVConstraint = WidenCvt, DestEEW = EEWSEWx2 in
 defm VFWCVTBF16_F_F_V : VWCVTF_FV_VS2<"vfwcvtbf16.f.f.v", 0b010010, 0b01101>;
 let Uses = [FRM] in
 defm VFNCVTBF16_F_F_W : VNCVTF_FV_VS2<"vfncvtbf16.f.f.w", 0b010010, 0b11101>;
@@ -27,6 +27,7 @@ defm VFNCVTBF16_F_F_W : VNCVTF_FV_VS2<"vfncvtbf16.f.f.w", 0b010010, 0b11101>;
 
 let Predicates = [HasStdExtZvfbfwma],
     Constraints = "@earlyclobber $vd_wb, $vd = $vd_wb",
-    RVVConstraint = WidenV, Uses = [FRM], mayRaiseFPException = true in {
+    RVVConstraint = WidenV, Uses = [FRM], mayRaiseFPException = true,
+    DestEEW = EEWSEWx2 in {
 defm VFWMACCBF16_V : VWMAC_FV_V_F<"vfwmaccbf16", 0b111011>;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index e19a11805c9c0..7ec13e4eaafa7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -123,7 +123,8 @@ let Predicates = [HasStdExtZvbb] in {
   def  VCLZ_V   : VALUVs2<0b010010, 0b01100, OPMVV, "vclz.v">;
   def  VCPOP_V  : VALUVs2<0b010010, 0b01110, OPMVV, "vcpop.v">;
   def  VCTZ_V   : VALUVs2<0b010010, 0b01101, OPMVV, "vctz.v">;
-  let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in
+  let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV,
+      DestEEW = EEWSEWx2 in
   defm VWSLL_V  : VSHT_IV_V_X_I<"vwsll", 0b110101>;
 } // Predicates = [HasStdExtZvbb]
 
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 026e0a365b8dc..a612a03106f02 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -69,6 +69,7 @@ class RISCVVectorPeephole : public MachineFunctionPass {
   bool foldUndefPassthruVMV_V_V(MachineInstr &MI);
   bool foldVMV_V_V(MachineInstr &MI);
 
+  bool hasSameEEW(const MachineInstr &User, const MachineInstr &Src) const;
   bool isAllOnesMask(const MachineInstr *MaskDef) const;
   std::optional<unsigned> getConstant(const MachineOperand &VL) const;
   bool ensureDominates(const MachineOperand &Use, MachineInstr &Src) const;
@@ -98,10 +99,17 @@ static bool isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) {
   return LHS.getImm() <= RHS.getImm();
 }
 
-static unsigned getSEWLMULRatio(const MachineInstr &MI) {
-  RISCVII::VLMUL LMUL = RISCVII::getLMul(MI.getDesc().TSFlags);
-  unsigned Log2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
-  return RISCVVType::getSEWLMULRatio(1 << Log2SEW, LMUL);
+/// Given \p User that has an input operand with EEW=SEW, which uses the dest
+/// operand of \p Src with an unknown EEW, return true if their EEWs match.
+bool RISCVVectorPeephole::hasSameEEW(const MachineInstr &User,
+                                     const MachineInstr &Src) const {
+  unsigned UserLog2SEW =
+      User.getOperand(RISCVII::getSEWOpNum(User.getDesc())).getImm();
+  unsigned SrcLog2SEW =
+      Src.getOperand(RISCVII::getSEWOpNum(Src.getDesc())).getImm();
+  unsigned SrcLog2EEW = RISCV::getDestLog2EEW(
+      TII->get(RISCV::getRVVMCOpcode(Src.getOpcode())), SrcLog2SEW);
+  return SrcLog2EEW == UserLog2SEW;
 }
 
 // Attempt to reduce the VL of an instruction whose sole use is feeding a
@@ -154,8 +162,8 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
       !RISCVII::hasSEWOp(Src->getDesc().TSFlags))
     return false;
 
-  // Src needs to have the same VLMAX as MI
-  if (getSEWLMULRatio(MI) != getSEWLMULRatio(*Src))
+  // Src's dest needs to have the same EEW as MI's input.
+  if (!hasSameEEW(MI, *Src))
     return false;
 
   bool ElementsDependOnVL = RISCVII::elementsDependOnVL(
@@ -486,8 +494,7 @@ bool RISCVVectorPeephole::foldUndefPassthruVMV_V_V(MachineInstr &MI) {
   if (Src && !Src->hasUnmodeledSideEffects() &&
       MRI->hasOneUse(MI.getOperand(2).getReg()) &&
       RISCVII::hasVLOp(Src->getDesc().TSFlags) &&
-      RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags) &&
-      getSEWLMULRatio(MI) == getSEWLMULRatio(*Src)) {
+      RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags) && hasSameEEW(MI, *Src)) {
     const MachineOperand &MIVL = MI.getOperand(3);
     const MachineOperand &SrcVL =
         Src->getOperand(RISCVII::getVLOpNum(Src->getDesc()));
@@ -532,8 +539,8 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
       !RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags))
     return false;
 
-  // Src needs to have the same VLMAX as MI
-  if (getSEWLMULRatio(MI) != getSEWLMULRatio(*Src))
+  // Src's dest needs to have the same EEW as MI's input.
+  if (!hasSameEEW(MI, *Src))
     return false;
 
   // Src needs to have the same passthru as VMV_V_V
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
index 6858231bf0e6c..2f02be025485c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
@@ -73,3 +73,19 @@ body: |
     %passthru:vr = COPY $v8
     %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */
     %y:vr = PseudoVMV_V_V_M1 $noreg, %x, 4, 5 /* e32 */, 1 /* ta, mu */
+...
+---
+# Shouldn't be folded because the EEWs don't match
+name: different_eew
+body: |
+  bb.0:
+    liveins: $v8
+    ; CHECK-LABEL: name: different_eew
+    ; CHECK: liveins: $v8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %passthru:vr = COPY $v8
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_MF4 %passthru, $noreg, $noreg, 4, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVMV_V_V_MF8 %passthru, %x, 4, 3 /* e8 */, 0 /* tu, mu */
+    %passthru:vr = COPY $v8
+    %x:vr = PseudoVADD_VV_MF4 %passthru, $noreg, $noreg, 4, 4 /* e16 */, 0 /* tu, mu */
+    %y:vr = PseudoVMV_V_V_MF8 %passthru, %x, 4, 3 /* e8 */, 0 /* tu, mu */
diff --git a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
index fe711619c6320..cc0aca10fc6da 100644
--- a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
+++ b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
@@ -316,6 +316,27 @@ TEST_P(RISCVInstrInfoTest, DescribeLoadedValue) {
   MF->deleteMachineBasicBlock(MBB);
 }
 
+TEST_P(RISCVInstrInfoTest, GetDestEEW) {
+  const RISCVInstrInfo *TII = ST->getInstrInfo();
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VADD_VV), 3), 3u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VWADD_VV), 3), 4u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VLE32_V), 5), 5u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VLSE32_V), 5), 5u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VREDSUM_VS), 4), 4u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VWREDSUM_VS), 4), 5u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VFWREDOSUM_VS), 5), 6u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VFCVT_RTZ_XU_F_V), 4), 4u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VFWCVT_RTZ_XU_F_V), 4), 5u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VSLL_VI), 4), 4u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VWSLL_VI), 4), 5u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VMSEQ_VV), 4), 0u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VMAND_MM), 0), 0u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VIOTA_M), 3), 3u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VQMACCU_2x8x2), 3), 5u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::VFWMACC_4x4x4), 4), 5u);
+  EXPECT_EQ(RISCV::getDestLog2EEW(TII->get(RISCV::THVdotVMAQA_VV), 5), 5u);
+}
+
 } // namespace
 
 INSTANTIATE_TEST_SUITE_P(RV32And64, RISCVInstrInfoTest,

From f006246299c96486a8e37005a94e07c0bf334ee0 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 5 Sep 2024 09:34:39 +0200
Subject: [PATCH 193/425] [CodeGen] Add generic INIT_UNDEF pseudo (#106744)

The InitUndef pass currently uses target-specific pseudo instructions,
with one pseudo per register class.

Instead, add a generic pseudo instruction, which can be used by all
targets and register classes.
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |   9 -
 llvm/include/llvm/Support/TargetOpcodes.def   |   5 +
 llvm/include/llvm/Target/Target.td            |   7 +
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |   4 +
 llvm/lib/CodeGen/InitUndef.cpp                |   7 +-
 llvm/lib/Target/ARM/ARMAsmPrinter.cpp         |   6 -
 llvm/lib/Target/ARM/ARMBaseInstrInfo.h        |  13 --
 llvm/lib/Target/ARM/ARMInstrInfo.td           |  12 --
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp     |   5 -
 llvm/lib/Target/RISCV/RISCVInstrInfo.h        |  15 --
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    |   9 -
 .../rvv/handle-noreg-with-implicit-def.mir    |   4 +-
 .../rvv/subregister-undef-early-clobber.mir   | 176 +++++++++---------
 .../RISCV/rvv/undef-earlyclobber-chain.mir    |   4 +-
 .../Thumb2/mve-laneinterleaving-cost.ll       |   4 +-
 llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll  |  10 +-
 .../match-table-imms.td                       |   2 +-
 17 files changed, 119 insertions(+), 173 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 49ce13dd8cbe3..65c5788ac5cc9 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2278,15 +2278,6 @@ class TargetInstrInfo : public MCInstrInfo {
     llvm_unreachable("unknown number of operands necessary");
   }
 
-  /// Gets the opcode for the Pseudo Instruction used to initialize
-  /// the undef value. If no Instruction is available, this will
-  /// fail compilation.
-  virtual unsigned getUndefInitOpcode(unsigned RegClassID) const {
-    (void)RegClassID;
-
-    llvm_unreachable("Unexpected register class.");
-  }
-
 private:
   mutable std::unique_ptr<MIRFormatter> Formatter;
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 635c265a43363..e1883de0c93b4 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -56,6 +56,11 @@ HANDLE_TARGET_OPCODE(INSERT_SUBREG)
 /// IMPLICIT_DEF - This is the MachineInstr-level equivalent of undef.
 HANDLE_TARGET_OPCODE(IMPLICIT_DEF)
 
+/// Explicit undef initialization used past IMPLICIT_DEF elimination in cases
+/// where an undef operand must be allocated to a different register than an
+/// early-clobber result operand.
+HANDLE_TARGET_OPCODE(INIT_UNDEF)
+
 /// SUBREG_TO_REG - Assert the value of bits in a super register.
 /// The result of this instruction is the value of the second operand inserted
 /// into the subregister specified by the third operand. All other bits are
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index b2eb250ae60b6..3e037affe1cfd 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1254,6 +1254,13 @@ def IMPLICIT_DEF : StandardPseudoInstruction {
   let isAsCheapAsAMove = true;
   let isMeta = true;
 }
+def INIT_UNDEF : StandardPseudoInstruction {
+  let OutOperandList = (outs unknown:$dst);
+  let InOperandList = (ins);
+  let AsmString = "";
+  let hasSideEffects = false;
+  let Size = 0;
+}
 def SUBREG_TO_REG : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins unknown:$implsrc, unknown:$subsrc, i32imm:$subidx);
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 19d23c8ba9678..88e9b9d27d3f2 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1832,6 +1832,10 @@ void AsmPrinter::emitFunctionBody() {
         // This instruction is only used to note jump table debug info, it's
         // purely meta information.
         break;
+      case TargetOpcode::INIT_UNDEF:
+        // This is only used to influence register allocation behavior, no
+        // actual initialization is needed.
+        break;
       default:
         emitInstruction(&MI);
         if (CanDoExtraAnalysis) {
diff --git a/llvm/lib/CodeGen/InitUndef.cpp b/llvm/lib/CodeGen/InitUndef.cpp
index 7c1b90afd495e..8d20f2668de6b 100644
--- a/llvm/lib/CodeGen/InitUndef.cpp
+++ b/llvm/lib/CodeGen/InitUndef.cpp
@@ -177,8 +177,7 @@ bool InitUndef::handleSubReg(MachineFunction &MF, MachineInstr &MI,
       Register TmpInitSubReg = MRI->createVirtualRegister(SubRegClass);
       LLVM_DEBUG(dbgs() << "Register Class ID" << SubRegClass->getID() << "\n");
       BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(),
-              TII->get(TII->getUndefInitOpcode(SubRegClass->getID())),
-              TmpInitSubReg);
+              TII->get(TargetOpcode::INIT_UNDEF), TmpInitSubReg);
       Register NewReg = MRI->createVirtualRegister(TargetRegClass);
       BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(),
               TII->get(TargetOpcode::INSERT_SUBREG), NewReg)
@@ -203,9 +202,9 @@ bool InitUndef::fixupIllOperand(MachineInstr *MI, MachineOperand &MO) {
   const TargetRegisterClass *TargetRegClass =
       TRI->getLargestSuperClass(MRI->getRegClass(MO.getReg()));
   LLVM_DEBUG(dbgs() << "Register Class ID" << TargetRegClass->getID() << "\n");
-  unsigned Opcode = TII->getUndefInitOpcode(TargetRegClass->getID());
   Register NewReg = MRI->createVirtualRegister(TargetRegClass);
-  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(Opcode), NewReg);
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+          TII->get(TargetOpcode::INIT_UNDEF), NewReg);
   MO.setReg(NewReg);
   if (MO.isUndef())
     MO.setIsUndef(false);
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 8eb5d91d3b879..710182985a1e9 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -2411,12 +2411,6 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case ARM::SEH_EpilogEnd:
     ATS.emitARMWinCFIEpilogEnd();
     return;
-
-  case ARM::PseudoARMInitUndefMQPR:
-  case ARM::PseudoARMInitUndefSPR:
-  case ARM::PseudoARMInitUndefDPR_VFP2:
-  case ARM::PseudoARMInitUndefGPR:
-    return;
   }
 
   MCInst TmpInst;
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 27290f7f76347..aee9797585dbd 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -546,19 +546,6 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
 
   std::optional<RegImmPair> isAddImmediate(const MachineInstr &MI,
                                            Register Reg) const override;
-
-  unsigned getUndefInitOpcode(unsigned RegClassID) const override {
-    if (RegClassID == ARM::MQPRRegClass.getID())
-      return ARM::PseudoARMInitUndefMQPR;
-    if (RegClassID == ARM::SPRRegClass.getID())
-      return ARM::PseudoARMInitUndefSPR;
-    if (RegClassID == ARM::DPR_VFP2RegClass.getID())
-      return ARM::PseudoARMInitUndefDPR_VFP2;
-    if (RegClassID == ARM::GPRRegClass.getID())
-      return ARM::PseudoARMInitUndefGPR;
-
-    llvm_unreachable("Unexpected register class.");
-  }
 };
 
 /// Get the operands corresponding to the given \p Pred value. By default, the
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 0fc561382084e..ed68c6ff20cde 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -6549,15 +6549,3 @@ let isPseudo = 1 in {
   let isTerminator = 1 in
   def SEH_EpilogEnd : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
 }
-
-
-//===----------------------------------------------------------------------===//
-// Pseudo Instructions for use when early-clobber is defined and Greedy Register
-// Allocation is used. This ensures the constraint is used properly.
-//===----------------------------------------------------------------------===//
-let isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
-  def PseudoARMInitUndefMQPR      :  PseudoInst<(outs MQPR:$vd), (ins), NoItinerary, []>;
-  def PseudoARMInitUndefSPR      :  PseudoInst<(outs SPR:$sd), (ins), NoItinerary, []>;
-  def PseudoARMInitUndefDPR_VFP2      :  PseudoInst<(outs DPR_VFP2:$dd), (ins), NoItinerary, []>;
-  def PseudoARMInitUndefGPR     :  PseudoInst<(outs GPR:$rd), (ins), NoItinerary, []>;
-}
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 476dde2be39e5..24bca2da652d0 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -303,11 +303,6 @@ void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case RISCV::KCFI_CHECK:
     LowerKCFI_CHECK(*MI);
     return;
-  case RISCV::PseudoRVVInitUndefM1:
-  case RISCV::PseudoRVVInitUndefM2:
-  case RISCV::PseudoRVVInitUndefM4:
-  case RISCV::PseudoRVVInitUndefM8:
-    return;
   case TargetOpcode::STACKMAP:
     return LowerSTACKMAP(*OutStreamer, SM, *MI);
   case TargetOpcode::PATCHPOINT:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index e040891539ddf..457db9b9860d0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -293,21 +293,6 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
 
   unsigned getTailDuplicateSize(CodeGenOptLevel OptLevel) const override;
 
-  unsigned getUndefInitOpcode(unsigned RegClassID) const override {
-    switch (RegClassID) {
-    case RISCV::VRRegClassID:
-      return RISCV::PseudoRVVInitUndefM1;
-    case RISCV::VRM2RegClassID:
-      return RISCV::PseudoRVVInitUndefM2;
-    case RISCV::VRM4RegClassID:
-      return RISCV::PseudoRVVInitUndefM4;
-    case RISCV::VRM8RegClassID:
-      return RISCV::PseudoRVVInitUndefM8;
-    default:
-      llvm_unreachable("Unexpected register class.");
-    }
-  }
-
 protected:
   const RISCVSubtarget &STI;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 1b4303fbbcf80..c91c9c3614a34 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -6116,15 +6116,6 @@ foreach lmul = MxList in {
   }
 }
 
-/// Empty pseudo for RISCVInitUndefPass
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 0,
-    isCodeGenOnly = 1 in {
-  def PseudoRVVInitUndefM1 : Pseudo<(outs VR:$vd), (ins), [], "">;
-  def PseudoRVVInitUndefM2 : Pseudo<(outs VRM2:$vd), (ins), [], "">;
-  def PseudoRVVInitUndefM4 : Pseudo<(outs VRM4:$vd), (ins), [], "">;
-  def PseudoRVVInitUndefM8 : Pseudo<(outs VRM8:$vd), (ins), [], "">;
-}
-
 //===----------------------------------------------------------------------===//
 // 6. Configuration-Setting Instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/RISCV/rvv/handle-noreg-with-implicit-def.mir b/llvm/test/CodeGen/RISCV/rvv/handle-noreg-with-implicit-def.mir
index e090b313d4f7b..7b4d200ef8a3b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/handle-noreg-with-implicit-def.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/handle-noreg-with-implicit-def.mir
@@ -9,8 +9,8 @@ body:             |
     ; MIR-LABEL: name: vrgather_all_undef
     ; MIR: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
     ; MIR-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
-    ; MIR-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1
-    ; MIR-NEXT: early-clobber %1:vr = PseudoVRGATHER_VI_M1 [[DEF1]], killed [[PseudoRVVInitUndefM1_]], 0, 0, 5 /* e32 */, 0 /* tu, mu */
+    ; MIR-NEXT: [[INIT_UNDEF:%[0-9]+]]:vr = INIT_UNDEF
+    ; MIR-NEXT: early-clobber %1:vr = PseudoVRGATHER_VI_M1 [[DEF1]], killed [[INIT_UNDEF]], 0, 0, 5 /* e32 */, 0 /* tu, mu */
     ; MIR-NEXT: $v8 = COPY %1
     ; MIR-NEXT: PseudoRET implicit $v8
     %2:vr = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir b/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir
index 539d319f3426d..be6ed4d2a6aa1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir
@@ -14,10 +14,10 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_0
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_1
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_1
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_1
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_1
     ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M4 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -52,10 +52,10 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_1
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_1
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_0
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_1
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_0
     ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M4 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -90,10 +90,10 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_2
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_0
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_3
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_0
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_3
     ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M4 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -128,10 +128,10 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_3
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_0
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_2
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_0
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_2
     ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M4 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -166,8 +166,8 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_0
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_1
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_1
     ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG1]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M4 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -202,8 +202,8 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_1
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_0
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_0
     ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG1]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M4 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -239,12 +239,12 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_0
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_1
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_1
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1
-    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_1
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_1
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_1
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -279,12 +279,12 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_1
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_1
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_1
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1
-    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_0
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_1
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_0
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -319,12 +319,12 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_2
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_1
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_0
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1
-    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_3
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_0
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_3
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -359,12 +359,12 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_3
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_1
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_0
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1
-    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_2
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_0
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_2
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -399,12 +399,12 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_4
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_3
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1
-    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_5
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_3
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_5
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -439,12 +439,12 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_5
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_3
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1
-    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_4
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_3
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_4
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -479,12 +479,12 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_6
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_2
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1
-    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_7
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_2
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_7
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -519,12 +519,12 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_7
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_2
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1
-    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[PseudoRVVInitUndefM1_]], %subreg.sub_vrm1_6
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_2
+    ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_6
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -559,10 +559,10 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_0
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_1
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_1
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_1
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -597,10 +597,10 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_1
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_1
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_0
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_0
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -635,10 +635,10 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_2
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_3
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_3
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -673,10 +673,10 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_3
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_0
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM2_:%[0-9]+]]:vrm2 = PseudoRVVInitUndefM2
-    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[PseudoRVVInitUndefM2_]], %subreg.sub_vrm2_2
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
+    ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_2
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -711,8 +711,8 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M4_]], %subreg.sub_vrm4_0
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_1
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG1]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -747,8 +747,8 @@ body:             |
     ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M4_]], %subreg.sub_vrm4_1
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM4_:%[0-9]+]]:vrm4 = PseudoRVVInitUndefM4
-    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[PseudoRVVInitUndefM4_]], %subreg.sub_vrm4_0
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF
+    ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0
     ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG1]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0
     ; CHECK-NEXT: PseudoVSE32_V_M8 killed %6, killed [[ADDI1]], 0, 5 /* e32 */, implicit $vl, implicit $vtype
diff --git a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.mir b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.mir
index 8df2c60c926c3..69078710e9ccf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.mir
@@ -78,8 +78,8 @@ body:             |
     ; CHECK-LABEL: name: undef_early_clobber_chain
     ; CHECK: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 208 /* e32, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: [[PseudoRVVInitUndefM1_:%[0-9]+]]:vr = PseudoRVVInitUndefM1
-    ; CHECK-NEXT: early-clobber %1:vr = PseudoVRGATHER_VI_M1 undef [[DEF]], [[PseudoRVVInitUndefM1_]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vr = INIT_UNDEF
+    ; CHECK-NEXT: early-clobber %1:vr = PseudoVRGATHER_VI_M1 undef [[DEF]], [[INIT_UNDEF]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: $v8 = COPY %1
     ; CHECK-NEXT: PseudoRET implicit $v8
     %2:vr = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
index c2511a4992cf5..e86c368e0fe8a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
@@ -365,13 +365,13 @@ define arm_aapcs_vfpcc void @mul_i32(ptr %A, ptr %B, i64 %C, ptr %D) {
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    ldr.w lr, [sp, #20]
-; CHECK-NEXT:    vmov.f32 s14, s5
 ; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vmov.f32 s14, s5
 ; CHECK-NEXT:    vmov r5, s4
 ; CHECK-NEXT:    vmov.f32 s4, s6
 ; CHECK-NEXT:    vmov.f32 s6, s7
-; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    smull r12, r3, r1, r0
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vmov.f32 s0, s2
diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index 6d581afe9fb31..8eb941371f993 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -229,9 +229,9 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no
 ; CHECK-NEXT:    vmov.f32 s16, s10
 ; CHECK-NEXT:    str r5, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    vmov.f32 s20, s14
+; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    vmov.f32 s18, s11
 ; CHECK-NEXT:    vmov.f32 s22, s15
-; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    vmullb.s32 q6, q5, q4
 ; CHECK-NEXT:    vmov.f32 s14, s13
 ; CHECK-NEXT:    vmov r4, r7, d12
@@ -780,8 +780,8 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
 ; CHECK-NEXT:    vmov.f32 s12, s6
-; CHECK-NEXT:    vmov.f32 s16, s10
 ; CHECK-NEXT:    vmov.f32 s14, s7
+; CHECK-NEXT:    vmov.f32 s16, s10
 ; CHECK-NEXT:    vmov.f32 s18, s11
 ; CHECK-NEXT:    vmullb.u32 q5, q4, q3
 ; CHECK-NEXT:    vmov.f32 s6, s5
@@ -792,6 +792,7 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no
 ; CHECK-NEXT:    sbcs r5, r5, #0
 ; CHECK-NEXT:    mov.w r6, #0
 ; CHECK-NEXT:    csetm r5, lo
+; CHECK-NEXT:    vmullb.u32 q4, q2, q1
 ; CHECK-NEXT:    bfi r6, r5, #0, #8
 ; CHECK-NEXT:    vmov r4, r5, d11
 ; CHECK-NEXT:    lsrl r4, r5, #31
@@ -800,12 +801,11 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no
 ; CHECK-NEXT:    sbcs r5, r5, #0
 ; CHECK-NEXT:    csetm r5, lo
 ; CHECK-NEXT:    bfi r6, r5, #8, #8
-; CHECK-NEXT:    vmsr p0, r6
-; CHECK-NEXT:    vpsel q3, q3, q0
-; CHECK-NEXT:    vmullb.u32 q4, q2, q1
 ; CHECK-NEXT:    vmov r10, r5, d8
 ; CHECK-NEXT:    lsrl r10, r5, #31
+; CHECK-NEXT:    vmsr p0, r6
 ; CHECK-NEXT:    subs.w r6, r10, #-1
+; CHECK-NEXT:    vpsel q3, q3, q0
 ; CHECK-NEXT:    sbcs r5, r5, #0
 ; CHECK-NEXT:    mov.w r6, #0
 ; CHECK-NEXT:    csetm r5, lo
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td
index d9a8854cd018f..e0b802447ea2a 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td
@@ -34,7 +34,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 
 //      CHECK: const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2({{[0-9]+}}), /*)*//*default:*//*Label 3*/ GIMT_Encode4([[L579:[0-9]+]]),
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(20), GIMT_Encode2({{[0-9]+}}), /*)*//*default:*//*Label 3*/ GIMT_Encode4([[L579:[0-9]+]]),
 // CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4([[L462:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
 // CHECK-NEXT:     /*TargetOpcode::G_CONSTANT*//*Label 1*/ GIMT_Encode4([[L493:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
 // CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4({{[0-9]+}}),

From f1ac334b13c22222ed5c71bad04ed8345b2be135 Mon Sep 17 00:00:00 2001
From: Mital Ashok <mital@mitalashok.co.uk>
Date: Thu, 5 Sep 2024 08:41:39 +0100
Subject: [PATCH 194/425] [Clang][SemaCXX] Preserve qualifiers in
 derived-to-base cast in defaulted comparison operators (#102619)

Fixes #102588

Co-authored-by: cor3ntin <corentinjabot@gmail.com>
---
 clang/docs/ReleaseNotes.rst                  |  2 +
 clang/lib/Sema/SemaDeclCXX.cpp               | 10 ++--
 clang/test/SemaCXX/cxx20-default-compare.cpp | 50 ++++++++++++++++++++
 3 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 1520f7a2916aa..44ffad94ef41f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -355,6 +355,8 @@ Bug Fixes to C++ Support
 - Fix an issue with dependent source location expressions (#GH106428), (#GH81155), (#GH80210), (#GH85373)
 - Fixed a bug in the substitution of empty pack indexing types. (#GH105903)
 - Clang no longer tries to capture non-odr used default arguments of template parameters of generic lambdas (#GH107048)
+- Fixed a bug where defaulted comparison operators would remove ``const`` from base classes. (#GH102588)
+
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 3044f1218f5b2..f90f16c2923d0 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -8450,10 +8450,12 @@ class DefaultedComparisonSynthesizer
     if (Obj.first.isInvalid() || Obj.second.isInvalid())
       return {ExprError(), ExprError()};
     CXXCastPath Path = {Base};
-    return {S.ImpCastExprToType(Obj.first.get(), Base->getType(),
-                                CK_DerivedToBase, VK_LValue, &Path),
-            S.ImpCastExprToType(Obj.second.get(), Base->getType(),
-                                CK_DerivedToBase, VK_LValue, &Path)};
+    const auto CastToBase = [&](Expr *E) {
+      QualType ToType = S.Context.getQualifiedType(
+          Base->getType(), E->getType().getQualifiers());
+      return S.ImpCastExprToType(E, ToType, CK_DerivedToBase, VK_LValue, &Path);
+    };
+    return {CastToBase(Obj.first.get()), CastToBase(Obj.second.get())};
   }
 
   ExprPair getField(FieldDecl *Field) {
diff --git a/clang/test/SemaCXX/cxx20-default-compare.cpp b/clang/test/SemaCXX/cxx20-default-compare.cpp
index 7074ee885ac4a..3e4673c31e489 100644
--- a/clang/test/SemaCXX/cxx20-default-compare.cpp
+++ b/clang/test/SemaCXX/cxx20-default-compare.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 %s -std=c++23 -verify -Wfloat-equal
 
+#include "Inputs/std-compare.h"
+
 struct Foo {
   float val;
   bool operator==(const Foo &) const;
@@ -15,3 +17,51 @@ bool operator==(const Foo &, const Foo &) = default;  // expected-warning {{comp
 
 // Declare the defaulted comparison function as a non-member function. Arguments are passed by value.
 bool operator==(Foo, Foo) = default;  // expected-warning {{comparing floating point with == or != is unsafe}} expected-note {{in defaulted equality comparison operator for 'Foo' first required here}}
+
+namespace GH102588 {
+struct A {
+  int i = 0;
+  constexpr operator int() const { return i; }
+  constexpr operator int&() { return ++i; }
+};
+
+struct B : A {
+  bool operator==(const B &) const = default;
+};
+
+constexpr bool f() {
+  B x;
+  return x == x;
+}
+
+static_assert(f());
+
+struct ConstOnly {
+  std::strong_ordering operator<=>(const ConstOnly&) const;
+  std::strong_ordering operator<=>(ConstOnly&) = delete;
+  friend bool operator==(const ConstOnly&, const ConstOnly&);
+  friend bool operator==(ConstOnly&, ConstOnly&) = delete;
+};
+
+struct MutOnly {
+  std::strong_ordering operator<=>(const MutOnly&) const = delete;;
+  std::strong_ordering operator<=>(MutOnly&);
+  friend bool operator==(const MutOnly&, const MutOnly&) = delete;;
+  friend bool operator==(MutOnly&, MutOnly&);
+};
+
+struct ConstCheck : ConstOnly {
+  friend std::strong_ordering operator<=>(const ConstCheck&, const ConstCheck&) = default;
+  std::strong_ordering operator<=>(ConstCheck const& __restrict) const __restrict = default;
+  friend bool operator==(const ConstCheck&, const ConstCheck&) = default;
+  bool operator==(this const ConstCheck&, const ConstCheck&) = default;
+};
+
+// FIXME: Non-reference explicit object parameter are rejected
+struct MutCheck : MutOnly {
+  friend bool operator==(MutCheck, MutCheck) = default;
+  // std::strong_ordering operator<=>(this MutCheck, MutCheck) = default;
+  friend std::strong_ordering operator<=>(MutCheck, MutCheck) = default;
+  // bool operator==(this MutCheck, MutCheck) = default;
+};
+}

From cf1ad28169be5d026ec95f351b56b0c090b3e682 Mon Sep 17 00:00:00 2001
From: Daniel Grumberg <dgrumberg@apple.com>
Date: Thu, 5 Sep 2024 09:15:09 +0100
Subject: [PATCH 195/425] [clang][ExtractAPI] Handle AttributedType fragments
 transparently (#107262)

rdar://131958623
---
 clang/lib/ExtractAPI/DeclarationFragments.cpp | 13 ++++++++++
 clang/test/ExtractAPI/attributed-typedef.m    | 24 +++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100644 clang/test/ExtractAPI/attributed-typedef.m

diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp
index d77bb1d424f7c..06ce5ed6a6475 100644
--- a/clang/lib/ExtractAPI/DeclarationFragments.cpp
+++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp
@@ -276,6 +276,19 @@ DeclarationFragments DeclarationFragmentsBuilder::getFragmentsForType(
 
   DeclarationFragments Fragments;
 
+  if (const MacroQualifiedType *MQT = dyn_cast<MacroQualifiedType>(T)) {
+    Fragments.append(
+        getFragmentsForType(MQT->getUnderlyingType(), Context, After));
+    return Fragments;
+  }
+
+  if (const AttributedType *AT = dyn_cast<AttributedType>(T)) {
+    // FIXME: Serialize Attributes correctly
+    Fragments.append(
+        getFragmentsForType(AT->getModifiedType(), Context, After));
+    return Fragments;
+  }
+
   // An ElaboratedType is a sugar for types that are referred to using an
   // elaborated keyword, e.g., `struct S`, `enum E`, or (in C++) via a
   // qualified name, e.g., `N::M::type`, or both.
diff --git a/clang/test/ExtractAPI/attributed-typedef.m b/clang/test/ExtractAPI/attributed-typedef.m
new file mode 100644
index 0000000000000..c948c873ab759
--- /dev/null
+++ b/clang/test/ExtractAPI/attributed-typedef.m
@@ -0,0 +1,24 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
+// RUN:   -triple arm64-apple-macosx -x objective-c-header %s -o %t/output.symbols.json
+
+_Pragma("clang assume_nonnull begin")
+
+struct Foo { int a; };
+typedef struct Foo *Bar;
+// RUN: FileCheck %s -input-file %t/output.symbols.json --check-prefix FUNC
+void func(Bar b);
+// FUNC-LABEL: "!testLabel": "c:@F@func",
+// CHECK-NOT: Foo
+// CHECK: "pathComponents"
+
+// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix THING
+#define SWIFT_NAME(_name) __attribute__((swift_name(#_name)))
+extern Bar const thing SWIFT_NAME(swiftThing);
+// THING-LABEL: "!testLabel": "c:@thing"
+// THING-NOT: Foo
+// THING: "pathComponents"
+
+_Pragma("clang assume_nonnull end")
+
+// expected-no-diagnostics

From 41373098421f2aa551a0879537864c87d797a102 Mon Sep 17 00:00:00 2001
From: Mital Ashok <mital@mitalashok.co.uk>
Date: Thu, 5 Sep 2024 09:15:54 +0100
Subject: [PATCH 196/425] [Clang] Warn with -Wpre-c23-compat instead of
 -Wpre-c++17-compat for u8 character literals in C23 (#97210)

Co-authored-by: cor3ntin <corentinjabot@gmail.com>
---
 clang/docs/ReleaseNotes.rst                     | 2 ++
 clang/include/clang/Basic/DiagnosticLexKinds.td | 3 +++
 clang/lib/Lex/Lexer.cpp                         | 4 +++-
 clang/test/Sema/pre-c2x-compat.c                | 1 +
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 44ffad94ef41f..bd84a2e40fb8b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -278,6 +278,8 @@ Improvements to Clang's diagnostics
 - The lifetimebound and GSL analysis in clang are coherent, allowing clang to
   detect more use-after-free bugs. (#GH100549).
 
+- Clang now warns for u8 character literals used in C23 with ``-Wpre-c23-compat`` instead of ``-Wpre-c++17-compat``.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 12d7b8c0205ee..fc14bb6aa2165 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -283,6 +283,9 @@ def warn_cxx98_compat_unicode_literal : Warning<
 def warn_cxx14_compat_u8_character_literal : Warning<
   "unicode literals are incompatible with C++ standards before C++17">,
   InGroup<CXXPre17Compat>, DefaultIgnore;
+def warn_c17_compat_u8_character_literal : Warning<
+  "unicode literals are incompatible with C standards before C23">,
+  InGroup<CPre23Compat>, DefaultIgnore;
 def warn_cxx11_compat_user_defined_literal : Warning<
   "identifier after literal will be treated as a user-defined literal suffix "
   "in C++11">, InGroup<CXX11Compat>, DefaultIgnore;
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index ef1e1f4bd9aeb..8647e9f2f27c3 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -2428,7 +2428,9 @@ bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
                           ? diag::warn_cxx98_compat_unicode_literal
                           : diag::warn_c99_compat_unicode_literal);
     else if (Kind == tok::utf8_char_constant)
-      Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
+      Diag(BufferPtr, LangOpts.CPlusPlus
+                          ? diag::warn_cxx14_compat_u8_character_literal
+                          : diag::warn_c17_compat_u8_character_literal);
   }
 
   char C = getAndAdvanceChar(CurPtr, Result);
diff --git a/clang/test/Sema/pre-c2x-compat.c b/clang/test/Sema/pre-c2x-compat.c
index fad472f1f72d5..15bb9b58349fa 100644
--- a/clang/test/Sema/pre-c2x-compat.c
+++ b/clang/test/Sema/pre-c2x-compat.c
@@ -1,3 +1,4 @@
 // RUN: %clang_cc1 %s -std=c2x -Wpre-c2x-compat -pedantic -fsyntax-only -verify
 
 int digit_seps = 123'456; // expected-warning {{digit separators are incompatible with C standards before C23}}
+unsigned char u8_char = u8'x'; // expected-warning {{unicode literals are incompatible with C standards before C23}}

From 3e4788377bb29ed389b46521fcba0d06aa985bcf Mon Sep 17 00:00:00 2001
From: Giulio Eulisse <10544+ktf@users.noreply.github.com>
Date: Thu, 5 Sep 2024 10:16:51 +0200
Subject: [PATCH 197/425] Recover performance loss after PagedVector
 introduction (#67972)

---
 clang/include/clang/Basic/SourceManager.h | 2 +-
 llvm/include/llvm/ADT/PagedVector.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h
index d3ccc7ef81c07..e0f1ea435d54e 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -724,7 +724,7 @@ class SourceManager : public RefCountedBase<SourceManager> {
   ///
   /// Negative FileIDs are indexes into this table. To get from ID to an index,
   /// use (-ID - 2).
-  llvm::PagedVector<SrcMgr::SLocEntry> LoadedSLocEntryTable;
+  llvm::PagedVector<SrcMgr::SLocEntry, 32> LoadedSLocEntryTable;
 
   /// For each allocation in LoadedSLocEntryTable, we keep the first FileID.
   /// We assume exactly one allocation per AST file, and use that to determine
diff --git a/llvm/include/llvm/ADT/PagedVector.h b/llvm/include/llvm/ADT/PagedVector.h
index 3fcca6d82cb33..52ecd0bb0ba11 100644
--- a/llvm/include/llvm/ADT/PagedVector.h
+++ b/llvm/include/llvm/ADT/PagedVector.h
@@ -84,7 +84,7 @@ template <typename T, size_t PageSize = 1024 / sizeof(T)> class PagedVector {
     assert(Index / PageSize < PageToDataPtrs.size());
     T *&PagePtr = PageToDataPtrs[Index / PageSize];
     // If the page was not yet allocated, allocate it.
-    if (!PagePtr) {
+    if (LLVM_UNLIKELY(!PagePtr)) {
       PagePtr = Allocator.getPointer()->template Allocate<T>(PageSize);
       // We need to invoke the default constructor on all the elements of the
       // page.

From b206bf0952796cb93f1aca9e47d5764e474e1998 Mon Sep 17 00:00:00 2001
From: Konrad Kleine <kkleine@redhat.com>
Date: Thu, 5 Sep 2024 10:41:18 +0200
Subject: [PATCH 198/425] Fix CLANG_BOOTSTRAP_TARGETS in Release.cmake
 (#106407)

# Problem

Before this patch you could not build the `stage2-LLVM` for example
because you first had to manually add it to `CLANG_BOOTSTRAP_TARGETS` in
the `Release.cmake` and also add it to
`LLVM_RELEASE_FINAL_STAGE_TARGETS` in the cmake configure run. Now you
can just use `-DLLVM_RELEASE_FINAL_STAGE_TARGETS="LLVM;clang"` on the
cmake CLI and be able to build the targets `stage2-LLVM` and
`stage2-clang` without further changes to the cache file.

# Solution

Take all `LLVM_RELEASE_FINAL_STAGE_TARGETS` elements and append them
prefixed with `stage2-` to `CLANG_BOOTSTRAP_TARGETS`. Afterwards all
duplicates are removed.
---
 clang/cmake/caches/Release.cmake | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake
index 6d5f75ca0074e..c93ff40ff3ee4 100644
--- a/clang/cmake/caches/Release.cmake
+++ b/clang/cmake/caches/Release.cmake
@@ -55,14 +55,22 @@ set(STAGE1_RUNTIMES "compiler-rt")
 
 if (LLVM_RELEASE_ENABLE_PGO)
   list(APPEND STAGE1_PROJECTS "lld")
-  set(CLANG_BOOTSTRAP_TARGETS
+  set(tmp_targets
     generate-profdata
     stage2-package
     stage2-clang
+    stage2
     stage2-install
     stage2-check-all
     stage2-check-llvm
-    stage2-check-clang CACHE STRING "")
+    stage2-check-clang)
+
+  foreach(X IN LISTS LLVM_RELEASE_FINAL_STAGE_TARGETS)
+    list(APPEND tmp_targets "stage2-${X}")
+  endforeach()
+  list(REMOVE_DUPLICATES tmp_targets)
+
+  set(CLANG_BOOTSTRAP_TARGETS "${tmp_targets}" CACHE STRING "")
 
   # Configuration for stage2-instrumented
   set(BOOTSTRAP_CLANG_ENABLE_BOOTSTRAP ON CACHE STRING "")

From 3413f957243e4a152726e572986eb730699b8486 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 5 Sep 2024 10:01:52 +0100
Subject: [PATCH 199/425] [AArch64] Add a few extra two-step zext shuffle
 tests. NFC

---
 llvm/test/CodeGen/AArch64/zext-shuffle.ll | 78 +++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
index af5a92017bbbc..6415fba29ff79 100644
--- a/llvm/test/CodeGen/AArch64/zext-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
@@ -263,6 +263,84 @@ define <8 x i16> @v8i16_371115(<16 x i8> %a, <16 x i8> %b) {
 }
 
 
+define <8 x i32> @v8i32_0246(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: v8i32_0246:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %d = zext <8 x i8> %c to <8 x i32>
+  ret <8 x i32> %d
+}
+
+define <8 x i32> @v8i32_1357(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: v8i32_1357:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp2 v0.16b, v0.16b, v0.16b
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %d = zext <8 x i8> %c to <8 x i32>
+  ret <8 x i32> %d
+}
+
+define <8 x i32> @v8i32_04812(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: v8i32_04812:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %d = zext <8 x i8> %c to <8 x i32>
+  ret <8 x i32> %d
+}
+
+define <8 x i32> @v8i32_15913(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: v8i32_15913:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #8
+; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %d = zext <8 x i8> %c to <8 x i32>
+  ret <8 x i32> %d
+}
+
+define <8 x i32> @v8i32_261014(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: v8i32_261014:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %d = zext <8 x i8> %c to <8 x i32>
+  ret <8 x i32> %d
+}
+
+define <8 x i32> @v8i32_371115(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: v8i32_371115:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #8
+; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
+  %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %d = zext <8 x i8> %c to <8 x i32>
+  ret <8 x i32> %d
+}
+
+
 define <8 x i64> @zext_add(<32 x i16> %l) {
 ; CHECK-LABEL: zext_add:
 ; CHECK:       // %bb.0:

From a95b212e9957b8f5b7d452b4713a7b6f9ee19e71 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 5 Sep 2024 10:34:34 +0100
Subject: [PATCH 200/425] [DWARF] Don't search scope chain to find DISubprogram
 for prologues (#107261)

Seemingly this goes back to fd07a2a in 2015 -- I anticipate that back
then the metadata layout was radically different. But nowadays at least, we
can just directly look up the subprogram.
---
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 77e304383fb5c..9da9382baf863 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2198,11 +2198,10 @@ DebugLoc DwarfDebug::emitInitialLocDirective(const MachineFunction &MF,
 
     // Ensure the compile unit is created if the function is called before
     // beginFunction().
-    (void)getOrCreateDwarfCompileUnit(
-        MF.getFunction().getSubprogram()->getUnit());
+    DISubprogram *SP = MF.getFunction().getSubprogram();
+    (void)getOrCreateDwarfCompileUnit(SP->getUnit());
     // We'd like to list the prologue as "not statements" but GDB behaves
     // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
-    const DISubprogram *SP = PrologEndLoc->getInlinedAtScope()->getSubprogram();
     ::recordSourceLine(*Asm, SP->getScopeLine(), 0, SP, DWARF2_FLAG_IS_STMT,
                        CUID, getDwarfVersion(), getUnits());
     return PrologEndLoc;

From 03d5b7ca3d83eee3514318ef8934ba26bc3d7fa9 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 5 Sep 2024 10:14:34 +0200
Subject: [PATCH 201/425] [MemorySanitizer] Don't create types pointers (NFC)

Everything in this pass uses a single addrspace 0 pointer type.
Don't try to create it using the typed pointer ctor.

This allows removing the type argument from
getShadowPtrForVAArgument().
---
 .../Instrumentation/MemorySanitizer.cpp       | 129 +++++++-----------
 1 file changed, 52 insertions(+), 77 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 0b3d85afcacf1..17c5638cf8ced 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -769,8 +769,7 @@ MemorySanitizer::getOrInsertMsanMetadataFunction(Module &M, StringRef Name,
                                                  ArgsTy... Args) {
   if (TargetTriple.getArch() == Triple::systemz) {
     // SystemZ ABI: shadow/origin pair is returned via a hidden parameter.
-    return M.getOrInsertFunction(Name, Type::getVoidTy(*C),
-                                 PointerType::get(MsanMetadata, 0),
+    return M.getOrInsertFunction(Name, Type::getVoidTy(*C), PtrTy,
                                  std::forward<ArgsTy>(Args)...);
   }
 
@@ -804,29 +803,26 @@ void MemorySanitizer::createKernelApi(Module &M, const TargetLibraryInfo &TLI) {
       ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), /* va_arg_origin */
       IRB.getInt64Ty(), ArrayType::get(OriginTy, kParamTLSSize / 4), OriginTy,
       OriginTy);
-  MsanGetContextStateFn = M.getOrInsertFunction(
-      "__msan_get_context_state", PointerType::get(MsanContextStateTy, 0));
+  MsanGetContextStateFn =
+      M.getOrInsertFunction("__msan_get_context_state", PtrTy);
 
-  MsanMetadata = StructType::get(PointerType::get(IRB.getInt8Ty(), 0),
-                                 PointerType::get(IRB.getInt32Ty(), 0));
+  MsanMetadata = StructType::get(PtrTy, PtrTy);
 
   for (int ind = 0, size = 1; ind < 4; ind++, size <<= 1) {
     std::string name_load =
         "__msan_metadata_ptr_for_load_" + std::to_string(size);
     std::string name_store =
         "__msan_metadata_ptr_for_store_" + std::to_string(size);
-    MsanMetadataPtrForLoad_1_8[ind] = getOrInsertMsanMetadataFunction(
-        M, name_load, PointerType::get(IRB.getInt8Ty(), 0));
-    MsanMetadataPtrForStore_1_8[ind] = getOrInsertMsanMetadataFunction(
-        M, name_store, PointerType::get(IRB.getInt8Ty(), 0));
+    MsanMetadataPtrForLoad_1_8[ind] =
+        getOrInsertMsanMetadataFunction(M, name_load, PtrTy);
+    MsanMetadataPtrForStore_1_8[ind] =
+        getOrInsertMsanMetadataFunction(M, name_store, PtrTy);
   }
 
   MsanMetadataPtrForLoadN = getOrInsertMsanMetadataFunction(
-      M, "__msan_metadata_ptr_for_load_n", PointerType::get(IRB.getInt8Ty(), 0),
-      IRB.getInt64Ty());
+      M, "__msan_metadata_ptr_for_load_n", PtrTy, IRB.getInt64Ty());
   MsanMetadataPtrForStoreN = getOrInsertMsanMetadataFunction(
-      M, "__msan_metadata_ptr_for_store_n",
-      PointerType::get(IRB.getInt8Ty(), 0), IRB.getInt64Ty());
+      M, "__msan_metadata_ptr_for_store_n", PtrTy, IRB.getInt64Ty());
 
   // Functions for poisoning and unpoisoning memory.
   MsanPoisonAllocaFn = M.getOrInsertFunction(
@@ -937,9 +933,8 @@ void MemorySanitizer::initializeCallbacks(Module &M, const TargetLibraryInfo &TL
                                    TLI.getAttrList(C, {1}, /*Signed=*/true),
                                    PtrTy, PtrTy, IRB.getInt32Ty(), IntptrTy);
 
-  MsanInstrumentAsmStoreFn =
-      M.getOrInsertFunction("__msan_instrument_asm_store", IRB.getVoidTy(),
-                            PointerType::get(IRB.getInt8Ty(), 0), IntptrTy);
+  MsanInstrumentAsmStoreFn = M.getOrInsertFunction(
+      "__msan_instrument_asm_store", IRB.getVoidTy(), PtrTy, IntptrTy);
 
   if (CompileKernel) {
     createKernelApi(M, TLI);
@@ -1264,8 +1259,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Align CurrentAlignment = Alignment;
     if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) {
       Value *IntptrOrigin = originToIntptr(IRB, Origin);
-      Value *IntptrOriginPtr =
-          IRB.CreatePointerCast(OriginPtr, PointerType::get(MS.IntptrTy, 0));
+      Value *IntptrOriginPtr = IRB.CreatePointerCast(OriginPtr, MS.PtrTy);
       for (unsigned i = 0; i < Size / IntptrSize; ++i) {
         Value *Ptr = i ? IRB.CreateConstGEP1_32(MS.IntptrTy, IntptrOriginPtr, i)
                        : IntptrOriginPtr;
@@ -1691,7 +1685,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           VectTy->getElementCount());
     }
     assert(IntPtrTy == MS.IntptrTy);
-    return PointerType::get(*MS.C, 0);
+    return MS.PtrTy;
   }
 
   Constant *constToIntPtr(Type *IntPtrTy, uint64_t C) const {
@@ -1787,8 +1781,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     TypeSize Size = DL.getTypeStoreSize(ShadowTy);
 
     FunctionCallee Getter = MS.getKmsanShadowOriginAccessFn(isStore, Size);
-    Value *AddrCast =
-        IRB.CreatePointerCast(Addr, PointerType::get(IRB.getInt8Ty(), 0));
+    Value *AddrCast = IRB.CreatePointerCast(Addr, MS.PtrTy);
     if (Getter) {
       ShadowOriginPtrs = createMetadataCall(IRB, Getter, AddrCast);
     } else {
@@ -1799,7 +1792,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           AddrCast, SizeVal);
     }
     Value *ShadowPtr = IRB.CreateExtractValue(ShadowOriginPtrs, 0);
-    ShadowPtr = IRB.CreatePointerCast(ShadowPtr, PointerType::get(ShadowTy, 0));
+    ShadowPtr = IRB.CreatePointerCast(ShadowPtr, MS.PtrTy);
     Value *OriginPtr = IRB.CreateExtractValue(ShadowOriginPtrs, 1);
 
     return std::make_pair(ShadowPtr, OriginPtr);
@@ -5009,21 +5002,19 @@ struct VarArgHelperBase : public VarArgHelper {
   }
 
   /// Compute the shadow address for a given va_arg.
-  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
-                                   unsigned ArgOffset) {
+  Value *getShadowPtrForVAArgument(IRBuilder<> &IRB, unsigned ArgOffset) {
     Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
     Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
-    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
-                              "_msarg_va_s");
+    return IRB.CreateIntToPtr(Base, MS.PtrTy, "_msarg_va_s");
   }
 
   /// Compute the shadow address for a given va_arg.
-  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
-                                   unsigned ArgOffset, unsigned ArgSize) {
+  Value *getShadowPtrForVAArgument(IRBuilder<> &IRB, unsigned ArgOffset,
+                                   unsigned ArgSize) {
     // Make sure we don't overflow __msan_va_arg_tls.
     if (ArgOffset + ArgSize > kParamTLSSize)
       return nullptr;
-    return getShadowPtrForVAArgument(Ty, IRB, ArgOffset);
+    return getShadowPtrForVAArgument(IRB, ArgOffset);
   }
 
   /// Compute the origin address for a given va_arg.
@@ -5033,8 +5024,7 @@ struct VarArgHelperBase : public VarArgHelper {
     // getShadowPtrForVAArgument(), so __msan_va_arg_origin_tls can never
     // overflow.
     Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
-    return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
-                              "_msarg_va_o");
+    return IRB.CreateIntToPtr(Base, MS.PtrTy, "_msarg_va_o");
   }
 
   void CleanUnusedTLS(IRBuilder<> &IRB, Value *ShadowBase,
@@ -5147,8 +5137,7 @@ struct VarArgAMD64Helper : public VarArgHelperBase {
         uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
         uint64_t AlignedSize = alignTo(ArgSize, 8);
         unsigned BaseOffset = OverflowOffset;
-        Value *ShadowBase =
-            getShadowPtrForVAArgument(RealTy, IRB, OverflowOffset);
+        Value *ShadowBase = getShadowPtrForVAArgument(IRB, OverflowOffset);
         Value *OriginBase = nullptr;
         if (MS.TrackOrigins)
           OriginBase = getOriginPtrForVAArgument(IRB, OverflowOffset);
@@ -5177,14 +5166,14 @@ struct VarArgAMD64Helper : public VarArgHelperBase {
         Value *ShadowBase, *OriginBase = nullptr;
         switch (AK) {
         case AK_GeneralPurpose:
-          ShadowBase = getShadowPtrForVAArgument(A->getType(), IRB, GpOffset);
+          ShadowBase = getShadowPtrForVAArgument(IRB, GpOffset);
           if (MS.TrackOrigins)
             OriginBase = getOriginPtrForVAArgument(IRB, GpOffset);
           GpOffset += 8;
           assert(GpOffset <= kParamTLSSize);
           break;
         case AK_FloatingPoint:
-          ShadowBase = getShadowPtrForVAArgument(A->getType(), IRB, FpOffset);
+          ShadowBase = getShadowPtrForVAArgument(IRB, FpOffset);
           if (MS.TrackOrigins)
             OriginBase = getOriginPtrForVAArgument(IRB, FpOffset);
           FpOffset += 16;
@@ -5196,8 +5185,7 @@ struct VarArgAMD64Helper : public VarArgHelperBase {
           uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
           uint64_t AlignedSize = alignTo(ArgSize, 8);
           unsigned BaseOffset = OverflowOffset;
-          ShadowBase =
-              getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset);
+          ShadowBase = getShadowPtrForVAArgument(IRB, OverflowOffset);
           if (MS.TrackOrigins) {
             OriginBase = getOriginPtrForVAArgument(IRB, OverflowOffset);
           }
@@ -5263,13 +5251,11 @@ struct VarArgAMD64Helper : public VarArgHelperBase {
       NextNodeIRBuilder IRB(OrigInst);
       Value *VAListTag = OrigInst->getArgOperand(0);
 
-      Type *RegSaveAreaPtrTy = PointerType::getUnqual(*MS.C); // i64*
       Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
           IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
                         ConstantInt::get(MS.IntptrTy, 16)),
-          PointerType::get(RegSaveAreaPtrTy, 0));
-      Value *RegSaveAreaPtr =
-          IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+          MS.PtrTy);
+      Value *RegSaveAreaPtr = IRB.CreateLoad(MS.PtrTy, RegSaveAreaPtrPtr);
       Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
       const Align Alignment = Align(16);
       std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
@@ -5280,13 +5266,12 @@ struct VarArgAMD64Helper : public VarArgHelperBase {
       if (MS.TrackOrigins)
         IRB.CreateMemCpy(RegSaveAreaOriginPtr, Alignment, VAArgTLSOriginCopy,
                          Alignment, AMD64FpEndOffset);
-      Type *OverflowArgAreaPtrTy = PointerType::getUnqual(*MS.C); // i64*
       Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr(
           IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
                         ConstantInt::get(MS.IntptrTy, 8)),
-          PointerType::get(OverflowArgAreaPtrTy, 0));
+          MS.PtrTy);
       Value *OverflowArgAreaPtr =
-          IRB.CreateLoad(OverflowArgAreaPtrTy, OverflowArgAreaPtrPtr);
+          IRB.CreateLoad(MS.PtrTy, OverflowArgAreaPtrPtr);
       Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr;
       std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) =
           MSV.getShadowOriginPtr(OverflowArgAreaPtr, IRB, IRB.getInt8Ty(),
@@ -5329,7 +5314,7 @@ struct VarArgMIPS64Helper : public VarArgHelperBase {
         if (ArgSize < 8)
           VAArgOffset += (8 - ArgSize);
       }
-      Base = getShadowPtrForVAArgument(A->getType(), IRB, VAArgOffset, ArgSize);
+      Base = getShadowPtrForVAArgument(IRB, VAArgOffset, ArgSize);
       VAArgOffset += ArgSize;
       VAArgOffset = alignTo(VAArgOffset, 8);
       if (!Base)
@@ -5371,12 +5356,9 @@ struct VarArgMIPS64Helper : public VarArgHelperBase {
     for (CallInst *OrigInst : VAStartInstrumentationList) {
       NextNodeIRBuilder IRB(OrigInst);
       Value *VAListTag = OrigInst->getArgOperand(0);
-      Type *RegSaveAreaPtrTy = PointerType::getUnqual(*MS.C); // i64*
-      Value *RegSaveAreaPtrPtr =
-          IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
-                             PointerType::get(RegSaveAreaPtrTy, 0));
-      Value *RegSaveAreaPtr =
-          IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+      Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
+          IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), MS.PtrTy);
+      Value *RegSaveAreaPtr = IRB.CreateLoad(MS.PtrTy, RegSaveAreaPtrPtr);
       Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
       const Align Alignment = Align(8);
       std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
@@ -5460,11 +5442,11 @@ struct VarArgAArch64Helper : public VarArgHelperBase {
       Value *Base;
       switch (AK) {
       case AK_GeneralPurpose:
-        Base = getShadowPtrForVAArgument(A->getType(), IRB, GrOffset);
+        Base = getShadowPtrForVAArgument(IRB, GrOffset);
         GrOffset += 8 * RegNum;
         break;
       case AK_FloatingPoint:
-        Base = getShadowPtrForVAArgument(A->getType(), IRB, VrOffset);
+        Base = getShadowPtrForVAArgument(IRB, VrOffset);
         VrOffset += 16 * RegNum;
         break;
       case AK_Memory:
@@ -5475,7 +5457,7 @@ struct VarArgAArch64Helper : public VarArgHelperBase {
         uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
         uint64_t AlignedSize = alignTo(ArgSize, 8);
         unsigned BaseOffset = OverflowOffset;
-        Base = getShadowPtrForVAArgument(A->getType(), IRB, BaseOffset);
+        Base = getShadowPtrForVAArgument(IRB, BaseOffset);
         OverflowOffset += AlignedSize;
         if (OverflowOffset > kParamTLSSize) {
           // We have no space to copy shadow there.
@@ -5500,7 +5482,7 @@ struct VarArgAArch64Helper : public VarArgHelperBase {
     Value *SaveAreaPtrPtr = IRB.CreateIntToPtr(
         IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
                       ConstantInt::get(MS.IntptrTy, offset)),
-        PointerType::get(*MS.C, 0));
+        MS.PtrTy);
     return IRB.CreateLoad(Type::getInt64Ty(*MS.C), SaveAreaPtrPtr);
   }
 
@@ -5509,7 +5491,7 @@ struct VarArgAArch64Helper : public VarArgHelperBase {
     Value *SaveAreaPtr = IRB.CreateIntToPtr(
         IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
                       ConstantInt::get(MS.IntptrTy, offset)),
-        PointerType::get(*MS.C, 0));
+        MS.PtrTy);
     Value *SaveArea32 = IRB.CreateLoad(IRB.getInt32Ty(), SaveAreaPtr);
     return IRB.CreateSExt(SaveArea32, MS.IntptrTy);
   }
@@ -5670,8 +5652,8 @@ struct VarArgPowerPC64Helper : public VarArgHelperBase {
           ArgAlign = Align(8);
         VAArgOffset = alignTo(VAArgOffset, ArgAlign);
         if (!IsFixed) {
-          Value *Base = getShadowPtrForVAArgument(
-              RealTy, IRB, VAArgOffset - VAArgBase, ArgSize);
+          Value *Base =
+              getShadowPtrForVAArgument(IRB, VAArgOffset - VAArgBase, ArgSize);
           if (Base) {
             Value *AShadowPtr, *AOriginPtr;
             std::tie(AShadowPtr, AOriginPtr) =
@@ -5707,8 +5689,8 @@ struct VarArgPowerPC64Helper : public VarArgHelperBase {
             VAArgOffset += (8 - ArgSize);
         }
         if (!IsFixed) {
-          Base = getShadowPtrForVAArgument(A->getType(), IRB,
-                                           VAArgOffset - VAArgBase, ArgSize);
+          Base =
+              getShadowPtrForVAArgument(IRB, VAArgOffset - VAArgBase, ArgSize);
           if (Base)
             IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
         }
@@ -5755,12 +5737,9 @@ struct VarArgPowerPC64Helper : public VarArgHelperBase {
     for (CallInst *OrigInst : VAStartInstrumentationList) {
       NextNodeIRBuilder IRB(OrigInst);
       Value *VAListTag = OrigInst->getArgOperand(0);
-      Type *RegSaveAreaPtrTy = PointerType::getUnqual(*MS.C); // i64*
-      Value *RegSaveAreaPtrPtr =
-          IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
-                             PointerType::get(RegSaveAreaPtrTy, 0));
-      Value *RegSaveAreaPtr =
-          IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+      Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
+          IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), MS.PtrTy);
+      Value *RegSaveAreaPtr = IRB.CreateLoad(MS.PtrTy, RegSaveAreaPtrPtr);
       Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
       const Align Alignment = Align(8);
       std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
@@ -5855,7 +5834,7 @@ struct VarArgSystemZHelper : public VarArgHelperBase {
       Type *T = A->getType();
       ArgKind AK = classifyArgument(T);
       if (AK == ArgKind::Indirect) {
-        T = PointerType::get(T, 0);
+        T = MS.PtrTy;
         AK = ArgKind::GeneralPurpose;
       }
       if (AK == ArgKind::GeneralPurpose && GpOffset >= SystemZGpEndOffset)
@@ -5948,8 +5927,7 @@ struct VarArgSystemZHelper : public VarArgHelperBase {
       if (SE != ShadowExtension::None)
         Shadow = MSV.CreateShadowCast(IRB, Shadow, IRB.getInt64Ty(),
                                       /*Signed*/ SE == ShadowExtension::Sign);
-      ShadowBase = IRB.CreateIntToPtr(
-          ShadowBase, PointerType::get(Shadow->getType(), 0), "_msarg_va_s");
+      ShadowBase = IRB.CreateIntToPtr(ShadowBase, MS.PtrTy, "_msarg_va_s");
       IRB.CreateStore(Shadow, ShadowBase);
       if (MS.TrackOrigins) {
         Value *Origin = MSV.getOrigin(A);
@@ -5964,13 +5942,12 @@ struct VarArgSystemZHelper : public VarArgHelperBase {
   }
 
   void copyRegSaveArea(IRBuilder<> &IRB, Value *VAListTag) {
-    Type *RegSaveAreaPtrTy = PointerType::getUnqual(*MS.C); // i64*
     Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
         IRB.CreateAdd(
             IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
             ConstantInt::get(MS.IntptrTy, SystemZRegSaveAreaPtrOffset)),
-        PointerType::get(RegSaveAreaPtrTy, 0));
-    Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+        MS.PtrTy);
+    Value *RegSaveAreaPtr = IRB.CreateLoad(MS.PtrTy, RegSaveAreaPtrPtr);
     Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
     const Align Alignment = Align(8);
     std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
@@ -5991,14 +5968,12 @@ struct VarArgSystemZHelper : public VarArgHelperBase {
   // FIXME: This implementation limits OverflowOffset to kParamTLSSize, so we
   // don't know real overflow size and can't clear shadow beyond kParamTLSSize.
   void copyOverflowArea(IRBuilder<> &IRB, Value *VAListTag) {
-    Type *OverflowArgAreaPtrTy = PointerType::getUnqual(*MS.C); // i64*
     Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr(
         IRB.CreateAdd(
             IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
             ConstantInt::get(MS.IntptrTy, SystemZOverflowArgAreaPtrOffset)),
-        PointerType::get(OverflowArgAreaPtrTy, 0));
-    Value *OverflowArgAreaPtr =
-        IRB.CreateLoad(OverflowArgAreaPtrTy, OverflowArgAreaPtrPtr);
+        MS.PtrTy);
+    Value *OverflowArgAreaPtr = IRB.CreateLoad(MS.PtrTy, OverflowArgAreaPtrPtr);
     Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr;
     const Align Alignment = Align(8);
     std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) =

From 071606ab282bb622a87759569b7044ec19a9c641 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 5 Sep 2024 18:03:11 +0800
Subject: [PATCH 202/425] [RISCV] Remove RV32 FIXMEs completed in #107290. NFC

---
 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll   | 2 --
 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll  | 2 --
 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll | 2 --
 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll  | 2 --
 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll | 2 --
 5 files changed, 10 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
index 805a3c640957b..6246ef7db0cb3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
@@ -1392,8 +1392,6 @@ define <32 x i64> @vadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
   ret <32 x i64> %v
 }
 
-; FIXME: We don't match vadd.vi on RV32.
-
 define <32 x i64> @vadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
 ; CHECK-LABEL: vadd_vx_v32i64_evl12:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
index c5dd6ac344a37..5030fda9dea33 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
@@ -1405,8 +1405,6 @@ define <32 x i64> @vsadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
   ret <32 x i64> %v
 }
 
-; FIXME: We don't match vsadd.vi on RV32.
-
 define <32 x i64> @vsadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
 ; CHECK-LABEL: vsadd_vx_v32i64_evl12:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
index 17d9c437590a7..562399ea33e7a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
@@ -1401,8 +1401,6 @@ define <32 x i64> @vsaddu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
   ret <32 x i64> %v
 }
 
-; FIXME: We don't match vsaddu.vi on RV32.
-
 define <32 x i64> @vsaddu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
 ; CHECK-LABEL: vsaddu_vx_v32i64_evl12:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
index 90e1b5ce55752..549c6ca11e320 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
@@ -1447,8 +1447,6 @@ define <32 x i64> @vssub_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
   ret <32 x i64> %v
 }
 
-; FIXME: We don't match vssub.vi on RV32.
-
 define <32 x i64> @vssub_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
 ; CHECK-LABEL: vssub_vx_v32i64_evl12:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
index 59899ab8b9994..683f1150310b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
@@ -1442,8 +1442,6 @@ define <32 x i64> @vssubu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
   ret <32 x i64> %v
 }
 
-; FIXME: We don't match vssubu.vi on RV32.
-
 define <32 x i64> @vssubu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
 ; CHECK-LABEL: vssubu_vx_v32i64_evl12:
 ; CHECK:       # %bb.0:

From 5ee73953f03fe0cf53190c8dc9a257c752ab4171 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 5 Sep 2024 11:10:42 +0100
Subject: [PATCH 203/425] [AMDGPU] Add image_atomic_fmin/fmax as aliases for
 GFX12 (#107242)

This just follows SP3.
---
 llvm/lib/Target/AMDGPU/MIMGInstructions.td   | 4 ++++
 llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index b4e58cfd98a23..5c49a8116ae7f 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1675,6 +1675,10 @@ defm IMAGE_ATOMIC_PK_ADD_BF16   : MIMG_Atomic <mimgopc<0x87, MIMG.NOP, MIMG.NOP,
 defm IMAGE_ATOMIC_ADD_FLT       : MIMG_Atomic <mimgopc<0x83, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_add_flt", 0, 1>;
 defm IMAGE_ATOMIC_MIN_FLT       : MIMG_Atomic <mimgopc<0x84, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_min_num_flt", 0, 1, "image_atomic_min_flt">;
 defm IMAGE_ATOMIC_MAX_FLT       : MIMG_Atomic <mimgopc<0x85, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_max_num_flt", 0, 1, "image_atomic_max_flt">;
+let AssemblerPredicate = isGFX12Plus in {
+  def : AMDGPUMnemonicAlias<"image_atomic_fmin", "image_atomic_min_flt">;
+  def : AMDGPUMnemonicAlias<"image_atomic_fmax", "image_atomic_max_flt">;
+}
 
 defm IMAGE_SAMPLE               : MIMG_Sampler_WQM <mimgopc<0x1b, 0x1b, 0x20>, AMDGPUSample>;
 let OtherPredicates = [HasImageInsts, HasExtendedImageInsts] in {
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s
index a88a3ef100fb4..fceab4b7830f9 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s
@@ -29,3 +29,9 @@ image_atomic_min_num_flt v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
 
 image_atomic_max_num_flt v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
 // GFX12: image_atomic_max_flt v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x40,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+image_atomic_fmin v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// GFX12: image_atomic_min_flt v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x00,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+image_atomic_fmax v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// GFX12: image_atomic_max_flt v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x40,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]

From d0278cf395e09bfb8dbef9cb92e6103be91e1eb3 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 5 Sep 2024 18:11:20 +0800
Subject: [PATCH 204/425] [RISCV] Remove some more completed FIXMEs from tests.
 NFC

---
 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll | 1 -
 llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll              | 1 -
 llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll              | 1 -
 llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll             | 1 -
 llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll              | 1 -
 llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll             | 1 -
 llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll         | 2 --
 7 files changed, 8 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
index 776a1e9bab6b2..81fb86cd81cd3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
@@ -47,7 +47,6 @@ define <32 x i32> @insertelt_v32i32_0(<32 x i32> %a, i32 %y) {
   ret <32 x i32> %b
 }
 
-; FIXME: Should only require an m2 slideup
 define <32 x i32> @insertelt_v32i32_4(<32 x i32> %a, i32 %y) {
 ; CHECK-LABEL: insertelt_v32i32_4:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
index 2a4fbb248cd9c..feeef73e538ae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
@@ -1425,7 +1425,6 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, <v
   ret <vscale x 32 x i32> %v
 }
 
-; FIXME: The first vadd.vi should be able to infer that its AVL is equivalent to VLMAX.
 ; FIXME: The upper half of the operation is doing nothing but we don't catch
 ; that on RV64; we issue a usubsat(and (vscale x 16), 0xffffffff, vscale x 16)
 ; (the "original" %evl is the "and", due to known-bits issues with legalizing
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
index 5fdfb332da7cf..f65e708f5303c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
@@ -1062,7 +1062,6 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
   ret <vscale x 32 x i32> %v
 }
 
-; FIXME: The first vmax.vx should be able to infer that its AVL is equivalent to VLMAX.
 ; FIXME: The upper half of the operation is doing nothing but we don't catch
 ; that on RV64; we issue a usubsat(and (vscale x 16), 0xffffffff, vscale x 16)
 ; (the "original" %evl is the "and", due to known-bits issues with legalizing
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
index 7d678950b7a3c..df1ad58e5ecbd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
@@ -1061,7 +1061,6 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
   ret <vscale x 32 x i32> %v
 }
 
-; FIXME: The first vmaxu.vx should be able to infer that its AVL is equivalent to VLMAX.
 ; FIXME: The upper half of the operation is doing nothing but we don't catch
 ; that on RV64; we issue a usubsat(and (vscale x 16), 0xffffffff, vscale x 16)
 ; (the "original" %evl is the "and", due to known-bits issues with legalizing
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
index 98a288ed68b9a..0bf0638633aa4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
@@ -1062,7 +1062,6 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
   ret <vscale x 32 x i32> %v
 }
 
-; FIXME: The first vmin.vx should be able to infer that its AVL is equivalent to VLMAX.
 ; FIXME: The upper half of the operation is doing nothing but we don't catch
 ; that on RV64; we issue a usubsat(and (vscale x 16), 0xffffffff, vscale x 16)
 ; (the "original" %evl is the "and", due to known-bits issues with legalizing
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
index 34b554b7ff514..2acebdf2e646d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
@@ -1061,7 +1061,6 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
   ret <vscale x 32 x i32> %v
 }
 
-; FIXME: The first vminu.vx should be able to infer that its AVL is equivalent to VLMAX.
 ; FIXME: The upper half of the operation is doing nothing but we don't catch
 ; that on RV64; we issue a usubsat(and (vscale x 16), 0xffffffff, vscale x 16)
 ; (the "original" %evl is the "and", due to known-bits issues with legalizing
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
index a4d58985b75de..b7ce0e3f196f0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
@@ -1394,8 +1394,6 @@ define <vscale x 1 x i64> @i1_zext(<vscale x 1 x i1> %va, <vscale x 1 x i64> %vb
 }
 
 ; %x.i32 and %y.i32 are disjoint, so DAGCombiner will combine it into an or.
-; FIXME: We should be able to recover the or into vwaddu.vv if the disjoint
-; flag is set.
 define <vscale x 2 x i32> @vwaddu_vv_disjoint_or_add(<vscale x 2 x i8> %x.i8, <vscale x 2 x i8> %y.i8) {
 ; CHECK-LABEL: vwaddu_vv_disjoint_or_add:
 ; CHECK:       # %bb.0:

From 3299bc863fd74613fdfad2a2fde3f75de79bd645 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 5 Sep 2024 11:18:29 +0100
Subject: [PATCH 205/425] [DWARF] Identify prologue_end by instruction rather
 than DILocation (#107264)

Currently, we identify the end of the prologue as being "the instruction
that first has *this* DebugLoc". It works well enough, but I feel
identifying a position in a function is best communicated by a
MachineInstr. Plus, I've got some patches coming that depend upon this.
---
 llvm/include/llvm/CodeGen/DebugHandlerBase.h |  2 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp   | 23 ++++++++++----------
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h     |  6 +++--
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/DebugHandlerBase.h b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
index 85046c200ff9b..d39e7e68cb255 100644
--- a/llvm/include/llvm/CodeGen/DebugHandlerBase.h
+++ b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
@@ -74,7 +74,7 @@ class DebugHandlerBase {
 
   /// This location indicates end of function prologue and beginning of
   /// function body.
-  DebugLoc PrologEndLoc;
+  const MachineInstr *PrologEndLoc;
 
   /// This block includes epilogue instructions.
   const MachineBasicBlock *EpilogBeginBlock = nullptr;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 9da9382baf863..ea3fed8817d89 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2111,9 +2111,9 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
   // (The new location might be an explicit line 0, which we do emit.)
   if (DL.getLine() == 0 && LastAsmLine == 0)
     return;
-  if (DL == PrologEndLoc) {
+  if (MI == PrologEndLoc) {
     Flags |= DWARF2_FLAG_PROLOGUE_END | DWARF2_FLAG_IS_STMT;
-    PrologEndLoc = DebugLoc();
+    PrologEndLoc = nullptr;
   }
   // If the line changed, we call that a new statement; unless we went to
   // line 0 and came back, in which case it is not a new statement. We also
@@ -2131,10 +2131,11 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
     PrevInstLoc = DL;
 }
 
-static std::pair<DebugLoc, bool> findPrologueEndLoc(const MachineFunction *MF) {
+static std::pair<const MachineInstr *, bool>
+findPrologueEndLoc(const MachineFunction *MF) {
   // First known non-DBG_VALUE and non-frame setup location marks
   // the beginning of the function body.
-  DebugLoc LineZeroLoc;
+  const MachineInstr *LineZeroLoc = nullptr;
   const Function &F = MF->getFunction();
 
   // Some instructions may be inserted into prologue after this function. Must
@@ -2151,9 +2152,9 @@ static std::pair<DebugLoc, bool> findPrologueEndLoc(const MachineFunction *MF) {
           // meaningful breakpoint. If none is found, return the first
           // location after the frame setup.
           if (MI.getDebugLoc().getLine())
-            return std::make_pair(MI.getDebugLoc(), IsEmptyPrologue);
+            return std::make_pair(&MI, IsEmptyPrologue);
 
-          LineZeroLoc = MI.getDebugLoc();
+          LineZeroLoc = &MI;
         }
         IsEmptyPrologue = false;
       }
@@ -2184,10 +2185,10 @@ static void recordSourceLine(AsmPrinter &Asm, unsigned Line, unsigned Col,
                                          Discriminator, Fn);
 }
 
-DebugLoc DwarfDebug::emitInitialLocDirective(const MachineFunction &MF,
-                                             unsigned CUID) {
-  std::pair<DebugLoc, bool> PrologEnd = findPrologueEndLoc(&MF);
-  DebugLoc PrologEndLoc = PrologEnd.first;
+const MachineInstr *
+DwarfDebug::emitInitialLocDirective(const MachineFunction &MF, unsigned CUID) {
+  std::pair<const MachineInstr *, bool> PrologEnd = findPrologueEndLoc(&MF);
+  const MachineInstr *PrologEndLoc = PrologEnd.first;
   bool IsEmptyPrologue = PrologEnd.second;
 
   // Get beginning of function.
@@ -2206,7 +2207,7 @@ DebugLoc DwarfDebug::emitInitialLocDirective(const MachineFunction &MF,
                        CUID, getDwarfVersion(), getUnits());
     return PrologEndLoc;
   }
-  return DebugLoc();
+  return nullptr;
 }
 
 // Gather pre-function debug information.  Assumes being called immediately
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 6e379396ea079..19f5b677bb8d0 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -724,8 +724,10 @@ class DwarfDebug : public DebugHandlerBase {
   /// Emit all Dwarf sections that should come after the content.
   void endModule() override;
 
-  /// Emits inital debug location directive.
-  DebugLoc emitInitialLocDirective(const MachineFunction &MF, unsigned CUID);
+  /// Emits inital debug location directive. Returns instruction at which
+  /// the function prologue ends.
+  const MachineInstr *emitInitialLocDirective(const MachineFunction &MF,
+                                              unsigned CUID);
 
   /// Process beginning of an instruction.
   void beginInstruction(const MachineInstr *MI) override;

From 95684afbcd59f34be580f75ee32f766874b5d0f5 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Thu, 5 Sep 2024 12:53:57 +0200
Subject: [PATCH 206/425] [IR][ARM64EC][NFC] Clean up and document ARM64EC
 mangling helpers. (#107230)

---
 llvm/include/llvm/IR/Mangler.h |  5 +++++
 llvm/lib/IR/Mangler.cpp        | 39 +++++++++++++++++-----------------
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/IR/Mangler.h b/llvm/include/llvm/IR/Mangler.h
index f28ffc961b6db..349f9e6e75233 100644
--- a/llvm/include/llvm/IR/Mangler.h
+++ b/llvm/include/llvm/IR/Mangler.h
@@ -53,7 +53,12 @@ void emitLinkerFlagsForGlobalCOFF(raw_ostream &OS, const GlobalValue *GV,
 void emitLinkerFlagsForUsedCOFF(raw_ostream &OS, const GlobalValue *GV,
                                 const Triple &T, Mangler &M);
 
+/// Returns the ARM64EC mangled function name unless the input is already
+/// mangled.
 std::optional<std::string> getArm64ECMangledFunctionName(StringRef Name);
+
+/// Returns the ARM64EC demangled function name, unless the input is not
+/// mangled.
 std::optional<std::string> getArm64ECDemangledFunctionName(StringRef Name);
 
 } // End llvm namespace
diff --git a/llvm/lib/IR/Mangler.cpp b/llvm/lib/IR/Mangler.cpp
index e6c3ea9d56883..15a4debf191a5 100644
--- a/llvm/lib/IR/Mangler.cpp
+++ b/llvm/lib/IR/Mangler.cpp
@@ -291,39 +291,40 @@ void llvm::emitLinkerFlagsForUsedCOFF(raw_ostream &OS, const GlobalValue *GV,
 }
 
 std::optional<std::string> llvm::getArm64ECMangledFunctionName(StringRef Name) {
-  bool IsCppFn = Name[0] == '?';
-  if (IsCppFn && Name.contains("$$h"))
-    return std::nullopt;
-  if (!IsCppFn && Name[0] == '#')
-    return std::nullopt;
+  if (Name[0] != '?') {
+    // For non-C++ symbols, prefix the name with "#" unless it's already
+    // mangled.
+    if (Name[0] == '#')
+      return std::nullopt;
+    return std::optional<std::string>(("#" + Name).str());
+  }
 
-  StringRef Prefix = "$$h";
-  size_t InsertIdx = 0;
-  if (IsCppFn) {
-    InsertIdx = Name.find("@@");
-    size_t ThreeAtSignsIdx = Name.find("@@@");
-    if (InsertIdx != std::string::npos && InsertIdx != ThreeAtSignsIdx) {
-      InsertIdx += 2;
-    } else {
-      InsertIdx = Name.find("@");
-      if (InsertIdx != std::string::npos)
-        InsertIdx++;
-    }
+  // Insert the ARM64EC "$$h" tag after the mangled function name.
+  if (Name.contains("$$h"))
+    return std::nullopt;
+  size_t InsertIdx = Name.find("@@");
+  size_t ThreeAtSignsIdx = Name.find("@@@");
+  if (InsertIdx != std::string::npos && InsertIdx != ThreeAtSignsIdx) {
+    InsertIdx += 2;
   } else {
-    Prefix = "#";
+    InsertIdx = Name.find("@");
+    if (InsertIdx != std::string::npos)
+      InsertIdx++;
   }
 
   return std::optional<std::string>(
-      (Name.substr(0, InsertIdx) + Prefix + Name.substr(InsertIdx)).str());
+      (Name.substr(0, InsertIdx) + "$$h" + Name.substr(InsertIdx)).str());
 }
 
 std::optional<std::string>
 llvm::getArm64ECDemangledFunctionName(StringRef Name) {
+  // For non-C++ names, drop the "#" prefix.
   if (Name[0] == '#')
     return std::optional<std::string>(Name.substr(1));
   if (Name[0] != '?')
     return std::nullopt;
 
+  // Drop the ARM64EC "$$h" tag.
   std::pair<StringRef, StringRef> Pair = Name.split("$$h");
   if (Pair.second.empty())
     return std::nullopt;

From 87b4b648585f69a2ea148e86543aa31474e59acd Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Thu, 5 Sep 2024 13:17:17 +0200
Subject: [PATCH 207/425] Fix a typo in CheckExprLifetime.cpp, NFC

---
 clang/lib/Sema/CheckExprLifetime.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index 1482711cc2839..6ed6b8aaa7040 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -328,7 +328,7 @@ static bool shouldTrackFirstArgument(const FunctionDecl *FD) {
 // We assuments that a normal assingment operator always returns *this, that is,
 // an lvalue reference that is the same type as the implicit object parameter
 // (or the LHS for a non-member operator$=).
-static bool isNormalAsisgnmentOperator(const FunctionDecl *FD) {
+static bool isNormalAssignmentOperator(const FunctionDecl *FD) {
   OverloadedOperatorKind OO = FD->getDeclName().getCXXOverloadedOperator();
   if (OO == OO_Equal || isCompoundAssignmentOperator(OO)) {
     QualType RetT = FD->getReturnType();
@@ -362,7 +362,7 @@ static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) {
       return true;
   }
 
-  return isNormalAsisgnmentOperator(FD);
+  return isNormalAssignmentOperator(FD);
 }
 
 // Visit lifetimebound or gsl-pointer arguments.
@@ -940,10 +940,10 @@ static bool pathOnlyHandlesGslPointer(IndirectLocalPath &Path) {
   return false;
 }
 
-static bool isAssginmentOperatorLifetimeBound(CXXMethodDecl *CMD) {
+static bool isAssignmentOperatorLifetimeBound(CXXMethodDecl *CMD) {
   if (!CMD)
     return false;
-  return isNormalAsisgnmentOperator(CMD) && CMD->param_size() == 1 &&
+  return isNormalAssignmentOperator(CMD) && CMD->param_size() == 1 &&
          CMD->getParamDecl(0)->hasAttr<LifetimeBoundAttr>();
 }
 
@@ -953,7 +953,7 @@ static bool shouldRunGSLAssignmentAnalysis(const Sema &SemaRef,
       diag::warn_dangling_lifetime_pointer_assignment, SourceLocation());
   return (EnableGSLAssignmentWarnings &&
           (isRecordWithAttr<PointerAttr>(Entity.LHS->getType()) ||
-           isAssginmentOperatorLifetimeBound(Entity.AssignmentOperator)));
+           isAssignmentOperatorLifetimeBound(Entity.AssignmentOperator)));
 }
 
 static void checkExprLifetimeImpl(Sema &SemaRef,

From 3e070906eff720dc44aee86e533e12aafc8bb14b Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Thu, 5 Sep 2024 13:24:38 +0200
Subject: [PATCH 208/425] Fix llvm-else-after-return clang-tidy warning in
 CheckExprLifetime.cpp, NFC

---
 clang/lib/Sema/CheckExprLifetime.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index 6ed6b8aaa7040..8f4d5d50669f1 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -288,7 +288,8 @@ static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) {
         // Map and set types.
         .Cases("find", "equal_range", "lower_bound", "upper_bound", true)
         .Default(false);
-  } else if (Callee->getReturnType()->isReferenceType()) {
+  }
+  if (Callee->getReturnType()->isReferenceType()) {
     if (!Callee->getIdentifier()) {
       auto OO = Callee->getOverloadedOperator();
       return OO == OverloadedOperatorKind::OO_Subscript ||
@@ -316,7 +317,8 @@ static bool shouldTrackFirstArgument(const FunctionDecl *FD) {
         .Cases("end", "rend", "cend", "crend", true)
         .Case("data", true)
         .Default(false);
-  } else if (FD->getReturnType()->isReferenceType()) {
+  }
+  if (FD->getReturnType()->isReferenceType()) {
     return llvm::StringSwitch<bool>(FD->getName())
         .Cases("get", "any_cast", true)
         .Default(false);

From 07bef02831836748f46ddd285520f351fe18cfe9 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Thu, 5 Sep 2024 12:30:20 +0100
Subject: [PATCH 209/425] [OpenMPOpt] Initialize OpenMPIRBuilderConfig::IsGPU
 flag (#104456)

This patch ensures the `IsGPU` flag is set by the OpenMPOpt pass, so
that it can be relied upon by `OpenMPIRBuilder` methods when called by
that pass as well.

Since currently there are very limited callers for the
`OpenMPIRBuilder::isGPU()` method, no assertions are being triggered by
the lack of initialization of this flag. However, when more
offloading-related features are implemented, it will eventually start
happening.
---
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 7c1489f37049f..cd94661bbe07f 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -286,6 +286,19 @@ struct OMPInformationCache : public InformationCache {
         OpenMPPostLink(OpenMPPostLink) {
 
     OMPBuilder.Config.IsTargetDevice = isOpenMPDevice(OMPBuilder.M);
+    const Triple T(OMPBuilder.M.getTargetTriple());
+    switch (T.getArch()) {
+    case llvm::Triple::nvptx:
+    case llvm::Triple::nvptx64:
+    case llvm::Triple::amdgcn:
+      assert(OMPBuilder.Config.IsTargetDevice &&
+             "OpenMP AMDGPU/NVPTX is only prepared to deal with device code.");
+      OMPBuilder.Config.IsGPU = true;
+      break;
+    default:
+      OMPBuilder.Config.IsGPU = false;
+      break;
+    }
     OMPBuilder.initialize();
     initializeRuntimeFunctions(M);
     initializeInternalControlVars();

From 142433684a6e3a2936f814268396dea4190905dc Mon Sep 17 00:00:00 2001
From: Akash Banerjee <akash.banerjee@amd.com>
Date: Thu, 5 Sep 2024 12:44:10 +0100
Subject: [PATCH 210/425] [OpenMP][Flang] Fix dynamic-extent array mapping
 (#107247)

This patch fixes the mapping and lowering of arrays with dynamic extents
and adds a new test for the same. The fix discards the incomplete the
dynamic extent information and replacing it with just the base type.
When lowering to llvm later, the bounds information is used instead.
---
 flang/lib/Lower/OpenMP/Utils.cpp              |  7 ++++
 flang/test/Lower/OpenMP/array-bounds.f90      |  2 +-
 flang/test/Lower/OpenMP/target.f90            |  2 +-
 .../offloading/fortran/target-map-dynamic.f90 | 33 +++++++++++++++++++
 4 files changed, 42 insertions(+), 2 deletions(-)
 create mode 100644 offload/test/offloading/fortran/target-map-dynamic.f90

diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index bbf08961bee66..8073b24a1d5b4 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -132,6 +132,13 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::TypeAttr varType = mlir::TypeAttr::get(
       llvm::cast<mlir::omp::PointerLikeType>(retTy).getElementType());
 
+  // For types with unknown extents such as <2x?xi32> we discard the incomplete
+  // type info and only retain the base type. The correct dimensions are later
+  // recovered through the bounds info.
+  if (auto seqType = llvm::dyn_cast<fir::SequenceType>(varType.getValue()))
+    if (seqType.hasDynamicExtents())
+      varType = mlir::TypeAttr::get(seqType.getEleTy());
+
   mlir::omp::MapInfoOp op = builder.create<mlir::omp::MapInfoOp>(
       loc, retTy, baseAddr, varType, varPtrPtr, members, membersIndex, bounds,
       builder.getIntegerAttr(builder.getIntegerType(64, false), mapType),
diff --git a/flang/test/Lower/OpenMP/array-bounds.f90 b/flang/test/Lower/OpenMP/array-bounds.f90
index 9e59d71f560fc..09498ca6cdde9 100644
--- a/flang/test/Lower/OpenMP/array-bounds.f90
+++ b/flang/test/Lower/OpenMP/array-bounds.f90
@@ -74,7 +74,7 @@ end subroutine assumed_shape_array
 !HOST: %[[C4_1:.*]] = arith.subi %c4, %c1{{.*}} : index
 !HOST: %[[EXT:.*]] = arith.addi %[[C4_1]], %c1{{.*}} : index
 !HOST: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%c1{{.*}} : index) upper_bound(%c4{{.*}} : index) extent(%[[EXT]] : index) stride(%[[DIMS0]]#2 : index) start_idx(%c1{{.*}} : index) {stride_in_bytes = true}
-!HOST: %[[MAP:.*]] = omp.map.info var_ptr(%[[ARG0_DECL]]#1 : !fir.ref<!fir.array<?xi32>>, !fir.array<?xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
+!HOST: %[[MAP:.*]] = omp.map.info var_ptr(%[[ARG0_DECL]]#1 : !fir.ref<!fir.array<?xi32>>, i32) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
 !HOST: omp.target map_entries(%[[MAP]] -> %{{.*}}, {{.*}} -> {{.*}} : !fir.ref<!fir.array<?xi32>>, !fir.ref<i32>) {
     subroutine assumed_size_array(arr_read_write)
         integer, intent(inout) :: arr_read_write(*)
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index e86a2f9b6098d..6fccea7e37072 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -460,7 +460,7 @@ subroutine omp_target_implicit_bounds(n)
    integer :: n
    integer :: a(n)
    !CHECK: %[[VAL_14:.*]] = omp.map.bounds lower_bound(%c0{{.*}} : index) upper_bound(%[[UB]] : index) extent(%[[VAL_7]] : index) stride(%c1{{.*}} : index) start_idx(%c1{{.*}} : index)
-   !CHECK: %[[VAL_15:.*]] = omp.map.info var_ptr(%[[VAL_10]]#1 : !fir.ref<!fir.array<?xi32>>, !fir.array<?xi32>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[VAL_14]]) -> !fir.ref<!fir.array<?xi32>> {name = "a"}
+   !CHECK: %[[VAL_15:.*]] = omp.map.info var_ptr(%[[VAL_10]]#1 : !fir.ref<!fir.array<?xi32>>, i32) map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[VAL_14]]) -> !fir.ref<!fir.array<?xi32>> {name = "a"}
    !CHECK: %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_COPY]] : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = ""}
    !CHECK: omp.target map_entries(%[[VAL_15]] -> %[[VAL_17:.*]], %[[VAL_16]] -> %[[VAL_18:.*]] : !fir.ref<!fir.array<?xi32>>, !fir.ref<i32>) {
    !CHECK: ^bb0(%[[VAL_17]]: !fir.ref<!fir.array<?xi32>>, %[[VAL_18]]: !fir.ref<i32>):
diff --git a/offload/test/offloading/fortran/target-map-dynamic.f90 b/offload/test/offloading/fortran/target-map-dynamic.f90
new file mode 100644
index 0000000000000..8bf6980884a18
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-dynamic.f90
@@ -0,0 +1,33 @@
+! Offloading test checking lowering of arrays with dynamic extents.
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+
+subroutine test_array_target_enter_data(dims)
+    integer, intent(in) :: dims(2)
+    double precision :: A(2, dims(2))
+    !$omp target enter data map(to: U)
+
+    A(2,2) = 1.0
+    !$omp target
+         A(1,1) = 10
+         A(2,1) = 20
+         A(1,2) = 30
+         A(2,2) = 40
+    !$omp end target
+
+    !$omp target exit data map(from: A)
+
+    print *, A
+end subroutine test_array_target_enter_data
+
+program main
+    integer :: dimensions(2)
+    dimensions(1) = 1
+    dimensions(2) = 2
+
+call test_array_target_enter_data(dimensions)
+end program
+
+
+! CHECK:  10. 20. 30. 40.

From b44d9e5d3605d7ddd64992e3c77b6669f0f7701b Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Thu, 5 Sep 2024 12:50:21 +0100
Subject: [PATCH 211/425] VPlanTransforms: fix style after cursory reading
 (NFC) (#105827)

---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9796ee64f6ef9..39f7bf55ee5eb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1072,10 +1072,11 @@ void VPlanTransforms::truncateToMinimalBitwidths(
         ResultVPV->replaceAllUsesWith(Ext);
         Ext->setOperand(0, ResultVPV);
         assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
-      } else
+      } else {
         assert(
             match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) &&
             "Only ICmps should not need extending the result.");
+      }
 
       assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
       if (isa<VPWidenLoadRecipe>(&R))
@@ -1214,7 +1215,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
 
   // Now create the ActiveLaneMaskPhi recipe in the main loop using the
   // preheader ActiveLaneMask instruction.
-  auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
+  auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
   LaneMaskPhi->insertAfter(CanonicalIVPHI);
 
   // Create the active lane mask for the next iteration of the loop before the
@@ -1290,7 +1291,7 @@ void VPlanTransforms::addActiveLaneMask(
          "DataAndControlFlowWithoutRuntimeCheck implies "
          "UseActiveLaneMaskForControlFlow");
 
-  auto FoundWidenCanonicalIVUser =
+  auto *FoundWidenCanonicalIVUser =
       find_if(Plan.getCanonicalIV()->users(),
               [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
   assert(FoundWidenCanonicalIVUser &&
@@ -1440,14 +1441,13 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
   // Collect recipes in the backward slice of `Root` that may generate a poison
   // value that is used after vectorization.
   SmallPtrSet<VPRecipeBase *, 16> Visited;
-  auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
+  auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
     SmallVector<VPRecipeBase *, 16> Worklist;
     Worklist.push_back(Root);
 
     // Traverse the backward slice of Root through its use-def chain.
     while (!Worklist.empty()) {
-      VPRecipeBase *CurRec = Worklist.back();
-      Worklist.pop_back();
+      VPRecipeBase *CurRec = Worklist.pop_back_val();
 
       if (!Visited.insert(CurRec).second)
         continue;
@@ -1493,8 +1493,8 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
       }
 
       // Add new definitions to the worklist.
-      for (VPValue *operand : CurRec->operands())
-        if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
+      for (VPValue *Operand : CurRec->operands())
+        if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
           Worklist.push_back(OpDef);
     }
   });
@@ -1510,7 +1510,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
         VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
         if (AddrDef && WidenRec->isConsecutive() &&
             BlockNeedsPredication(UnderlyingInstr.getParent()))
-          collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
+          CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
         VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
         if (AddrDef) {
@@ -1526,7 +1526,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
           }
 
           if (NeedPredication)
-            collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
+            CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
         }
       }
     }

From fa385274baae77a0ea7e78c4c6feca6b0ab4f1dc Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 5 Sep 2024 13:53:19 +0200
Subject: [PATCH 212/425] [libc++] Add ABI tests for unordered_{map,set}
 (#107200)

These are used to ensure #76756 is correct.
---
 .../unord.map/abi.compile.pass.cpp            | 134 ++++++++++++++++++
 .../unord.set/abi.compile.pass.cpp            | 132 +++++++++++++++++
 libcxx/utils/libcxx/test/features.py          |   1 +
 3 files changed, 267 insertions(+)
 create mode 100644 libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp

diff --git a/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp
new file mode 100644
index 0000000000000..c8e5ba0ec899e
--- /dev/null
+++ b/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp
@@ -0,0 +1,134 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: libcpp-has-abi-fix-unordered-container-size-type
+
+#include <cstdint>
+#include <unordered_map>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+template <class T>
+class small_pointer {
+  std::uint16_t offset;
+};
+
+template <class T>
+class small_iter_allocator {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::uint16_t;
+  using difference_type = std::int16_t;
+
+  small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  small_iter_allocator(small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(small_iter_allocator, small_iter_allocator) { return true; }
+  friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
+};
+
+template <class T>
+class final_small_iter_allocator final {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::uint16_t;
+  using difference_type = std::int16_t;
+
+  final_small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  final_small_iter_allocator(final_small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(final_small_iter_allocator, final_small_iter_allocator) { return true; }
+  friend bool operator!=(final_small_iter_allocator, final_small_iter_allocator) { return false; }
+};
+
+template <class T, class Alloc>
+using unordered_map_alloc = std::unordered_map<T, T, std::hash<T>, std::equal_to<T>, Alloc>;
+
+#if __SIZE_WIDTH__ == 64
+
+static_assert(sizeof(unordered_map_alloc<int, std::allocator<std::pair<const int, int> > >) == 40, "");
+static_assert(sizeof(unordered_map_alloc<int, min_allocator<std::pair<const int, int> > >) == 40, "");
+static_assert(sizeof(unordered_map_alloc<int, test_allocator<std::pair<const int, int> > >) == 64, "");
+static_assert(sizeof(unordered_map_alloc<int, small_iter_allocator<std::pair<const int, int> > >) == 12, "");
+static_assert(sizeof(unordered_map_alloc<int, final_small_iter_allocator<std::pair<const int, int> > >) == 16, "");
+
+static_assert(sizeof(unordered_map_alloc<char, std::allocator<std::pair<const char, char> > >) == 40, "");
+static_assert(sizeof(unordered_map_alloc<char, min_allocator<std::pair<const char, char> > >) == 40, "");
+static_assert(sizeof(unordered_map_alloc<char, test_allocator<std::pair<const char, char> > >) == 64, "");
+static_assert(sizeof(unordered_map_alloc<char, small_iter_allocator<std::pair<const char, char> > >) == 12, "");
+static_assert(sizeof(unordered_map_alloc<char, final_small_iter_allocator<std::pair<const char, char> > >) == 16, "");
+
+static_assert(TEST_ALIGNOF(unordered_map_alloc<int, std::allocator<std::pair<const int, int> > >) == 8, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<int, min_allocator<std::pair<const int, int> > >) == 8, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<int, test_allocator<std::pair<const int, int> > >) == 8, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<int, small_iter_allocator<std::pair<const int, int> > >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<int, final_small_iter_allocator<std::pair<const int, int> > >) == 4, "");
+
+static_assert(TEST_ALIGNOF(unordered_map_alloc<char, std::allocator<std::pair<const char, char> > >) == 8, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<char, min_allocator<std::pair<const char, char> > >) == 8, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<char, test_allocator<std::pair<const char, char> > >) == 8, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<char, small_iter_allocator<std::pair<const char, char> > >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<char, final_small_iter_allocator<std::pair<const char, char> > >) == 4,
+              "");
+
+struct TEST_ALIGNAS(32) AlignedHash {};
+struct UnalignedEqualTo {};
+
+static_assert(sizeof(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 96, "");
+static_assert(TEST_ALIGNOF(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 32, "");
+
+#elif __SIZE_WIDTH__ == 32
+
+static_assert(sizeof(unordered_map_alloc<int, std::allocator<std::pair<const int, int> > >) == 20, "");
+static_assert(sizeof(unordered_map_alloc<int, min_allocator<std::pair<const int, int> > >) == 20, "");
+static_assert(sizeof(unordered_map_alloc<int, test_allocator<std::pair<const int, int> > >) == 44, "");
+static_assert(sizeof(unordered_map_alloc<int, small_iter_allocator<std::pair<const int, int> > >) == 12, "");
+static_assert(sizeof(unordered_map_alloc<int, final_small_iter_allocator<std::pair<const int, int> > >) == 16, "");
+
+static_assert(sizeof(unordered_map_alloc<char, std::allocator<std::pair<const char, char> > >) == 20, "");
+static_assert(sizeof(unordered_map_alloc<char, min_allocator<std::pair<const char, char> > >) == 20, "");
+static_assert(sizeof(unordered_map_alloc<char, test_allocator<std::pair<const char, char> > >) == 44, "");
+static_assert(sizeof(unordered_map_alloc<char, small_iter_allocator<std::pair<const char, char> > >) == 12, "");
+static_assert(sizeof(unordered_map_alloc<char, final_small_iter_allocator<std::pair<const char, char> > >) == 16, "");
+
+static_assert(TEST_ALIGNOF(unordered_map_alloc<int, std::allocator<std::pair<const int, int> > >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<int, min_allocator<std::pair<const int, int> > >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<int, test_allocator<std::pair<const int, int> > >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<int, small_iter_allocator<std::pair<const int, int> > >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<int, final_small_iter_allocator<std::pair<const int, int> > >) == 4, "");
+
+static_assert(TEST_ALIGNOF(unordered_map_alloc<char, std::allocator<std::pair<const char, char> > >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<char, min_allocator<std::pair<const char, char> > >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<char, test_allocator<std::pair<const char, char> > >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<char, small_iter_allocator<std::pair<const char, char> > >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_map_alloc<char, final_small_iter_allocator<std::pair<const char, char> > >) == 4,
+              "");
+
+struct TEST_ALIGNAS(32) AlignedHash {};
+struct UnalignedEqualTo {};
+
+static_assert(sizeof(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 96);
+static_assert(TEST_ALIGNOF(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 32);
+
+#else
+#  error std::size_t has an unexpected size
+#endif
diff --git a/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp
new file mode 100644
index 0000000000000..359e248ff7a4f
--- /dev/null
+++ b/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp
@@ -0,0 +1,132 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: libcpp-has-abi-fix-unordered-container-size-type
+
+#include <cstdint>
+#include <unordered_set>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+template <class T>
+class small_pointer {
+  std::uint16_t offset;
+};
+
+template <class T>
+class small_iter_allocator {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::uint16_t;
+  using difference_type = std::int16_t;
+
+  small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  small_iter_allocator(small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(small_iter_allocator, small_iter_allocator) { return true; }
+  friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
+};
+
+template <class T>
+class final_small_iter_allocator final {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::uint16_t;
+  using difference_type = std::int16_t;
+
+  final_small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  final_small_iter_allocator(final_small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(final_small_iter_allocator, final_small_iter_allocator) { return true; }
+  friend bool operator!=(final_small_iter_allocator, final_small_iter_allocator) { return false; }
+};
+
+template <class T, class Alloc>
+using unordered_set_alloc = std::unordered_set<T, std::hash<T>, std::equal_to<T>, Alloc>;
+
+#if __SIZE_WIDTH__ == 64
+
+static_assert(sizeof(unordered_set_alloc<int, std::allocator<int> >) == 40, "");
+static_assert(sizeof(unordered_set_alloc<int, min_allocator<int> >) == 40, "");
+static_assert(sizeof(unordered_set_alloc<int, test_allocator<int> >) == 64, "");
+static_assert(sizeof(unordered_set_alloc<int, small_iter_allocator<int> >) == 12, "");
+static_assert(sizeof(unordered_set_alloc<int, final_small_iter_allocator<int> >) == 16, "");
+
+static_assert(sizeof(unordered_set_alloc<char, std::allocator<char> >) == 40, "");
+static_assert(sizeof(unordered_set_alloc<char, min_allocator<char> >) == 40, "");
+static_assert(sizeof(unordered_set_alloc<char, test_allocator<char> >) == 64, "");
+static_assert(sizeof(unordered_set_alloc<char, small_iter_allocator<char> >) == 12, "");
+static_assert(sizeof(unordered_set_alloc<char, final_small_iter_allocator<char> >) == 16, "");
+
+static_assert(TEST_ALIGNOF(unordered_set_alloc<int, std::allocator<int> >) == 8, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<int, min_allocator<int> >) == 8, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<int, test_allocator<int> >) == 8, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<int, small_iter_allocator<int> >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<int, final_small_iter_allocator<int> >) == 4, "");
+
+static_assert(TEST_ALIGNOF(unordered_set_alloc<char, std::allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<char, min_allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<char, test_allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<char, small_iter_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<char, final_small_iter_allocator<char> >) == 4, "");
+
+struct TEST_ALIGNAS(32) AlignedHash {};
+struct UnalignedEqualTo {};
+
+static_assert(sizeof(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 96, "");
+static_assert(TEST_ALIGNOF(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 32, "");
+
+#elif __SIZE_WIDTH__ == 32
+
+static_assert(sizeof(unordered_set_alloc<int, std::allocator<int> >) == 20, "");
+static_assert(sizeof(unordered_set_alloc<int, min_allocator<int> >) == 20, "");
+static_assert(sizeof(unordered_set_alloc<int, test_allocator<int> >) == 44, "");
+static_assert(sizeof(unordered_set_alloc<int, small_iter_allocator<int> >) == 12, "");
+static_assert(sizeof(unordered_set_alloc<int, final_small_iter_allocator<int> >) == 16, "");
+
+static_assert(sizeof(unordered_set_alloc<char, std::allocator<char> >) == 20, "");
+static_assert(sizeof(unordered_set_alloc<char, min_allocator<char> >) == 20, "");
+static_assert(sizeof(unordered_set_alloc<char, test_allocator<char> >) == 44, "");
+static_assert(sizeof(unordered_set_alloc<char, small_iter_allocator<char> >) == 12, "");
+static_assert(sizeof(unordered_set_alloc<char, final_small_iter_allocator<char> >) == 16, "");
+
+static_assert(TEST_ALIGNOF(unordered_set_alloc<int, std::allocator<int> >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<int, min_allocator<int> >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<int, test_allocator<int> >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<int, small_iter_allocator<int> >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<int, final_small_iter_allocator<int> >) == 4, "");
+
+static_assert(TEST_ALIGNOF(unordered_set_alloc<char, std::allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<char, min_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<char, test_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<char, small_iter_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(unordered_set_alloc<char, final_small_iter_allocator<char> >) == 4, "");
+
+struct TEST_ALIGNAS(32) AlignedHash {};
+struct UnalignedEqualTo {};
+
+static_assert(sizeof(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 96);
+static_assert(TEST_ALIGNOF(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 32);
+
+#else
+#  error std::size_t has an unexpected size
+#endif
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 6857a28eb3299..2cd04124a6ca5 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -374,6 +374,7 @@ def _mingwSupportsModules(cfg):
     "_LIBCPP_ABI_BOUNDED_ITERATORS": "libcpp-has-abi-bounded-iterators",
     "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING": "libcpp-has-abi-bounded-iterators-in-string",
     "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR": "libcpp-has-abi-bounded-iterators-in-vector",
+    "_LIBCPP_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE": "libcpp-has-abi-fix-unordered-container-size-type",
     "_LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR": "libcpp-deprecated-abi-disable-pair-trivial-copy-ctor",
     "_LIBCPP_HAS_NO_FILESYSTEM": "no-filesystem",
     "_LIBCPP_HAS_NO_RANDOM_DEVICE": "no-random-device",

From 11040560ba30381ed47c3089a2562a41b00dbb4b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 5 Sep 2024 12:52:00 +0100
Subject: [PATCH 213/425] [X86] preferABDSToABSWithNSW - use ABDS for i32/i64
 if we have CMOV

Now that we have better ABDS lowering, prefer cmov(sub(x,y),sub(y,x)) to cmov(abs(sub(x,y)),sub(x,y)) to improve ILP
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  2 +-
 llvm/test/CodeGen/X86/abds-neg.ll       | 36 ++++++++++++---------
 llvm/test/CodeGen/X86/abds.ll           | 42 +++++++++++++------------
 3 files changed, 45 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 451881e1d6141..092a7192929fd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58210,7 +58210,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
 }
 
 bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const {
-  return false;
+  return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
 }
 
 // Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index 833273dc98243..f837f49abf7a4 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -1027,19 +1027,22 @@ define i16 @abd_subnsw_i16_undef(i16 %a, i16 %b) nounwind {
 define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i32:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    cmovll %edx, %eax
 ; X86-NEXT:    negl %eax
-; X86-NEXT:    cmovnsl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_subnsw_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    subl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    subl %esi, %eax
+; X64-NEXT:    subl %edi, %esi
+; X64-NEXT:    cmovgel %esi, %eax
 ; X64-NEXT:    negl %eax
-; X64-NEXT:    cmovnsl %edi, %eax
 ; X64-NEXT:    retq
   %sub = sub nsw i32 %a, %b
   %abs = call i32 @llvm.abs.i32(i32 %sub, i1 false)
@@ -1050,19 +1053,22 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind {
 define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i32_undef:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    cmovll %edx, %eax
 ; X86-NEXT:    negl %eax
-; X86-NEXT:    cmovnsl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_subnsw_i32_undef:
 ; X64:       # %bb.0:
-; X64-NEXT:    subl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    subl %esi, %eax
+; X64-NEXT:    subl %edi, %esi
+; X64-NEXT:    cmovgel %esi, %eax
 ; X64-NEXT:    negl %eax
-; X64-NEXT:    cmovnsl %edi, %eax
 ; X64-NEXT:    retq
   %sub = sub nsw i32 %a, %b
   %abs = call i32 @llvm.abs.i32(i32 %sub, i1 true)
@@ -1090,10 +1096,11 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
 ;
 ; X64-LABEL: abd_subnsw_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    subq %rsi, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    subq %rsi, %rax
+; X64-NEXT:    subq %rdi, %rsi
+; X64-NEXT:    cmovgeq %rsi, %rax
 ; X64-NEXT:    negq %rax
-; X64-NEXT:    cmovnsq %rdi, %rax
 ; X64-NEXT:    retq
   %sub = sub nsw i64 %a, %b
   %abs = call i64 @llvm.abs.i64(i64 %sub, i1 false)
@@ -1121,10 +1128,11 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 ;
 ; X64-LABEL: abd_subnsw_i64_undef:
 ; X64:       # %bb.0:
-; X64-NEXT:    subq %rsi, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    subq %rsi, %rax
+; X64-NEXT:    subq %rdi, %rsi
+; X64-NEXT:    cmovgeq %rsi, %rax
 ; X64-NEXT:    negq %rax
-; X64-NEXT:    cmovnsq %rdi, %rax
 ; X64-NEXT:    retq
   %sub = sub nsw i64 %a, %b
   %abs = call i64 @llvm.abs.i64(i64 %sub, i1 true)
diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll
index d9ba140032b31..9476fd14306fe 100644
--- a/llvm/test/CodeGen/X86/abds.ll
+++ b/llvm/test/CodeGen/X86/abds.ll
@@ -928,19 +928,20 @@ define i16 @abd_subnsw_i16_undef(i16 %a, i16 %b) nounwind {
 define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i32:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    negl %eax
-; X86-NEXT:    cmovsl %ecx, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    cmovll %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_subnsw_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    subl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    negl %eax
-; X64-NEXT:    cmovsl %edi, %eax
+; X64-NEXT:    subl %esi, %eax
+; X64-NEXT:    subl %edi, %esi
+; X64-NEXT:    cmovgel %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub nsw i32 %a, %b
   %abs = call i32 @llvm.abs.i32(i32 %sub, i1 false)
@@ -950,19 +951,20 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind {
 define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i32_undef:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    negl %eax
-; X86-NEXT:    cmovsl %ecx, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    cmovll %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_subnsw_i32_undef:
 ; X64:       # %bb.0:
-; X64-NEXT:    subl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    negl %eax
-; X64-NEXT:    cmovsl %edi, %eax
+; X64-NEXT:    subl %esi, %eax
+; X64-NEXT:    subl %edi, %esi
+; X64-NEXT:    cmovgel %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub nsw i32 %a, %b
   %abs = call i32 @llvm.abs.i32(i32 %sub, i1 true)
@@ -986,10 +988,10 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
 ;
 ; X64-LABEL: abd_subnsw_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    subq %rsi, %rdi
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    negq %rax
-; X64-NEXT:    cmovsq %rdi, %rax
+; X64-NEXT:    subq %rsi, %rax
+; X64-NEXT:    subq %rdi, %rsi
+; X64-NEXT:    cmovgeq %rsi, %rax
 ; X64-NEXT:    retq
   %sub = sub nsw i64 %a, %b
   %abs = call i64 @llvm.abs.i64(i64 %sub, i1 false)
@@ -1013,10 +1015,10 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 ;
 ; X64-LABEL: abd_subnsw_i64_undef:
 ; X64:       # %bb.0:
-; X64-NEXT:    subq %rsi, %rdi
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    negq %rax
-; X64-NEXT:    cmovsq %rdi, %rax
+; X64-NEXT:    subq %rsi, %rax
+; X64-NEXT:    subq %rdi, %rsi
+; X64-NEXT:    cmovgeq %rsi, %rax
 ; X64-NEXT:    retq
   %sub = sub nsw i64 %a, %b
   %abs = call i64 @llvm.abs.i64(i64 %sub, i1 true)

From 84cf3a573e89b18ce79ff32a7646c0a99729029c Mon Sep 17 00:00:00 2001
From: Mital Ashok <mital@mitalashok.co.uk>
Date: Thu, 5 Sep 2024 13:23:08 +0100
Subject: [PATCH 214/425] [Clang] CWG2749: relational operators involving
 pointers to void (#93046)

https://cplusplus.github.io/CWG/issues/2749.html

This DR's effects are backported to C++98.

Does not affect C where integral constant expressions cannot involve
pointers.

---------

Co-authored-by: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Co-authored-by: cor3ntin <corentinjabot@gmail.com>
---
 clang/docs/ReleaseNotes.rst                   |  4 ++
 .../include/clang/Basic/DiagnosticASTKinds.td |  2 -
 clang/lib/AST/ExprConstant.cpp                | 10 ----
 clang/test/AST/ByteCode/literals.cpp          |  8 +---
 clang/test/CXX/drs/cwg27xx.cpp                | 46 ++++++++++++++++++-
 clang/test/CXX/expr/expr.const/p2-0x.cpp      | 11 +++--
 clang/www/cxx_dr_status.html                  |  2 +-
 7 files changed, 58 insertions(+), 25 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index bd84a2e40fb8b..dc103aceebc36 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -154,6 +154,10 @@ Resolutions to C++ Defect Reports
 - Allow ``void{}`` as a prvalue of type ``void``.
   (`CWG2351: void{} <https://cplusplus.github.io/CWG/issues/2351.html>`_).
 
+- Clang now allows comparing unequal object pointers that have been cast to ``void *``
+  in constant expressions. These comparisons always worked in non-constant expressions.
+  (`CWG2749: Treatment of "pointer to void" for relational comparisons <https://cplusplus.github.io/CWG/issues/2749.html>`_).
+
 C Language Changes
 ------------------
 
diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td
index 45ad84831589b..91135b18c7571 100644
--- a/clang/include/clang/Basic/DiagnosticASTKinds.td
+++ b/clang/include/clang/Basic/DiagnosticASTKinds.td
@@ -148,8 +148,6 @@ def note_constexpr_var_init_weak : Note<
 def note_constexpr_typeid_polymorphic : Note<
   "typeid applied to expression of polymorphic type %0 is "
   "not allowed in a constant expression in C++ standards before C++20">;
-def note_constexpr_void_comparison : Note<
-  "comparison between unequal pointers to void has unspecified result">;
 def note_constexpr_temporary_here : Note<"temporary created here">;
 def note_constexpr_dynamic_alloc_here : Note<"heap allocation performed here">;
 def note_constexpr_conditional_never_const : Note<
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 3dc13c14c0034..205cbdf52a6f7 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -13895,16 +13895,6 @@ EvaluateComparisonBinaryOperator(EvalInfo &Info, const BinaryOperator *E,
     SubobjectDesignator &LHSDesignator = LHSValue.getLValueDesignator();
     SubobjectDesignator &RHSDesignator = RHSValue.getLValueDesignator();
 
-    // C++11 [expr.rel]p3:
-    //   Pointers to void (after pointer conversions) can be compared, with a
-    //   result defined as follows: If both pointers represent the same
-    //   address or are both the null pointer value, the result is true if the
-    //   operator is <= or >= and false otherwise; otherwise the result is
-    //   unspecified.
-    // We interpret this as applying to pointers to *cv* void.
-    if (LHSTy->isVoidPointerType() && LHSOffset != RHSOffset && IsRelational)
-      Info.CCEDiag(E, diag::note_constexpr_void_comparison);
-
     // C++11 [expr.rel]p2:
     // - If two pointers point to non-static data members of the same object,
     //   or to subobjects or array elements fo such members, recursively, the
diff --git a/clang/test/AST/ByteCode/literals.cpp b/clang/test/AST/ByteCode/literals.cpp
index 2329d4d973f01..13d6c4feb3500 100644
--- a/clang/test/AST/ByteCode/literals.cpp
+++ b/clang/test/AST/ByteCode/literals.cpp
@@ -199,12 +199,8 @@ namespace PointerComparison {
   constexpr bool v3 = null == pv; // ok
   constexpr bool v4 = qv == pv; // ok
 
-  /// FIXME: These two are rejected by the current interpreter, but
-  ///   accepted by GCC.
-  constexpr bool v5 = qv >= pv; // ref-error {{constant expression}} \
-                                // ref-note {{unequal pointers to void}}
-  constexpr bool v8 = qv > (void*)&s.a; // ref-error {{constant expression}} \
-                                        // ref-note {{unequal pointers to void}}
+  constexpr bool v5 = qv >= pv;
+  constexpr bool v8 = qv > (void*)&s.a;
   constexpr bool v6 = qv > null; // both-error {{must be initialized by a constant expression}} \
                                  // both-note {{comparison between '&s.b' and 'nullptr' has unspecified value}}
 
diff --git a/clang/test/CXX/drs/cwg27xx.cpp b/clang/test/CXX/drs/cwg27xx.cpp
index 406c8ea41f3b2..b3867696c615b 100644
--- a/clang/test/CXX/drs/cwg27xx.cpp
+++ b/clang/test/CXX/drs/cwg27xx.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++98 -pedantic-errors -verify=expected %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++98 -pedantic-errors -verify=expected,cxx98 %s
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++11 -pedantic-errors -verify=expected %s
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++14 -pedantic-errors -verify=expected %s
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++17 -pedantic-errors -verify=expected %s
@@ -6,6 +6,29 @@
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++23 -pedantic-errors -verify=expected,since-cxx23 %s
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++2c -pedantic-errors -verify=expected,since-cxx23,since-cxx26 %s
 
+#if __cplusplus == 199711L
+#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
+// cxx98-error@-1 {{variadic macros are a C99 feature}}
+#endif
+
+#if __cplusplus == 199711L
+#define __enable_constant_folding(x) (__builtin_constant_p(x) ? (x) : (x))
+#else
+#define __enable_constant_folding
+#endif
+
+namespace std {
+#if __cplusplus >= 202002L
+  struct strong_ordering {
+    int n;
+    constexpr operator int() const { return n; }
+    static const strong_ordering less, equal, greater;
+  };
+  constexpr strong_ordering strong_ordering::less{-1},
+      strong_ordering::equal{0}, strong_ordering::greater{1};
+#endif
+} // namespace std
+
 namespace cwg2718 { // cwg2718: 2.7
 struct B {};
 struct D;
@@ -18,6 +41,27 @@ void f(B b) {
 struct D : B {};
 } // namespace cwg2718
 
+namespace cwg2749 { // cwg2749: 20
+
+extern int x[2];
+struct Y {
+  int i;
+  int j;
+};
+extern Y y[2];
+
+static_assert(__enable_constant_folding(static_cast<void*>(&x[0]) < static_cast<void*>(&x[1])), "");
+static_assert(__enable_constant_folding(static_cast<void*>(&y[0].i) < static_cast<void*>(&y[0].j)), "");
+static_assert(__enable_constant_folding(static_cast<void*>(&y[0].j) < static_cast<void*>(&y[1].i)), "");
+
+#if __cplusplus >= 202002L
+static_assert((static_cast<void*>(&x[0]) <=> static_cast<void*>(&x[1])) == std::strong_ordering::less);
+static_assert((static_cast<void*>(&y[0].i) <=> static_cast<void*>(&y[0].j)) == std::strong_ordering::less);
+static_assert((static_cast<void*>(&y[0].j) <=> static_cast<void*>(&y[1].i)) == std::strong_ordering::less);
+#endif
+
+} // namespace cwg2749
+
 namespace cwg2759 { // cwg2759: 19
 #if __cplusplus >= 201103L
 
diff --git a/clang/test/CXX/expr/expr.const/p2-0x.cpp b/clang/test/CXX/expr/expr.const/p2-0x.cpp
index e3cd057baba75..767eee1c74f05 100644
--- a/clang/test/CXX/expr/expr.const/p2-0x.cpp
+++ b/clang/test/CXX/expr/expr.const/p2-0x.cpp
@@ -571,18 +571,19 @@ namespace UnspecifiedRelations {
   // [expr.rel]p3: Pointers to void can be compared [...] if both pointers
   // represent the same address or are both the null pointer [...]; otherwise
   // the result is unspecified.
+  // Same address restriction removed by CWG2749
   struct S { int a, b; } s;
   constexpr void *null = 0;
   constexpr void *pv = (void*)&s.a;
   constexpr void *qv = (void*)&s.b;
   constexpr bool v1 = null < (int*)0;
   constexpr bool v2 = null < pv; // expected-error {{constant expression}} expected-note {{comparison between 'nullptr' and '&s.a' has unspecified value}}
-  constexpr bool v3 = null == pv; // ok
-  constexpr bool v4 = qv == pv; // ok
-  constexpr bool v5 = qv >= pv; // expected-error {{constant expression}} expected-note {{unequal pointers to void}}
+  constexpr bool v3 = null == pv;
+  constexpr bool v4 = qv == pv;
+  constexpr bool v5 = qv >= pv;
   constexpr bool v6 = qv > null; // expected-error {{constant expression}} expected-note {{comparison between '&s.b' and 'nullptr' has unspecified value}}
-  constexpr bool v7 = qv <= (void*)&s.b; // ok
-  constexpr bool v8 = qv > (void*)&s.a; // expected-error {{constant expression}} expected-note {{unequal pointers to void}}
+  constexpr bool v7 = qv <= (void*)&s.b;
+  constexpr bool v8 = qv > (void*)&s.a;
 }
 
 // - an assignment or a compound assignment (5.17); or
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index aa79c3706f32b..b638f0ff30bcc 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -16341,7 +16341,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2749.html">2749</a></td>
     <td>DRWP</td>
     <td>Treatment of "pointer to void" for relational comparisons</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="unreleased" align="center">Clang 20</td>
   </tr>
   <tr id="2750">
     <td><a href="https://cplusplus.github.io/CWG/issues/2750.html">2750</a></td>

From d6832a611a7c4ec36f08b1cfe9af850dad32da2e Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 5 Sep 2024 08:28:33 -0400
Subject: [PATCH 215/425] [libc++][modules] Modularize <cstddef> (#107254)

Many headers include `<cstddef>` just for size_t, and pulling in
additional content (e.g. the traits used for std::byte) is unnecessary.
To solve this problem, this patch splits up `<cstddef>` into
subcomponents so that headers can include only the parts that they
actually require.

This has the added benefit of making the modules build a lot stricter
with respect to IWYU, and also providing a canonical location where we
define `std::size_t` and friends (which were previously defined in
multiple headers like `<cstddef>` and `<ctime>`).

After this patch, there's still many places in the codebase where we
include `<cstddef>` when `<__cstddef/size_t.h>` would be sufficient.
This patch focuses on removing `<cstddef>` includes from __type_traits
to make these headers non-circular with `<cstddef>`. Additional
refactorings can be tackled separately.
---
 libcxx/include/CMakeLists.txt                 |  5 ++
 libcxx/include/__algorithm/ranges_minmax.h    |  1 +
 libcxx/include/__atomic/atomic.h              |  2 +
 libcxx/include/__charconv/to_chars_integral.h |  1 +
 libcxx/include/__cstddef/byte.h               | 84 +++++++++++++++++++
 libcxx/include/__cstddef/max_align_t.h        | 27 ++++++
 libcxx/include/__cstddef/nullptr_t.h          | 25 ++++++
 libcxx/include/__cstddef/ptrdiff_t.h          | 25 ++++++
 libcxx/include/__cstddef/size_t.h             | 25 ++++++
 libcxx/include/__exception/nested_exception.h |  2 +
 libcxx/include/__fwd/array.h                  |  3 +-
 libcxx/include/__fwd/complex.h                |  2 +-
 libcxx/include/__fwd/pair.h                   |  2 +-
 libcxx/include/__fwd/span.h                   |  2 +-
 libcxx/include/__fwd/subrange.h               |  2 +-
 libcxx/include/__fwd/tuple.h                  |  2 +-
 libcxx/include/__iterator/concepts.h          |  1 +
 libcxx/include/__iterator/iterator_traits.h   |  2 +
 libcxx/include/__iterator/wrap_iter.h         |  1 +
 libcxx/include/__mdspan/layout_stride.h       |  1 +
 libcxx/include/__memory/pointer_traits.h      |  1 +
 libcxx/include/__memory/shared_ptr.h          |  2 +
 libcxx/include/__memory/unique_ptr.h          |  1 +
 libcxx/include/__memory/uses_allocator.h      |  1 +
 .../__random/mersenne_twister_engine.h        |  1 +
 libcxx/include/__random/seed_seq.h            |  2 +
 .../__random/subtract_with_carry_engine.h     |  1 +
 libcxx/include/__ranges/subrange.h            |  1 +
 .../include/__string/constexpr_c_functions.h  |  2 +
 libcxx/include/__tuple/tuple_size.h           |  2 +
 .../include/__type_traits/aligned_storage.h   |  2 +-
 libcxx/include/__type_traits/aligned_union.h  |  2 +-
 libcxx/include/__type_traits/alignment_of.h   |  2 +-
 libcxx/include/__type_traits/extent.h         |  2 +-
 libcxx/include/__type_traits/is_allocator.h   |  2 +-
 libcxx/include/__type_traits/is_array.h       |  2 +-
 .../include/__type_traits/is_bounded_array.h  |  2 +-
 .../__type_traits/is_nothrow_destructible.h   |  2 +-
 .../include/__type_traits/is_null_pointer.h   |  2 +-
 libcxx/include/__type_traits/is_swappable.h   |  2 +-
 libcxx/include/__type_traits/rank.h           |  2 +-
 .../__type_traits/remove_all_extents.h        |  2 +-
 libcxx/include/__type_traits/remove_extent.h  |  2 +-
 libcxx/include/__type_traits/type_list.h      |  2 +-
 libcxx/include/__utility/in_place.h           |  1 +
 libcxx/include/__utility/pair.h               |  1 +
 libcxx/include/__utility/swap.h               |  1 +
 libcxx/include/any                            |  1 +
 libcxx/include/cstddef                        | 83 ++----------------
 libcxx/include/cstdio                         |  2 +-
 libcxx/include/cstdlib                        |  2 +-
 libcxx/include/cstring                        |  2 +-
 libcxx/include/ctime                          |  2 +-
 libcxx/include/cuchar                         |  2 +-
 libcxx/include/cwchar                         |  2 +-
 .../include/experimental/__simd/reference.h   |  1 +
 libcxx/include/experimental/__simd/scalar.h   |  1 +
 libcxx/include/experimental/__simd/simd.h     |  1 +
 .../include/experimental/__simd/simd_mask.h   |  1 +
 libcxx/include/experimental/__simd/vec_ext.h  |  1 +
 libcxx/include/module.modulemap               | 31 +++++--
 libcxx/include/typeinfo                       |  1 +
 libcxx/include/unordered_map                  |  2 +
 libcxx/include/unordered_set                  |  2 +
 .../test/libcxx/transitive_includes/cxx03.csv |  2 -
 .../test/libcxx/transitive_includes/cxx11.csv |  2 -
 .../test/libcxx/transitive_includes/cxx14.csv |  2 -
 .../test/libcxx/transitive_includes/cxx17.csv |  2 -
 .../test/libcxx/transitive_includes/cxx20.csv |  2 -
 .../test/libcxx/transitive_includes/cxx23.csv |  2 -
 .../test/libcxx/transitive_includes/cxx26.csv |  2 -
 .../bit/bit.pow.two/bit_ceil.pass.cpp         |  1 +
 .../bit/bit.pow.two/bit_floor.pass.cpp        |  1 +
 .../bit/bit.pow.two/bit_width.pass.cpp        |  1 +
 .../bit/bit.pow.two/has_single_bit.pass.cpp   |  1 +
 .../bit/bitops.count/countl_one.pass.cpp      |  1 +
 .../bit/bitops.count/countl_zero.pass.cpp     |  1 +
 .../bit/bitops.count/countr_one.pass.cpp      |  1 +
 .../bit/bitops.count/countr_zero.pass.cpp     |  1 +
 .../bit/bitops.count/popcount.pass.cpp        |  1 +
 .../std/numerics/bit/bitops.rot/rotl.pass.cpp |  1 +
 .../std/numerics/bit/bitops.rot/rotr.pass.cpp |  1 +
 .../header_exportable_declarations.cpp        |  4 +-
 libcxx/utils/libcxx/test/modules.py           |  4 +
 84 files changed, 306 insertions(+), 124 deletions(-)
 create mode 100644 libcxx/include/__cstddef/byte.h
 create mode 100644 libcxx/include/__cstddef/max_align_t.h
 create mode 100644 libcxx/include/__cstddef/nullptr_t.h
 create mode 100644 libcxx/include/__cstddef/ptrdiff_t.h
 create mode 100644 libcxx/include/__cstddef/size_t.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 210beaf5a3364..0f43916dae438 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -324,6 +324,11 @@ set(files
   __coroutine/coroutine_traits.h
   __coroutine/noop_coroutine_handle.h
   __coroutine/trivial_awaitables.h
+  __cstddef/byte.h
+  __cstddef/max_align_t.h
+  __cstddef/nullptr_t.h
+  __cstddef/ptrdiff_t.h
+  __cstddef/size_t.h
   __debug_utils/randomize_range.h
   __debug_utils/sanitizers.h
   __debug_utils/strict_weak_ordering_check.h
diff --git a/libcxx/include/__algorithm/ranges_minmax.h b/libcxx/include/__algorithm/ranges_minmax.h
index 1b43b1e19cdec..5f2e5cb2a1eea 100644
--- a/libcxx/include/__algorithm/ranges_minmax.h
+++ b/libcxx/include/__algorithm/ranges_minmax.h
@@ -24,6 +24,7 @@
 #include <__ranges/access.h>
 #include <__ranges/concepts.h>
 #include <__type_traits/desugars_to.h>
+#include <__type_traits/is_integral.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/remove_cvref.h>
diff --git a/libcxx/include/__atomic/atomic.h b/libcxx/include/__atomic/atomic.h
index bcea21f5ce2e1..af6d12b5e4ce9 100644
--- a/libcxx/include/__atomic/atomic.h
+++ b/libcxx/include/__atomic/atomic.h
@@ -16,8 +16,10 @@
 #include <__config>
 #include <__functional/operations.h>
 #include <__memory/addressof.h>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/is_floating_point.h>
 #include <__type_traits/is_function.h>
+#include <__type_traits/is_integral.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/remove_const.h>
 #include <__type_traits/remove_pointer.h>
diff --git a/libcxx/include/__charconv/to_chars_integral.h b/libcxx/include/__charconv/to_chars_integral.h
index 0369f4dfb9bda..ccb5856df1799 100644
--- a/libcxx/include/__charconv/to_chars_integral.h
+++ b/libcxx/include/__charconv/to_chars_integral.h
@@ -21,6 +21,7 @@
 #include <__system_error/errc.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
+#include <__type_traits/is_integral.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/make_32_64_or_128_bit.h>
 #include <__type_traits/make_unsigned.h>
diff --git a/libcxx/include/__cstddef/byte.h b/libcxx/include/__cstddef/byte.h
new file mode 100644
index 0000000000000..b8cfe5e8d1c7e
--- /dev/null
+++ b/libcxx/include/__cstddef/byte.h
@@ -0,0 +1,84 @@
+//===---------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___CSTDDEF_BYTE_H
+#define _LIBCPP___CSTDDEF_BYTE_H
+
+#include <__config>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/is_integral.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if _LIBCPP_STD_VER >= 17
+namespace std { // purposefully not versioned
+
+enum class byte : unsigned char {};
+
+_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator|(byte __lhs, byte __rhs) noexcept {
+  return static_cast<byte>(
+      static_cast<unsigned char>(static_cast<unsigned int>(__lhs) | static_cast<unsigned int>(__rhs)));
+}
+
+_LIBCPP_HIDE_FROM_ABI inline constexpr byte& operator|=(byte& __lhs, byte __rhs) noexcept {
+  return __lhs = __lhs | __rhs;
+}
+
+_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator&(byte __lhs, byte __rhs) noexcept {
+  return static_cast<byte>(
+      static_cast<unsigned char>(static_cast<unsigned int>(__lhs) & static_cast<unsigned int>(__rhs)));
+}
+
+_LIBCPP_HIDE_FROM_ABI inline constexpr byte& operator&=(byte& __lhs, byte __rhs) noexcept {
+  return __lhs = __lhs & __rhs;
+}
+
+_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator^(byte __lhs, byte __rhs) noexcept {
+  return static_cast<byte>(
+      static_cast<unsigned char>(static_cast<unsigned int>(__lhs) ^ static_cast<unsigned int>(__rhs)));
+}
+
+_LIBCPP_HIDE_FROM_ABI inline constexpr byte& operator^=(byte& __lhs, byte __rhs) noexcept {
+  return __lhs = __lhs ^ __rhs;
+}
+
+_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator~(byte __b) noexcept {
+  return static_cast<byte>(static_cast<unsigned char>(~static_cast<unsigned int>(__b)));
+}
+
+template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr byte& operator<<=(byte& __lhs, _Integer __shift) noexcept {
+  return __lhs = __lhs << __shift;
+}
+
+template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr byte operator<<(byte __lhs, _Integer __shift) noexcept {
+  return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) << __shift));
+}
+
+template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr byte& operator>>=(byte& __lhs, _Integer __shift) noexcept {
+  return __lhs = __lhs >> __shift;
+}
+
+template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr byte operator>>(byte __lhs, _Integer __shift) noexcept {
+  return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) >> __shift));
+}
+
+template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Integer to_integer(byte __b) noexcept {
+  return static_cast<_Integer>(__b);
+}
+
+} // namespace std
+#endif // _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___CSTDDEF_BYTE_H
diff --git a/libcxx/include/__cstddef/max_align_t.h b/libcxx/include/__cstddef/max_align_t.h
new file mode 100644
index 0000000000000..7c09c7e7f3017
--- /dev/null
+++ b/libcxx/include/__cstddef/max_align_t.h
@@ -0,0 +1,27 @@
+//===---------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___CSTDDEF_MAX_ALIGN_T_H
+#define _LIBCPP___CSTDDEF_MAX_ALIGN_T_H
+
+#include <__config>
+#include <stddef.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if !defined(_LIBCPP_CXX03_LANG)
+using ::max_align_t _LIBCPP_USING_IF_EXISTS;
+#endif
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___CSTDDEF_MAX_ALIGN_T_H
diff --git a/libcxx/include/__cstddef/nullptr_t.h b/libcxx/include/__cstddef/nullptr_t.h
new file mode 100644
index 0000000000000..de3f7d4ab5fa7
--- /dev/null
+++ b/libcxx/include/__cstddef/nullptr_t.h
@@ -0,0 +1,25 @@
+//===---------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___CSTDDEF_NULLPTR_T_H
+#define _LIBCPP___CSTDDEF_NULLPTR_T_H
+
+#include <__config>
+#include <stddef.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+using ::nullptr_t;
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___CSTDDEF_NULLPTR_T_H
diff --git a/libcxx/include/__cstddef/ptrdiff_t.h b/libcxx/include/__cstddef/ptrdiff_t.h
new file mode 100644
index 0000000000000..f8b5cdaaff01c
--- /dev/null
+++ b/libcxx/include/__cstddef/ptrdiff_t.h
@@ -0,0 +1,25 @@
+//===---------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___CSTDDEF_PTRDIFF_T_H
+#define _LIBCPP___CSTDDEF_PTRDIFF_T_H
+
+#include <__config>
+#include <stddef.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+using ::ptrdiff_t _LIBCPP_USING_IF_EXISTS;
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___CSTDDEF_PTRDIFF_T_H
diff --git a/libcxx/include/__cstddef/size_t.h b/libcxx/include/__cstddef/size_t.h
new file mode 100644
index 0000000000000..91abbf0131895
--- /dev/null
+++ b/libcxx/include/__cstddef/size_t.h
@@ -0,0 +1,25 @@
+//===---------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___CSTDDEF_SIZE_T_H
+#define _LIBCPP___CSTDDEF_SIZE_T_H
+
+#include <__config>
+#include <stddef.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+using ::size_t _LIBCPP_USING_IF_EXISTS;
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___CSTDDEF_SIZE_T_H
diff --git a/libcxx/include/__exception/nested_exception.h b/libcxx/include/__exception/nested_exception.h
index feb489f87f62f..4c7970d167ffa 100644
--- a/libcxx/include/__exception/nested_exception.h
+++ b/libcxx/include/__exception/nested_exception.h
@@ -13,6 +13,8 @@
 #include <__exception/exception_ptr.h>
 #include <__memory/addressof.h>
 #include <__type_traits/decay.h>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_base_of.h>
 #include <__type_traits/is_class.h>
 #include <__type_traits/is_constructible.h>
diff --git a/libcxx/include/__fwd/array.h b/libcxx/include/__fwd/array.h
index 6c6461e727604..794779ae46ab7 100644
--- a/libcxx/include/__fwd/array.h
+++ b/libcxx/include/__fwd/array.h
@@ -10,7 +10,8 @@
 #define _LIBCPP___FWD_ARRAY_H
 
 #include <__config>
-#include <cstddef>
+#include <__cstddef/size_t.h>
+#include <__type_traits/integral_constant.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__fwd/complex.h b/libcxx/include/__fwd/complex.h
index 22c78c5cc3c77..092d2e10b12b5 100644
--- a/libcxx/include/__fwd/complex.h
+++ b/libcxx/include/__fwd/complex.h
@@ -10,7 +10,7 @@
 #define _LIBCPP___FWD_COMPLEX_H
 
 #include <__config>
-#include <cstddef>
+#include <__cstddef/size_t.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__fwd/pair.h b/libcxx/include/__fwd/pair.h
index af32628fe1e0d..b8ba2b7e92324 100644
--- a/libcxx/include/__fwd/pair.h
+++ b/libcxx/include/__fwd/pair.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___FWD_PAIR_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__fwd/tuple.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__fwd/span.h b/libcxx/include/__fwd/span.h
index 8dafa742c19df..5d473ee51c6b7 100644
--- a/libcxx/include/__fwd/span.h
+++ b/libcxx/include/__fwd/span.h
@@ -11,7 +11,7 @@
 #define _LIBCPP___FWD_SPAN_H
 
 #include <__config>
-#include <cstddef>
+#include <__cstddef/size_t.h>
 #include <limits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__fwd/subrange.h b/libcxx/include/__fwd/subrange.h
index 60a41da23dd44..5b3a07e55348a 100644
--- a/libcxx/include/__fwd/subrange.h
+++ b/libcxx/include/__fwd/subrange.h
@@ -11,8 +11,8 @@
 
 #include <__concepts/copyable.h>
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__iterator/concepts.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__fwd/tuple.h b/libcxx/include/__fwd/tuple.h
index 902770c29555e..2ed32bc0df4e1 100644
--- a/libcxx/include/__fwd/tuple.h
+++ b/libcxx/include/__fwd/tuple.h
@@ -10,7 +10,7 @@
 #define _LIBCPP___FWD_TUPLE_H
 
 #include <__config>
-#include <cstddef>
+#include <__cstddef/size_t.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__iterator/concepts.h b/libcxx/include/__iterator/concepts.h
index be2890bee4928..93ea7efd49537 100644
--- a/libcxx/include/__iterator/concepts.h
+++ b/libcxx/include/__iterator/concepts.h
@@ -34,6 +34,7 @@
 #include <__memory/pointer_traits.h>
 #include <__type_traits/add_pointer.h>
 #include <__type_traits/common_reference.h>
+#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_primary_template.h>
 #include <__type_traits/is_reference.h>
diff --git a/libcxx/include/__iterator/iterator_traits.h b/libcxx/include/__iterator/iterator_traits.h
index 11af9e301842c..4d9ad480cc4a2 100644
--- a/libcxx/include/__iterator/iterator_traits.h
+++ b/libcxx/include/__iterator/iterator_traits.h
@@ -24,6 +24,8 @@
 #include <__type_traits/common_reference.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/disjunction.h>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_convertible.h>
 #include <__type_traits/is_object.h>
 #include <__type_traits/is_primary_template.h>
diff --git a/libcxx/include/__iterator/wrap_iter.h b/libcxx/include/__iterator/wrap_iter.h
index 34f8d5f1663b2..549d8ff2dbd7d 100644
--- a/libcxx/include/__iterator/wrap_iter.h
+++ b/libcxx/include/__iterator/wrap_iter.h
@@ -17,6 +17,7 @@
 #include <__memory/addressof.h>
 #include <__memory/pointer_traits.h>
 #include <__type_traits/enable_if.h>
+#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_convertible.h>
 #include <cstddef>
 
diff --git a/libcxx/include/__mdspan/layout_stride.h b/libcxx/include/__mdspan/layout_stride.h
index 704a5a4c1aea5..c57f596431c7c 100644
--- a/libcxx/include/__mdspan/layout_stride.h
+++ b/libcxx/include/__mdspan/layout_stride.h
@@ -25,6 +25,7 @@
 #include <__type_traits/common_type.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
+#include <__type_traits/is_integral.h>
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_same.h>
 #include <__utility/as_const.h>
diff --git a/libcxx/include/__memory/pointer_traits.h b/libcxx/include/__memory/pointer_traits.h
index 0914aceb318b7..8e08eb74413ee 100644
--- a/libcxx/include/__memory/pointer_traits.h
+++ b/libcxx/include/__memory/pointer_traits.h
@@ -15,6 +15,7 @@
 #include <__type_traits/conditional.h>
 #include <__type_traits/conjunction.h>
 #include <__type_traits/decay.h>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/is_class.h>
 #include <__type_traits/is_function.h>
 #include <__type_traits/is_void.h>
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index d487e4fbe3a95..4dd8022822d22 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -34,6 +34,8 @@
 #include <__type_traits/conditional.h>
 #include <__type_traits/conjunction.h>
 #include <__type_traits/disjunction.h>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_bounded_array.h>
 #include <__type_traits/is_constructible.h>
diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h
index 7f5e0ea243c95..392cf42137821 100644
--- a/libcxx/include/__memory/unique_ptr.h
+++ b/libcxx/include/__memory/unique_ptr.h
@@ -23,6 +23,7 @@
 #include <__type_traits/common_type.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/dependent_type.h>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_assignable.h>
diff --git a/libcxx/include/__memory/uses_allocator.h b/libcxx/include/__memory/uses_allocator.h
index 84310c3fa5673..16504e8b2a998 100644
--- a/libcxx/include/__memory/uses_allocator.h
+++ b/libcxx/include/__memory/uses_allocator.h
@@ -11,6 +11,7 @@
 #define _LIBCPP___MEMORY_USES_ALLOCATOR_H
 
 #include <__config>
+#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_convertible.h>
 #include <cstddef>
 
diff --git a/libcxx/include/__random/mersenne_twister_engine.h b/libcxx/include/__random/mersenne_twister_engine.h
index 65280d7c5505f..1f50e608ce8d4 100644
--- a/libcxx/include/__random/mersenne_twister_engine.h
+++ b/libcxx/include/__random/mersenne_twister_engine.h
@@ -13,6 +13,7 @@
 #include <__algorithm/min.h>
 #include <__config>
 #include <__random/is_seed_sequence.h>
+#include <__type_traits/enable_if.h>
 #include <cstddef>
 #include <cstdint>
 #include <iosfwd>
diff --git a/libcxx/include/__random/seed_seq.h b/libcxx/include/__random/seed_seq.h
index 5cf84aeb8a72b..c1a320a75c88a 100644
--- a/libcxx/include/__random/seed_seq.h
+++ b/libcxx/include/__random/seed_seq.h
@@ -14,6 +14,8 @@
 #include <__algorithm/max.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/is_integral.h>
 #include <__type_traits/is_unsigned.h>
 #include <cstdint>
 #include <initializer_list>
diff --git a/libcxx/include/__random/subtract_with_carry_engine.h b/libcxx/include/__random/subtract_with_carry_engine.h
index ec25fed49f949..926333cdda45e 100644
--- a/libcxx/include/__random/subtract_with_carry_engine.h
+++ b/libcxx/include/__random/subtract_with_carry_engine.h
@@ -14,6 +14,7 @@
 #include <__config>
 #include <__random/is_seed_sequence.h>
 #include <__random/linear_congruential_engine.h>
+#include <__type_traits/enable_if.h>
 #include <cstddef>
 #include <cstdint>
 #include <iosfwd>
diff --git a/libcxx/include/__ranges/subrange.h b/libcxx/include/__ranges/subrange.h
index aba584ef93354..144746babb325 100644
--- a/libcxx/include/__ranges/subrange.h
+++ b/libcxx/include/__ranges/subrange.h
@@ -33,6 +33,7 @@
 #include <__tuple/tuple_size.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/decay.h>
+#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/make_unsigned.h>
diff --git a/libcxx/include/__string/constexpr_c_functions.h b/libcxx/include/__string/constexpr_c_functions.h
index 32fc06e121b36..9b8871e2e71a3 100644
--- a/libcxx/include/__string/constexpr_c_functions.h
+++ b/libcxx/include/__string/constexpr_c_functions.h
@@ -13,11 +13,13 @@
 #include <__memory/addressof.h>
 #include <__memory/construct_at.h>
 #include <__type_traits/datasizeof.h>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/is_always_bitcastable.h>
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_equality_comparable.h>
+#include <__type_traits/is_integral.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/is_trivially_lexicographically_comparable.h>
diff --git a/libcxx/include/__tuple/tuple_size.h b/libcxx/include/__tuple/tuple_size.h
index 18a17fd4d5878..21c9811abeee7 100644
--- a/libcxx/include/__tuple/tuple_size.h
+++ b/libcxx/include/__tuple/tuple_size.h
@@ -12,6 +12,8 @@
 #include <__config>
 #include <__fwd/tuple.h>
 #include <__tuple/tuple_types.h>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_const.h>
 #include <__type_traits/is_volatile.h>
 #include <cstddef>
diff --git a/libcxx/include/__type_traits/aligned_storage.h b/libcxx/include/__type_traits/aligned_storage.h
index 46aae12832f86..49b4e971bbb67 100644
--- a/libcxx/include/__type_traits/aligned_storage.h
+++ b/libcxx/include/__type_traits/aligned_storage.h
@@ -10,11 +10,11 @@
 #define _LIBCPP___TYPE_TRAITS_ALIGNED_STORAGE_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/nat.h>
 #include <__type_traits/type_list.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/aligned_union.h b/libcxx/include/__type_traits/aligned_union.h
index 005ed9a096ea8..de62a4b1c2a33 100644
--- a/libcxx/include/__type_traits/aligned_union.h
+++ b/libcxx/include/__type_traits/aligned_union.h
@@ -10,9 +10,9 @@
 #define _LIBCPP___TYPE_TRAITS_ALIGNED_UNION_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__type_traits/aligned_storage.h>
 #include <__type_traits/integral_constant.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/alignment_of.h b/libcxx/include/__type_traits/alignment_of.h
index f2d069bf2488f..8871c8ce110d6 100644
--- a/libcxx/include/__type_traits/alignment_of.h
+++ b/libcxx/include/__type_traits/alignment_of.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___TYPE_TRAITS_ALIGNMENT_OF_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__type_traits/integral_constant.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/extent.h b/libcxx/include/__type_traits/extent.h
index bab03fe997eb6..1c34a4db1c4b5 100644
--- a/libcxx/include/__type_traits/extent.h
+++ b/libcxx/include/__type_traits/extent.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___TYPE_TRAITS_EXTENT_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__type_traits/integral_constant.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/is_allocator.h b/libcxx/include/__type_traits/is_allocator.h
index 144ffac4d7ce5..191eeb9a1f522 100644
--- a/libcxx/include/__type_traits/is_allocator.h
+++ b/libcxx/include/__type_traits/is_allocator.h
@@ -10,10 +10,10 @@
 #define _LIBCPP___TYPE_IS_ALLOCATOR_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/void_t.h>
 #include <__utility/declval.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/is_array.h b/libcxx/include/__type_traits/is_array.h
index dc23de28d2c63..f34204e19ed89 100644
--- a/libcxx/include/__type_traits/is_array.h
+++ b/libcxx/include/__type_traits/is_array.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___TYPE_TRAITS_IS_ARRAY_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__type_traits/integral_constant.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/is_bounded_array.h b/libcxx/include/__type_traits/is_bounded_array.h
index 211403d638d08..a78b52e7062b8 100644
--- a/libcxx/include/__type_traits/is_bounded_array.h
+++ b/libcxx/include/__type_traits/is_bounded_array.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___TYPE_TRAITS_IS_BOUNDED_ARRAY_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__type_traits/integral_constant.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/is_nothrow_destructible.h b/libcxx/include/__type_traits/is_nothrow_destructible.h
index c2d5ca87232aa..41271a38f3711 100644
--- a/libcxx/include/__type_traits/is_nothrow_destructible.h
+++ b/libcxx/include/__type_traits/is_nothrow_destructible.h
@@ -10,10 +10,10 @@
 #define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_DESTRUCTIBLE_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_destructible.h>
 #include <__utility/declval.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/is_null_pointer.h b/libcxx/include/__type_traits/is_null_pointer.h
index 9f5697e232684..abc5d142562f1 100644
--- a/libcxx/include/__type_traits/is_null_pointer.h
+++ b/libcxx/include/__type_traits/is_null_pointer.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___TYPE_TRAITS_IS_NULL_POINTER_H
 
 #include <__config>
+#include <__cstddef/nullptr_t.h>
 #include <__type_traits/integral_constant.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/is_swappable.h b/libcxx/include/__type_traits/is_swappable.h
index 0b817e6509933..f4d687094bbca 100644
--- a/libcxx/include/__type_traits/is_swappable.h
+++ b/libcxx/include/__type_traits/is_swappable.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___TYPE_TRAITS_IS_SWAPPABLE_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__type_traits/add_lvalue_reference.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_assignable.h>
@@ -18,7 +19,6 @@
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/void_t.h>
 #include <__utility/declval.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/rank.h b/libcxx/include/__type_traits/rank.h
index 7f6fad1c54024..aeeedec40dee5 100644
--- a/libcxx/include/__type_traits/rank.h
+++ b/libcxx/include/__type_traits/rank.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___TYPE_TRAITS_RANK_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__type_traits/integral_constant.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/remove_all_extents.h b/libcxx/include/__type_traits/remove_all_extents.h
index d5373b51f5221..db7dab4a6c132 100644
--- a/libcxx/include/__type_traits/remove_all_extents.h
+++ b/libcxx/include/__type_traits/remove_all_extents.h
@@ -10,7 +10,7 @@
 #define _LIBCPP___TYPE_TRAITS_REMOVE_ALL_EXTENTS_H
 
 #include <__config>
-#include <cstddef>
+#include <__cstddef/size_t.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/remove_extent.h b/libcxx/include/__type_traits/remove_extent.h
index fe37b5c7266c6..aceeb47069660 100644
--- a/libcxx/include/__type_traits/remove_extent.h
+++ b/libcxx/include/__type_traits/remove_extent.h
@@ -10,7 +10,7 @@
 #define _LIBCPP___TYPE_TRAITS_REMOVE_EXTENT_H
 
 #include <__config>
-#include <cstddef>
+#include <__cstddef/size_t.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/type_list.h b/libcxx/include/__type_traits/type_list.h
index 02905707ee37a..0d9ca98958377 100644
--- a/libcxx/include/__type_traits/type_list.h
+++ b/libcxx/include/__type_traits/type_list.h
@@ -10,7 +10,7 @@
 #define _LIBCPP___TYPE_TRAITS_TYPE_LIST_H
 
 #include <__config>
-#include <cstddef>
+#include <__cstddef/size_t.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__utility/in_place.h b/libcxx/include/__utility/in_place.h
index fa7a2f4bfd4a9..459b271675261 100644
--- a/libcxx/include/__utility/in_place.h
+++ b/libcxx/include/__utility/in_place.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___UTILITY_IN_PLACE_H
 
 #include <__config>
+#include <__type_traits/integral_constant.h>
 #include <__type_traits/remove_cvref.h>
 #include <cstddef>
 
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index 0afbebcdc9f2a..78534a3f399f2 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -25,6 +25,7 @@
 #include <__type_traits/common_type.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/decay.h>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_constructible.h>
diff --git a/libcxx/include/__utility/swap.h b/libcxx/include/__utility/swap.h
index ab88b8e0a0b53..ecfbdec75a2ae 100644
--- a/libcxx/include/__utility/swap.h
+++ b/libcxx/include/__utility/swap.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___UTILITY_SWAP_H
 
 #include <__config>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_nothrow_assignable.h>
diff --git a/libcxx/include/any b/libcxx/include/any
index 5def14dc87e6b..7630e8a057d05 100644
--- a/libcxx/include/any
+++ b/libcxx/include/any
@@ -90,6 +90,7 @@ namespace std {
 #include <__type_traits/aligned_storage.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/decay.h>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_function.h>
 #include <__type_traits/is_nothrow_constructible.h>
diff --git a/libcxx/include/cstddef b/libcxx/include/cstddef
index 592f6261a6de3..2b138d9690393 100644
--- a/libcxx/include/cstddef
+++ b/libcxx/include/cstddef
@@ -34,9 +34,6 @@ Types:
 */
 
 #include <__config>
-#include <__type_traits/enable_if.h>
-#include <__type_traits/integral_constant.h>
-#include <__type_traits/is_integral.h>
 #include <version>
 
 #include <stddef.h>
@@ -53,80 +50,10 @@ Types:
 #  pragma GCC system_header
 #endif
 
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-using ::nullptr_t;
-using ::ptrdiff_t _LIBCPP_USING_IF_EXISTS;
-using ::size_t _LIBCPP_USING_IF_EXISTS;
-
-#if !defined(_LIBCPP_CXX03_LANG)
-using ::max_align_t _LIBCPP_USING_IF_EXISTS;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#if _LIBCPP_STD_VER >= 17
-namespace std { // purposefully not versioned
-
-enum class byte : unsigned char {};
-
-_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator|(byte __lhs, byte __rhs) noexcept {
-  return static_cast<byte>(
-      static_cast<unsigned char>(static_cast<unsigned int>(__lhs) | static_cast<unsigned int>(__rhs)));
-}
-
-_LIBCPP_HIDE_FROM_ABI inline constexpr byte& operator|=(byte& __lhs, byte __rhs) noexcept {
-  return __lhs = __lhs | __rhs;
-}
-
-_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator&(byte __lhs, byte __rhs) noexcept {
-  return static_cast<byte>(
-      static_cast<unsigned char>(static_cast<unsigned int>(__lhs) & static_cast<unsigned int>(__rhs)));
-}
-
-_LIBCPP_HIDE_FROM_ABI inline constexpr byte& operator&=(byte& __lhs, byte __rhs) noexcept {
-  return __lhs = __lhs & __rhs;
-}
-
-_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator^(byte __lhs, byte __rhs) noexcept {
-  return static_cast<byte>(
-      static_cast<unsigned char>(static_cast<unsigned int>(__lhs) ^ static_cast<unsigned int>(__rhs)));
-}
-
-_LIBCPP_HIDE_FROM_ABI inline constexpr byte& operator^=(byte& __lhs, byte __rhs) noexcept {
-  return __lhs = __lhs ^ __rhs;
-}
-
-_LIBCPP_HIDE_FROM_ABI inline constexpr byte operator~(byte __b) noexcept {
-  return static_cast<byte>(static_cast<unsigned char>(~static_cast<unsigned int>(__b)));
-}
-
-template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr byte& operator<<=(byte& __lhs, _Integer __shift) noexcept {
-  return __lhs = __lhs << __shift;
-}
-
-template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr byte operator<<(byte __lhs, _Integer __shift) noexcept {
-  return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) << __shift));
-}
-
-template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr byte& operator>>=(byte& __lhs, _Integer __shift) noexcept {
-  return __lhs = __lhs >> __shift;
-}
-
-template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr byte operator>>(byte __lhs, _Integer __shift) noexcept {
-  return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) >> __shift));
-}
-
-template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
-[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Integer to_integer(byte __b) noexcept {
-  return static_cast<_Integer>(__b);
-}
-
-} // namespace std
-#endif // _LIBCPP_STD_VER >= 17
+#include <__cstddef/byte.h>
+#include <__cstddef/max_align_t.h>
+#include <__cstddef/nullptr_t.h>
+#include <__cstddef/ptrdiff_t.h>
+#include <__cstddef/size_t.h>
 
 #endif // _LIBCPP_CSTDDEF
diff --git a/libcxx/include/cstdio b/libcxx/include/cstdio
index 7f94371081f8b..a461c24dcc019 100644
--- a/libcxx/include/cstdio
+++ b/libcxx/include/cstdio
@@ -96,6 +96,7 @@ void perror(const char* s);
 */
 
 #include <__config>
+#include <__cstddef/size_t.h>
 
 #include <stdio.h>
 
@@ -115,7 +116,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 using ::FILE _LIBCPP_USING_IF_EXISTS;
 using ::fpos_t _LIBCPP_USING_IF_EXISTS;
-using ::size_t _LIBCPP_USING_IF_EXISTS;
 
 using ::fclose _LIBCPP_USING_IF_EXISTS;
 using ::fflush _LIBCPP_USING_IF_EXISTS;
diff --git a/libcxx/include/cstdlib b/libcxx/include/cstdlib
index c817fd8f4accd..1ecdd3a6d0328 100644
--- a/libcxx/include/cstdlib
+++ b/libcxx/include/cstdlib
@@ -82,6 +82,7 @@ void *aligned_alloc(size_t alignment, size_t size);                       // C11
 */
 
 #include <__config>
+#include <__cstddef/size_t.h>
 
 #include <stdlib.h>
 
@@ -99,7 +100,6 @@ void *aligned_alloc(size_t alignment, size_t size);                       // C11
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-using ::size_t _LIBCPP_USING_IF_EXISTS;
 using ::div_t _LIBCPP_USING_IF_EXISTS;
 using ::ldiv_t _LIBCPP_USING_IF_EXISTS;
 using ::lldiv_t _LIBCPP_USING_IF_EXISTS;
diff --git a/libcxx/include/cstring b/libcxx/include/cstring
index c2c92b02e73cc..5bb6e3e10628f 100644
--- a/libcxx/include/cstring
+++ b/libcxx/include/cstring
@@ -57,6 +57,7 @@ size_t strlen(const char* s);
 */
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__type_traits/is_constant_evaluated.h>
 
 #include <string.h>
@@ -75,7 +76,6 @@ size_t strlen(const char* s);
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-using ::size_t _LIBCPP_USING_IF_EXISTS;
 using ::memcpy _LIBCPP_USING_IF_EXISTS;
 using ::memmove _LIBCPP_USING_IF_EXISTS;
 using ::strcpy _LIBCPP_USING_IF_EXISTS;
diff --git a/libcxx/include/ctime b/libcxx/include/ctime
index f47b49a43e23e..d2d2470f2fa60 100644
--- a/libcxx/include/ctime
+++ b/libcxx/include/ctime
@@ -46,6 +46,7 @@ int timespec_get( struct timespec *ts, int base); // C++17
 */
 
 #include <__config>
+#include <__cstddef/size_t.h>
 
 // <time.h> is not provided by libc++
 #if __has_include(<time.h>)
@@ -62,7 +63,6 @@ int timespec_get( struct timespec *ts, int base); // C++17
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 using ::clock_t _LIBCPP_USING_IF_EXISTS;
-using ::size_t _LIBCPP_USING_IF_EXISTS;
 using ::time_t _LIBCPP_USING_IF_EXISTS;
 using ::tm _LIBCPP_USING_IF_EXISTS;
 #if _LIBCPP_STD_VER >= 17
diff --git a/libcxx/include/cuchar b/libcxx/include/cuchar
index f0015be275367..bfc26f03aaf78 100644
--- a/libcxx/include/cuchar
+++ b/libcxx/include/cuchar
@@ -37,6 +37,7 @@ size_t c32rtomb(char* s, char32_t c32, mbstate_t* ps);
 */
 
 #include <__config>
+#include <__cstddef/size_t.h>
 
 #include <uchar.h>
 
@@ -57,7 +58,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if !defined(_LIBCPP_CXX03_LANG)
 
 using ::mbstate_t _LIBCPP_USING_IF_EXISTS;
-using ::size_t _LIBCPP_USING_IF_EXISTS;
 
 #  if !defined(_LIBCPP_HAS_NO_C8RTOMB_MBRTOC8)
 using ::mbrtoc8 _LIBCPP_USING_IF_EXISTS;
diff --git a/libcxx/include/cwchar b/libcxx/include/cwchar
index 08cfac58c846a..f5a26664c1c3e 100644
--- a/libcxx/include/cwchar
+++ b/libcxx/include/cwchar
@@ -103,6 +103,7 @@ size_t wcsrtombs(char* restrict dst, const wchar_t** restrict src, size_t len,
 */
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__type_traits/copy_cv.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_equality_comparable.h>
@@ -127,7 +128,6 @@ size_t wcsrtombs(char* restrict dst, const wchar_t** restrict src, size_t len,
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 using ::mbstate_t _LIBCPP_USING_IF_EXISTS;
-using ::size_t _LIBCPP_USING_IF_EXISTS;
 using ::tm _LIBCPP_USING_IF_EXISTS;
 using ::wint_t _LIBCPP_USING_IF_EXISTS;
 using ::FILE _LIBCPP_USING_IF_EXISTS;
diff --git a/libcxx/include/experimental/__simd/reference.h b/libcxx/include/experimental/__simd/reference.h
index 39f60d64f7b41..c60c08b0ea459 100644
--- a/libcxx/include/experimental/__simd/reference.h
+++ b/libcxx/include/experimental/__simd/reference.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H
 
+#include <__type_traits/enable_if.h>
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_same.h>
 #include <__utility/declval.h>
diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h
index 1add4653209ac..a2aeeb5cd0f54 100644
--- a/libcxx/include/experimental/__simd/scalar.h
+++ b/libcxx/include/experimental/__simd/scalar.h
@@ -11,6 +11,7 @@
 #define _LIBCPP_EXPERIMENTAL___SIMD_SCALAR_H
 
 #include <__assert>
+#include <__type_traits/integral_constant.h>
 #include <cstddef>
 #include <experimental/__config>
 #include <experimental/__simd/declaration.h>
diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h
index 37e334aad6da0..db0f9b39d9600 100644
--- a/libcxx/include/experimental/__simd/simd.h
+++ b/libcxx/include/experimental/__simd/simd.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_SIMD_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_H
 
+#include <__type_traits/enable_if.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/remove_cvref.h>
 #include <__utility/forward.h>
diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h
index fd6dee2e28ee9..d54d4898b718a 100644
--- a/libcxx/include/experimental/__simd/simd_mask.h
+++ b/libcxx/include/experimental/__simd/simd_mask.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H
 
+#include <__type_traits/enable_if.h>
 #include <__type_traits/is_same.h>
 #include <cstddef>
 #include <experimental/__config>
diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h
index 316866b84873d..5787f237bb01e 100644
--- a/libcxx/include/experimental/__simd/vec_ext.h
+++ b/libcxx/include/experimental/__simd/vec_ext.h
@@ -12,6 +12,7 @@
 
 #include <__assert>
 #include <__bit/bit_ceil.h>
+#include <__type_traits/integral_constant.h>
 #include <__utility/forward.h>
 #include <__utility/integer_sequence.h>
 #include <cstddef>
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 7cde21417561e..3abc11723a5a9 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -440,6 +440,11 @@ module std_cstdbool [system] {
 }
 module std_cstddef [system] {
   header "cstddef"
+  module byte         { header "__cstddef/byte.h" }
+  module max_align_t  { header "__cstddef/max_align_t.h" }
+  module nullptr_t    { header "__cstddef/nullptr_t.h" }
+  module ptrdiff_t    { header "__cstddef/ptrdiff_t.h" }
+  module size_t       { header "__cstddef/size_t.h" }
   export *
 }
 module std_cstdint [system] {
@@ -1418,6 +1423,7 @@ module std_private_iterator_iterator                [system] { header "__iterato
 module std_private_iterator_iterator_traits         [system] {
   header "__iterator/iterator_traits.h"
   export std_private_type_traits_is_primary_template
+  export std_private_type_traits_integral_constant
 }
 module std_private_iterator_iterator_with_data      [system] { header "__iterator/iterator_with_data.h" }
 module std_private_iterator_mergeable               [system] {
@@ -1814,7 +1820,10 @@ module std_private_tuple_tuple_like             [system] {
   export *
 }
 module std_private_tuple_tuple_like_ext         [system] { header "__tuple/tuple_like_ext.h" }
-module std_private_tuple_tuple_size             [system] { header "__tuple/tuple_size.h" }
+module std_private_tuple_tuple_size             [system] {
+  header "__tuple/tuple_size.h"
+  export std_private_type_traits_integral_constant
+}
 module std_private_tuple_tuple_types            [system] { header "__tuple/tuple_types.h" }
 
 module std_private_type_traits_add_const                                 [system] { header "__type_traits/add_const.h" }
@@ -1926,7 +1935,10 @@ module std_private_type_traits_is_implicitly_default_constructible       [system
   header "__type_traits/is_implicitly_default_constructible.h"
   export std_private_type_traits_integral_constant
 }
-module std_private_type_traits_is_integral                               [system] { header "__type_traits/is_integral.h" }
+module std_private_type_traits_is_integral                               [system] {
+  header "__type_traits/is_integral.h"
+  export std_private_type_traits_integral_constant
+}
 module std_private_type_traits_is_literal_type                           [system] { header "__type_traits/is_literal_type.h" }
 module std_private_type_traits_is_member_pointer                         [system] { header "__type_traits/is_member_pointer.h" }
 module std_private_type_traits_is_nothrow_assignable                     [system] { header "__type_traits/is_nothrow_assignable.h" }
@@ -1954,7 +1966,10 @@ module std_private_type_traits_is_primary_template                       [system
   header "__type_traits/is_primary_template.h"
   export std_private_type_traits_enable_if
 }
-module std_private_type_traits_is_reference                              [system] { header "__type_traits/is_reference.h" }
+module std_private_type_traits_is_reference                              [system] {
+  header "__type_traits/is_reference.h"
+  export std_private_type_traits_integral_constant
+}
 module std_private_type_traits_is_reference_wrapper                      [system] { header "__type_traits/is_reference_wrapper.h" }
 module std_private_type_traits_is_referenceable                          [system] { header "__type_traits/is_referenceable.h" }
 module std_private_type_traits_is_same                                   [system] {
@@ -1976,7 +1991,10 @@ module std_private_type_traits_is_swappable                              [system
 module std_private_type_traits_is_trivial                                [system] { header "__type_traits/is_trivial.h" }
 module std_private_type_traits_is_trivially_assignable                   [system] { header "__type_traits/is_trivially_assignable.h" }
 module std_private_type_traits_is_trivially_constructible                [system] { header "__type_traits/is_trivially_constructible.h" }
-module std_private_type_traits_is_trivially_copyable                     [system] { header "__type_traits/is_trivially_copyable.h" }
+module std_private_type_traits_is_trivially_copyable                     [system] {
+  header "__type_traits/is_trivially_copyable.h"
+  export std_private_type_traits_integral_constant
+}
 module std_private_type_traits_is_trivially_destructible                 [system] { header "__type_traits/is_trivially_destructible.h" }
 module std_private_type_traits_is_trivially_lexicographically_comparable [system] { header "__type_traits/is_trivially_lexicographically_comparable.h" }
 module std_private_type_traits_is_trivially_relocatable                  [system] { header "__type_traits/is_trivially_relocatable.h" }
@@ -2044,7 +2062,10 @@ module std_private_utility_exception_guard        [system] { header "__utility/e
 module std_private_utility_exchange               [system] { header "__utility/exchange.h" }
 module std_private_utility_forward                [system] { header "__utility/forward.h" }
 module std_private_utility_forward_like           [system] { header "__utility/forward_like.h" }
-module std_private_utility_in_place               [system] { header "__utility/in_place.h" }
+module std_private_utility_in_place               [system] {
+  header "__utility/in_place.h"
+  export std_private_type_traits_integral_constant
+}
 module std_private_utility_integer_sequence       [system] { header "__utility/integer_sequence.h" }
 module std_private_utility_is_pointer_in_range    [system] { header "__utility/is_pointer_in_range.h" }
 module std_private_utility_is_valid_range         [system] { header "__utility/is_valid_range.h" }
diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo
index 2727cad02fa99..54e0b4cf5d634 100644
--- a/libcxx/include/typeinfo
+++ b/libcxx/include/typeinfo
@@ -58,6 +58,7 @@ public:
 
 #include <__config>
 #include <__exception/exception.h>
+#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__verbose_abort>
 #include <cstddef>
diff --git a/libcxx/include/unordered_map b/libcxx/include/unordered_map
index acf3485e644af..69314ba756319 100644
--- a/libcxx/include/unordered_map
+++ b/libcxx/include/unordered_map
@@ -600,7 +600,9 @@ template <class Key, class T, class Hash, class Pred, class Alloc>
 #include <__ranges/concepts.h>
 #include <__ranges/container_compatible_range.h>
 #include <__ranges/from_range.h>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/is_allocator.h>
+#include <__type_traits/is_integral.h>
 #include <__type_traits/type_identity.h>
 #include <__utility/forward.h>
 #include <stdexcept>
diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set
index d11ceacba8143..fb50f78a02941 100644
--- a/libcxx/include/unordered_set
+++ b/libcxx/include/unordered_set
@@ -548,7 +548,9 @@ template <class Value, class Hash, class Pred, class Alloc>
 #include <__ranges/concepts.h>
 #include <__ranges/container_compatible_range.h>
 #include <__ranges/from_range.h>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/is_allocator.h>
+#include <__type_traits/is_integral.h>
 #include <__utility/forward.h>
 #include <version>
 
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index 3bf39ea17c912..51f38ea086ac0 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -231,7 +231,6 @@ expected cstddef
 expected initializer_list
 expected new
 expected version
-experimental/iterator cstddef
 experimental/iterator iosfwd
 experimental/iterator iterator
 experimental/iterator type_traits
@@ -918,7 +917,6 @@ tuple type_traits
 tuple typeinfo
 tuple utility
 tuple version
-type_traits cstddef
 type_traits cstdint
 type_traits version
 typeindex compare
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index 49125486cfcf6..f7b0179f6f60e 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -232,7 +232,6 @@ expected cstddef
 expected initializer_list
 expected new
 expected version
-experimental/iterator cstddef
 experimental/iterator iosfwd
 experimental/iterator iterator
 experimental/iterator type_traits
@@ -925,7 +924,6 @@ tuple type_traits
 tuple typeinfo
 tuple utility
 tuple version
-type_traits cstddef
 type_traits cstdint
 type_traits version
 typeindex compare
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index 28dfb320fe06c..11afb76583a8c 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -233,7 +233,6 @@ expected cstddef
 expected initializer_list
 expected new
 expected version
-experimental/iterator cstddef
 experimental/iterator iosfwd
 experimental/iterator iterator
 experimental/iterator type_traits
@@ -928,7 +927,6 @@ tuple type_traits
 tuple typeinfo
 tuple utility
 tuple version
-type_traits cstddef
 type_traits cstdint
 type_traits version
 typeindex compare
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index 5b7b6cecf73f8..42ba4ef3f6f52 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -233,7 +233,6 @@ expected cstddef
 expected initializer_list
 expected new
 expected version
-experimental/iterator cstddef
 experimental/iterator iosfwd
 experimental/iterator iterator
 experimental/iterator type_traits
@@ -929,7 +928,6 @@ tuple type_traits
 tuple typeinfo
 tuple utility
 tuple version
-type_traits cstddef
 type_traits cstdint
 type_traits version
 typeindex compare
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index 84ea6433fb12d..cd48b37520f15 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -241,7 +241,6 @@ expected cstddef
 expected initializer_list
 expected new
 expected version
-experimental/iterator cstddef
 experimental/iterator iosfwd
 experimental/iterator iterator
 experimental/iterator type_traits
@@ -936,7 +935,6 @@ tuple type_traits
 tuple typeinfo
 tuple utility
 tuple version
-type_traits cstddef
 type_traits cstdint
 type_traits version
 typeindex compare
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index 946ba486294d3..db09568fc76ff 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -155,7 +155,6 @@ expected cstdint
 expected initializer_list
 expected new
 expected version
-experimental/iterator cstddef
 experimental/iterator iterator
 experimental/memory cstddef
 experimental/memory cstdint
@@ -647,7 +646,6 @@ tuple compare
 tuple cstddef
 tuple cstdint
 tuple version
-type_traits cstddef
 type_traits cstdint
 type_traits version
 typeindex compare
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 946ba486294d3..db09568fc76ff 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -155,7 +155,6 @@ expected cstdint
 expected initializer_list
 expected new
 expected version
-experimental/iterator cstddef
 experimental/iterator iterator
 experimental/memory cstddef
 experimental/memory cstdint
@@ -647,7 +646,6 @@ tuple compare
 tuple cstddef
 tuple cstdint
 tuple version
-type_traits cstddef
 type_traits cstdint
 type_traits version
 typeindex compare
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp
index 5e37db95ab090..1ab1aa60ab826 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp
@@ -17,6 +17,7 @@
 
 #include <bit>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <type_traits>
 
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp
index 38a46fcc12227..f243e9d1f63b5 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp
@@ -16,6 +16,7 @@
 
 #include <bit>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp
index baf2032a4a1f0..e6a0cfb9d11e0 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp
@@ -17,6 +17,7 @@
 
 #include <bit>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp
index 81dca301e21fb..a1088218a35f0 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp
@@ -15,6 +15,7 @@
 
 #include <bit>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp
index 92268cf563b47..82931162b4f39 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp
@@ -17,6 +17,7 @@
 
 #include <bit>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp
index 9d5d361662e8c..20e0eff91b253 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp
@@ -17,6 +17,7 @@
 
 #include <bit>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp
index 63b60640ac048..1fedc4f8a5386 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp
@@ -17,6 +17,7 @@
 
 #include <bit>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp
index 1df1d883a12e1..4221b86fe1cc6 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp
@@ -17,6 +17,7 @@
 
 #include <bit>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
diff --git a/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp
index 588c5e0cf7af2..a7c5c43a4e2c2 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp
@@ -16,6 +16,7 @@
 
 #include <bit>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
diff --git a/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp b/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp
index 16eabbd2a5a4d..72e412772fb08 100644
--- a/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp
@@ -15,6 +15,7 @@
 
 #include <bit>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
diff --git a/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp b/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp
index 53405588266f7..fc0fff60394e3 100644
--- a/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp
@@ -15,6 +15,7 @@
 
 #include <bit>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
diff --git a/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp b/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp
index 9d5d37abde958..897da8b769e6a 100644
--- a/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp
+++ b/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp
@@ -122,7 +122,9 @@ void header_exportable_declarations::registerMatchers(clang::ast_matchers::Match
     [[fallthrough]];
   case FileType::ModulePartition:
   case FileType::CompatModulePartition:
-    finder->addMatcher(namedDecl(isExpansionInFileMatching(filename_)).bind("header_exportable_declarations"), this);
+    finder->addMatcher(namedDecl(anyOf(isExpansionInFileMatching(filename_), isExpansionInFileMatching(extra_header_)))
+                           .bind("header_exportable_declarations"),
+                       this);
     break;
   case FileType::Module:
   case FileType::CompatModule:
diff --git a/libcxx/utils/libcxx/test/modules.py b/libcxx/utils/libcxx/test/modules.py
index b7758dc9a41ee..91933d4f425bd 100644
--- a/libcxx/utils/libcxx/test/modules.py
+++ b/libcxx/utils/libcxx/test/modules.py
@@ -99,6 +99,10 @@
 # same definition.
 ExtraHeader["functional"] = "v1/__compare/compare_three_way.h$"
 
+# Some C compatibility headers define std::size_t, which is in <__cstddef/size_t.h>
+for header in ("cstdio", "cstdlib", "cstring", "ctime", "cuchar", "cwchar"):
+    ExtraHeader[header] = "v1/__cstddef/size_t.h$"
+
 
 # newline needs to be escaped for the module partition output.
 nl = "\\\\n"

From 485d191f0ca5e31a60fe2489ac99270ed5c7a594 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 5 Sep 2024 12:29:55 +0000
Subject: [PATCH 216/425] [gn build] Port d6832a611a7c

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 39ee220ee3a72..371ca7fc7a37e 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -396,6 +396,11 @@ if (current_toolchain == default_toolchain) {
       "__coroutine/coroutine_traits.h",
       "__coroutine/noop_coroutine_handle.h",
       "__coroutine/trivial_awaitables.h",
+      "__cstddef/byte.h",
+      "__cstddef/max_align_t.h",
+      "__cstddef/nullptr_t.h",
+      "__cstddef/ptrdiff_t.h",
+      "__cstddef/size_t.h",
       "__debug_utils/randomize_range.h",
       "__debug_utils/sanitizers.h",
       "__debug_utils/strict_weak_ordering_check.h",

From 2a07509c8d3c8b5b2c88e4f73dde0071bf506870 Mon Sep 17 00:00:00 2001
From: Mital Ashok <mital@mitalashok.co.uk>
Date: Thu, 5 Sep 2024 13:42:59 +0100
Subject: [PATCH 217/425] [Clang] Add __builtin_is_within_lifetime to implement
 P2641R4's std::is_within_lifetime (#91895)

[P2641R4](https://wg21.link/P2641R4)

This new builtin function is declared `consteval`. Support for
`-fexperimental-new-constant-interpreter` will be added in a later
patch.

---------

Co-authored-by: cor3ntin <corentinjabot@gmail.com>
---
 clang/docs/ReleaseNotes.rst                   |   3 +
 clang/include/clang/Basic/Builtins.td         |   6 +
 .../include/clang/Basic/DiagnosticASTKinds.td |  12 +-
 .../clang/Basic/DiagnosticSemaKinds.td        |   4 +
 clang/lib/AST/ByteCode/State.h                |   1 +
 clang/lib/AST/ExprConstant.cpp                | 109 ++++-
 clang/lib/CodeGen/CGBuiltin.cpp               |   3 +
 clang/lib/Sema/SemaChecking.cpp               |  40 ++
 clang/lib/Sema/SemaExpr.cpp                   |   3 +-
 .../SemaCXX/builtin-is-within-lifetime.cpp    | 431 ++++++++++++++++++
 clang/test/SemaCXX/consteval-builtin.cpp      |  93 ++++
 11 files changed, 698 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/SemaCXX/builtin-is-within-lifetime.cpp
 create mode 100644 clang/test/SemaCXX/consteval-builtin.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index dc103aceebc36..ab3c3e6049f60 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -122,6 +122,9 @@ C++2c Feature Support
 
 - Implemented `P2747R2 constexpr placement new <https://wg21.link/P2747R2>`_.
 
+- Added the ``__builtin_is_within_lifetime`` builtin, which supports
+  `P2641R4 Checking if a union alternative is active <https://wg21.link/p2641r4>`_
+
 C++23 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 - Removed the restriction to literal types in constexpr functions in C++23 mode.
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 9e2a590f265ac..92118418d9d45 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -934,6 +934,12 @@ def IsConstantEvaluated : LangBuiltin<"CXX_LANG"> {
   let Prototype = "bool()";
 }
 
+def IsWithinLifetime : LangBuiltin<"CXX_LANG"> {
+  let Spellings = ["__builtin_is_within_lifetime"];
+  let Attributes = [NoThrow, CustomTypeChecking, Consteval];
+  let Prototype = "bool(void*)";
+}
+
 // GCC exception builtins
 def EHReturn : Builtin {
   let Spellings = ["__builtin_eh_return"];
diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td
index 91135b18c7571..21a307d1e8987 100644
--- a/clang/include/clang/Basic/DiagnosticASTKinds.td
+++ b/clang/include/clang/Basic/DiagnosticASTKinds.td
@@ -167,14 +167,14 @@ def note_constexpr_this : Note<
 def access_kind : TextSubstitution<
   "%select{read of|read of|assignment to|increment of|decrement of|"
   "member call on|dynamic_cast of|typeid applied to|construction of|"
-  "destruction of}0">;
+  "destruction of|read of}0">;
 def access_kind_subobject : TextSubstitution<
   "%select{read of|read of|assignment to|increment of|decrement of|"
   "member call on|dynamic_cast of|typeid applied to|"
-  "construction of subobject of|destruction of}0">;
+  "construction of subobject of|destruction of|read of}0">;
 def access_kind_volatile : TextSubstitution<
   "%select{read of|read of|assignment to|increment of|decrement of|"
-  "<ERROR>|<ERROR>|<ERROR>|<ERROR>|<ERROR>}0">;
+  "<ERROR>|<ERROR>|<ERROR>|<ERROR>|<ERROR>|<ERROR>}0">;
 def note_constexpr_lifetime_ended : Note<
   "%sub{access_kind}0 %select{temporary|variable}1 whose "
   "%plural{8:storage duration|:lifetime}0 has ended">;
@@ -407,6 +407,12 @@ def warn_is_constant_evaluated_always_true_constexpr : Warning<
   "'%0' will always evaluate to 'true' in a manifestly constant-evaluated expression">,
   InGroup<DiagGroup<"constant-evaluated">>;
 
+def err_invalid_is_within_lifetime : Note<
+  "'%0' cannot be called with "
+  "%select{a null pointer|a one-past-the-end pointer|"
+  "a pointer to an object whose lifetime has not yet begun}1"
+>;
+
 // inline asm related.
 let CategoryName = "Inline Assembly Issue" in {
   def err_asm_invalid_escape : Error<
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index dcb49d8a67604..72ea5338ce615 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12186,6 +12186,10 @@ def err_builtin_launder_invalid_arg : Error<
   "%select{non-pointer|function pointer|void pointer}0 argument to "
   "'__builtin_launder' is not allowed">;
 
+def err_builtin_is_within_lifetime_invalid_arg : Error<
+  "%select{non-|function }0pointer argument to '__builtin_is_within_lifetime' "
+  "is not allowed">;
+
 def err_builtin_invalid_arg_type: Error <
   "%ordinal0 argument must be "
   "%select{a vector, integer or floating point type|a matrix|"
diff --git a/clang/lib/AST/ByteCode/State.h b/clang/lib/AST/ByteCode/State.h
index 2cffce4bc2ae4..3248e2d8be697 100644
--- a/clang/lib/AST/ByteCode/State.h
+++ b/clang/lib/AST/ByteCode/State.h
@@ -34,6 +34,7 @@ enum AccessKinds {
   AK_TypeId,
   AK_Construct,
   AK_Destroy,
+  AK_IsWithinLifetime,
 };
 
 /// The order of this enum is important for diagnostics.
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 205cbdf52a6f7..0ad3577d4e102 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -1522,7 +1522,8 @@ CallStackFrame::~CallStackFrame() {
 }
 
 static bool isRead(AccessKinds AK) {
-  return AK == AK_Read || AK == AK_ReadObjectRepresentation;
+  return AK == AK_Read || AK == AK_ReadObjectRepresentation ||
+         AK == AK_IsWithinLifetime;
 }
 
 static bool isModification(AccessKinds AK) {
@@ -1532,6 +1533,7 @@ static bool isModification(AccessKinds AK) {
   case AK_MemberCall:
   case AK_DynamicCast:
   case AK_TypeId:
+  case AK_IsWithinLifetime:
     return false;
   case AK_Assign:
   case AK_Increment:
@@ -1549,7 +1551,8 @@ static bool isAnyAccess(AccessKinds AK) {
 
 /// Is this an access per the C++ definition?
 static bool isFormalAccess(AccessKinds AK) {
-  return isAnyAccess(AK) && AK != AK_Construct && AK != AK_Destroy;
+  return isAnyAccess(AK) && AK != AK_Construct && AK != AK_Destroy &&
+         AK != AK_IsWithinLifetime;
 }
 
 /// Is this kind of axcess valid on an indeterminate object value?
@@ -1561,6 +1564,7 @@ static bool isValidIndeterminateAccess(AccessKinds AK) {
     // These need the object's value.
     return false;
 
+  case AK_IsWithinLifetime:
   case AK_ReadObjectRepresentation:
   case AK_Assign:
   case AK_Construct:
@@ -3707,7 +3711,8 @@ struct CompleteObject {
     // In C++14 onwards, it is permitted to read a mutable member whose
     // lifetime began within the evaluation.
     // FIXME: Should we also allow this in C++11?
-    if (!Info.getLangOpts().CPlusPlus14)
+    if (!Info.getLangOpts().CPlusPlus14 &&
+        AK != AccessKinds::AK_IsWithinLifetime)
       return false;
     return lifetimeStartedInEvaluation(Info, Base, /*MutableSubobject*/true);
   }
@@ -3760,6 +3765,12 @@ findSubobject(EvalInfo &Info, const Expr *E, const CompleteObject &Obj,
     if ((O->isAbsent() && !(handler.AccessKind == AK_Construct && I == N)) ||
         (O->isIndeterminate() &&
          !isValidIndeterminateAccess(handler.AccessKind))) {
+      // Object has ended lifetime.
+      // If I is non-zero, some subobject (member or array element) of a
+      // complete object has ended its lifetime, so this is valid for
+      // IsWithinLifetime, resulting in false.
+      if (I != 0 && handler.AccessKind == AK_IsWithinLifetime)
+        return false;
       if (!Info.checkingPotentialConstantExpression())
         Info.FFDiag(E, diag::note_constexpr_access_uninit)
             << handler.AccessKind << O->isIndeterminate()
@@ -3927,6 +3938,9 @@ findSubobject(EvalInfo &Info, const Expr *E, const CompleteObject &Obj,
             // Placement new onto an inactive union member makes it active.
             O->setUnion(Field, APValue());
           } else {
+            // Pointer to/into inactive union member: Not within lifetime
+            if (handler.AccessKind == AK_IsWithinLifetime)
+              return false;
             // FIXME: If O->getUnionValue() is absent, report that there's no
             // active union member rather than reporting the prior active union
             // member. We'll need to fix nullptr_t to not use APValue() as its
@@ -11684,6 +11698,9 @@ class IntExprEvaluator
 
   bool ZeroInitialization(const Expr *E) { return Success(0, E); }
 
+  friend std::optional<bool> EvaluateBuiltinIsWithinLifetime(IntExprEvaluator &,
+                                                             const CallExpr *);
+
   //===--------------------------------------------------------------------===//
   //                            Visitor Methods
   //===--------------------------------------------------------------------===//
@@ -12743,6 +12760,11 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     return Success(Info.InConstantContext, E);
   }
 
+  case Builtin::BI__builtin_is_within_lifetime:
+    if (auto result = EvaluateBuiltinIsWithinLifetime(*this, E))
+      return Success(*result, E);
+    return false;
+
   case Builtin::BI__builtin_ctz:
   case Builtin::BI__builtin_ctzl:
   case Builtin::BI__builtin_ctzll:
@@ -17322,3 +17344,84 @@ bool Expr::tryEvaluateStrLen(uint64_t &Result, ASTContext &Ctx) const {
   EvalInfo Info(Ctx, Status, EvalInfo::EM_ConstantFold);
   return EvaluateBuiltinStrLen(this, Result, Info);
 }
+
+namespace {
+struct IsWithinLifetimeHandler {
+  EvalInfo &Info;
+  static constexpr AccessKinds AccessKind = AccessKinds::AK_IsWithinLifetime;
+  using result_type = std::optional<bool>;
+  std::optional<bool> failed() { return std::nullopt; }
+  template <typename T>
+  std::optional<bool> found(T &Subobj, QualType SubobjType) {
+    return true;
+  }
+};
+
+std::optional<bool> EvaluateBuiltinIsWithinLifetime(IntExprEvaluator &IEE,
+                                                    const CallExpr *E) {
+  EvalInfo &Info = IEE.Info;
+  // Sometimes this is called during some sorts of constant folding / early
+  // evaluation. These are meant for non-constant expressions and are not
+  // necessary since this consteval builtin will never be evaluated at runtime.
+  // Just fail to evaluate when not in a constant context.
+  if (!Info.InConstantContext)
+    return std::nullopt;
+  assert(E->getBuiltinCallee() == Builtin::BI__builtin_is_within_lifetime);
+  const Expr *Arg = E->getArg(0);
+  if (Arg->isValueDependent())
+    return std::nullopt;
+  LValue Val;
+  if (!EvaluatePointer(Arg, Val, Info))
+    return std::nullopt;
+
+  auto Error = [&](int Diag) {
+    bool CalledFromStd = false;
+    const auto *Callee = Info.CurrentCall->getCallee();
+    if (Callee && Callee->isInStdNamespace()) {
+      const IdentifierInfo *Identifier = Callee->getIdentifier();
+      CalledFromStd = Identifier && Identifier->isStr("is_within_lifetime");
+    }
+    Info.CCEDiag(CalledFromStd ? Info.CurrentCall->getCallRange().getBegin()
+                               : E->getExprLoc(),
+                 diag::err_invalid_is_within_lifetime)
+        << (CalledFromStd ? "std::is_within_lifetime"
+                          : "__builtin_is_within_lifetime")
+        << Diag;
+    return std::nullopt;
+  };
+  // C++2c [meta.const.eval]p4:
+  //   During the evaluation of an expression E as a core constant expression, a
+  //   call to this function is ill-formed unless p points to an object that is
+  //   usable in constant expressions or whose complete object's lifetime began
+  //   within E.
+
+  // Make sure it points to an object
+  // nullptr does not point to an object
+  if (Val.isNullPointer() || Val.getLValueBase().isNull())
+    return Error(0);
+  QualType T = Val.getLValueBase().getType();
+  assert(!T->isFunctionType() &&
+         "Pointers to functions should have been typed as function pointers "
+         "which would have been rejected earlier");
+  assert(T->isObjectType());
+  // Hypothetical array element is not an object
+  if (Val.getLValueDesignator().isOnePastTheEnd())
+    return Error(1);
+  assert(Val.getLValueDesignator().isValidSubobject() &&
+         "Unchecked case for valid subobject");
+  // All other ill-formed values should have failed EvaluatePointer, so the
+  // object should be a pointer to an object that is usable in a constant
+  // expression or whose complete lifetime began within the expression
+  CompleteObject CO =
+      findCompleteObject(Info, E, AccessKinds::AK_IsWithinLifetime, Val, T);
+  // The lifetime hasn't begun yet if we are still evaluating the
+  // initializer ([basic.life]p(1.2))
+  if (Info.EvaluatingDeclValue && CO.Value == Info.EvaluatingDeclValue)
+    return Error(2);
+
+  if (!CO)
+    return false;
+  IsWithinLifetimeHandler handler{Info};
+  return findSubobject(Info, E, CO, Val.getLValueDesignator(), handler);
+}
+} // namespace
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index e826c1c6fbbd2..02d8726baa421 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2538,6 +2538,9 @@ static RValue EmitHipStdParUnsupportedBuiltin(CodeGenFunction *CGF,
 RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                                         const CallExpr *E,
                                         ReturnValueSlot ReturnValue) {
+  assert(!getContext().BuiltinInfo.isImmediate(BuiltinID) &&
+         "Should not codegen for consteval builtins");
+
   const FunctionDecl *FD = GD.getDecl()->getAsFunction();
   // See if we can constant fold this builtin.  If so, don't emit it at all.
   // TODO: Extend this handling to all builtin calls that we can constant-fold.
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index b01765b6833a1..2aab52160afa7 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1844,6 +1844,44 @@ static ExprResult BuiltinLaunder(Sema &S, CallExpr *TheCall) {
   return TheCall;
 }
 
+static ExprResult BuiltinIsWithinLifetime(Sema &S, CallExpr *TheCall) {
+  if (S.checkArgCount(TheCall, 1))
+    return ExprError();
+
+  ExprResult Arg = S.DefaultFunctionArrayLvalueConversion(TheCall->getArg(0));
+  if (Arg.isInvalid())
+    return ExprError();
+  QualType ParamTy = Arg.get()->getType();
+  TheCall->setArg(0, Arg.get());
+  TheCall->setType(S.Context.BoolTy);
+
+  // Only accept pointers to objects as arguments, which should have object
+  // pointer or void pointer types.
+  if (const auto *PT = ParamTy->getAs<PointerType>()) {
+    // LWG4138: Function pointer types not allowed
+    if (PT->getPointeeType()->isFunctionType()) {
+      S.Diag(TheCall->getArg(0)->getExprLoc(),
+             diag::err_builtin_is_within_lifetime_invalid_arg)
+          << 1;
+      return ExprError();
+    }
+    // Disallow VLAs too since those shouldn't be able to
+    // be a template parameter for `std::is_within_lifetime`
+    if (PT->getPointeeType()->isVariableArrayType()) {
+      S.Diag(TheCall->getArg(0)->getExprLoc(), diag::err_vla_unsupported)
+          << 1 << "__builtin_is_within_lifetime";
+      return ExprError();
+    }
+  } else {
+    S.Diag(TheCall->getArg(0)->getExprLoc(),
+           diag::err_builtin_is_within_lifetime_invalid_arg)
+        << 0;
+    return ExprError();
+  }
+
+  return TheCall;
+}
+
 // Emit an error and return true if the current object format type is in the
 // list of unsupported types.
 static bool CheckBuiltinTargetNotInUnsupported(
@@ -2276,6 +2314,8 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   }
   case Builtin::BI__builtin_launder:
     return BuiltinLaunder(*this, TheCall);
+  case Builtin::BI__builtin_is_within_lifetime:
+    return BuiltinIsWithinLifetime(*this, TheCall);
   case Builtin::BI__sync_fetch_and_add:
   case Builtin::BI__sync_fetch_and_add_1:
   case Builtin::BI__sync_fetch_and_add_2:
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index e291ef6c97eef..32dac4440fb82 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -17622,7 +17622,8 @@ HandleImmediateInvocations(Sema &SemaRef,
         (SemaRef.inTemplateInstantiation() && !ImmediateEscalating)) {
       SemaRef.Diag(DR->getBeginLoc(), diag::err_invalid_consteval_take_address)
           << ND << isa<CXXRecordDecl>(ND) << FD->isConsteval();
-      SemaRef.Diag(ND->getLocation(), diag::note_declared_at);
+      if (!FD->getBuiltinID())
+        SemaRef.Diag(ND->getLocation(), diag::note_declared_at);
       if (auto Context =
               SemaRef.InnermostDeclarationWithDelayedImmediateInvocations()) {
         SemaRef.Diag(Context->Loc, diag::note_invalid_consteval_initializer)
diff --git a/clang/test/SemaCXX/builtin-is-within-lifetime.cpp b/clang/test/SemaCXX/builtin-is-within-lifetime.cpp
new file mode 100644
index 0000000000000..62ff2681952ce
--- /dev/null
+++ b/clang/test/SemaCXX/builtin-is-within-lifetime.cpp
@@ -0,0 +1,431 @@
+// RUN: %clang_cc1 -std=c++20 -Wno-unused %s -verify=expected,cxx20 -Wno-vla-cxx-extension
+// RUN: %clang_cc1 -std=c++23 -Wno-unused %s -verify=expected,sincecxx23 -Wno-vla-cxx-extension
+// RUN: %clang_cc1 -std=c++26 -Wno-unused %s -verify=expected,sincecxx23 -Wno-vla-cxx-extension
+// RUN: %clang_cc1 -std=c++26 -DINLINE_NAMESPACE -Wno-unused %s -verify=expected,sincecxx23 -Wno-vla-cxx-extension
+
+inline constexpr void* operator new(__SIZE_TYPE__, void* p) noexcept { return p; }
+namespace std {
+template<typename T, typename... Args>
+constexpr T* construct_at(T* p, Args&&... args) { return ::new((void*)p) T(static_cast<Args&&>(args)...); }
+template<typename T>
+constexpr void destroy_at(T* p) { p->~T(); }
+template<typename T>
+struct allocator {
+  constexpr T* allocate(__SIZE_TYPE__ n) { return static_cast<T*>(::operator new(n * sizeof(T))); }
+  constexpr void deallocate(T* p, __SIZE_TYPE__) { ::operator delete(p); }
+};
+using nullptr_t = decltype(nullptr);
+template<typename T, T v>
+struct integral_constant { static constexpr T value = v; };
+template<bool v>
+using bool_constant = integral_constant<bool, v>;
+using true_type = bool_constant<true>;
+using false_type = bool_constant<false>;
+template<typename T>
+inline constexpr bool is_function_v = __is_function(T);
+#ifdef INLINE_NAMESPACE
+inline namespace __1 {
+#endif
+template<typename T> requires (!is_function_v<T>) // #std-constraint
+consteval bool is_within_lifetime(const T* p) noexcept { // #std-definition
+  return __builtin_is_within_lifetime(p);
+}
+#ifdef INLINE_NAMESPACE
+}
+#endif
+}
+
+consteval bool test_union(int& i, char& c) {
+  if (__builtin_is_within_lifetime(&i) || __builtin_is_within_lifetime(&c))
+    return false;
+  std::construct_at(&c, 1);
+  if (__builtin_is_within_lifetime(&i) || !__builtin_is_within_lifetime(&c))
+    return false;
+  std::construct_at(&i, 3);
+  if (!__builtin_is_within_lifetime(&i) || __builtin_is_within_lifetime(&c))
+    return false;
+  return true;
+}
+
+static_assert([]{
+  union { int i; char c; } u;
+  return test_union(u.i, u.c);
+}());
+static_assert([]{
+  union { int i; char c; };
+  return test_union(i, c);
+}());
+static_assert([]{
+  struct { union { int i; char c; }; } u;
+  return test_union(u.i, u.c);
+}());
+static_assert([]{
+  struct { union { int i; char c; } u; } r;
+  return test_union(r.u.i, r.u.c);
+}());
+
+consteval bool test_nested() {
+  union {
+    union { int i; char c; } u;
+    long l;
+  };
+  if (__builtin_is_within_lifetime(&l) || __builtin_is_within_lifetime(&u) || __builtin_is_within_lifetime(&u.i) || __builtin_is_within_lifetime(&u.c))
+    return false;
+  std::construct_at(&l);
+  if (!__builtin_is_within_lifetime(&l) || __builtin_is_within_lifetime(&u) || __builtin_is_within_lifetime(&u.i) || __builtin_is_within_lifetime(&u.c))
+    return false;
+  std::construct_at(&u);
+  std::construct_at(&u.i);
+  if (__builtin_is_within_lifetime(&l) || !__builtin_is_within_lifetime(&u) || !__builtin_is_within_lifetime(&u.i) || __builtin_is_within_lifetime(&u.c))
+    return false;
+  std::construct_at(&u.c);
+  if (__builtin_is_within_lifetime(&l) || !__builtin_is_within_lifetime(&u) || __builtin_is_within_lifetime(&u.i) || !__builtin_is_within_lifetime(&u.c))
+    return false;
+  return true;
+}
+static_assert(test_nested());
+
+consteval bool test_dynamic(bool read_after_deallocate) {
+  std::allocator<int> a;
+  int* p = a.allocate(1);
+  // a.allocate starts the lifetime of an array,
+  // the complete object of *p has started its lifetime
+  if (__builtin_is_within_lifetime(p))
+    return false;
+  std::construct_at(p);
+  if (!__builtin_is_within_lifetime(p))
+    return false;
+  std::destroy_at(p);
+  if (__builtin_is_within_lifetime(p))
+    return false;
+  a.deallocate(p, 1);
+  if (read_after_deallocate)
+    __builtin_is_within_lifetime(p); // expected-note {{read of heap allocated object that has been deleted}}
+  return true;
+}
+static_assert(test_dynamic(false));
+static_assert(test_dynamic(true));
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+//   expected-note@-2 {{in call to 'test_dynamic(true)'}}
+
+consteval bool test_automatic(int read_dangling) {
+  int* p;
+  {
+    int x = 0;
+    p = &x;
+    if (!__builtin_is_within_lifetime(p))
+      return false;
+  }
+  {
+    int x = 0;
+    if (read_dangling == 1)
+      __builtin_is_within_lifetime(p); // expected-note {{read of object outside its lifetime is not allowed in a constant expression}}
+  }
+  if (read_dangling == 2)
+    __builtin_is_within_lifetime(p); // expected-note {{read of object outside its lifetime is not allowed in a constant expression}}
+  {
+    int x[4];
+    p = &x[2];
+    if (!__builtin_is_within_lifetime(p))
+      return false;
+  }
+  if (read_dangling == 3)
+    __builtin_is_within_lifetime(p); // expected-note {{read of object outside its lifetime is not allowed in a constant expression}}
+  std::nullptr_t* q;
+  {
+    std::nullptr_t np = nullptr;
+    q = &np;
+    if (!__builtin_is_within_lifetime(q))
+      return false;
+  }
+  if (read_dangling == 4)
+    __builtin_is_within_lifetime(q); // expected-note {{read of object outside its lifetime is not allowed in a constant expression}}
+  return true;
+}
+static_assert(test_automatic(0));
+static_assert(test_automatic(1));
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+//   expected-note@-2 {{in call to 'test_automatic(1)'}}
+static_assert(test_automatic(2));
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+//   expected-note@-2 {{in call to 'test_automatic(2)'}}
+static_assert(test_automatic(3));
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+//   expected-note@-2 {{in call to 'test_automatic(3)'}}
+static_assert(test_automatic(4));
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+//   expected-note@-2 {{in call to 'test_automatic(4)'}}
+
+
+consteval bool test_indeterminate() {
+  int x;
+  if (!__builtin_is_within_lifetime(&x))
+    return false;
+  bool b = true;
+  unsigned char c = __builtin_bit_cast(unsigned char, b);
+  if (!__builtin_is_within_lifetime(&c))
+    return false;
+  struct {} padding;
+  unsigned char y = __builtin_bit_cast(unsigned char, padding);
+  if (!__builtin_is_within_lifetime(&y))
+    return false;
+  return true;
+}
+static_assert(test_indeterminate());
+
+consteval bool test_volatile() {
+  int x;
+  if (!__builtin_is_within_lifetime(static_cast<volatile int*>(&x)) || !__builtin_is_within_lifetime(static_cast<volatile void*>(&x)))
+    return false;
+  volatile int y;
+  if (!__builtin_is_within_lifetime(const_cast<int*>(&y)) || !__builtin_is_within_lifetime(const_cast<void*>(static_cast<volatile void*>(&y))))
+    return false;
+  return true;
+}
+static_assert(test_volatile());
+
+constexpr bool self = __builtin_is_within_lifetime(&self);
+// expected-error@-1 {{constexpr variable 'self' must be initialized by a constant expression}}
+//   expected-note@-2 {{'__builtin_is_within_lifetime' cannot be called with a pointer to an object whose lifetime has not yet begun}}
+// expected-error@-3 {{call to consteval function '__builtin_is_within_lifetime' is not a constant expression}}
+//   expected-note@-4 {{initializer of 'self' is not a constant expression}}
+//   expected-note@-5 {{declared here}}
+constexpr int external{};
+static_assert(__builtin_is_within_lifetime(&external));
+void not_constexpr() {
+  __builtin_is_within_lifetime(&external);
+}
+void invalid_args() {
+  __builtin_is_within_lifetime(static_cast<int*>(nullptr));
+  // expected-error@-1 {{call to consteval function '__builtin_is_within_lifetime' is not a constant expression}}
+  //   expected-note@-2 {{'__builtin_is_within_lifetime' cannot be called with a null pointer}}
+
+  // FIXME: avoid function to pointer conversion on all consteval builtins
+  __builtin_is_within_lifetime(0);
+  // expected-error@-1 {{non-pointer argument to '__builtin_is_within_lifetime' is not allowed}}
+  // expected-error@-2 {{cannot take address of consteval function '__builtin_is_within_lifetime' outside of an immediate invocation}}
+  __builtin_is_within_lifetime();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+  // expected-error@-2 {{cannot take address of consteval function '__builtin_is_within_lifetime' outside of an immediate invocation}}
+  __builtin_is_within_lifetime(1, 2);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+  // expected-error@-2 {{cannot take address of consteval function '__builtin_is_within_lifetime' outside of an immediate invocation}}
+  __builtin_is_within_lifetime(&external, &external);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+  // expected-error@-2 {{cannot take address of consteval function '__builtin_is_within_lifetime' outside of an immediate invocation}}
+}
+
+constexpr struct {
+  union {
+    int i;
+    char c;
+  };
+  mutable int mi;  // #x-mi
+} x1{ .c = 2 };
+static_assert(!__builtin_is_within_lifetime(&x1.i));
+static_assert(__builtin_is_within_lifetime(&x1.c));
+static_assert(__builtin_is_within_lifetime(&x1.mi));
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+//   expected-note@-2 {{read of mutable member 'mi' is not allowed in a constant expression}}
+//   expected-note@#x-mi {{declared here}}
+
+constexpr struct NSDMI { // #NSDMI
+  bool a = true;
+  bool b = __builtin_is_within_lifetime(&a); // #NSDMI-read
+} x2;
+// expected-error@-1 {{constexpr variable 'x2' must be initialized by a constant expression}}
+//   expected-note@#NSDMI-read {{'__builtin_is_within_lifetime' cannot be called with a pointer to an object whose lifetime has not yet begun}}
+//   expected-note@-3 {{in call to 'NSDMI()'}}
+// expected-error@-4 {{call to immediate function 'NSDMI::NSDMI' is not a constant expression}}
+//   expected-note@#NSDMI {{'NSDMI' is an immediate constructor because the default initializer of 'b' contains a call to a consteval function '__builtin_is_within_lifetime' and that call is not a constant expression}}
+//   expected-note@#NSDMI-read {{'__builtin_is_within_lifetime' cannot be called with a pointer to an object whose lifetime has not yet begun}}
+//   expected-note@-7 {{in call to 'NSDMI()'}}
+
+struct X3 {
+  consteval X3() {
+    __builtin_is_within_lifetime(this); // #X3-read
+  }
+} x3;
+// expected-error@-1 {{call to consteval function 'X3::X3' is not a constant expression}}
+//   expected-note@#X3-read {{'__builtin_is_within_lifetime' cannot be called with a pointer to an object whose lifetime has not yet begun}}
+//   expected-note@-3 {{in call to 'X3()'}}
+
+constexpr int i = 2;
+static_assert(__builtin_is_within_lifetime(const_cast<int*>(&i)));
+static_assert(__builtin_is_within_lifetime(const_cast<volatile int*>(&i)));
+static_assert(__builtin_is_within_lifetime(static_cast<const void*>(&i)));
+
+constexpr int arr[2]{};
+static_assert(__builtin_is_within_lifetime(arr));
+static_assert(__builtin_is_within_lifetime(arr + 0));
+static_assert(__builtin_is_within_lifetime(arr + 1));
+void f() {
+  __builtin_is_within_lifetime(&i + 1);
+  // expected-error@-1 {{call to consteval function '__builtin_is_within_lifetime' is not a constant expression}}
+  //   expected-note@-2 {{'__builtin_is_within_lifetime' cannot be called with a one-past-the-end pointer}}
+  __builtin_is_within_lifetime(arr + 2);
+  // expected-error@-1 {{call to consteval function '__builtin_is_within_lifetime' is not a constant expression}}
+  //   expected-note@-2 {{'__builtin_is_within_lifetime' cannot be called with a one-past-the-end pointer}}
+}
+
+template<typename T>
+consteval void disallow_function_types(bool b, const T* p) {
+  if (b) {
+    __builtin_is_within_lifetime(p); // expected-error {{function pointer argument to '__builtin_is_within_lifetime' is not allowed}}
+  }
+}
+void g() {
+  disallow_function_types<void ()>(false, &f);
+  // expected-note@-1 {{in instantiation of function template specialization 'disallow_function_types<void ()>' requested here}}
+}
+
+struct OptBool {
+  union { bool b; char c; };
+
+  // note: this assumes common implementation properties for bool and char:
+  // * sizeof(bool) == sizeof(char), and
+  // * the value representations for true and false are distinct
+  //   from the value representation for 2
+  constexpr OptBool() : c(2) { }
+  constexpr OptBool(bool b) : b(b) { }
+
+  constexpr auto has_value() const -> bool {
+    if consteval {  // cxx20-warning {{consteval if}}
+      return __builtin_is_within_lifetime(&b);   // during constant evaluation, cannot read from c
+    } else {
+      return c != 2;                        // during runtime, must read from c
+    }
+  }
+
+  constexpr auto operator*() const -> const bool& {
+    return b;
+  }
+};
+
+constexpr OptBool disengaged;
+constexpr OptBool engaged(true);
+static_assert(!disengaged.has_value());
+static_assert(engaged.has_value());
+static_assert(*engaged);
+
+namespace vlas {
+
+consteval bool f(int n) {
+  int vla[n]; // cxx20-error {{variable of non-literal type}}
+  return __builtin_is_within_lifetime(static_cast<void*>(&vla));
+}
+static_assert(f(1));
+
+consteval bool fail(int n) {
+  int vla[n]; // cxx20-error {{variable of non-literal type}}
+  return __builtin_is_within_lifetime(&vla); // expected-error {{variable length arrays are not supported in '__builtin_is_within_lifetime'}}
+}
+static_assert(fail(1)); // sincecxx23-error {{static assertion expression is not an integral constant expression}}
+
+consteval bool variably_modified(int n) {
+  int(* p)[n];
+  return __builtin_is_within_lifetime(&p);
+}
+static_assert(variably_modified(1));
+
+} // namespace vlas
+
+consteval bool partial_arrays() {
+  int arr[2];
+  if (!__builtin_is_within_lifetime(&arr) || !__builtin_is_within_lifetime(&arr[0]) || !__builtin_is_within_lifetime(&arr[1]))
+    return false;
+  std::destroy_at(&arr[0]);
+  if (!__builtin_is_within_lifetime(&arr) ||  __builtin_is_within_lifetime(&arr[0]) || !__builtin_is_within_lifetime(&arr[1]))
+    return false;
+  std::construct_at(&arr[0]);
+  if (!__builtin_is_within_lifetime(&arr) || !__builtin_is_within_lifetime(&arr[0]) || !__builtin_is_within_lifetime(&arr[1]))
+    return false;
+  return true;
+}
+static_assert(partial_arrays());
+
+consteval bool partial_members() {
+  struct S {
+    int x;
+    int y;
+  } s;
+  if (!__builtin_is_within_lifetime(&s) || !__builtin_is_within_lifetime(&s.x) || !__builtin_is_within_lifetime(&s.y))
+    return false;
+  std::destroy_at(&s.x);
+  if (!__builtin_is_within_lifetime(&s) ||  __builtin_is_within_lifetime(&s.x) || !__builtin_is_within_lifetime(&s.y))
+    return false;
+  std::construct_at(&s.x);
+  if (!__builtin_is_within_lifetime(&s) || !__builtin_is_within_lifetime(&s.x) || !__builtin_is_within_lifetime(&s.y))
+    return false;
+  return true;
+}
+
+struct NonTrivial {
+  constexpr NonTrivial() {}
+  constexpr NonTrivial(const NonTrivial&) {}
+  constexpr ~NonTrivial() {}
+};
+
+template<typename T>
+constexpr T& unmove(T&& temp) { return static_cast<T&>(temp); }
+
+consteval bool test_temporaries() {
+  static_assert(__builtin_is_within_lifetime(&unmove(0)));
+  static_assert(__builtin_is_within_lifetime(&unmove(NonTrivial{})));
+  if (!__builtin_is_within_lifetime(&unmove(0)))
+    return false;
+  if (!__builtin_is_within_lifetime(&unmove(NonTrivial{})))
+    return false;
+  return true;
+}
+static_assert(test_temporaries());
+
+constexpr const int& temp = 0;
+static_assert(__builtin_is_within_lifetime(&temp));
+
+template<typename T>
+constexpr T* test_dangling() {
+  T i; // expected-note 2 {{declared here}}
+  return &i; // expected-warning 2 {{address of stack memory associated with local variable 'i' returned}}
+}
+static_assert(__builtin_is_within_lifetime(test_dangling<int>())); // expected-note {{in instantiation of function template specialization}}
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+//   expected-note@-2 {{read of variable whose lifetime has ended}}
+static_assert(__builtin_is_within_lifetime(test_dangling<int[1]>())); // expected-note {{in instantiation of function template specialization}}
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+//   expected-note@-2 {{read of variable whose lifetime has ended}}
+
+template<auto F>
+concept CanCallAndPassToIsWithinLifetime = std::bool_constant<__builtin_is_within_lifetime(F())>::value;
+static_assert(CanCallAndPassToIsWithinLifetime<[]{ return &i; }>);
+static_assert(!CanCallAndPassToIsWithinLifetime<[]{ return static_cast<int*>(nullptr); }>);
+static_assert(!CanCallAndPassToIsWithinLifetime<[]{ return static_cast<void(*)()>(&f); }>);
+template<auto F> constexpr std::true_type sfinae() requires CanCallAndPassToIsWithinLifetime<F> { return {}; }
+template<auto F> std::false_type sfinae() { return {}; }
+static_assert(decltype(sfinae<[]{ return &i; }>())::value);
+static_assert(!decltype(sfinae<[]{ return static_cast<int*>(nullptr); }>())::value);
+std::true_type(* not_immediate)() = &sfinae<[]{ return &i; }>;
+
+void test_std_error_message() {
+  std::is_within_lifetime(static_cast<int*>(nullptr));
+  // expected-error@-1 {{call to consteval function 'std::is_within_lifetime<int>' is not a constant expression}}
+  //   expected-note@-2 {{'std::is_within_lifetime' cannot be called with a null pointer}}
+  //   expected-note@-3 {{in call to 'is_within_lifetime<int>(nullptr)'}}
+  std::is_within_lifetime<void()>(&test_std_error_message);
+  // expected-error@-1 {{no matching function for call to 'is_within_lifetime'}}
+  //   expected-note@#std-definition {{candidate template ignored: constraints not satisfied [with T = void ()]}}
+  //   expected-note@#std-constraint {{because '!is_function_v<void ()>' evaluated to false}}
+  std::is_within_lifetime(arr + 2);
+  // expected-error@-1 {{call to consteval function 'std::is_within_lifetime<int>' is not a constant expression}}
+  //   expected-note@-2 {{'std::is_within_lifetime' cannot be called with a one-past-the-end pointer}}
+  //   expected-note@-3 {{in call to 'is_within_lifetime<int>(&arr[2])'}}
+}
+struct XStd {
+  consteval XStd() {
+    std::is_within_lifetime(this); // #XStd-read
+  }
+} xstd;
+// expected-error@-1 {{call to consteval function 'XStd::XStd' is not a constant expression}}
+//   expected-note@#XStd-read {{'std::is_within_lifetime' cannot be called with a pointer to an object whose lifetime has not yet begun}}
+//   expected-note@#XStd-read {{in call to 'is_within_lifetime<XStd>(&)'}}
+//   expected-note@-4 {{in call to 'XStd()'}}
diff --git a/clang/test/SemaCXX/consteval-builtin.cpp b/clang/test/SemaCXX/consteval-builtin.cpp
new file mode 100644
index 0000000000000..3ba95b4dbd9b5
--- /dev/null
+++ b/clang/test/SemaCXX/consteval-builtin.cpp
@@ -0,0 +1,93 @@
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -Wno-unused %s -verify=cxx20-cxx26
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -Wno-unused %s -verify=cxx20,cxx20-cxx26
+// RUN: %clang_cc1 -std=c++17 -fsyntax-only -Wno-unused %s -verify=precxx20,cxx11-cxx17
+// RUN: %clang_cc1 -std=c++14 -fsyntax-only -Wno-unused %s -verify=precxx20,cxx11-cxx17
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -Wno-unused %s -verify=precxx20,cxx11-cxx17
+// RUN: %clang_cc1 -std=c++03 -fsyntax-only -Wno-unused %s -verify=precxx20
+// RUN: %clang_cc1 -std=c++98 -fsyntax-only -Wno-unused %s -verify=precxx20
+// RUN: %clang_cc1 -x c -std=c23 -fsyntax-only -Wno-unused %s -verify=c
+
+#if __has_builtin(__builtin_is_within_lifetime)
+#error has the builtin
+#else
+#error does not have the builtin
+#endif
+// cxx20-cxx26-error@-4 {{has the builtin}}
+// precxx20-error@-3 {{does not have the builtin}}
+// c-error@-4 {{does not have the builtin}}
+
+#if __has_constexpr_builtin(__builtin_is_within_lifetime)
+#error has the constexpr builtin
+#else
+#error does not have the constexpr builtin
+#endif
+// cxx20-cxx26-error@-4 {{has the constexpr builtin}}
+// precxx20-error@-3 {{does not have the constexpr builtin}}
+// c-error@-4 {{does not have the constexpr builtin}}
+
+#if __cplusplus < 201103L
+#define static_assert __extension__ _Static_assert
+#define CONSTEXPR11
+#else
+#define CONSTEXPR11 constexpr
+#endif
+
+static const int i1 = 0;
+static_assert(__builtin_is_within_lifetime(&i1), "");
+// precxx20-error@-1 {{use of undeclared identifier '__builtin_is_within_lifetime'}}
+// c-error@-2 {{use of undeclared identifier '__builtin_is_within_lifetime'}}
+
+#if !defined(__cplusplus) || __cplusplus >= 201102L
+constexpr int i2 = 0;
+static_assert(__builtin_is_within_lifetime(&i2), "");
+// cxx11-cxx17-error@-1 {{use of undeclared identifier '__builtin_is_within_lifetime'}}
+// c-error@-2 {{use of undeclared identifier '__builtin_is_within_lifetime'}}
+#endif
+
+#ifdef __cplusplus
+template<typename T>
+CONSTEXPR11 bool f1(T i) {  // #f1
+  return __builtin_is_within_lifetime(&i);  // #f1-consteval-call
+}
+
+bool(&fp1)(int) = f1<int>;
+// cxx20-cxx26-error@-1 {{cannot take address of immediate function 'f1<int>' outside of an immediate invocation}}
+//   cxx20-cxx26-note@#f1 {{declared here}}
+//   cxx20-cxx26-note@#f1-consteval-call {{'f1<int>' is an immediate function because its body contains a call to a consteval function '__builtin_is_within_lifetime' and that call is not a constant expression}}
+// precxx20-error@#f1-consteval-call {{use of undeclared identifier '__builtin_is_within_lifetime'}}
+//   precxx20-note@-5 {{in instantiation of function template specialization 'f1<int>' requested here}}
+#else
+void f1(int i) {
+  __builtin_is_within_lifetime(&i);
+  // c-error@-1 {{use of undeclared identifier '__builtin_is_within_lifetime'}}
+}
+#endif
+
+#if __cplusplus >= 202002L
+constexpr void f2() {
+  int i = 0;
+  if consteval {  // cxx20-warning {{consteval if}}
+    __builtin_is_within_lifetime(&i);
+  }
+}
+void(&fp2)() = f2;
+
+constexpr void f3() {
+  __builtin_is_within_lifetime(&i1);
+}
+void(&fp3)() = f3;
+
+constexpr void f4() {
+  &__builtin_is_within_lifetime;
+  // cxx20-cxx26-error@-1 {{builtin functions must be directly called}}
+  // cxx20-cxx26-error@-2 {{cannot take address of consteval function '__builtin_is_within_lifetime' outside of an immediate invocation}}
+  __builtin_is_within_lifetime();
+  // cxx20-cxx26-error@-1 {{too few arguments to function call, expected 1, have 0}}
+  // cxx20-cxx26-error@-2 {{cannot take address of consteval function '__builtin_is_within_lifetime' outside of an immediate invocation}}
+  int* not_constexpr;
+  __builtin_is_within_lifetime(not_constexpr);
+  // cxx20-cxx26-error@-1 {{call to consteval function '__builtin_is_within_lifetime' is not a constant expression}}
+  //   cxx20-cxx26-note@-2 {{read of non-constexpr variable 'not_constexpr' is not allowed in a constant expression}}
+  //   cxx20-cxx26-note@-4 {{declared here}}
+}
+#endif

From e4fdbcc28f19b59fef065f2a6f939f91f286b9a8 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 4 Sep 2024 17:27:46 -0400
Subject: [PATCH 218/425] [libc++] Add miscellaneous missing includes

---
 libcxx/include/__memory/uninitialized_algorithms.h | 1 +
 libcxx/include/forward_list                        | 1 +
 libcxx/include/list                                | 1 +
 libcxx/include/string                              | 1 +
 4 files changed, 4 insertions(+)

diff --git a/libcxx/include/__memory/uninitialized_algorithms.h b/libcxx/include/__memory/uninitialized_algorithms.h
index 72db3a266fdd4..8ff87e28b3bb5 100644
--- a/libcxx/include/__memory/uninitialized_algorithms.h
+++ b/libcxx/include/__memory/uninitialized_algorithms.h
@@ -22,6 +22,7 @@
 #include <__memory/construct_at.h>
 #include <__memory/pointer_traits.h>
 #include <__memory/voidify.h>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/extent.h>
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_constant_evaluated.h>
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index 6c0dc5f96a5d5..3187b11e4dde7 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -218,6 +218,7 @@ template <class T, class Allocator, class Predicate>
 #include <__ranges/container_compatible_range.h>
 #include <__ranges/from_range.h>
 #include <__type_traits/conditional.h>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/is_allocator.h>
 #include <__type_traits/is_const.h>
 #include <__type_traits/is_nothrow_assignable.h>
diff --git a/libcxx/include/list b/libcxx/include/list
index 76b1d9241b41c..2aa774451ec2a 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -225,6 +225,7 @@ template <class T, class Allocator, class Predicate>
 #include <__ranges/container_compatible_range.h>
 #include <__ranges/from_range.h>
 #include <__type_traits/conditional.h>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/is_allocator.h>
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
diff --git a/libcxx/include/string b/libcxx/include/string
index 5cb0693ad10bc..3480b57375c11 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -621,6 +621,7 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #include <__string/char_traits.h>
 #include <__string/extern_template_lists.h>
 #include <__type_traits/conditional.h>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/is_allocator.h>
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_convertible.h>

From 2c3da172d1869a2e261af38c45582027a9ff6af7 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Thu, 5 Sep 2024 14:01:12 +0100
Subject: [PATCH 219/425] LIR: strip unused LAA header dependency (NFC)
 (#107382)

LoopIdiomRecognize does not use LoopAccessAnalysis. Make this clear.
---
 llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 0ee1afa76a823..578d087e470e1 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -40,7 +40,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CmpInstAnalysis.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemoryLocation.h"

From 3d01f0a33b9a14545217938fbd2475226ade2719 Mon Sep 17 00:00:00 2001
From: Andrea Faulds <andrea.faulds@amd.com>
Date: Thu, 5 Sep 2024 15:03:22 +0200
Subject: [PATCH 220/425] [mlir][gpu] Add 'cluster_stride' attribute to
 gpu.subgroup_reduce (#107142)

Follow-up to 7aa22f013e24d20291aad745368ff907baa9dfa4, adding an
additional attribute needed in some applications.
---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    | 32 +++++--
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        | 13 ++-
 .../GPU/Transforms/SubgroupReduceLowering.cpp | 90 +++++++++++--------
 mlir/test/Dialect/GPU/canonicalize.mlir       |  2 +-
 mlir/test/Dialect/GPU/invalid.mlir            | 21 ++++-
 .../Dialect/GPU/subgroup-reduce-lowering.mlir | 47 +++++++---
 6 files changed, 144 insertions(+), 61 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index d2a5e5d77ad84..6098eb34d04d5 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1200,10 +1200,12 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]
     The `subgroup_reduce` op reduces the values of lanes (work items) across a
     subgroup.
 
-    The subgroup is divided into clusters of `cluster_size` contiguous lanes
-    each, and a reduction is done for every lane of each cluster (in parallel).
-    The result is equal for all lanes in a cluster. When `cluster_size` is
-    omitted, there is a single cluster covering the entire subgroup.
+    The subgroup is divided into clusters starting at lane index 0. Within each
+    cluster, there are `size` lanes, and the lane index advances by `stride`.
+    A reduction is done for each cluster in parallel: every lane in the cluster
+    is reduced, and the result is equal for all lanes in the cluster. If `size`
+    is omitted, there is a single cluster covering the entire subgroup. If
+    `stride` is omitted, the stride is 1 (the cluster's lanes are contiguous).
 
     When the reduced value is of a vector type, each vector element is reduced
     independently. Only 1-d vector types are allowed.
@@ -1213,7 +1215,8 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]
     ```mlir
     %1 = gpu.subgroup_reduce add %a : (f32) -> f32
     %2 = gpu.subgroup_reduce add %b : (vector<4xf16>) -> vector<4xf16>
-    %3 = gpu.subgroup_reduce add %c cluster_size(4) : (f32) -> f32
+    %3 = gpu.subgroup_reduce add %c cluster(size = 4) : (f32) -> f32
+    %3 = gpu.subgroup_reduce add %c cluster(size = 4, stride = 2) : (f32) -> f32
     ```
 
     If `uniform` flag is set either none or all lanes of a subgroup need to execute
@@ -1230,7 +1233,8 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]
     AnyIntegerOrFloatOr1DVector:$value,
     GPU_AllReduceOperationAttr:$op,
     UnitAttr:$uniform,
-    OptionalAttr<I32Attr>:$cluster_size
+    OptionalAttr<I32Attr>:$cluster_size,
+    DefaultValuedAttr<I32Attr,"1">:$cluster_stride
   );
   let results = (outs AnyIntegerOrFloatOr1DVector:$result);
 
@@ -1238,19 +1242,29 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]
     OpBuilder<(ins "Value":$value,
                "::mlir::gpu::AllReduceOperation":$op,
                "bool":$uniform), [{
-      build($_builder, $_state, value, op, uniform, /*cluster_size=*/ nullptr);
+      build($_builder, $_state, value, op, uniform, std::nullopt);
     }]>,
     OpBuilder<(ins "Value":$value,
                "::mlir::gpu::AllReduceOperation":$op,
                "bool":$uniform,
                "std::optional<uint32_t>":$cluster_size), [{
-      build($_builder, $_state, value, op, uniform, cluster_size ? $_builder.getI32IntegerAttr(*cluster_size) : nullptr);
+      build($_builder, $_state, value, op, uniform,
+            cluster_size ? $_builder.getI32IntegerAttr(*cluster_size) : nullptr);
+    }]>,
+    OpBuilder<(ins "Value":$value,
+               "::mlir::gpu::AllReduceOperation":$op,
+               "bool":$uniform,
+               "std::optional<uint32_t>":$cluster_size,
+               "uint32_t":$cluster_stride), [{
+      build($_builder, $_state, value, op, uniform,
+            cluster_size ? $_builder.getI32IntegerAttr(*cluster_size) : nullptr,
+            cluster_stride);
     }]>
   ];
 
   let assemblyFormat = [{ custom<AllReduceOperation>($op) $value
                           (`uniform` $uniform^)?
-                          (`cluster_size` `(` $cluster_size^ `)`)?
+                          (`cluster` `(` `size` `=` $cluster_size^ (`,` `stride` `=` $cluster_stride^)? `)`)?
                           attr-dict
                           `:` functional-type(operands, results) }];
 
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index e45ba7838b453..f822c11aeec00 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -621,7 +621,8 @@ LogicalResult gpu::SubgroupReduceOp::verify() {
                        << getType();
   }
 
-  if (auto clusterSize = getClusterSize()) {
+  auto clusterSize = getClusterSize();
+  if (clusterSize) {
     uint32_t size = *clusterSize;
     if (!llvm::isPowerOf2_32(size)) {
       return emitOpError() << "cluster size " << size
@@ -629,6 +630,16 @@ LogicalResult gpu::SubgroupReduceOp::verify() {
     }
   }
 
+  uint32_t stride = getClusterStride();
+  if (stride != 1 && !clusterSize) {
+    return emitOpError() << "cluster stride can only be specified if cluster "
+                            "size is specified";
+  }
+  if (!llvm::isPowerOf2_32(stride)) {
+    return emitOpError() << "cluster stride " << stride
+                         << " is not a power of two";
+  }
+
   return success();
 }
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 288f7ab9f3022..2cf5d12588a73 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -50,8 +50,6 @@ struct BreakDownSubgroupReduce final : OpRewritePattern<gpu::SubgroupReduceOp> {
 
   LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
                                 PatternRewriter &rewriter) const override {
-    std::optional<uint32_t> clusterSize = op.getClusterSize();
-
     auto vecTy = dyn_cast<VectorType>(op.getType());
     if (!vecTy || vecTy.getNumElements() < 2)
       return rewriter.notifyMatchFailure(op, "not a multi-element reduction");
@@ -97,7 +95,8 @@ struct BreakDownSubgroupReduce final : OpRewritePattern<gpu::SubgroupReduceOp> {
       }
 
       Value reduce = rewriter.create<gpu::SubgroupReduceOp>(
-          loc, extracted, op.getOp(), op.getUniform(), clusterSize);
+          loc, extracted, op.getOp(), op.getUniform(), op.getClusterSize(),
+          op.getClusterStride());
       if (numElems == 1) {
         res = rewriter.create<vector::InsertOp>(loc, reduce, res, startIdx);
         continue;
@@ -129,8 +128,6 @@ struct ScalarizeSingleElementReduce final
 
   LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
                                 PatternRewriter &rewriter) const override {
-    std::optional<uint32_t> clusterSize = op.getClusterSize();
-
     auto vecTy = dyn_cast<VectorType>(op.getType());
     if (!vecTy || vecTy.getNumElements() != 1)
       return rewriter.notifyMatchFailure(op, "not a single-element reduction");
@@ -140,34 +137,64 @@ struct ScalarizeSingleElementReduce final
     Location loc = op.getLoc();
     Value extracted = rewriter.create<vector::ExtractOp>(loc, op.getValue(), 0);
     Value reduce = rewriter.create<gpu::SubgroupReduceOp>(
-        loc, extracted, op.getOp(), op.getUniform(), clusterSize);
+        loc, extracted, op.getOp(), op.getUniform(), op.getClusterSize(),
+        op.getClusterStride());
     rewriter.replaceOpWithNewOp<vector::BroadcastOp>(op, vecTy, reduce);
     return success();
   }
 };
 
+struct ClusterInfo {
+  unsigned clusterStride;
+  unsigned clusterSize;
+  unsigned subgroupSize;
+};
+
+static FailureOr<ClusterInfo>
+getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
+  assert(llvm::isPowerOf2_32(subgroupSize));
+
+  std::optional<uint32_t> clusterSize = op.getClusterSize();
+  assert(!clusterSize ||
+         llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
+  if (clusterSize && *clusterSize > subgroupSize)
+    return op.emitOpError()
+           << "cluster size " << *clusterSize
+           << " is greater than subgroup size " << subgroupSize;
+  unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+
+  auto clusterStride = op.getClusterStride();
+  assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
+  if (clusterStride >= subgroupSize)
+    return op.emitOpError()
+           << "cluster stride " << clusterStride
+           << " is not less than subgroup size " << subgroupSize;
+
+  return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
+}
+
 /// Emits a subgroup reduction using a sequence of shuffles. Uses the `packFn`
 /// and `unpackFn` to convert to the native shuffle type and to the reduction
 /// type, respectively. For example, with `input` of type `f16`, `packFn` could
 /// build ops to cast the value to `i32` to perform shuffles, while `unpackFn`
 /// would cast it back to `f16` to perform arithmetic reduction on. Assumes that
 /// the subgroup is `subgroupSize` lanes wide and divides it into clusters of
-/// `clusterSize` lanes, reducing all lanes in each cluster in parallel.
-static Value createSubgroupShuffleReduction(
-    OpBuilder &builder, Location loc, Value input, gpu::AllReduceOperation mode,
-    unsigned clusterSize, unsigned subgroupSize,
-    function_ref<Value(Value)> packFn, function_ref<Value(Value)> unpackFn) {
-  assert(llvm::isPowerOf2_32(clusterSize));
-  assert(llvm::isPowerOf2_32(subgroupSize));
-  assert(clusterSize <= subgroupSize);
+/// `clusterSize` lanes starting at lane 0 with a stride of `clusterStride` for
+/// lanes within a cluster, reducing all lanes in each cluster in parallel.
+Value createSubgroupShuffleReduction(OpBuilder &builder, Location loc,
+                                     Value input, gpu::AllReduceOperation mode,
+                                     const ClusterInfo &ci,
+                                     function_ref<Value(Value)> packFn,
+                                     function_ref<Value(Value)> unpackFn) {
   // Lane value always stays in the original type. We use it to perform arith
   // reductions.
   Value laneVal = input;
   // Parallel reduction using butterfly shuffles.
-  for (unsigned i = 1; i < clusterSize; i <<= 1) {
+  for (unsigned i = ci.clusterStride; i < ci.clusterStride * ci.clusterSize;
+       i <<= 1) {
     Value shuffled = builder
                          .create<gpu::ShuffleOp>(loc, packFn(laneVal), i,
-                                                 /*width=*/subgroupSize,
+                                                 /*width=*/ci.subgroupSize,
                                                  /*mode=*/gpu::ShuffleMode::XOR)
                          .getShuffleResult();
     laneVal = vector::makeArithReduction(builder, loc,
@@ -190,12 +217,9 @@ struct ScalarSubgroupReduceToShuffles final
 
   LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
                                 PatternRewriter &rewriter) const override {
-    std::optional<uint32_t> clusterSize = op.getClusterSize();
-    if (clusterSize && *clusterSize > subgroupSize)
-      return op.emitOpError()
-             << "cluster size " << *clusterSize
-             << " is greater than subgroup size " << subgroupSize;
-    unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+    auto ci = getAndValidateClusterInfo(op, subgroupSize);
+    if (failed(ci))
+      return failure();
 
     Type valueTy = op.getType();
     unsigned elemBitwidth =
@@ -209,9 +233,8 @@ struct ScalarSubgroupReduceToShuffles final
     if (elemBitwidth == shuffleBitwidth) {
       auto identityFn = [](Value v) { return v; };
       rewriter.replaceOp(op, createSubgroupShuffleReduction(
-                                 rewriter, loc, op.getValue(), op.getOp(),
-                                 effectiveClusterSize, subgroupSize, identityFn,
-                                 identityFn));
+                                 rewriter, loc, op.getValue(), op.getOp(), *ci,
+                                 identityFn, identityFn));
       return success();
     }
 
@@ -232,8 +255,7 @@ struct ScalarSubgroupReduceToShuffles final
 
     rewriter.replaceOp(
         op, createSubgroupShuffleReduction(rewriter, loc, op.getValue(),
-                                           op.getOp(), effectiveClusterSize,
-                                           subgroupSize, packFn, unpackFn));
+                                           op.getOp(), *ci, packFn, unpackFn));
     return success();
   }
 
@@ -253,12 +275,9 @@ struct VectorSubgroupReduceToShuffles final
 
   LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
                                 PatternRewriter &rewriter) const override {
-    std::optional<uint32_t> clusterSize = op.getClusterSize();
-    if (clusterSize && *clusterSize > subgroupSize)
-      return op.emitOpError()
-             << "cluster size " << *clusterSize
-             << " is greater than subgroup size " << subgroupSize;
-    unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+    auto ci = getAndValidateClusterInfo(op, subgroupSize);
+    if (failed(ci))
+      return failure();
 
     auto vecTy = dyn_cast<VectorType>(op.getType());
     if (!vecTy)
@@ -308,9 +327,8 @@ struct VectorSubgroupReduceToShuffles final
       return rewriter.create<vector::BitCastOp>(loc, extendedVecTy, asIntVec);
     };
 
-    Value res = createSubgroupShuffleReduction(rewriter, loc, extendedInput,
-                                               op.getOp(), effectiveClusterSize,
-                                               subgroupSize, packFn, unpackFn);
+    Value res = createSubgroupShuffleReduction(
+        rewriter, loc, extendedInput, op.getOp(), *ci, packFn, unpackFn);
 
     if (vecBitwidth < shuffleBitwidth) {
       res = rewriter.create<vector::ExtractStridedSliceOp>(
diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir
index 469c03c9460df..d342ae9df10ee 100644
--- a/mlir/test/Dialect/GPU/canonicalize.mlir
+++ b/mlir/test/Dialect/GPU/canonicalize.mlir
@@ -255,7 +255,7 @@ func.func @subgroup_reduce_cluster_size_1() {
   gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %0#0, %arg7 = %0#1, %arg8 = %0#2)
     threads(%arg3, %arg4, %arg5) in (%arg9 = %0#3, %arg10 = %0#4, %arg11 = %0#5) {
     %1 = "test.test2"() : () -> i32
-    %2 = gpu.subgroup_reduce add %1 cluster_size(1) : (i32) -> (i32)
+    %2 = gpu.subgroup_reduce add %1 cluster(size=1) : (i32) -> (i32)
     "test.test3"(%2) : (i32) -> ()
     gpu.terminator
   }
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index fd7618020b5d8..2a0f7e8c6b10c 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -335,7 +335,7 @@ func.func @reduce_invalid_op_type_maximumf(%arg0 : i32) {
 
 func.func @subgroup_reduce_zero_cluster_size(%arg0 : vector<4xf32>) {
   // expected-error@+1 {{cluster size 0 is not a power of two}}
-  %res = gpu.subgroup_reduce add %arg0 cluster_size(0) : (vector<4xf32>) -> vector<4xf32>
+  %res = gpu.subgroup_reduce add %arg0 cluster(size = 0) : (vector<4xf32>) -> vector<4xf32>
   return
 }
 
@@ -343,10 +343,27 @@ func.func @subgroup_reduce_zero_cluster_size(%arg0 : vector<4xf32>) {
 
 func.func @subgroup_reduce_npot_cluster_size(%arg0 : vector<4xf32>) {
   // expected-error@+1 {{cluster size 3 is not a power of two}}
-  %res = gpu.subgroup_reduce add %arg0 cluster_size(3) : (vector<4xf32>) -> vector<4xf32>
+  %res = gpu.subgroup_reduce add %arg0 cluster(size = 3) : (vector<4xf32>) -> vector<4xf32>
   return
 }
 
+// -----
+
+func.func @subgroup_reduce_zero_cluster_stride(%arg0 : vector<4xf32>) {
+  // expected-error@+1 {{cluster stride 0 is not a power of two}}
+  %res = gpu.subgroup_reduce add %arg0 cluster(size = 4, stride = 0) : (vector<4xf32>) -> vector<4xf32>
+  return
+}
+
+// -----
+
+func.func @subgroup_reduce_cluster_stride_without_size(%arg0 : vector<4xf32>) {
+  // expected-error@+1 {{cluster stride can only be specified if cluster size is specified}}
+  %res = gpu.subgroup_reduce add %arg0 { cluster_stride = 2 : i32 } : (vector<4xf32>) -> vector<4xf32>
+  return
+}
+
+
 // -----
 
 func.func @subgroup_reduce_bad_type(%arg0 : vector<2x2xf32>) {
diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
index 37608ce4cfed7..9f2aa1be52fc3 100644
--- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
+++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
@@ -34,14 +34,14 @@ gpu.module @kernels {
     %sum1 = gpu.subgroup_reduce mul %arg0 uniform : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum1) : (vector<5xf16>) -> ()
 
-    // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster_size(4)
+    // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4)
     // CHECK-SUB: "test.consume"
-    %sum2 = gpu.subgroup_reduce mul %arg0 cluster_size(4) : (vector<5xf16>) -> (vector<5xf16>)
+    %sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum2) : (vector<5xf16>) -> ()
 
-    // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform cluster_size(4)
+    // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform cluster(size = 4, stride = 2)
     // CHECK-SUB: "test.consume"
-    %sum3 = gpu.subgroup_reduce mul %arg0 uniform cluster_size(4) : (vector<5xf16>) -> (vector<5xf16>)
+    %sum3 = gpu.subgroup_reduce mul %arg0 uniform cluster(size = 4, stride = 2) : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum3) : (vector<5xf16>) -> ()
 
     // CHECK-SUB: gpu.return
@@ -65,14 +65,15 @@ gpu.module @kernels {
     %sum1 = gpu.subgroup_reduce add %arg0 uniform : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum1) : (vector<1xf32>) -> ()
 
-    // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster_size(8) : (f32) -> f32
+    // Note stride is dropped because it is == 1.
+    // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32
     // CHECK-SUB: "test.consume"
-    %sum2 = gpu.subgroup_reduce add %arg0 cluster_size(8) : (vector<1xf32>) -> (vector<1xf32>)
+    %sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum2) : (vector<1xf32>) -> ()
 
-    // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster_size(8) : (f32) -> f32
+    // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32
     // CHECK-SUB: "test.consume"
-    %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster_size(8) : (vector<1xf32>) -> (vector<1xf32>)
+    %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum3) : (vector<1xf32>) -> ()
 
     // CHECK-SUB: gpu.return
@@ -143,7 +144,29 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[S2:.+]], %{{.+}} = gpu.shuffle xor %[[A1]], %[[C4]], %[[C32]] : i32
     // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32
     // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> ()
-    %sum0 = gpu.subgroup_reduce add %arg0 cluster_size(8) : (i32) -> i32
+    %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32
+    "test.consume"(%sum0) : (i32) -> ()
+
+    // CHECK-SHFL: gpu.return
+    gpu.return
+  }
+
+  // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided(
+  // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
+  gpu.func @kernel3_clustered_strided(%arg0: i32) kernel {
+    // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32
+    // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32
+    // CHECK-SHFL-DAG: %[[C4:.+]] = arith.constant 16 : i32
+    // CHECK-SHFL-DAG: %[[C32:.+]] = arith.constant 32 : i32
+
+    // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[ARG0]], %[[C1]], %[[C32]] : i32
+    // CHECK-SHFL: %[[A0:.+]] = arith.addi %[[ARG0]], %[[S0]] : i32
+    // CHECK-SHFL: %[[S1:.+]], %{{.+}} = gpu.shuffle xor %[[A0]], %[[C2]], %[[C32]] : i32
+    // CHECK-SHFL: %[[A1:.+]] = arith.addi %[[A0]], %[[S1]] : i32
+    // CHECK-SHFL: %[[S2:.+]], %{{.+}} = gpu.shuffle xor %[[A1]], %[[C4]], %[[C32]] : i32
+    // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32
+    // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> ()
+    %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 4) : (i32) -> i32
     "test.consume"(%sum0) : (i32) -> ()
 
     // CHECK-SHFL: gpu.return
@@ -194,7 +217,7 @@ gpu.module @kernels {
     // CHECK-SHFL-DAG: %[[C32:.+]] = arith.constant 32 : i32
 
     // CHECK-SHFL-COUNT-2: gpu.shuffle xor
-    %sum0 = gpu.subgroup_reduce add %arg0 cluster_size(4) : (vector<2xf16>) -> (vector<2xf16>)
+    %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 4) : (vector<2xf16>) -> (vector<2xf16>)
     "test.consume"(%sum0) : (vector<2xf16>) -> ()
 
     // CHECK-SHFL: gpu.return
@@ -234,7 +257,7 @@ gpu.module @kernels {
     // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
     // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
     // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
-    %sum0 = gpu.subgroup_reduce add %arg0 cluster_size(16) : (i16) -> i16
+    %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16
     "test.consume"(%sum0) : (i16) -> ()
 
     // CHECK-SHFL: gpu.return
@@ -268,7 +291,7 @@ gpu.module @kernels {
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<3xi8>)
   gpu.func @kernel_cluster_size_is_subgroup_size(%arg0: vector<3xi8>) kernel {
     // CHECK-SHFL-COUNT-5: gpu.shuffle xor
-    %sum0 = gpu.subgroup_reduce add %arg0 cluster_size(32) : (vector<3xi8>) -> (vector<3xi8>)
+    %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (vector<3xi8>) -> (vector<3xi8>)
     "test.consume"(%sum0) : (vector<3xi8>) -> ()
 
     // CHECK-SHFL: gpu.return

From 7d1a68178ef4332c9bf19a5c959a3ec4cef0285d Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 5 Sep 2024 15:10:00 +0200
Subject: [PATCH 221/425] [SystemZ] Use APInt::getAllOnes()

This was using -1 without setting the signed flag.

Split off from https://github.com/llvm/llvm-project/pull/80309.
---
 llvm/lib/Target/SystemZ/SystemZISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 6f84bd6c6e4ff..582a8c139b293 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -7981,7 +7981,7 @@ static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
       break;
     }
     case Intrinsic::s390_vperm:
-      SrcDemE = APInt(NumElts, -1);
+      SrcDemE = APInt::getAllOnes(NumElts);
       break;
     default:
       llvm_unreachable("Unhandled intrinsic.");

From 67e19e5bb11d8ed2f1b5a0b8145331c8bf4522e9 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 5 Sep 2024 15:17:25 +0200
Subject: [PATCH 222/425] [flang] Set isSigned=true for negative constant (NFC)

We're providing this as a negative signed value, so set the flag.
Currently doesn't make a difference, but will assert in the future.

Split out of https://github.com/llvm/llvm-project/pull/80309.
---
 flang/lib/Optimizer/CodeGen/CodeGen.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index eb91969236ae0..ac521ae95df39 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -1262,7 +1262,7 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
       } else {
         auto maskAttr = mlir::IntegerAttr::get(
             rewriter.getIntegerType(8, /*isSigned=*/false),
-            llvm::APInt(8, (uint64_t)~_CFI_ADDENDUM_FLAG, /*isSigned=*/false));
+            llvm::APInt(8, (uint64_t)~_CFI_ADDENDUM_FLAG, /*isSigned=*/true));
         mlir::LLVM::ConstantOp mask = rewriter.create<mlir::LLVM::ConstantOp>(
             loc, rewriter.getI8Type(), maskAttr);
         extraField = rewriter.create<mlir::LLVM::AndOp>(loc, extraField, mask);

From 9e9971b100e121b83f1de9e9206cddb52cda4815 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 5 Sep 2024 15:19:16 +0200
Subject: [PATCH 223/425] [PatternMatchTest] Use APInt::getAllOnes() (NFC)

Split out from https://github.com/llvm/llvm-project/pull/80309 to
avoid assertion failures in the future.
---
 llvm/unittests/IR/PatternMatch.cpp | 202 ++++++++++++++---------------
 1 file changed, 101 insertions(+), 101 deletions(-)

diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp
index 379f97fb63139..13f121a2b9c7d 100644
--- a/llvm/unittests/IR/PatternMatch.cpp
+++ b/llvm/unittests/IR/PatternMatch.cpp
@@ -71,7 +71,7 @@ TEST_F(PatternMatchTest, SpecificIntEQ) {
 
   Value *Zero = ConstantInt::get(IntTy, 0);
   Value *One = ConstantInt::get(IntTy, 1);
-  Value *NegOne = ConstantInt::get(IntTy, -1);
+  Value *NegOne = Constant::getAllOnesValue(IntTy);
 
   EXPECT_TRUE(
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, APInt(BitWidth, 0))
@@ -93,15 +93,15 @@ TEST_F(PatternMatchTest, SpecificIntEQ) {
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, APInt(BitWidth, 1))
           .match(NegOne));
 
-  EXPECT_FALSE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, APInt(BitWidth, -1))
-          .match(Zero));
-  EXPECT_FALSE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, APInt(BitWidth, -1))
-          .match(One));
-  EXPECT_TRUE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, APInt(BitWidth, -1))
-          .match(NegOne));
+  EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+                                  APInt::getAllOnes(BitWidth))
+                   .match(Zero));
+  EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+                                  APInt::getAllOnes(BitWidth))
+                   .match(One));
+  EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+                                 APInt::getAllOnes(BitWidth))
+                  .match(NegOne));
 }
 
 TEST_F(PatternMatchTest, SpecificIntNE) {
@@ -110,7 +110,7 @@ TEST_F(PatternMatchTest, SpecificIntNE) {
 
   Value *Zero = ConstantInt::get(IntTy, 0);
   Value *One = ConstantInt::get(IntTy, 1);
-  Value *NegOne = ConstantInt::get(IntTy, -1);
+  Value *NegOne = Constant::getAllOnesValue(IntTy);
 
   EXPECT_FALSE(
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, APInt(BitWidth, 0))
@@ -132,15 +132,15 @@ TEST_F(PatternMatchTest, SpecificIntNE) {
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, APInt(BitWidth, 1))
           .match(NegOne));
 
-  EXPECT_TRUE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, APInt(BitWidth, -1))
-          .match(Zero));
-  EXPECT_TRUE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, APInt(BitWidth, -1))
-          .match(One));
-  EXPECT_FALSE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, APInt(BitWidth, -1))
-          .match(NegOne));
+  EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE,
+                                 APInt::getAllOnes(BitWidth))
+                  .match(Zero));
+  EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE,
+                                 APInt::getAllOnes(BitWidth))
+                  .match(One));
+  EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE,
+                                  APInt::getAllOnes(BitWidth))
+                   .match(NegOne));
 }
 
 TEST_F(PatternMatchTest, SpecificIntUGT) {
@@ -149,7 +149,7 @@ TEST_F(PatternMatchTest, SpecificIntUGT) {
 
   Value *Zero = ConstantInt::get(IntTy, 0);
   Value *One = ConstantInt::get(IntTy, 1);
-  Value *NegOne = ConstantInt::get(IntTy, -1);
+  Value *NegOne = Constant::getAllOnesValue(IntTy);
 
   EXPECT_FALSE(
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT, APInt(BitWidth, 0))
@@ -171,15 +171,15 @@ TEST_F(PatternMatchTest, SpecificIntUGT) {
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT, APInt(BitWidth, 1))
           .match(NegOne));
 
-  EXPECT_FALSE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT, APInt(BitWidth, -1))
-          .match(Zero));
-  EXPECT_FALSE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT, APInt(BitWidth, -1))
-          .match(One));
-  EXPECT_FALSE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT, APInt(BitWidth, -1))
-          .match(NegOne));
+  EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT,
+                                  APInt::getAllOnes(BitWidth))
+                   .match(Zero));
+  EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT,
+                                  APInt::getAllOnes(BitWidth))
+                   .match(One));
+  EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGT,
+                                  APInt::getAllOnes(BitWidth))
+                   .match(NegOne));
 }
 
 TEST_F(PatternMatchTest, SignbitZeroChecks) {
@@ -187,7 +187,7 @@ TEST_F(PatternMatchTest, SignbitZeroChecks) {
 
   Value *Zero = ConstantInt::get(IntTy, 0);
   Value *One = ConstantInt::get(IntTy, 1);
-  Value *NegOne = ConstantInt::get(IntTy, -1);
+  Value *NegOne = Constant::getAllOnesValue(IntTy);
 
   EXPECT_TRUE(m_Negative().match(NegOne));
   EXPECT_FALSE(m_NonNegative().match(NegOne));
@@ -211,7 +211,7 @@ TEST_F(PatternMatchTest, SpecificIntUGE) {
 
   Value *Zero = ConstantInt::get(IntTy, 0);
   Value *One = ConstantInt::get(IntTy, 1);
-  Value *NegOne = ConstantInt::get(IntTy, -1);
+  Value *NegOne = Constant::getAllOnesValue(IntTy);
 
   EXPECT_TRUE(
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE, APInt(BitWidth, 0))
@@ -233,15 +233,15 @@ TEST_F(PatternMatchTest, SpecificIntUGE) {
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE, APInt(BitWidth, 1))
           .match(NegOne));
 
-  EXPECT_FALSE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE, APInt(BitWidth, -1))
-          .match(Zero));
-  EXPECT_FALSE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE, APInt(BitWidth, -1))
-          .match(One));
-  EXPECT_TRUE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE, APInt(BitWidth, -1))
-          .match(NegOne));
+  EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE,
+                                  APInt::getAllOnes(BitWidth))
+                   .match(Zero));
+  EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE,
+                                  APInt::getAllOnes(BitWidth))
+                   .match(One));
+  EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_UGE,
+                                 APInt::getAllOnes(BitWidth))
+                  .match(NegOne));
 }
 
 TEST_F(PatternMatchTest, SpecificIntULT) {
@@ -250,7 +250,7 @@ TEST_F(PatternMatchTest, SpecificIntULT) {
 
   Value *Zero = ConstantInt::get(IntTy, 0);
   Value *One = ConstantInt::get(IntTy, 1);
-  Value *NegOne = ConstantInt::get(IntTy, -1);
+  Value *NegOne = Constant::getAllOnesValue(IntTy);
 
   EXPECT_FALSE(
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, APInt(BitWidth, 0))
@@ -272,15 +272,15 @@ TEST_F(PatternMatchTest, SpecificIntULT) {
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, APInt(BitWidth, 1))
           .match(NegOne));
 
-  EXPECT_TRUE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, APInt(BitWidth, -1))
-          .match(Zero));
-  EXPECT_TRUE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, APInt(BitWidth, -1))
-          .match(One));
-  EXPECT_FALSE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, APInt(BitWidth, -1))
-          .match(NegOne));
+  EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
+                                 APInt::getAllOnes(BitWidth))
+                  .match(Zero));
+  EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
+                                 APInt::getAllOnes(BitWidth))
+                  .match(One));
+  EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
+                                  APInt::getAllOnes(BitWidth))
+                   .match(NegOne));
 }
 
 TEST_F(PatternMatchTest, SpecificIntULE) {
@@ -289,7 +289,7 @@ TEST_F(PatternMatchTest, SpecificIntULE) {
 
   Value *Zero = ConstantInt::get(IntTy, 0);
   Value *One = ConstantInt::get(IntTy, 1);
-  Value *NegOne = ConstantInt::get(IntTy, -1);
+  Value *NegOne = Constant::getAllOnesValue(IntTy);
 
   EXPECT_TRUE(
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE, APInt(BitWidth, 0))
@@ -311,15 +311,15 @@ TEST_F(PatternMatchTest, SpecificIntULE) {
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE, APInt(BitWidth, 1))
           .match(NegOne));
 
-  EXPECT_TRUE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE, APInt(BitWidth, -1))
-          .match(Zero));
-  EXPECT_TRUE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE, APInt(BitWidth, -1))
-          .match(One));
-  EXPECT_TRUE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE, APInt(BitWidth, -1))
-          .match(NegOne));
+  EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE,
+                                 APInt::getAllOnes(BitWidth))
+                  .match(Zero));
+  EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE,
+                                 APInt::getAllOnes(BitWidth))
+                  .match(One));
+  EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULE,
+                                 APInt::getAllOnes(BitWidth))
+                  .match(NegOne));
 }
 
 TEST_F(PatternMatchTest, SpecificIntSGT) {
@@ -328,7 +328,7 @@ TEST_F(PatternMatchTest, SpecificIntSGT) {
 
   Value *Zero = ConstantInt::get(IntTy, 0);
   Value *One = ConstantInt::get(IntTy, 1);
-  Value *NegOne = ConstantInt::get(IntTy, -1);
+  Value *NegOne = Constant::getAllOnesValue(IntTy);
 
   EXPECT_FALSE(
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT, APInt(BitWidth, 0))
@@ -350,15 +350,15 @@ TEST_F(PatternMatchTest, SpecificIntSGT) {
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT, APInt(BitWidth, 1))
           .match(NegOne));
 
-  EXPECT_TRUE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT, APInt(BitWidth, -1))
-          .match(Zero));
-  EXPECT_TRUE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT, APInt(BitWidth, -1))
-          .match(One));
-  EXPECT_FALSE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT, APInt(BitWidth, -1))
-          .match(NegOne));
+  EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT,
+                                 APInt::getAllOnes(BitWidth))
+                  .match(Zero));
+  EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT,
+                                 APInt::getAllOnes(BitWidth))
+                  .match(One));
+  EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGT,
+                                  APInt::getAllOnes(BitWidth))
+                   .match(NegOne));
 }
 
 TEST_F(PatternMatchTest, SpecificIntSGE) {
@@ -367,7 +367,7 @@ TEST_F(PatternMatchTest, SpecificIntSGE) {
 
   Value *Zero = ConstantInt::get(IntTy, 0);
   Value *One = ConstantInt::get(IntTy, 1);
-  Value *NegOne = ConstantInt::get(IntTy, -1);
+  Value *NegOne = Constant::getAllOnesValue(IntTy);
 
   EXPECT_TRUE(
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE, APInt(BitWidth, 0))
@@ -389,15 +389,15 @@ TEST_F(PatternMatchTest, SpecificIntSGE) {
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE, APInt(BitWidth, 1))
           .match(NegOne));
 
-  EXPECT_TRUE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE, APInt(BitWidth, -1))
-          .match(Zero));
-  EXPECT_TRUE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE, APInt(BitWidth, -1))
-          .match(One));
-  EXPECT_TRUE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE, APInt(BitWidth, -1))
-          .match(NegOne));
+  EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE,
+                                 APInt::getAllOnes(BitWidth))
+                  .match(Zero));
+  EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE,
+                                 APInt::getAllOnes(BitWidth))
+                  .match(One));
+  EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SGE,
+                                 APInt::getAllOnes(BitWidth))
+                  .match(NegOne));
 }
 
 TEST_F(PatternMatchTest, SpecificIntSLT) {
@@ -406,7 +406,7 @@ TEST_F(PatternMatchTest, SpecificIntSLT) {
 
   Value *Zero = ConstantInt::get(IntTy, 0);
   Value *One = ConstantInt::get(IntTy, 1);
-  Value *NegOne = ConstantInt::get(IntTy, -1);
+  Value *NegOne = Constant::getAllOnesValue(IntTy);
 
   EXPECT_FALSE(
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT, APInt(BitWidth, 0))
@@ -428,15 +428,15 @@ TEST_F(PatternMatchTest, SpecificIntSLT) {
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT, APInt(BitWidth, 1))
           .match(NegOne));
 
-  EXPECT_FALSE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT, APInt(BitWidth, -1))
-          .match(Zero));
-  EXPECT_FALSE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT, APInt(BitWidth, -1))
-          .match(One));
-  EXPECT_FALSE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT, APInt(BitWidth, -1))
-          .match(NegOne));
+  EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT,
+                                  APInt::getAllOnes(BitWidth))
+                   .match(Zero));
+  EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT,
+                                  APInt::getAllOnes(BitWidth))
+                   .match(One));
+  EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLT,
+                                  APInt::getAllOnes(BitWidth))
+                   .match(NegOne));
 }
 
 TEST_F(PatternMatchTest, SpecificIntSLE) {
@@ -445,7 +445,7 @@ TEST_F(PatternMatchTest, SpecificIntSLE) {
 
   Value *Zero = ConstantInt::get(IntTy, 0);
   Value *One = ConstantInt::get(IntTy, 1);
-  Value *NegOne = ConstantInt::get(IntTy, -1);
+  Value *NegOne = Constant::getAllOnesValue(IntTy);
 
   EXPECT_TRUE(
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE, APInt(BitWidth, 0))
@@ -467,15 +467,15 @@ TEST_F(PatternMatchTest, SpecificIntSLE) {
       m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE, APInt(BitWidth, 1))
           .match(NegOne));
 
-  EXPECT_FALSE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE, APInt(BitWidth, -1))
-          .match(Zero));
-  EXPECT_FALSE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE, APInt(BitWidth, -1))
-          .match(One));
-  EXPECT_TRUE(
-      m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE, APInt(BitWidth, -1))
-          .match(NegOne));
+  EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE,
+                                  APInt::getAllOnes(BitWidth))
+                   .match(Zero));
+  EXPECT_FALSE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE,
+                                  APInt::getAllOnes(BitWidth))
+                   .match(One));
+  EXPECT_TRUE(m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_SLE,
+                                 APInt::getAllOnes(BitWidth))
+                  .match(NegOne));
 }
 
 TEST_F(PatternMatchTest, Unless) {

From eae1d6152fd77511f943fd7f300a971c53453e70 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 5 Sep 2024 14:28:37 +0100
Subject: [PATCH 224/425] [X86] Add test coverage for #107289

---
 .../CodeGen/X86/vector-shuffle-combining.ll   | 56 ++++++++++++++++++-
 1 file changed, 54 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index b5adfb3733357..923af983f1d47 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -3533,6 +3533,58 @@ define <4 x i32> @PR63700(i128 %0) {
   ret <4 x i32> %shuffle.i11
 }
 
+define <16 x i8> @PR107289(<16 x i8> %0) {
+; SSE2-LABEL: PR107289:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    shldq $8, %rax, %rcx
+; SSE2-NEXT:    shlq $8, %rax
+; SSE2-NEXT:    movq %rcx, %xmm1
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: PR107289:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movq %xmm0, %rax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSSE3-NEXT:    movq %xmm0, %rcx
+; SSSE3-NEXT:    shldq $8, %rax, %rcx
+; SSSE3-NEXT:    shlq $8, %rax
+; SSSE3-NEXT:    movq %rcx, %xmm1
+; SSSE3-NEXT:    movq %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: PR107289:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    movq %xmm0, %rcx
+; SSE41-NEXT:    shldq $8, %rcx, %rax
+; SSE41-NEXT:    shlq $8, %rcx
+; SSE41-NEXT:    movq %rax, %xmm1
+; SSE41-NEXT:    movq %rcx, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: PR107289:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX-NEXT:    vmovq %xmm0, %rcx
+; AVX-NEXT:    shldq $8, %rcx, %rax
+; AVX-NEXT:    shlq $8, %rcx
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vmovq %rcx, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %src = bitcast <16 x i8> %0 to i128
+  %shl = shl i128 %src, 8
+  %res = bitcast i128 %shl to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; Test case reported on D105827
 define void @SpinningCube() {
 ; SSE2-LABEL: SpinningCube:
@@ -3641,9 +3693,9 @@ define void @autogen_SD25931() {
 ; CHECK-LABEL: autogen_SD25931:
 ; CHECK:       # %bb.0: # %BB
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB141_1: # %CF242
+; CHECK-NEXT:  .LBB142_1: # %CF242
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    jmp .LBB141_1
+; CHECK-NEXT:    jmp .LBB142_1
 BB:
   %Cmp16 = icmp uge <2 x i1> zeroinitializer, zeroinitializer
   %Shuff19 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Cmp16, <2 x i32> <i32 3, i32 1>

From 1a1264726db275d4b207c5bc640e2779dd484478 Mon Sep 17 00:00:00 2001
From: Robin Caloudis <robin.caloudis@gmx.de>
Date: Thu, 5 Sep 2024 15:44:16 +0200
Subject: [PATCH 225/425] [libc++][math] Add `constexpr` for `std::signbit()`
 (#105946)

## Why
Since 18th of August, the floating point comparison builtin
``__builtin_signbit`` is available in Clang as constant expression
(https://github.com/llvm/llvm-project/pull/94118).

## What
* Implement `constexpr` for `std::signbit()` as defined by
[P0533R9](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p0533r9.pdf)
(new C++23 feature)
* Restrict execution of tests to tip-of-trunk Clang as builtin is not
yet available (note that builtin is available in GCC)
---
 libcxx/include/__math/traits.h                | 13 +++-
 .../c.math/constexpr-cxx23-clang.pass.cpp     |  7 ++
 .../c.math/constexpr-cxx23-gcc.pass.cpp       |  6 +-
 .../test/std/numerics/c.math/signbit.pass.cpp | 78 +++++++++++++++++++
 4 files changed, 98 insertions(+), 6 deletions(-)
 create mode 100644 libcxx/test/std/numerics/c.math/signbit.pass.cpp

diff --git a/libcxx/include/__math/traits.h b/libcxx/include/__math/traits.h
index 0638a6949580e..3d4f14fc9cd55 100644
--- a/libcxx/include/__math/traits.h
+++ b/libcxx/include/__math/traits.h
@@ -27,18 +27,25 @@ namespace __math {
 
 // signbit
 
+// TODO(LLVM 22): Remove conditional once support for Clang 19 is dropped.
+#if defined(_LIBCPP_COMPILER_GCC) || __has_constexpr_builtin(__builtin_signbit)
+#  define _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_CONSTEXPR_SINCE_CXX23
+#else
+#  define _LIBCPP_SIGNBIT_CONSTEXPR
+#endif
+
 template <class _A1, __enable_if_t<is_floating_point<_A1>::value, int> = 0>
-_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool signbit(_A1 __x) _NOEXCEPT {
+_LIBCPP_NODISCARD inline _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_HIDE_FROM_ABI bool signbit(_A1 __x) _NOEXCEPT {
   return __builtin_signbit(__x);
 }
 
 template <class _A1, __enable_if_t<is_integral<_A1>::value && is_signed<_A1>::value, int> = 0>
-_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool signbit(_A1 __x) _NOEXCEPT {
+_LIBCPP_NODISCARD inline _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_HIDE_FROM_ABI bool signbit(_A1 __x) _NOEXCEPT {
   return __x < 0;
 }
 
 template <class _A1, __enable_if_t<is_integral<_A1>::value && !is_signed<_A1>::value, int> = 0>
-_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool signbit(_A1) _NOEXCEPT {
+_LIBCPP_NODISCARD inline _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_HIDE_FROM_ABI bool signbit(_A1) _NOEXCEPT {
   return false;
 }
 
diff --git a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp
index a07260a34516f..3f17f21e8c108 100644
--- a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp
+++ b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp
@@ -220,9 +220,16 @@ int main(int, char**) {
   ASSERT_CONSTEXPR_CXX23(std::isnormal(-1.0) == 1);
   ASSERT_CONSTEXPR_CXX23(std::isnormal(-1.0L) == 1);
 
+// TODO(LLVM 22): Remove `__has_constexpr_builtin` conditional once support for Clang 19 is dropped.
+#if !__has_constexpr_builtin(__builtin_signbit)
   ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0f) == 1);
   ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0) == 1);
   ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0L) == 1);
+#else
+  ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0f) == 1);
+  ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0) == 1);
+  ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0L) == 1);
+#endif
 
   ASSERT_NOT_CONSTEXPR_CXX23(std::isgreater(-1.0f, 0.0f) == 0);
   ASSERT_NOT_CONSTEXPR_CXX23(std::isgreater(-1.0, 0.0) == 0);
diff --git a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-gcc.pass.cpp b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-gcc.pass.cpp
index 8c481f41a945e..d8779706bcee2 100644
--- a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-gcc.pass.cpp
+++ b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-gcc.pass.cpp
@@ -217,9 +217,9 @@ int main(int, char**) {
   ASSERT_CONSTEXPR_CXX23(std::isnormal(-1.0) == 1);
   ASSERT_CONSTEXPR_CXX23(std::isnormal(-1.0L) == 1);
 
-  ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0f) == 1);
-  ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0) == 1);
-  ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0L) == 1);
+  ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0f) == 1);
+  ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0) == 1);
+  ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0L) == 1);
 
   ASSERT_NOT_CONSTEXPR_CXX23(std::isgreater(-1.0f, 0.0f) == 0);
   ASSERT_NOT_CONSTEXPR_CXX23(std::isgreater(-1.0, 0.0) == 0);
diff --git a/libcxx/test/std/numerics/c.math/signbit.pass.cpp b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
new file mode 100644
index 0000000000000..c85033e363ce5
--- /dev/null
+++ b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
@@ -0,0 +1,78 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// bool signbit(floating-point-type x); // constexpr since C++23
+
+// We don't control the implementation on windows
+// UNSUPPORTED: windows
+
+// These compilers don't support constexpr `__builtin_signbit` yet.
+// UNSUPPORTED: clang-17, clang-18, clang-19, apple-clang-15, apple-clang-16
+
+#include <cassert>
+#include <cmath>
+#include <limits>
+
+#include "test_macros.h"
+#include "type_algorithms.h"
+
+struct TestFloat {
+  template <class T>
+  static TEST_CONSTEXPR_CXX23 bool test() {
+    assert(!std::signbit(T(0)));
+    assert(!std::signbit(std::numeric_limits<T>::min()));
+    assert(!std::signbit(std::numeric_limits<T>::denorm_min()));
+    assert(!std::signbit(std::numeric_limits<T>::max()));
+    assert(!std::signbit(std::numeric_limits<T>::infinity()));
+    assert(!std::signbit(std::numeric_limits<T>::quiet_NaN()));
+    assert(!std::signbit(std::numeric_limits<T>::signaling_NaN()));
+    assert(std::signbit(-T(0)));
+    assert(std::signbit(-std::numeric_limits<T>::infinity()));
+    assert(std::signbit(std::numeric_limits<T>::lowest()));
+
+    return true;
+  }
+
+  template <class T>
+  TEST_CONSTEXPR_CXX23 void operator()() {
+    test<T>();
+#if TEST_STD_VER >= 23
+    static_assert(test<T>());
+#endif
+  }
+};
+
+struct TestInt {
+  template <class T>
+  static TEST_CONSTEXPR_CXX23 bool test() {
+    assert(!std::signbit(std::numeric_limits<T>::max()));
+    assert(!std::signbit(T(0)));
+    if (std::is_unsigned<T>::value) {
+      assert(!std::signbit(std::numeric_limits<T>::lowest()));
+    } else {
+      assert(std::signbit(std::numeric_limits<T>::lowest()));
+    }
+
+    return true;
+  }
+
+  template <class T>
+  TEST_CONSTEXPR_CXX23 void operator()() {
+    test<T>();
+#if TEST_STD_VER >= 23
+    static_assert(test<T>());
+#endif
+  }
+};
+
+int main(int, char**) {
+  types::for_each(types::floating_point_types(), TestFloat());
+  types::for_each(types::integral_types(), TestInt());
+
+  return 0;
+}

From 7f0c5b0502b462d2afad32d3681b37cfc15ba844 Mon Sep 17 00:00:00 2001
From: Lukacma <Marian.Lukac@arm.com>
Date: Thu, 5 Sep 2024 14:47:10 +0100
Subject: [PATCH 226/425] [AArch64]Fix invalid use of ld1/st1 in stack alloc
 (#105518)

This patch fixes incorrect usage of scalar+immediate variant of ld1/st1
instructions during stack allocation caused by
[c4bac7f](https://github.com/llvm/llvm-project/commit/c4bac7f7dcd931a5e561604e95656a24c3d1c9d9).
This commit used ld1/st1 even when stack offset was outside of immediate
range for this instruction, producing invalid assembly.  This commit was also using incorrect offsets when using ld1/st1.
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |   27 +-
 llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll  |   68 +-
 .../CodeGen/AArch64/sme2-intrinsics-ld1.ll    | 1456 +++++++++++------
 .../CodeGen/AArch64/sme2-intrinsics-ldnt1.ll  | 1456 +++++++++++------
 .../AArch64/sve-callee-save-restore-pairs.ll  |   84 +-
 5 files changed, 1952 insertions(+), 1139 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index ad20e76d0fe2e..7e041b086599b 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3020,6 +3020,7 @@ static void computeCalleeSaveRegisterPairs(
       ByteOffset += StackFillDir * StackHazardSize;
     LastReg = RPI.Reg1;
 
+    int Scale = RPI.getScale();
     // Add the next reg to the pair if it is in the same register class.
     if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) {
       Register NextReg = CSI[i + RegInc].getReg();
@@ -3045,9 +3046,14 @@ static void computeCalleeSaveRegisterPairs(
       case RegPairInfo::PPR:
         break;
       case RegPairInfo::ZPR:
-        if (AFI->getPredicateRegForFillSpill() != 0)
-          if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
+        if (AFI->getPredicateRegForFillSpill() != 0 &&
+            ((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) {
+          // Calculate offset of register pair to see if pair instruction can be
+          // used.
+          int Offset = (ScalableByteOffset + StackFillDir * 2 * Scale) / Scale;
+          if ((-16 <= Offset && Offset <= 14) && (Offset % 2 == 0))
             RPI.Reg2 = NextReg;
+        }
         break;
       case RegPairInfo::VG:
         break;
@@ -3087,7 +3093,6 @@ static void computeCalleeSaveRegisterPairs(
     if (NeedsWinCFI &&
         RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
       RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
-    int Scale = RPI.getScale();
 
     int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
     assert(OffsetPre % Scale == 0);
@@ -3356,8 +3361,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
           MachineMemOperand::MOStore, Size, Alignment));
       MIB.addReg(PnReg);
       MIB.addReg(AArch64::SP)
-          .addImm(RPI.Offset) // [sp, #offset*scale],
-                              // where factor*scale is implicit
+          .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale],
+                                  // where 2*vscale is implicit
           .setMIFlag(MachineInstr::FrameSetup);
       MIB.addMemOperand(MF.getMachineMemOperand(
           MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
@@ -3378,8 +3383,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       }
       MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
           .addReg(AArch64::SP)
-          .addImm(RPI.Offset) // [sp, #offset*scale],
-                              // where factor*scale is implicit
+          .addImm(RPI.Offset) // [sp, #offset*vscale],
+                              // where factor*vscale is implicit
           .setMIFlag(MachineInstr::FrameSetup);
       MIB.addMemOperand(MF.getMachineMemOperand(
           MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
@@ -3523,8 +3528,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
           MachineMemOperand::MOLoad, Size, Alignment));
       MIB.addReg(PnReg);
       MIB.addReg(AArch64::SP)
-          .addImm(RPI.Offset) // [sp, #offset*scale]
-                              // where factor*scale is implicit
+          .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale]
+                                  // where 2*vscale is implicit
           .setMIFlag(MachineInstr::FrameDestroy);
       MIB.addMemOperand(MF.getMachineMemOperand(
           MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
@@ -3541,8 +3546,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
       }
       MIB.addReg(Reg1, getDefRegState(true));
       MIB.addReg(AArch64::SP)
-          .addImm(RPI.Offset) // [sp, #offset*scale]
-                              // where factor*scale is implicit
+          .addImm(RPI.Offset) // [sp, #offset*vscale]
+                              // where factor*vscale is implicit
           .setMIFlag(MachineInstr::FrameDestroy);
       MIB.addMemOperand(MF.getMachineMemOperand(
           MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index a96f9e382ed1a..8724e7c1c368d 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -332,16 +332,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; CHECK-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    ptrue pn8.b
 ; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
-; CHECK-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
 ; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; CHECK-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
 ; CHECK-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; CHECK-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
 ; CHECK-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
 ; CHECK-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -349,7 +349,8 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; CHECK-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG
@@ -372,15 +373,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
 ; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
 ; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
 ; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -427,16 +429,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; FP-CHECK-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
 ; FP-CHECK-NEXT:    ptrue pn8.b
 ; FP-CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
-; FP-CHECK-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
 ; FP-CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; FP-CHECK-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
 ; FP-CHECK-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; FP-CHECK-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
 ; FP-CHECK-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
 ; FP-CHECK-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
 ; FP-CHECK-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
 ; FP-CHECK-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -444,7 +446,8 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; FP-CHECK-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
 ; FP-CHECK-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
 ; FP-CHECK-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG
 ; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG
 ; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG
@@ -465,15 +468,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    addvl sp, sp, #1
 ; FP-CHECK-NEXT:    ptrue pn8.b
+; FP-CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
 ; FP-CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; FP-CHECK-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
 ; FP-CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; FP-CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; FP-CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index 29d3d68fc4c3d..c63899cf7d257 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -55,31 +55,45 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1b { z0.b, z8.b }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -89,14 +103,20 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -109,15 +129,21 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -184,31 +210,45 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1b { z0.b, z8.b }, pn8/z, [x0, x1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -218,14 +258,20 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -238,15 +284,21 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -314,31 +366,45 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1h { z0.h, z8.h }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -348,14 +414,20 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -368,15 +440,21 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -443,31 +521,45 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -477,14 +569,20 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -497,15 +595,21 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -573,31 +677,45 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1w { z0.s, z8.s }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -607,14 +725,20 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -627,15 +751,21 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -702,31 +832,45 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -736,14 +880,20 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -756,15 +906,21 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -832,31 +988,45 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1d { z0.d, z8.d }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -866,14 +1036,20 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -886,15 +1062,21 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -961,31 +1143,45 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -995,14 +1191,20 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -1015,15 +1217,21 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1093,32 +1301,46 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1130,13 +1352,18 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -1154,14 +1381,19 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1234,32 +1466,46 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1271,13 +1517,18 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -1295,14 +1546,19 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1376,32 +1632,46 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1413,13 +1683,18 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -1437,14 +1712,19 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1518,32 +1798,46 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1555,13 +1849,18 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -1579,14 +1878,19 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1660,32 +1964,46 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1697,13 +2015,18 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -1721,14 +2044,19 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1802,32 +2130,46 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1839,13 +2181,18 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -1863,14 +2210,19 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1944,32 +2296,46 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1981,13 +2347,18 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -2005,14 +2376,19 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2086,32 +2462,46 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2123,13 +2513,18 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -2147,14 +2542,19 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index 3d3748e101122..05241f788d3ea 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -8,31 +8,45 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1b { z0.b, z8.b }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -42,14 +56,20 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -62,15 +82,21 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -90,31 +116,45 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1b { z0.b, z8.b }, pn8/z, [x0, x1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -124,14 +164,20 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -144,15 +190,21 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -173,31 +225,45 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1h { z0.h, z8.h }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -207,14 +273,20 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -227,15 +299,21 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -255,31 +333,45 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -289,14 +381,20 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -309,15 +407,21 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -338,31 +442,45 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1w { z0.s, z8.s }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -372,14 +490,20 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -392,15 +516,21 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -420,31 +550,45 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -454,14 +598,20 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -474,15 +624,21 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -503,31 +659,45 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1d { z0.d, z8.d }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -537,14 +707,20 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -557,15 +733,21 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -585,31 +767,45 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -619,14 +815,20 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -639,15 +841,21 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -668,32 +876,46 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -705,13 +927,18 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -729,14 +956,19 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -760,32 +992,46 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -797,13 +1043,18 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -821,14 +1072,19 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -853,32 +1109,46 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -890,13 +1160,18 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -914,14 +1189,19 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -945,32 +1225,46 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -982,13 +1276,18 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -1006,14 +1305,19 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1038,32 +1342,46 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1075,13 +1393,18 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -1099,14 +1422,19 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1130,32 +1458,46 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1167,13 +1509,18 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -1191,14 +1538,19 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1223,32 +1575,46 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1260,13 +1626,18 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -1284,14 +1655,19 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1315,32 +1691,46 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
+; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
-; STRIDED-NEXT:    ptrue pn8.b
-; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z4.d
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1352,13 +1742,18 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
-; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -1376,14 +1771,19 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
 ; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
index 470c0dd45782c..c9d216935edbf 100644
--- a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
@@ -91,16 +91,16 @@ define void @fbyte(<vscale x 16 x i8> %v) {
 ; PAIR-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    ptrue pn8.b
 ; PAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -108,7 +108,8 @@ define void @fbyte(<vscale x 16 x i8> %v) {
 ; PAIR-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
 ; PAIR-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; PAIR-NEXT:    .cfi_offset w30, -8
 ; PAIR-NEXT:    .cfi_offset w29, -16
@@ -122,15 +123,16 @@ define void @fbyte(<vscale x 16 x i8> %v) {
 ; PAIR-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
 ; PAIR-NEXT:    bl my_func
 ; PAIR-NEXT:    ptrue pn8.b
+; PAIR-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
 ; PAIR-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; PAIR-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
 ; PAIR-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; PAIR-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; PAIR-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -233,16 +235,16 @@ define void @fhalf(<vscale x 8 x half> %v) {
 ; PAIR-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    ptrue pn8.b
 ; PAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -250,7 +252,8 @@ define void @fhalf(<vscale x 8 x half> %v) {
 ; PAIR-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
 ; PAIR-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; PAIR-NEXT:    .cfi_offset w30, -8
 ; PAIR-NEXT:    .cfi_offset w29, -16
@@ -264,15 +267,16 @@ define void @fhalf(<vscale x 8 x half> %v) {
 ; PAIR-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
 ; PAIR-NEXT:    bl my_func
 ; PAIR-NEXT:    ptrue pn8.b
+; PAIR-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
 ; PAIR-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; PAIR-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
 ; PAIR-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; PAIR-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; PAIR-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -328,7 +332,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs() {
 ; PAIR-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
-; PAIR-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; PAIR-NEXT:    .cfi_offset w29, -16
 ; PAIR-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
@@ -338,7 +342,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs() {
 ; PAIR-NEXT:    //NO_APP
 ; PAIR-NEXT:    ptrue pn8.b
 ; PAIR-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
-; PAIR-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
 ; PAIR-NEXT:    ldr p8, [sp, #5, mul vl] // 2-byte Folded Reload
 ; PAIR-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
 ; PAIR-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -385,7 +389,7 @@ define aarch64_sve_vector_pcs  void @test_clobbers_z_p_regs2() {
 ; PAIR-NEXT:    ptrue pn9.b
 ; PAIR-NEXT:    str p10, [sp, #6, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
-; PAIR-NEXT:    st1b { z8.b, z9.b }, pn9, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    st1b { z8.b, z9.b }, pn9, [sp, #2, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; PAIR-NEXT:    .cfi_offset w29, -16
 ; PAIR-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
@@ -396,7 +400,7 @@ define aarch64_sve_vector_pcs  void @test_clobbers_z_p_regs2() {
 ; PAIR-NEXT:    ptrue pn9.b
 ; PAIR-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
 ; PAIR-NEXT:    ldr p10, [sp, #6, mul vl] // 2-byte Folded Reload
-; PAIR-NEXT:    ld1b { z8.b, z9.b }, pn9/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT:    ld1b { z8.b, z9.b }, pn9/z, [sp, #2, mul vl] // 32-byte Folded Reload
 ; PAIR-NEXT:    ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
 ; PAIR-NEXT:    addvl sp, sp, #4
 ; PAIR-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -431,17 +435,17 @@ define aarch64_sve_vector_pcs  void @test_clobbers_z_regs() {
 ; PAIR-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; PAIR-NEXT:    addvl sp, sp, #-3
 ; PAIR-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT:    ptrue pn8.b
-; PAIR-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
 ; PAIR-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
 ; PAIR-NEXT:    .cfi_offset w29, -16
 ; PAIR-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; PAIR-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
 ; PAIR-NEXT:    //APP
 ; PAIR-NEXT:    //NO_APP
-; PAIR-NEXT:    ptrue pn8.b
-; PAIR-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
 ; PAIR-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; PAIR-NEXT:    addvl sp, sp, #3
 ; PAIR-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; PAIR-NEXT:    ret

From 80cd2141eb7f6e7be738a01348bc2ccd08b41cd6 Mon Sep 17 00:00:00 2001
From: Mogball <jeff@modular.com>
Date: Thu, 5 Sep 2024 14:45:38 +0100
Subject: [PATCH 227/425] [mlir][llvm] Add `externally_initialized` support to
 GlobalOp

This maps the `externally_initialized` flag in `llvm::GlobalVariable` to
`GlobalOp` and adds exported support.
---
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td  | 1 +
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 3 +--
 mlir/test/Target/LLVMIR/llvmir.mlir          | 3 +++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 46bf1c9640c17..86f6b3e6326c2 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -1122,6 +1122,7 @@ def LLVM_GlobalOp : LLVM_Op<"mlir.global",
     Linkage:$linkage,
     UnitAttr:$dso_local,
     UnitAttr:$thread_local_,
+    UnitAttr:$externally_initialized,
     OptionalAttr<AnyAttr>:$value,
     OptionalAttr<I64Attr>:$alignment,
     DefaultValuedAttr<ConfinedAttr<I32Attr, [IntNonNegative]>, "0">:$addr_space,
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index bb23da039e21f..fcb329eb7a92c 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1021,7 +1021,6 @@ LogicalResult ModuleTranslation::convertGlobals() {
     }
 
     auto linkage = convertLinkageToLLVM(op.getLinkage());
-    auto addrSpace = op.getAddrSpace();
 
     // LLVM IR requires constant with linkage other than external or weak
     // external to have initializers. If MLIR does not provide an initializer,
@@ -1037,7 +1036,7 @@ LogicalResult ModuleTranslation::convertGlobals() {
         /*InsertBefore=*/nullptr,
         op.getThreadLocal_() ? llvm::GlobalValue::GeneralDynamicTLSModel
                              : llvm::GlobalValue::NotThreadLocal,
-        addrSpace);
+        op.getAddrSpace(), op.getExternallyInitialized());
 
     if (std::optional<mlir::SymbolRefAttr> comdat = op.getComdat()) {
       auto selectorOp = cast<ComdatSelectorOp>(
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index df61fef605fde..d2cd0221e0ea7 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -39,6 +39,9 @@ llvm.mlir.global internal constant @string_const("foobar") : !llvm.array<6 x i8>
 // CHECK: @int_global_undef = internal global i64 undef
 llvm.mlir.global internal @int_global_undef() : i64
 
+// CHECK: @externally_initialized_global = internal externally_initialized global i32 0
+llvm.mlir.global internal @externally_initialized_global(0 : i32) {externally_initialized} : i32
+
 // CHECK: @f8E3M4_global_as_i8 = internal global i8 56
 llvm.mlir.global internal @f8E3M4_global_as_i8(1.5 : f8E3M4) : i8
 

From 0ba78182b975d8ccd8ca42b33fbf038a85a44747 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 5 Sep 2024 14:54:05 +0100
Subject: [PATCH 228/425] [X86] LowerSelect - generalize "select icmp(x,0),
 lhs, rhs" folding patterns [NFC] (#107374)

This patch proposes we add a LowerSELECTWithCmpZero helper, which allows us to fold the compare-with-zero from different condition nodes with minimal duplication.

So far I've only handled the simple no-cmov case for or/xor nodes, but the intention is to convert more folds in future PRs.

NFC preliminary patch for #107272
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 87 +++++++++++++++----------
 1 file changed, 52 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 092a7192929fd..5f87ffd2f1eab 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24074,6 +24074,55 @@ static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
 }
 
+// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
+static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS,
+                                      unsigned X86CC, const SDLoc &DL,
+                                      SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
+  EVT CmpVT = CmpVal.getValueType();
+  EVT VT = LHS.getValueType();
+  if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
+    return SDValue();
+
+  if (!Subtarget.canUseCMOV() && X86CC == X86::COND_E &&
+      CmpVal.getOpcode() == ISD::AND && isOneConstant(CmpVal.getOperand(1))) {
+    SDValue Src1, Src2;
+    // true if RHS is XOR or OR operator and one of its operands
+    // is equal to LHS
+    // ( a , a op b) || ( b , a op b)
+    auto isOrXorPattern = [&]() {
+      if ((RHS.getOpcode() == ISD::XOR || RHS.getOpcode() == ISD::OR) &&
+          (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS)) {
+        Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
+        Src2 = LHS;
+        return true;
+      }
+      return false;
+    };
+
+    if (isOrXorPattern()) {
+      SDValue Neg;
+      unsigned int CmpSz = CmpVT.getSizeInBits();
+      // we need mask of all zeros or ones with same size of the other
+      // operands.
+      if (CmpSz > VT.getSizeInBits())
+        Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpVal);
+      else if (CmpSz < VT.getSizeInBits())
+        Neg = DAG.getNode(
+            ISD::AND, DL, VT,
+            DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpVal.getOperand(0)),
+            DAG.getConstant(1, DL, VT));
+      else
+        Neg = CmpVal;
+      SDValue Mask = DAG.getNegative(Neg, DL, VT); // -(and (x, 0x1))
+      SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
+      return DAG.getNode(RHS.getOpcode(), DL, VT, And, Src2);  // And Op y
+    }
+  }
+
+  return SDValue();
+}
+
 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   bool AddTest = true;
   SDValue Cond  = Op.getOperand(0);
@@ -24218,41 +24267,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
                                 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
                                 Sub.getValue(1));
       return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
-    } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
-               CmpOp0.getOpcode() == ISD::AND &&
-               isOneConstant(CmpOp0.getOperand(1))) {
-      SDValue Src1, Src2;
-      // true if Op2 is XOR or OR operator and one of its operands
-      // is equal to Op1
-      // ( a , a op b) || ( b , a op b)
-      auto isOrXorPattern = [&]() {
-        if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
-            (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
-          Src1 =
-              Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
-          Src2 = Op1;
-          return true;
-        }
-        return false;
-      };
-
-      if (isOrXorPattern()) {
-        SDValue Neg;
-        unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
-        // we need mask of all zeros or ones with same size of the other
-        // operands.
-        if (CmpSz > VT.getSizeInBits())
-          Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
-        else if (CmpSz < VT.getSizeInBits())
-          Neg = DAG.getNode(ISD::AND, DL, VT,
-              DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
-              DAG.getConstant(1, DL, VT));
-        else
-          Neg = CmpOp0;
-        SDValue Mask = DAG.getNegative(Neg, DL, VT); // -(and (x, 0x1))
-        SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
-        return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
-      }
+    } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
+                                                  DL, DAG, Subtarget)) {
+      return R;
     } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
                Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
                ((CondCode == X86::COND_S) ||                    // smin(x, 0)

From 233ed51cf53d590d3f52d5becff95317dbf73657 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Thu, 5 Sep 2024 15:57:20 +0200
Subject: [PATCH 229/425] [LLD][COFF][NFC] Use is64Bit in
 Baserel::getDefaultType. (#107378)

In preparation for ARM64EC support. Also make it static.
---
 lld/COFF/Chunks.cpp | 11 +----------
 lld/COFF/Chunks.h   |  2 +-
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp
index 386012e3ce823..060eb6c32004d 100644
--- a/lld/COFF/Chunks.cpp
+++ b/lld/COFF/Chunks.cpp
@@ -1000,16 +1000,7 @@ void BaserelChunk::writeTo(uint8_t *buf) const {
 }
 
 uint8_t Baserel::getDefaultType(llvm::COFF::MachineTypes machine) {
-  switch (machine) {
-  case AMD64:
-  case ARM64:
-    return IMAGE_REL_BASED_DIR64;
-  case I386:
-  case ARMNT:
-    return IMAGE_REL_BASED_HIGHLOW;
-  default:
-    llvm_unreachable("unknown machine type");
-  }
+  return is64Bit(machine) ? IMAGE_REL_BASED_DIR64 : IMAGE_REL_BASED_HIGHLOW;
 }
 
 MergeChunk::MergeChunk(uint32_t alignment)
diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h
index 8ccd05b21af7b..30e5b538c352e 100644
--- a/lld/COFF/Chunks.h
+++ b/lld/COFF/Chunks.h
@@ -713,7 +713,7 @@ class Baserel {
   Baserel(uint32_t v, uint8_t ty) : rva(v), type(ty) {}
   explicit Baserel(uint32_t v, llvm::COFF::MachineTypes machine)
       : Baserel(v, getDefaultType(machine)) {}
-  uint8_t getDefaultType(llvm::COFF::MachineTypes machine);
+  static uint8_t getDefaultType(llvm::COFF::MachineTypes machine);
 
   uint32_t rva;
   uint8_t type;

From 9707b98e572adf34ef3e71bcf159dae08e654fd8 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 5 Sep 2024 16:02:50 +0200
Subject: [PATCH 230/425] [ConstantRange] Perform increment on APInt (NFC)

This handles the edge case where BitWidth is 1 and doing the
increment gets a value that's not valid in that width, while we
just want wrap-around.

Split out of https://github.com/llvm/llvm-project/pull/80309.
---
 llvm/lib/Analysis/ValueTracking.cpp | 2 +-
 llvm/lib/IR/ConstantRange.cpp       | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 8f35fd5eb5268..3a0ec99ee5ea1 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9596,7 +9596,7 @@ static ConstantRange getRangeForIntrinsic(const IntrinsicInst &II) {
   case Intrinsic::cttz:
     // Maximum of set/clear bits is the bit width.
     return ConstantRange::getNonEmpty(APInt::getZero(Width),
-                                      APInt(Width, Width + 1));
+                                      APInt(Width, Width) + 1);
   case Intrinsic::uadd_sat:
     // uadd.sat(x, C) produces [C, UINT_MAX].
     if (match(II.getOperand(0), m_APInt(C)) ||
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index c389d7214defc..61a051821a5db 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -1952,7 +1952,7 @@ ConstantRange ConstantRange::ctlz(bool ZeroIsPoison) const {
   // Zero is either safe or not in the range. The output range is composed by
   // the result of countLeadingZero of the two extremes.
   return getNonEmpty(APInt(getBitWidth(), getUnsignedMax().countl_zero()),
-                     APInt(getBitWidth(), getUnsignedMin().countl_zero() + 1));
+                     APInt(getBitWidth(), getUnsignedMin().countl_zero()) + 1);
 }
 
 static ConstantRange getUnsignedCountTrailingZerosRange(const APInt &Lower,
@@ -2011,7 +2011,7 @@ ConstantRange ConstantRange::cttz(bool ZeroIsPoison) const {
   }
 
   if (isFullSet())
-    return getNonEmpty(Zero, APInt(BitWidth, BitWidth + 1));
+    return getNonEmpty(Zero, APInt(BitWidth, BitWidth) + 1);
   if (!isWrappedSet())
     return getUnsignedCountTrailingZerosRange(Lower, Upper);
   // The range is wrapped. We decompose it into two ranges, [0, Upper) and
@@ -2056,7 +2056,7 @@ ConstantRange ConstantRange::ctpop() const {
   unsigned BitWidth = getBitWidth();
   APInt Zero = APInt::getZero(BitWidth);
   if (isFullSet())
-    return getNonEmpty(Zero, APInt(BitWidth, BitWidth + 1));
+    return getNonEmpty(Zero, APInt(BitWidth, BitWidth) + 1);
   if (!isWrappedSet())
     return getUnsignedPopCountRange(Lower, Upper);
   // The range is wrapped. We decompose it into two ranges, [0, Upper) and

From 9e85efb0dec8e78ca69925a05c0bbba211dee507 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 5 Sep 2024 16:04:52 +0200
Subject: [PATCH 231/425] [ConstantRangeTest] Set APInt signed flags where
 needed (NFC)

Split out from https://github.com/llvm/llvm-project/pull/80309 to
avoid assertion failures in the future.
---
 llvm/unittests/IR/ConstantRangeTest.cpp | 240 ++++++++++++------------
 1 file changed, 124 insertions(+), 116 deletions(-)

diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp
index 4815117458b9a..e1d9b3e387b20 100644
--- a/llvm/unittests/IR/ConstantRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeTest.cpp
@@ -372,10 +372,10 @@ TEST_F(ConstantRangeTest, GetMinsAndMaxes) {
   EXPECT_EQ(Some.getSignedMax(), APInt(16, 0xaa9));
   EXPECT_EQ(Wrap.getSignedMax(), APInt(16, INT16_MAX));
 
-  EXPECT_EQ(Full.getSignedMin(), APInt(16, (uint64_t)INT16_MIN));
+  EXPECT_EQ(Full.getSignedMin(), APInt(16, (uint16_t)INT16_MIN));
   EXPECT_EQ(One.getSignedMin(), APInt(16, 0xa));
   EXPECT_EQ(Some.getSignedMin(), APInt(16, 0xa));
-  EXPECT_EQ(Wrap.getSignedMin(), APInt(16, (uint64_t)INT16_MIN));
+  EXPECT_EQ(Wrap.getSignedMin(), APInt(16, (uint16_t)INT16_MIN));
 
   // Found by Klee
   EXPECT_EQ(ConstantRange(APInt(4, 7), APInt(4, 0)).getSignedMax(),
@@ -487,7 +487,7 @@ TEST_F(ConstantRangeTest, SExt) {
                                  APInt(20, INT16_MAX + 1, true)));
 
   EXPECT_EQ(ConstantRange(APInt(8, 120), APInt(8, 140)).signExtend(16),
-            ConstantRange(APInt(16, -128), APInt(16, 128)));
+            ConstantRange(APInt(16, -128, true), APInt(16, 128)));
 
   EXPECT_EQ(ConstantRange(APInt(16, 0x0200), APInt(16, 0x8000)).signExtend(19),
             ConstantRange(APInt(19, 0x0200), APInt(19, 0x8000)));
@@ -516,7 +516,7 @@ TEST_F(ConstantRangeTest, IntersectWith) {
   EXPECT_TRUE(LHS.intersectWith(RHS) == LHS);
 
   // previous bug: intersection of [min, 3) and [2, max) should be 2
-  LHS = ConstantRange(APInt(32, -2147483646), APInt(32, 3));
+  LHS = ConstantRange(APInt(32, (uint32_t)-2147483646), APInt(32, 3));
   RHS = ConstantRange(APInt(32, 2), APInt(32, 2147483646));
   EXPECT_EQ(LHS.intersectWith(RHS), ConstantRange(APInt(32, 2)));
 
@@ -744,45 +744,51 @@ TEST_F(ConstantRangeTest, AddWithNoWrap) {
   EXPECT_NE(Some.addWithNoWrap(Full, OBO::NoSignedWrap), Full);
   EXPECT_EQ(Full.addWithNoWrap(ConstantRange(APInt(16, 1), APInt(16, 2)),
                                OBO::NoSignedWrap),
-            ConstantRange(APInt(16, INT16_MIN + 1), APInt(16, INT16_MIN)));
+            ConstantRange(APInt(16, INT16_MIN + 1, true),
+                          APInt(16, INT16_MIN, true)));
   EXPECT_EQ(ConstantRange(APInt(16, 1), APInt(16, 2))
                 .addWithNoWrap(Full, OBO::NoSignedWrap),
-            ConstantRange(APInt(16, INT16_MIN + 1), APInt(16, INT16_MIN)));
-  EXPECT_EQ(Full.addWithNoWrap(ConstantRange(APInt(16, -1), APInt(16, 0)),
+            ConstantRange(APInt(16, INT16_MIN + 1, true),
+                          APInt(16, INT16_MIN, true)));
+  EXPECT_EQ(Full.addWithNoWrap(ConstantRange(APInt(16, -1, true), APInt(16, 0)),
                                OBO::NoSignedWrap),
-            ConstantRange(APInt(16, INT16_MIN), APInt(16, INT16_MAX)));
+            ConstantRange(APInt(16, INT16_MIN, true), APInt(16, INT16_MAX)));
   EXPECT_EQ(ConstantRange(APInt(8, 100), APInt(8, 120))
                 .addWithNoWrap(ConstantRange(APInt(8, 120), APInt(8, 123)),
                                OBO::NoSignedWrap),
             ConstantRange(8, false));
-  EXPECT_EQ(ConstantRange(APInt(8, -120), APInt(8, -100))
-                .addWithNoWrap(ConstantRange(APInt(8, -110), APInt(8, -100)),
-                               OBO::NoSignedWrap),
+  EXPECT_EQ(ConstantRange(APInt(8, -120, true), APInt(8, -100, true))
+                .addWithNoWrap(
+                    ConstantRange(APInt(8, -110, true), APInt(8, -100, true)),
+                    OBO::NoSignedWrap),
             ConstantRange(8, false));
-  EXPECT_EQ(ConstantRange(APInt(8, 0), APInt(8, 101))
-                .addWithNoWrap(ConstantRange(APInt(8, -128), APInt(8, 28)),
-                               OBO::NoSignedWrap),
-            ConstantRange(8, true));
-  EXPECT_EQ(ConstantRange(APInt(8, 0), APInt(8, 101))
-                .addWithNoWrap(ConstantRange(APInt(8, -120), APInt(8, 29)),
-                               OBO::NoSignedWrap),
-            ConstantRange(APInt(8, -120), APInt(8, -128)));
-  EXPECT_EQ(ConstantRange(APInt(8, -50), APInt(8, 50))
+  EXPECT_EQ(
+      ConstantRange(APInt(8, 0), APInt(8, 101))
+          .addWithNoWrap(ConstantRange(APInt(8, -128, true), APInt(8, 28)),
+                         OBO::NoSignedWrap),
+      ConstantRange(8, true));
+  EXPECT_EQ(
+      ConstantRange(APInt(8, 0), APInt(8, 101))
+          .addWithNoWrap(ConstantRange(APInt(8, -120, true), APInt(8, 29)),
+                         OBO::NoSignedWrap),
+      ConstantRange(APInt(8, -120, true), APInt(8, -128, true)));
+  EXPECT_EQ(ConstantRange(APInt(8, -50, true), APInt(8, 50))
                 .addWithNoWrap(ConstantRange(APInt(8, 10), APInt(8, 20)),
                                OBO::NoSignedWrap),
-            ConstantRange(APInt(8, -40), APInt(8, 69)));
+            ConstantRange(APInt(8, -40, true), APInt(8, 69)));
   EXPECT_EQ(ConstantRange(APInt(8, 10), APInt(8, 20))
-                .addWithNoWrap(ConstantRange(APInt(8, -50), APInt(8, 50)),
+                .addWithNoWrap(ConstantRange(APInt(8, -50, true), APInt(8, 50)),
                                OBO::NoSignedWrap),
-            ConstantRange(APInt(8, -40), APInt(8, 69)));
-  EXPECT_EQ(ConstantRange(APInt(8, 120), APInt(8, -10))
+            ConstantRange(APInt(8, -40, true), APInt(8, 69)));
+  EXPECT_EQ(ConstantRange(APInt(8, 120), APInt(8, -10, true))
                 .addWithNoWrap(ConstantRange(APInt(8, 5), APInt(8, 20)),
                                OBO::NoSignedWrap),
             ConstantRange(APInt(8, 125), APInt(8, 9)));
-  EXPECT_EQ(ConstantRange(APInt(8, 5), APInt(8, 20))
-                .addWithNoWrap(ConstantRange(APInt(8, 120), APInt(8, -10)),
-                               OBO::NoSignedWrap),
-            ConstantRange(APInt(8, 125), APInt(8, 9)));
+  EXPECT_EQ(
+      ConstantRange(APInt(8, 5), APInt(8, 20))
+          .addWithNoWrap(ConstantRange(APInt(8, 120), APInt(8, -10, true)),
+                         OBO::NoSignedWrap),
+      ConstantRange(APInt(8, 125), APInt(8, 9)));
 
   TestBinaryOpExhaustive(
       [](const ConstantRange &CR1, const ConstantRange &CR2) {
@@ -827,15 +833,15 @@ TEST_F(ConstantRangeTest, AddWithNoWrap) {
   EXPECT_EQ(ConstantRange(APInt(8, 10), APInt(8, 20))
                 .addWithNoWrap(ConstantRange(APInt(8, 50), APInt(8, 200)),
                                OBO::NoUnsignedWrap),
-            ConstantRange(APInt(8, 60), APInt(8, -37)));
-  EXPECT_EQ(ConstantRange(APInt(8, 20), APInt(8, -30))
+            ConstantRange(APInt(8, 60), APInt(8, -37, true)));
+  EXPECT_EQ(ConstantRange(APInt(8, 20), APInt(8, -30, true))
                 .addWithNoWrap(ConstantRange(APInt(8, 5), APInt(8, 20)),
                                OBO::NoUnsignedWrap),
-            ConstantRange(APInt(8, 25), APInt(8, -11)));
+            ConstantRange(APInt(8, 25), APInt(8, -11, true)));
   EXPECT_EQ(ConstantRange(APInt(8, 5), APInt(8, 20))
-                .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, -30)),
+                .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, -30, true)),
                                OBO::NoUnsignedWrap),
-            ConstantRange(APInt(8, 25), APInt(8, -11)));
+            ConstantRange(APInt(8, 25), APInt(8, -11, true)));
 
   TestBinaryOpExhaustive(
       [](const ConstantRange &CR1, const ConstantRange &CR2) {
@@ -853,7 +859,7 @@ TEST_F(ConstantRangeTest, AddWithNoWrap) {
   EXPECT_EQ(ConstantRange(APInt(8, 50), APInt(8, 100))
                 .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, 70)),
                                OBO::NoSignedWrap),
-            ConstantRange(APInt(8, 70), APInt(8, -128)));
+            ConstantRange(APInt(8, 70), APInt(8, -128, true)));
   EXPECT_EQ(ConstantRange(APInt(8, 50), APInt(8, 100))
                 .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, 70)),
                                OBO::NoUnsignedWrap),
@@ -861,17 +867,17 @@ TEST_F(ConstantRangeTest, AddWithNoWrap) {
   EXPECT_EQ(ConstantRange(APInt(8, 50), APInt(8, 100))
                 .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, 70)),
                                OBO::NoUnsignedWrap | OBO::NoSignedWrap),
-            ConstantRange(APInt(8, 70), APInt(8, -128)));
+            ConstantRange(APInt(8, 70), APInt(8, -128, true)));
 
-  EXPECT_EQ(ConstantRange(APInt(8, -100), APInt(8, -50))
+  EXPECT_EQ(ConstantRange(APInt(8, -100, true), APInt(8, -50, true))
                 .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, 30)),
                                OBO::NoSignedWrap),
-            ConstantRange(APInt(8, -80), APInt(8, -21)));
-  EXPECT_EQ(ConstantRange(APInt(8, -100), APInt(8, -50))
+            ConstantRange(APInt(8, -80, true), APInt(8, -21, true)));
+  EXPECT_EQ(ConstantRange(APInt(8, -100, true), APInt(8, -50, true))
                 .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, 30)),
                                OBO::NoUnsignedWrap),
             ConstantRange(APInt(8, 176), APInt(8, 235)));
-  EXPECT_EQ(ConstantRange(APInt(8, -100), APInt(8, -50))
+  EXPECT_EQ(ConstantRange(APInt(8, -100, true), APInt(8, -50, true))
                 .addWithNoWrap(ConstantRange(APInt(8, 20), APInt(8, 30)),
                                OBO::NoUnsignedWrap | OBO::NoSignedWrap),
             ConstantRange(APInt(8, 176), APInt(8, 235)));
@@ -1004,17 +1010,17 @@ TEST_F(ConstantRangeTest, Multiply) {
             ConstantRange(APInt(8, 250), APInt(8, 253)));
 
   // TODO: This should be return [-2, 0]
-  EXPECT_EQ(ConstantRange(APInt(8, -2)).multiply(
-              ConstantRange(APInt(8, 0), APInt(8, 2))),
-            ConstantRange(APInt(8, -2), APInt(8, 1)));
+  EXPECT_EQ(ConstantRange(APInt(8, -2, true))
+                .multiply(ConstantRange(APInt(8, 0), APInt(8, 2))),
+            ConstantRange(APInt(8, -2, true), APInt(8, 1)));
 
   // Multiplication by -1 should give precise results.
-  EXPECT_EQ(ConstantRange(APInt(8, 3), APInt(8, -11))
-                .multiply(ConstantRange(APInt(8, -1))),
-            ConstantRange(APInt(8, 12), APInt(8, -2)));
-  EXPECT_EQ(ConstantRange(APInt(8, -1))
-                .multiply(ConstantRange(APInt(8, 3), APInt(8, -11))),
-            ConstantRange(APInt(8, 12), APInt(8, -2)));
+  EXPECT_EQ(ConstantRange(APInt(8, 3), APInt(8, -11, true))
+                .multiply(ConstantRange(APInt(8, -1, true))),
+            ConstantRange(APInt(8, 12), APInt(8, -2, true)));
+  EXPECT_EQ(ConstantRange(APInt(8, -1, true))
+                .multiply(ConstantRange(APInt(8, 3), APInt(8, -11, true))),
+            ConstantRange(APInt(8, 12), APInt(8, -2, true)));
 
   TestBinaryOpExhaustive(
       [](const ConstantRange &CR1, const ConstantRange &CR2) {
@@ -1185,11 +1191,11 @@ TEST_F(ConstantRangeTest, SMax) {
   EXPECT_EQ(Empty.smax(Wrap), Empty);
   EXPECT_EQ(Empty.smax(One), Empty);
   EXPECT_EQ(Some.smax(Some), Some);
-  EXPECT_EQ(Some.smax(Wrap), ConstantRange(APInt(16, 0xa),
-                                           APInt(16, (uint64_t)INT16_MIN)));
+  EXPECT_EQ(Some.smax(Wrap),
+            ConstantRange(APInt(16, 0xa), APInt(16, (uint16_t)INT16_MIN)));
   EXPECT_EQ(Some.smax(One), Some);
-  EXPECT_EQ(Wrap.smax(One), ConstantRange(APInt(16, 0xa),
-                                          APInt(16, (uint64_t)INT16_MIN)));
+  EXPECT_EQ(Wrap.smax(One),
+            ConstantRange(APInt(16, 0xa), APInt(16, (uint16_t)INT16_MIN)));
   EXPECT_EQ(One.smax(One), One);
 
   TestBinaryOpExhaustive(
@@ -1231,20 +1237,20 @@ TEST_F(ConstantRangeTest, UMin) {
 TEST_F(ConstantRangeTest, SMin) {
   EXPECT_EQ(Full.smin(Full), Full);
   EXPECT_EQ(Full.smin(Empty), Empty);
-  EXPECT_EQ(Full.smin(Some), ConstantRange(APInt(16, (uint64_t)INT16_MIN),
-                                           APInt(16, 0xaaa)));
+  EXPECT_EQ(Full.smin(Some),
+            ConstantRange(APInt(16, (uint16_t)INT16_MIN), APInt(16, 0xaaa)));
   EXPECT_EQ(Full.smin(Wrap), Full);
   EXPECT_EQ(Empty.smin(Empty), Empty);
   EXPECT_EQ(Empty.smin(Some), Empty);
   EXPECT_EQ(Empty.smin(Wrap), Empty);
   EXPECT_EQ(Empty.smin(One), Empty);
   EXPECT_EQ(Some.smin(Some), Some);
-  EXPECT_EQ(Some.smin(Wrap), ConstantRange(APInt(16, (uint64_t)INT16_MIN),
-                                           APInt(16, 0xaaa)));
+  EXPECT_EQ(Some.smin(Wrap),
+            ConstantRange(APInt(16, (uint16_t)INT16_MIN), APInt(16, 0xaaa)));
   EXPECT_EQ(Some.smin(One), One);
   EXPECT_EQ(Wrap.smin(Wrap), Wrap);
-  EXPECT_EQ(Wrap.smin(One), ConstantRange(APInt(16, (uint64_t)INT16_MIN),
-                                          APInt(16, 0xb)));
+  EXPECT_EQ(Wrap.smin(One),
+            ConstantRange(APInt(16, (uint16_t)INT16_MIN), APInt(16, 0xb)));
   EXPECT_EQ(One.smin(One), One);
 
   TestBinaryOpExhaustive(
@@ -1320,8 +1326,8 @@ TEST_F(ConstantRangeTest, SDiv) {
     }
 
     // If there is a non-full signed envelope, that should be the result.
-    APInt SMin(Bits, Results.find_first() - Bias);
-    APInt SMax(Bits, Results.find_last() - Bias);
+    APInt SMin(Bits, Results.find_first() - Bias, true);
+    APInt SMax(Bits, Results.find_last() - Bias, true);
     ConstantRange Envelope = ConstantRange::getNonEmpty(SMin, SMax + 1);
     if (!Envelope.isFullSet()) {
       EXPECT_EQ(Envelope, CR);
@@ -1340,8 +1346,8 @@ TEST_F(ConstantRangeTest, SDiv) {
         --LastPos;
     }
 
-    APInt WMax(Bits, LastNeg);
-    APInt WMin(Bits, LastPos);
+    APInt WMax(Bits, LastNeg, true);
+    APInt WMin(Bits, LastPos, true);
     ConstantRange Wrapped = ConstantRange::getNonEmpty(WMin, WMax + 1);
     EXPECT_EQ(Wrapped, CR);
   });
@@ -1394,8 +1400,8 @@ TEST_F(ConstantRangeTest, SRem) {
   EXPECT_EQ(Full.srem(Full), ConstantRange(APInt::getSignedMinValue(16) + 1,
                                            APInt::getSignedMinValue(16)));
 
-  ConstantRange PosMod(APInt(16, 10), APInt(16, 21));  // [10, 20]
-  ConstantRange NegMod(APInt(16, -20), APInt(16, -9)); // [-20, -10]
+  ConstantRange PosMod(APInt(16, 10), APInt(16, 21));              // [10, 20]
+  ConstantRange NegMod(APInt(16, -20, true), APInt(16, -9, true)); // [-20, -10]
   ConstantRange IntMinMod(APInt::getSignedMinValue(16));
 
   ConstantRange Expected(16, true);
@@ -1405,12 +1411,12 @@ TEST_F(ConstantRangeTest, SRem) {
   Expected = ConstantRange(APInt(16, 0), APInt(16, 20));
   EXPECT_EQ(PosLargeLHS.srem(PosMod), Expected);
   EXPECT_EQ(PosLargeLHS.srem(NegMod), Expected);
-  ConstantRange NegLargeLHS(APInt(16, -40), APInt(16, 1));
-  Expected = ConstantRange(APInt(16, -19), APInt(16, 1));
+  ConstantRange NegLargeLHS(APInt(16, -40, true), APInt(16, 1));
+  Expected = ConstantRange(APInt(16, -19, true), APInt(16, 1));
   EXPECT_EQ(NegLargeLHS.srem(PosMod), Expected);
   EXPECT_EQ(NegLargeLHS.srem(NegMod), Expected);
-  ConstantRange PosNegLargeLHS(APInt(16, -32), APInt(16, 38));
-  Expected = ConstantRange(APInt(16, -19), APInt(16, 20));
+  ConstantRange PosNegLargeLHS(APInt(16, -32, true), APInt(16, 38));
+  Expected = ConstantRange(APInt(16, -19, true), APInt(16, 20));
   EXPECT_EQ(PosNegLargeLHS.srem(PosMod), Expected);
   EXPECT_EQ(PosNegLargeLHS.srem(NegMod), Expected);
 
@@ -1419,11 +1425,11 @@ TEST_F(ConstantRangeTest, SRem) {
   EXPECT_EQ(PosLHS.srem(PosMod), PosLHS);
   EXPECT_EQ(PosLHS.srem(NegMod), PosLHS);
   EXPECT_EQ(PosLHS.srem(IntMinMod), PosLHS);
-  ConstantRange NegLHS(APInt(16, -15), APInt(16, 1));
+  ConstantRange NegLHS(APInt(16, -15, true), APInt(16, 1));
   EXPECT_EQ(NegLHS.srem(PosMod), NegLHS);
   EXPECT_EQ(NegLHS.srem(NegMod), NegLHS);
   EXPECT_EQ(NegLHS.srem(IntMinMod), NegLHS);
-  ConstantRange PosNegLHS(APInt(16, -12), APInt(16, 18));
+  ConstantRange PosNegLHS(APInt(16, -12, true), APInt(16, 18));
   EXPECT_EQ(PosNegLHS.srem(PosMod), PosNegLHS);
   EXPECT_EQ(PosNegLHS.srem(NegMod), PosNegLHS);
   EXPECT_EQ(PosNegLHS.srem(IntMinMod), PosNegLHS);
@@ -1433,11 +1439,11 @@ TEST_F(ConstantRangeTest, SRem) {
   EXPECT_EQ(PosSmallLHS.srem(PosMod), PosSmallLHS);
   EXPECT_EQ(PosSmallLHS.srem(NegMod), PosSmallLHS);
   EXPECT_EQ(PosSmallLHS.srem(IntMinMod), PosSmallLHS);
-  ConstantRange NegSmallLHS(APInt(16, -7), APInt(16, -2));
+  ConstantRange NegSmallLHS(APInt(16, -7, true), APInt(16, -2, true));
   EXPECT_EQ(NegSmallLHS.srem(PosMod), NegSmallLHS);
   EXPECT_EQ(NegSmallLHS.srem(NegMod), NegSmallLHS);
   EXPECT_EQ(NegSmallLHS.srem(IntMinMod), NegSmallLHS);
-  ConstantRange PosNegSmallLHS(APInt(16, -3), APInt(16, 8));
+  ConstantRange PosNegSmallLHS(APInt(16, -3, true), APInt(16, 8));
   EXPECT_EQ(PosNegSmallLHS.srem(PosMod), PosNegSmallLHS);
   EXPECT_EQ(PosNegSmallLHS.srem(NegMod), PosNegSmallLHS);
   EXPECT_EQ(PosNegSmallLHS.srem(IntMinMod), PosNegSmallLHS);
@@ -1554,27 +1560,27 @@ TEST_F(ConstantRangeTest, ShlWithNoWrap) {
   EXPECT_EQ(One.shlWithNoWrap(Full, OBO::NoSignedWrap),
             ConstantRange(APInt(16, 10), APInt(16, 20481)));
   EXPECT_EQ(One.shlWithNoWrap(Full, OBO::NoUnsignedWrap),
-            ConstantRange(APInt(16, 10), APInt(16, -24575)));
+            ConstantRange(APInt(16, 10), APInt(16, -24575, true)));
   EXPECT_EQ(One.shlWithNoWrap(Full, OBO::NoSignedWrap | OBO::NoUnsignedWrap),
             ConstantRange(APInt(16, 10), APInt(16, 20481)));
   ConstantRange NegOne(APInt(16, 0xffff));
   EXPECT_EQ(NegOne.shlWithNoWrap(Full, OBO::NoSignedWrap),
-            ConstantRange(APInt(16, -32768), APInt(16, 0)));
+            ConstantRange(APInt(16, -32768, true), APInt(16, 0)));
   EXPECT_EQ(NegOne.shlWithNoWrap(Full, OBO::NoUnsignedWrap), NegOne);
   EXPECT_EQ(ConstantRange(APInt(16, 768))
                 .shlWithNoWrap(Full, OBO::NoSignedWrap | OBO::NoUnsignedWrap),
             ConstantRange(APInt(16, 768), APInt(16, 24577)));
   EXPECT_EQ(Full.shlWithNoWrap(ConstantRange(APInt(16, 1), APInt(16, 16)),
                                OBO::NoUnsignedWrap),
-            ConstantRange(APInt(16, 0), APInt(16, -1)));
-  EXPECT_EQ(ConstantRange(APInt(4, 3), APInt(4, -8))
+            ConstantRange(APInt(16, 0), APInt(16, -1, true)));
+  EXPECT_EQ(ConstantRange(APInt(4, 3), APInt(4, -8, true))
                 .shlWithNoWrap(ConstantRange(APInt(4, 0), APInt(4, 4)),
                                OBO::NoSignedWrap),
-            ConstantRange(APInt(4, 3), APInt(4, -8)));
-  EXPECT_EQ(ConstantRange(APInt(4, -1), APInt(4, 0))
+            ConstantRange(APInt(4, 3), APInt(4, -8, true)));
+  EXPECT_EQ(ConstantRange(APInt(4, -1, true), APInt(4, 0))
                 .shlWithNoWrap(ConstantRange(APInt(4, 1), APInt(4, 4)),
                                OBO::NoSignedWrap),
-            ConstantRange(APInt(4, -8), APInt(4, -1)));
+            ConstantRange(APInt(4, -8, true), APInt(4, -1, true)));
 }
 
 TEST_F(ConstantRangeTest, Lshr) {
@@ -1620,9 +1626,9 @@ TEST_F(ConstantRangeTest, Ashr) {
                                            APInt(16, (0xaaa >> 0xa) + 1)));
   EXPECT_EQ(Some.ashr(Wrap), ConstantRange(APInt(16, 0), APInt(16, 0xaaa)));
   EXPECT_EQ(Wrap.ashr(Wrap), Full);
-  ConstantRange Neg(APInt(16, 0xf3f0, true), APInt(16, 0xf7f8, true));
-  EXPECT_EQ(Neg.ashr(Small), ConstantRange(APInt(16, 0xfffc, true),
-                                           APInt(16, 0xfffe, true)));
+  ConstantRange Neg(APInt(16, 0xf3f0), APInt(16, 0xf7f8));
+  EXPECT_EQ(Neg.ashr(Small),
+            ConstantRange(APInt(16, 0xfffc), APInt(16, 0xfffe)));
 }
 
 TEST(ConstantRange, MakeAllowedICmpRegion) {
@@ -1665,23 +1671,23 @@ TEST(ConstantRange, MakeSatisfyingICmpRegion) {
                                                     UnsignedSample),
             ConstantRange(APInt(8, 199), APInt(8, 0)));
 
-  ConstantRange SignedSample(APInt(8, -5), APInt(8, 5));
+  ConstantRange SignedSample(APInt(8, -5, true), APInt(8, 5));
 
   EXPECT_EQ(
       ConstantRange::makeSatisfyingICmpRegion(ICmpInst::ICMP_SLT, SignedSample),
-      ConstantRange(APInt(8, -128), APInt(8, -5)));
+      ConstantRange(APInt(8, -128, true), APInt(8, -5, true)));
 
   EXPECT_EQ(
       ConstantRange::makeSatisfyingICmpRegion(ICmpInst::ICMP_SLE, SignedSample),
-      ConstantRange(APInt(8, -128), APInt(8, -4)));
+      ConstantRange(APInt(8, -128, true), APInt(8, -4, true)));
 
   EXPECT_EQ(
       ConstantRange::makeSatisfyingICmpRegion(ICmpInst::ICMP_SGT, SignedSample),
-      ConstantRange(APInt(8, 5), APInt(8, -128)));
+      ConstantRange(APInt(8, 5), APInt(8, -128, true)));
 
   EXPECT_EQ(
       ConstantRange::makeSatisfyingICmpRegion(ICmpInst::ICMP_SGE, SignedSample),
-      ConstantRange(APInt(8, 4), APInt(8, -128)));
+      ConstantRange(APInt(8, 4), APInt(8, -128, true)));
 }
 
 void ICmpTestImpl(CmpInst::Predicate Pred) {
@@ -1703,7 +1709,7 @@ TEST(ConstantRange, ICmp) {
 }
 
 TEST(ConstantRange, MakeGuaranteedNoWrapRegion) {
-  const int IntMin4Bits = 8;
+  const int IntMin4Bits = -8;
   const int IntMax4Bits = 7;
   typedef OverflowingBinaryOperator OBO;
 
@@ -1812,7 +1818,7 @@ TEST(ConstantRange, MakeGuaranteedNoWrapRegion) {
                 Instruction::Sub, OneToFive, OBO::NoUnsignedWrap),
             ConstantRange(APInt::getMinValue(32) + 5, APInt::getMinValue(32)));
 
-  ConstantRange MinusFiveToMinusTwo(APInt(32, -5), APInt(32, -1));
+  ConstantRange MinusFiveToMinusTwo(APInt(32, -5, true), APInt(32, -1, true));
   EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion(
                 Instruction::Add, MinusFiveToMinusTwo, OBO::NoSignedWrap),
             ConstantRange(APInt::getSignedMinValue(32) + 5,
@@ -1826,10 +1832,9 @@ TEST(ConstantRange, MakeGuaranteedNoWrapRegion) {
                           APInt::getSignedMaxValue(32) - 4));
   EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion(
                 Instruction::Sub, MinusFiveToMinusTwo, OBO::NoUnsignedWrap),
-            ConstantRange(APInt::getMaxValue(32) - 1,
-                          APInt::getMinValue(32)));
+            ConstantRange(APInt::getMaxValue(32) - 1, APInt::getMinValue(32)));
 
-  ConstantRange MinusOneToOne(APInt(32, -1), APInt(32, 2));
+  ConstantRange MinusOneToOne(APInt(32, -1, true), APInt(32, 2));
   EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion(
                 Instruction::Add, MinusOneToOne, OBO::NoSignedWrap),
             ConstantRange(APInt::getSignedMinValue(32) + 1,
@@ -1877,7 +1882,7 @@ TEST(ConstantRange, MakeGuaranteedNoWrapRegion) {
             ConstantRange(APInt(32, 0), APInt(32, 1) + 1));
   EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion(
                 Instruction::Shl, UpToBitWidth, OBO::NoSignedWrap),
-            ConstantRange(APInt(32, -1), APInt(32, 0) + 1));
+            ConstantRange(APInt(32, -1, true), APInt(32, 0) + 1));
 
   EXPECT_EQ(
       ConstantRange::makeGuaranteedNoWrapRegion(
@@ -1898,34 +1903,36 @@ TEST(ConstantRange, MakeGuaranteedNoWrapRegion) {
                 Instruction::Shl, IllegalShAmt, OBO::NoSignedWrap),
             ConstantRange::getFull(32));
 
-  EXPECT_EQ(
-      ConstantRange::makeGuaranteedNoWrapRegion(
-          Instruction::Shl, ConstantRange(APInt(32, -32), APInt(32, 16) + 1),
-          OBO::NoUnsignedWrap),
-      ConstantRange::makeGuaranteedNoWrapRegion(
-          Instruction::Shl, ConstantRange(APInt(32, 0), APInt(32, 16) + 1),
-          OBO::NoUnsignedWrap));
-  EXPECT_EQ(
-      ConstantRange::makeGuaranteedNoWrapRegion(
-          Instruction::Shl, ConstantRange(APInt(32, -32), APInt(32, 16) + 1),
-          OBO::NoSignedWrap),
-      ConstantRange::makeGuaranteedNoWrapRegion(
-          Instruction::Shl, ConstantRange(APInt(32, 0), APInt(32, 16) + 1),
-          OBO::NoSignedWrap));
+  EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion(
+                Instruction::Shl,
+                ConstantRange(APInt(32, -32, true), APInt(32, 16) + 1),
+                OBO::NoUnsignedWrap),
+            ConstantRange::makeGuaranteedNoWrapRegion(
+                Instruction::Shl,
+                ConstantRange(APInt(32, 0), APInt(32, 16) + 1),
+                OBO::NoUnsignedWrap));
+  EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion(
+                Instruction::Shl,
+                ConstantRange(APInt(32, -32, true), APInt(32, 16) + 1),
+                OBO::NoSignedWrap),
+            ConstantRange::makeGuaranteedNoWrapRegion(
+                Instruction::Shl,
+                ConstantRange(APInt(32, 0), APInt(32, 16) + 1),
+                OBO::NoSignedWrap));
 
   EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion(
                 Instruction::Shl,
-                ConstantRange(APInt(32, -32), APInt(32, 16) + 1),
+                ConstantRange(APInt(32, -32, true), APInt(32, 16) + 1),
                 OBO::NoUnsignedWrap),
             ConstantRange(APInt(32, 0), APInt(32, 65535) + 1));
   EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion(
                 Instruction::Shl,
-                ConstantRange(APInt(32, -32), APInt(32, 16) + 1),
+                ConstantRange(APInt(32, -32, true), APInt(32, 16) + 1),
                 OBO::NoSignedWrap),
-            ConstantRange(APInt(32, -32768), APInt(32, 32767) + 1));
+            ConstantRange(APInt(32, -32768, true), APInt(32, 32767) + 1));
 }
 
-template<typename Fn>
+template <typename Fn>
 void TestNoWrapRegionExhaustive(Instruction::BinaryOps BinOp,
                                 unsigned NoWrapKind, Fn OverflowFn) {
   for (unsigned Bits : {1, 5}) {
@@ -2090,14 +2097,15 @@ TEST(ConstantRange, GetEquivalentICmp) {
   EXPECT_EQ(Pred, CmpInst::ICMP_NE);
   EXPECT_EQ(RHS, APInt(32, 0));
 
-  EXPECT_TRUE(ConstantRange(APInt(32, -1)).getEquivalentICmp(Pred, RHS));
+  EXPECT_TRUE(ConstantRange(APInt(32, -1, true)).getEquivalentICmp(Pred, RHS));
   EXPECT_EQ(Pred, CmpInst::ICMP_EQ);
-  EXPECT_EQ(RHS, APInt(32, -1));
+  EXPECT_EQ(RHS, APInt(32, -1, true));
 
-  EXPECT_TRUE(
-      ConstantRange(APInt(32, -1)).inverse().getEquivalentICmp(Pred, RHS));
+  EXPECT_TRUE(ConstantRange(APInt(32, -1, true))
+                  .inverse()
+                  .getEquivalentICmp(Pred, RHS));
   EXPECT_EQ(Pred, CmpInst::ICMP_NE);
-  EXPECT_EQ(RHS, APInt(32, -1));
+  EXPECT_EQ(RHS, APInt(32, -1, true));
 
   EnumerateInterestingConstantRanges([](const ConstantRange &CR) {
     unsigned Bits = CR.getBitWidth();

From 5024dff6eee5a95a741b063c953422c5b6d02fdc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 5 Sep 2024 17:25:41 +0300
Subject: [PATCH 232/425] [libc++][ci] Add a test configuration with an
 incomplete sysroot (#107089)

When bringing up a new cross compiler from scratch, we build
libunwind/libcxx in a setup where the toolchain is incomplete and unable
to perform the normal linker checks; this requires a few special cases
in the CMake files.

We simulate that scenario by removing the libc++ headers, libunwind and
libc++ libraries from the installed toolchain.

We need to set CMAKE_CXX_COMPILER_WORKS since CMake fails to probe the
compiler. We need to set CMAKE_CXX_COMPILER_TARGET, since LLVM's
heuristics fail when CMake hasn't been able to probe the environment
properly. (This is normal; one has to set those options when setting up
such a toolchain from scratch.)

This adds CI coverage for these build scenarios, which otherwise seldom
are tested by some build flow (but are essential when setting up a cross
compiler from scratch).
---
 .github/workflows/libcxx-build-and-test.yaml |  7 ++++++
 libcxx/utils/ci/run-buildbot                 | 23 ++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 1a26a699db8e0..b5e60781e0006 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -242,6 +242,7 @@ jobs:
         - { config: mingw-dll, mingw: true }
         - { config: mingw-static, mingw: true }
         - { config: mingw-dll-i686, mingw: true }
+        - { config: mingw-incomplete-sysroot, mingw: true }
     steps:
       - uses: actions/checkout@v4
       - name: Install dependencies
@@ -260,6 +261,12 @@ jobs:
           del llvm-mingw*.zip
           mv llvm-mingw* c:\llvm-mingw
           echo "c:\llvm-mingw\bin" | Out-File -FilePath $Env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Simulate a from-scratch build of llvm-mingw
+        if: ${{ matrix.config == 'mingw-incomplete-sysroot' }}
+        run: |
+          rm -r c:\llvm-mingw\include\c++
+          rm -r c:\llvm-mingw\*-w64-mingw32\lib\libc++*
+          rm -r c:\llvm-mingw\*-w64-mingw32\lib\libunwind*
       - name: Add Git Bash to the path
         run: |
           echo "c:\Program Files\Git\usr\bin" | Out-File -FilePath $Env:GITHUB_PATH -Encoding utf8 -Append
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index 14ff611302981..b0533cb9a49c9 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -715,6 +715,29 @@ mingw-dll-i686)
           -C "${MONOREPO_ROOT}/libcxx/cmake/caches/MinGW.cmake"
     check-runtimes
 ;;
+mingw-incomplete-sysroot)
+    # When bringing up a new cross compiler from scratch, we build
+    # libunwind/libcxx in a setup where the toolchain is incomplete and
+    # unable to perform the normal linker checks; this requires a few
+    # special cases in the CMake files.
+    #
+    # Building in an incomplete setup requires setting CMAKE_*_COMPILER_WORKS,
+    # as CMake fails to probe the compiler. This case also requires
+    # setting CMAKE_CXX_COMPILER_TARGET, as LLVM's heuristics for setting
+    # the triple fails when CMake hasn't been able to probe the environment.
+    # (This is what one has to do when building the initial libunwind/libcxx
+    # for a new toolchain.)
+    clean
+    generate-cmake \
+          -DCMAKE_C_COMPILER_WORKS=TRUE \
+          -DCMAKE_CXX_COMPILER_WORKS=TRUE \
+          -DCMAKE_C_COMPILER_TARGET=x86_64-w64-windows-gnu \
+          -DCMAKE_CXX_COMPILER_TARGET=x86_64-w64-windows-gnu \
+          -C "${MONOREPO_ROOT}/libcxx/cmake/caches/MinGW.cmake"
+    # Only test that building succeeds; there's not much extra value in running
+    # the tests here, as it would be equivalent to the mingw-dll config above.
+    ${NINJA} -vC "${BUILD_DIR}"
+;;
 aix)
     clean
     generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/AIX.cmake" \

From 16dc65bdc0f0a23bc2696afce2abecd9f2faa097 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Thu, 5 Sep 2024 07:33:55 -0700
Subject: [PATCH 233/425] [mlgo] Fix test post - #106744

Trivial fix, some instruction opcodes changed.
---
 llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
index 79be2447abcff..31e120007cb7d 100644
--- a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
+++ b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
@@ -26,7 +26,7 @@
 ; Also, the first eviction problem is significantly less than 300 instructions. Check
 ; that there is a zero value.
 ; Note: we're regex-ing some of the opcodes to avoid test flakyness.
-; CHECK: instructions: 19,{{([0-9]{4})}},1{{([0-9]{3})}},1{{([0-9]{3})}},{{.*}},0,
+; CHECK: instructions: 20,{{([0-9]{4})}},1{{([0-9]{3})}},1{{([0-9]{3})}},{{.*}},0,
 ; Only the candidate virtreg and the 10th LR are included in this problem. Make
 ; sure the other LRs have values of zero. There are 2700 0s followed by some 1s.
 ; There's a limit to how many repetitions can be matched.

From bded3b3ea9f78c5b3edc3d4a6076665af0ea746b Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Thu, 5 Sep 2024 07:42:23 -0700
Subject: [PATCH 234/425] [llvm][AArch64] Improve the cost model for i128 div's
 (#107306)

---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  8 +++-
 llvm/test/Analysis/CostModel/AArch64/div.ll   | 43 +++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 58c267f1ce4bd..2b5deaff39719 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3224,8 +3224,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     }
     [[fallthrough]];
   case ISD::UDIV: {
+    auto VT = TLI->getValueType(DL, Ty);
     if (Op2Info.isConstant() && Op2Info.isUniform()) {
-      auto VT = TLI->getValueType(DL, Ty);
       if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
         // Vector signed division by constant are expanded to the
         // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
@@ -3240,6 +3240,12 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
       }
     }
 
+    // div i128's are lowered as libcalls.  Pass nullptr as (u)divti3 calls are
+    // emitted by the backend even when those functions are not declared in the
+    // module.
+    if (!VT.isVector() && VT.getSizeInBits() > 64)
+      return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
+
     InstructionCost Cost = BaseT::getArithmeticInstrCost(
         Opcode, Ty, CostKind, Op1Info, Op2Info);
     if (Ty->isVectorTy()) {
diff --git a/llvm/test/Analysis/CostModel/AArch64/div.ll b/llvm/test/Analysis/CostModel/AArch64/div.ll
index 7b52e2dc18aa5..2ceaf0c6f536a 100644
--- a/llvm/test/Analysis/CostModel/AArch64/div.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/div.ll
@@ -5,6 +5,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define i32 @sdiv() {
 ; CHECK-LABEL: 'sdiv'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, undef
@@ -23,6 +24,8 @@ define i32 @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = sdiv i128 undef, undef
+
   %I64 = sdiv i64 undef, undef
   %V2i64 = sdiv <2 x i64> undef, undef
   %V4i64 = sdiv <4 x i64> undef, undef
@@ -48,6 +51,7 @@ define i32 @sdiv() {
 
 define i32 @udiv() {
 ; CHECK-LABEL: 'udiv'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, undef
@@ -66,6 +70,8 @@ define i32 @udiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = udiv i128 undef, undef
+
   %I64 = udiv i64 undef, undef
   %V2i64 = udiv <2 x i64> undef, undef
   %V4i64 = udiv <4 x i64> undef, undef
@@ -91,6 +97,7 @@ define i32 @udiv() {
 
 define i32 @sdiv_const() {
 ; CHECK-LABEL: 'sdiv_const'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
@@ -109,6 +116,8 @@ define i32 @sdiv_const() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = sdiv i128 undef, 7
+
   %I64 = sdiv i64 undef, 7
   %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
   %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
@@ -134,6 +143,7 @@ define i32 @sdiv_const() {
 
 define i32 @udiv_const() {
 ; CHECK-LABEL: 'udiv_const'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
@@ -152,6 +162,9 @@ define i32 @udiv_const() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+
+  %I128 = udiv i128 undef, 7
+
   %I64 = udiv i64 undef, 7
   %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
   %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
@@ -177,6 +190,7 @@ define i32 @udiv_const() {
 
 define i32 @sdiv_uniformconst() {
 ; CHECK-LABEL: 'sdiv_uniformconst'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 7, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
@@ -195,6 +209,8 @@ define i32 @sdiv_uniformconst() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = sdiv i128 undef, 7
+
   %I64 = sdiv i64 undef, 7
   %V2i64 = sdiv <2 x i64> undef, <i64 7, i64 7>
   %V4i64 = sdiv <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
@@ -220,6 +236,7 @@ define i32 @sdiv_uniformconst() {
 
 define i32 @udiv_uniformconst() {
 ; CHECK-LABEL: 'udiv_uniformconst'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 7, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
@@ -238,6 +255,8 @@ define i32 @udiv_uniformconst() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = udiv i128 undef, 7
+
   %I64 = udiv i64 undef, 7
   %V2i64 = udiv <2 x i64> undef, <i64 7, i64 7>
   %V4i64 = udiv <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
@@ -263,6 +282,7 @@ define i32 @udiv_uniformconst() {
 
 define i32 @sdiv_constpow2() {
 ; CHECK-LABEL: 'sdiv_constpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = sdiv i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
@@ -281,6 +301,8 @@ define i32 @sdiv_constpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = sdiv i128 undef, 16
+
   %I64 = sdiv i64 undef, 16
   %V2i64 = sdiv <2 x i64> undef, <i64 8, i64 16>
   %V4i64 = sdiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
@@ -306,6 +328,7 @@ define i32 @sdiv_constpow2() {
 
 define i32 @udiv_constpow2() {
 ; CHECK-LABEL: 'udiv_constpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
@@ -324,6 +347,8 @@ define i32 @udiv_constpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = udiv i128 undef, 16
+
   %I64 = udiv i64 undef, 16
   %V2i64 = udiv <2 x i64> undef, <i64 8, i64 16>
   %V4i64 = udiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
@@ -349,6 +374,7 @@ define i32 @udiv_constpow2() {
 
 define i32 @sdiv_uniformconstpow2() {
 ; CHECK-LABEL: 'sdiv_uniformconstpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = sdiv i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 16, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
@@ -367,6 +393,8 @@ define i32 @sdiv_uniformconstpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = sdiv i128 undef, 16
+
   %I64 = sdiv i64 undef, 16
   %V2i64 = sdiv <2 x i64> undef, <i64 16, i64 16>
   %V4i64 = sdiv <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
@@ -392,6 +420,7 @@ define i32 @sdiv_uniformconstpow2() {
 
 define i32 @udiv_uniformconstpow2() {
 ; CHECK-LABEL: 'udiv_uniformconstpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 16, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
@@ -410,6 +439,8 @@ define i32 @udiv_uniformconstpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = udiv i128 undef, 16
+
   %I64 = udiv i64 undef, 16
   %V2i64 = udiv <2 x i64> undef, <i64 16, i64 16>
   %V4i64 = udiv <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
@@ -435,6 +466,7 @@ define i32 @udiv_uniformconstpow2() {
 
 define i32 @sdiv_constnegpow2() {
 ; CHECK-LABEL: 'sdiv_constnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
@@ -453,6 +485,8 @@ define i32 @sdiv_constnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = sdiv i128 undef, -16
+
   %I64 = sdiv i64 undef, -16
   %V2i64 = sdiv <2 x i64> undef, <i64 -8, i64 -16>
   %V4i64 = sdiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
@@ -478,6 +512,7 @@ define i32 @sdiv_constnegpow2() {
 
 define i32 @udiv_constnegpow2() {
 ; CHECK-LABEL: 'udiv_constnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
@@ -496,6 +531,8 @@ define i32 @udiv_constnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = udiv i128 undef, -16
+
   %I64 = udiv i64 undef, -16
   %V2i64 = udiv <2 x i64> undef, <i64 -8, i64 -16>
   %V4i64 = udiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
@@ -521,6 +558,7 @@ define i32 @udiv_constnegpow2() {
 
 define i32 @sdiv_uniformconstnegpow2() {
 ; CHECK-LABEL: 'sdiv_uniformconstnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 -16, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
@@ -539,6 +577,8 @@ define i32 @sdiv_uniformconstnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = sdiv i128 undef, -16
+
   %I64 = sdiv i64 undef, -16
   %V2i64 = sdiv <2 x i64> undef, <i64 -16, i64 -16>
   %V4i64 = sdiv <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
@@ -564,6 +604,7 @@ define i32 @sdiv_uniformconstnegpow2() {
 
 define i32 @udiv_uniformconstnegpow2() {
 ; CHECK-LABEL: 'udiv_uniformconstnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 -16, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
@@ -582,6 +623,8 @@ define i32 @udiv_uniformconstnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = udiv i128 undef, -16
+
   %I64 = udiv i64 undef, -16
   %V2i64 = udiv <2 x i64> undef, <i64 -16, i64 -16>
   %V4i64 = udiv <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>

From 63e8a1b16f344eaef17c4015497326479e69d1e7 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 5 Sep 2024 07:52:27 -0700
Subject: [PATCH 235/425] [SLP] Enable reordering for non-power-of-two vectors
 (#106638)

This change tries to enable vector reordering during vectorization for
non-power-of-two vectors. Specifically, my goal is to be able to
vectorize reductions whose operands appear in other than identity order.
(i.e. a[1] + a[0] + a[2]). Our standard pass pipeline, Reassociation
effectively canonicalizes towards this form. So for reduction
vectorization to be wildly applicable, we need this feature.

This change enables the use of a non-empty ReorderIndices structure -
which is effectively required for out of order loads or gathers - while
leaving the ReuseShuffleIndices mechanism unused and disabled. If I've
understood the code structure, the former is used when describing
implicit shuffles required by the vectorization strategy (i.e. loading
elements 0,1,3,2 in the order 0,1,2,3 and then shuffling later), while
the later is used when trying to optimize explode/buildvectors (called
gathers in this code).

I audited all the code enabled by this change, but can't claim to
deeply understand most of it. I added a couple of bailouts in places
which appeared to be difficult to audit and optional optimizations. I've
tried to do so in the least risky way I can, but am not completely
confident in this change. Careful review appreciated.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 48 +++++-----
 .../AArch64/vec3-reorder-reshuffle.ll         | 15 ++--
 .../SLPVectorizer/RISCV/vec3-base.ll          | 88 +++++++++++--------
 .../X86/vec3-reorder-reshuffle.ll             |  9 +-
 4 files changed, 89 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 60476398e5ca7..74bb529b2526e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3388,6 +3388,10 @@ class BoUpSLP {
     TreeEntry *Last = VectorizableTree.back().get();
     Last->Idx = VectorizableTree.size() - 1;
     Last->State = EntryState;
+    // FIXME: Remove once support for ReuseShuffleIndices has been implemented
+    // for non-power-of-two vectors.
+    assert((has_single_bit(VL.size()) || ReuseShuffleIndices.empty()) &&
+           "Reshuffling scalars not yet supported for nodes with padding");
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
     if (ReorderIndices.empty()) {
@@ -3452,11 +3456,8 @@ class BoUpSLP {
       MustGather.insert(VL.begin(), VL.end());
     }
 
-    if (UserTreeIdx.UserTE) {
+    if (UserTreeIdx.UserTE)
       Last->UserTreeIndices.push_back(UserTreeIdx);
-      assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
-             "Reordering isn't implemented for non-power-of-2 nodes yet");
-    }
     return Last;
   }
 
@@ -4731,12 +4732,6 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
   auto *VecTy = getWidenedType(ScalarTy, Sz);
   // Check the order of pointer operands or that all pointers are the same.
   bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
-  // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
-  if (!Order.empty() && !has_single_bit(VL.size())) {
-    assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
-                                   "supported with VectorizeNonPowerOf2");
-    return LoadsState::Gather;
-  }
 
   Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
   if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
@@ -4824,6 +4819,12 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
   // representation is better than just gather.
   auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
                                                 bool ProfitableGatherPointers) {
+    // FIXME: The following code has not been updated for non-power-of-2
+    // vectors.  The splitting logic here does not cover the original
+    // vector if the vector factor is not a power of two.  FIXME
+    if (!has_single_bit(VL.size()))
+      return false;
+
     // Compare masked gather cost and loads + insert subvector costs.
     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
     auto [ScalarGEPCost, VectorGEPCost] =
@@ -5195,13 +5196,13 @@ static bool areTwoInsertFromSameBuildVector(
 
 std::optional<BoUpSLP::OrdersType>
 BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
-  // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
-  if (TE.isNonPowOf2Vec())
-    return std::nullopt;
-
   // No need to reorder if need to shuffle reuses, still need to shuffle the
   // node.
   if (!TE.ReuseShuffleIndices.empty()) {
+    // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
+    assert(!TE.isNonPowOf2Vec() &&
+           "Reshuffling scalars not yet supported for nodes with padding");
+
     if (isSplat(TE.Scalars))
       return std::nullopt;
     // Check if reuse shuffle indices can be improved by reordering.
@@ -5424,11 +5425,15 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     }
     if (isSplat(TE.Scalars))
       return std::nullopt;
-    if (TE.Scalars.size() >= 4)
+    if (TE.Scalars.size() >= 3)
       if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
         return Order;
-    if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
-      return CurrentOrder;
+
+    // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
+    // has been auditted for correctness with non-power-of-two vectors.
+    if (!TE.isNonPowOf2Vec())
+      if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
+        return CurrentOrder;
   }
   return std::nullopt;
 }
@@ -5580,7 +5585,7 @@ void BoUpSLP::reorderTopToBottom() {
 
   // Reorder the graph nodes according to their vectorization factor.
   for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
-       VF /= 2) {
+       VF = bit_ceil(VF) / 2) {
     auto It = VFToOrderedEntries.find(VF);
     if (It == VFToOrderedEntries.end())
       continue;
@@ -5752,10 +5757,6 @@ bool BoUpSLP::canReorderOperands(
     TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
     ArrayRef<TreeEntry *> ReorderableGathers,
     SmallVectorImpl<TreeEntry *> &GatherOps) {
-  // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
-  if (UserTE->isNonPowOf2Vec())
-    return false;
-
   for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
     if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
           return OpData.first == I &&
@@ -5927,9 +5928,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         }
         auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
         const auto AllowsReordering = [&](const TreeEntry *TE) {
-          // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
-          if (TE->isNonPowOf2Vec())
-            return false;
           if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
               (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
               (IgnoreReorder && TE->Idx == 0))
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index 9bbd314a27cb9..c9b2e0ffc15f3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -191,12 +191,12 @@ define i32 @reorder_indices_1(float %0) {
 ; NON-POW2-NEXT:  entry:
 ; NON-POW2-NEXT:    [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
 ; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
-; NON-POW2-NEXT:    [[TMP3:%.*]] = fneg <3 x float> [[TMP2]]
+; NON-POW2-NEXT:    [[TMP3:%.*]] = fneg <3 x float> [[TMP1]]
 ; NON-POW2-NEXT:    [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
 ; NON-POW2-NEXT:    [[TMP5:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> poison, <3 x i32> zeroinitializer
 ; NON-POW2-NEXT:    [[TMP6:%.*]] = fmul <3 x float> [[TMP3]], [[TMP5]]
-; NON-POW2-NEXT:    [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP6]])
+; NON-POW2-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; NON-POW2-NEXT:    [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP10]])
 ; NON-POW2-NEXT:    [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> [[TMP7]], <3 x float> zeroinitializer)
 ; NON-POW2-NEXT:    [[TMP9:%.*]] = fmul <3 x float> [[TMP8]], zeroinitializer
 ; NON-POW2-NEXT:    store <3 x float> [[TMP9]], ptr [[NOR1]], align 4
@@ -263,7 +263,8 @@ define void @reorder_indices_2(ptr %spoint) {
 ; NON-POW2-NEXT:    [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
 ; NON-POW2-NEXT:    [[TMP0:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> zeroinitializer, <3 x float> zeroinitializer, <3 x float> zeroinitializer)
 ; NON-POW2-NEXT:    [[TMP1:%.*]] = fmul <3 x float> [[TMP0]], zeroinitializer
-; NON-POW2-NEXT:    store <3 x float> [[TMP1]], ptr [[DSCO]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; NON-POW2-NEXT:    store <3 x float> [[TMP2]], ptr [[DSCO]], align 4
 ; NON-POW2-NEXT:    ret void
 ;
 ; POW2-ONLY-LABEL: define void @reorder_indices_2(
@@ -566,11 +567,11 @@ define void @can_reorder_vec3_op_with_padding(ptr %A, <3 x float> %in) {
 ; NON-POW2-LABEL: define void @can_reorder_vec3_op_with_padding(
 ; NON-POW2-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) {
 ; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[IN]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
-; NON-POW2-NEXT:    [[TMP1:%.*]] = fsub <3 x float> [[TMP0]], [[TMP0]]
+; NON-POW2-NEXT:    [[TMP1:%.*]] = fsub <3 x float> [[IN]], [[IN]]
 ; NON-POW2-NEXT:    [[TMP2:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <3 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>)
 ; NON-POW2-NEXT:    [[TMP3:%.*]] = fmul <3 x float> [[TMP2]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
-; NON-POW2-NEXT:    store <3 x float> [[TMP3]], ptr [[A]], align 4
+; NON-POW2-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; NON-POW2-NEXT:    store <3 x float> [[TMP4]], ptr [[A]], align 4
 ; NON-POW2-NEXT:    ret void
 ;
 ; POW2-ONLY-LABEL: define void @can_reorder_vec3_op_with_padding(
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index d9e5655f4b4e0..4e8e019e155db 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -557,25 +557,34 @@ define i32 @dot_product_i32(ptr %a, ptr %b) {
 ; Same as above, except the reduction order has been perturbed.  This
 ; is checking for our ability to reorder.
 define i32 @dot_product_i32_reorder(ptr %a, ptr %b) {
-; CHECK-LABEL: @dot_product_i32_reorder(
-; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
-; CHECK-NEXT:    [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
-; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
-; CHECK-NEXT:    [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
-; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
-; CHECK-NEXT:    [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
-; CHECK-NEXT:    [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
-; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
-; CHECK-NEXT:    [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
-; CHECK-NEXT:    [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
-; CHECK-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
-; CHECK-NEXT:    [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
-; CHECK-NEXT:    ret i32 [[ADD_1]]
+; NON-POW2-LABEL: @dot_product_i32_reorder(
+; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
+; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]]
+; NON-POW2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]])
+; NON-POW2-NEXT:    ret i32 [[TMP4]]
+;
+; POW2-ONLY-LABEL: @dot_product_i32_reorder(
+; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
+; POW2-ONLY-NEXT:    [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
+; POW2-ONLY-NEXT:    [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
+; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
+; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
+; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
+; POW2-ONLY-NEXT:    [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
+; POW2-ONLY-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
+; POW2-ONLY-NEXT:    [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
+; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
+; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
+; POW2-ONLY-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
+; POW2-ONLY-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
+; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
+; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]]
+; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
+; POW2-ONLY-NEXT:    ret i32 [[ADD_1]]
 ;
   %gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0
   %l.a.0 = load i32, ptr %gep.a.0, align 4
@@ -653,22 +662,31 @@ define float @dot_product_fp32(ptr %a, ptr %b) {
 ; Same as above, except the reduction order has been perturbed.  This
 ; is checking for our ability to reorder.
 define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
-; CHECK-LABEL: @dot_product_fp32_reorder(
-; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
-; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
-; CHECK-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
-; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
-; CHECK-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
-; CHECK-NEXT:    ret float [[ADD_1]]
+; NON-POW2-LABEL: @dot_product_fp32_reorder(
+; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
+; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
+; NON-POW2-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
+; NON-POW2-NEXT:    ret float [[TMP4]]
+;
+; POW2-ONLY-LABEL: @dot_product_fp32_reorder(
+; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
+; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
+; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
+; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
+; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
+; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
+; POW2-ONLY-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; POW2-ONLY-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]]
+; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
+; POW2-ONLY-NEXT:    ret float [[ADD_1]]
 ;
   %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
   %l.a.0 = load float, ptr %gep.a.0, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
index 1399b4c35c781..22a59d3da52a6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
@@ -190,12 +190,12 @@ define i32 @reorder_indices_1(float %0) {
 ; NON-POW2-NEXT:  entry:
 ; NON-POW2-NEXT:    [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
 ; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
-; NON-POW2-NEXT:    [[TMP3:%.*]] = fneg <3 x float> [[TMP2]]
+; NON-POW2-NEXT:    [[TMP3:%.*]] = fneg <3 x float> [[TMP1]]
 ; NON-POW2-NEXT:    [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
 ; NON-POW2-NEXT:    [[TMP5:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> poison, <3 x i32> zeroinitializer
 ; NON-POW2-NEXT:    [[TMP6:%.*]] = fmul <3 x float> [[TMP3]], [[TMP5]]
-; NON-POW2-NEXT:    [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP6]])
+; NON-POW2-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; NON-POW2-NEXT:    [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP10]])
 ; NON-POW2-NEXT:    [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> [[TMP7]], <3 x float> zeroinitializer)
 ; NON-POW2-NEXT:    [[TMP9:%.*]] = fmul <3 x float> [[TMP8]], zeroinitializer
 ; NON-POW2-NEXT:    store <3 x float> [[TMP9]], ptr [[NOR1]], align 4
@@ -262,7 +262,8 @@ define void @reorder_indices_2(ptr %spoint) {
 ; NON-POW2-NEXT:    [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
 ; NON-POW2-NEXT:    [[TMP0:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> zeroinitializer, <3 x float> zeroinitializer, <3 x float> zeroinitializer)
 ; NON-POW2-NEXT:    [[TMP1:%.*]] = fmul <3 x float> [[TMP0]], zeroinitializer
-; NON-POW2-NEXT:    store <3 x float> [[TMP1]], ptr [[DSCO]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; NON-POW2-NEXT:    store <3 x float> [[TMP2]], ptr [[DSCO]], align 4
 ; NON-POW2-NEXT:    ret void
 ;
 ; POW2-ONLY-LABEL: define void @reorder_indices_2(

From 3b19e480c056a35a60e3c65de476b6097329ceac Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Thu, 5 Sep 2024 15:59:36 +0100
Subject: [PATCH 236/425] [flang] Warn when F128 is unsupported (#102147)
 (#106957)

This generates `warning: REAL(KIND=16) is not an enabled type for this
target` if that type is used in a build not correctly configured to
support this type. Uses of `selected_real_kind(30)` return -1.

Relanding #102147 because the test errors turned out to be specific to a
downstream configuration.
---
 flang/include/flang/Tools/TargetSetup.h       | 21 +++++++-
 flang/module/ieee_arithmetic.f90              | 49 +++++++++++++++++++
 flang/test/CMakeLists.txt                     | 10 ++++
 flang/test/Evaluate/fold-out_of_range.f90     |  1 +
 flang/test/Evaluate/folding07.f90             |  1 +
 .../Lower/Intrinsics/ieee_class_queries.f90   |  1 +
 .../test/Lower/Intrinsics/ieee_unordered.f90  |  1 +
 flang/test/Lower/common-block.f90             |  2 +-
 flang/test/Semantics/kinds03.f90              |  1 +
 flang/test/Semantics/modfile26.f90            |  1 +
 flang/test/Semantics/realkinds-aarch64-01.f90 |  1 +
 flang/test/lit.cfg.py                         |  3 +-
 flang/test/lit.site.cfg.py.in                 |  1 +
 flang/tools/f18/CMakeLists.txt                | 25 +++++++++-
 14 files changed, 114 insertions(+), 4 deletions(-)

diff --git a/flang/include/flang/Tools/TargetSetup.h b/flang/include/flang/Tools/TargetSetup.h
index 37c1e1d2ff63f..ee89249441c17 100644
--- a/flang/include/flang/Tools/TargetSetup.h
+++ b/flang/include/flang/Tools/TargetSetup.h
@@ -12,6 +12,7 @@
 #include "flang/Evaluate/target.h"
 #include "flang/Frontend/TargetOptions.h"
 #include "llvm/Target/TargetMachine.h"
+#include <cfloat>
 
 namespace Fortran::tools {
 
@@ -23,9 +24,27 @@ namespace Fortran::tools {
 
   const llvm::Triple &targetTriple{targetMachine.getTargetTriple()};
   // FIXME: Handle real(3) ?
-  if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64)
+  if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64) {
     targetCharacteristics.DisableType(
         Fortran::common::TypeCategory::Real, /*kind=*/10);
+  }
+
+  // Figure out if we can support F128: see
+  // flang/runtime/Float128Math/math-entries.h
+  // TODO: this should be taken from TargetInfo::getLongDoubleFormat to support
+  // cross-compilation
+#ifdef FLANG_RUNTIME_F128_MATH_LIB
+  // we can use libquadmath wrappers
+  constexpr bool f128Support = true;
+#elif LDBL_MANT_DIG == 113
+  // we can use libm wrappers
+  constexpr bool f128Support = true;
+#else
+  constexpr bool f128Support = false;
+#endif
+
+  if constexpr (!f128Support)
+    targetCharacteristics.DisableType(Fortran::common::TypeCategory::Real, 16);
 
   for (auto realKind : targetOptions.disabledRealKinds)
     targetCharacteristics.DisableType(common::TypeCategory::Real, realKind);
diff --git a/flang/module/ieee_arithmetic.f90 b/flang/module/ieee_arithmetic.f90
index 7c7721d78c1ed..32e640b9e2457 100644
--- a/flang/module/ieee_arithmetic.f90
+++ b/flang/module/ieee_arithmetic.f90
@@ -161,6 +161,8 @@ end function ieee_round_ne
   G(1) G(2) G(4) G(8) G(16)
 #define SPECIFICS_L(G) \
   G(1) G(2) G(4) G(8)
+
+#if FLANG_SUPPORT_R16
 #if __x86_64__
 #define SPECIFICS_R(G) \
   G(2) G(3) G(4) G(8) G(10) G(16)
@@ -168,12 +170,24 @@ end function ieee_round_ne
 #define SPECIFICS_R(G) \
   G(2) G(3) G(4) G(8) G(16)
 #endif
+#else
+#if __x86_64__
+#define SPECIFICS_R(G) \
+  G(2) G(3) G(4) G(8) G(10)
+#else
+#define SPECIFICS_R(G) \
+  G(2) G(3) G(4) G(8)
+#endif
+#endif
+
 #define SPECIFICS_II(G) \
   G(1,1) G(1,2) G(1,4) G(1,8) G(1,16) \
   G(2,1) G(2,2) G(2,4) G(2,8) G(2,16) \
   G(4,1) G(4,2) G(4,4) G(4,8) G(4,16) \
   G(8,1) G(8,2) G(8,4) G(8,8) G(8,16) \
   G(16,1) G(16,2) G(16,4) G(16,8) G(16,16)
+
+#if FLANG_SUPPORT_R16
 #if __x86_64__
 #define SPECIFICS_RI(G) \
   G(2,1) G(2,2) G(2,4) G(2,8) G(2,16) \
@@ -190,7 +204,24 @@ end function ieee_round_ne
   G(8,1) G(8,2) G(8,4) G(8,8) G(8,16) \
   G(16,1) G(16,2) G(16,4) G(16,8) G(16,16)
 #endif
+#else
+#if __x86_64__
+#define SPECIFICS_RI(G) \
+  G(2,1) G(2,2) G(2,4) G(2,8) \
+  G(3,1) G(3,2) G(3,4) G(3,8) \
+  G(4,1) G(4,2) G(4,4) G(4,8) \
+  G(8,1) G(8,2) G(8,4) G(8,8) \
+  G(10,1) G(10,2) G(10,4) G(10,8)
+#else
+#define SPECIFICS_RI(G) \
+  G(2,1) G(2,2) G(2,4) G(2,8) \
+  G(3,1) G(3,2) G(3,4) G(3,8) \
+  G(4,1) G(4,2) G(4,4) G(4,8) \
+  G(8,1) G(8,2) G(8,4) G(8,8)
+#endif
+#endif
 
+#if FLANG_SUPPORT_R16
 #if __x86_64__
 #define SPECIFICS_RR(G) \
   G(2,2) G(2,3) G(2,4) G(2,8) G(2,10) G(2,16) \
@@ -207,6 +238,22 @@ end function ieee_round_ne
   G(8,2) G(8,3) G(8,4) G(8,8) G(8,16) \
   G(16,2) G(16,3) G(16,4) G(16,8) G(16,16)
 #endif
+#else
+#if __x86_64__
+#define SPECIFICS_RR(G) \
+  G(2,2) G(2,3) G(2,4) G(2,8) G(2,10) \
+  G(3,2) G(3,3) G(3,4) G(3,8) G(3,10) \
+  G(4,2) G(4,3) G(4,4) G(4,8) G(4,10) \
+  G(8,2) G(8,3) G(8,4) G(8,8) G(8,10) \
+  G(10,2) G(10,3) G(10,4) G(10,8) G(10,10)
+#else
+#define SPECIFICS_RR(G) \
+  G(2,2) G(2,3) G(2,4) G(2,8) \
+  G(3,2) G(3,3) G(3,4) G(3,8) \
+  G(4,2) G(4,3) G(4,4) G(4,8) \
+  G(8,2) G(8,3) G(8,4) G(8,8)
+#endif
+#endif
 
 #define IEEE_CLASS_R(XKIND) \
   elemental type(ieee_class_type) function ieee_class_a##XKIND(x); \
@@ -462,8 +509,10 @@ end function ieee_real_a##AKIND##_i##KKIND;
   interface ieee_real
     SPECIFICS_I(IEEE_REAL_I)
     SPECIFICS_R(IEEE_REAL_R)
+#if FLANG_SUPPORT_R16
     SPECIFICS_II(IEEE_REAL_II)
     SPECIFICS_RI(IEEE_REAL_RI)
+#endif
   end interface ieee_real
   public :: ieee_real
 #undef IEEE_REAL_I
diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt
index 43ad1e3312b64..a18a5c6519eda 100644
--- a/flang/test/CMakeLists.txt
+++ b/flang/test/CMakeLists.txt
@@ -11,6 +11,16 @@ llvm_canonicalize_cmake_booleans(
 
 set(FLANG_TOOLS_DIR ${FLANG_BINARY_DIR}/bin)
 
+# Check if 128-bit float computations can be done via long double
+check_cxx_source_compiles(
+  "#include <cfloat>
+   #if LDBL_MANT_DIG != 113
+   #error LDBL_MANT_DIG != 113
+   #endif
+   int main() { return 0; }
+  "
+  HAVE_LDBL_MANT_DIG_113)
+
 # FIXME In out-of-tree builds, "SHLIBDIR" is undefined and passing it to
 # `configure_lit_site_cfg` leads to a configuration error. This is currently
 # only required by plugins/examples, which are not supported in out-of-tree
diff --git a/flang/test/Evaluate/fold-out_of_range.f90 b/flang/test/Evaluate/fold-out_of_range.f90
index 5a9f900beb2d5..6360eee5322bb 100644
--- a/flang/test/Evaluate/fold-out_of_range.f90
+++ b/flang/test/Evaluate/fold-out_of_range.f90
@@ -1,6 +1,7 @@
 ! RUN: %python %S/test_folding.py %s %flang_fc1 -pedantic -triple x86_64-unknown-linux-gnu
 ! UNSUPPORTED: system-windows
 ! REQUIRES: target=x86_64{{.*}}
+! REQUIRES: flang-supports-f128-math
 ! Tests folding of OUT_OF_RANGE().
 module m
   integer(1),  parameter :: i1v(*)  = [ -huge(1_1)  - 1_1,  huge(1_1) ]
diff --git a/flang/test/Evaluate/folding07.f90 b/flang/test/Evaluate/folding07.f90
index 3b6a99df38826..d51df7acf7b8a 100644
--- a/flang/test/Evaluate/folding07.f90
+++ b/flang/test/Evaluate/folding07.f90
@@ -1,3 +1,4 @@
+! REQUIRES: flang-supports-f128-math
 ! RUN: %python %S/test_folding.py %s %flang_fc1
 ! Test numeric model inquiry intrinsics
 
diff --git a/flang/test/Lower/Intrinsics/ieee_class_queries.f90 b/flang/test/Lower/Intrinsics/ieee_class_queries.f90
index bb7787ea903e2..b2f9df83a902a 100644
--- a/flang/test/Lower/Intrinsics/ieee_class_queries.f90
+++ b/flang/test/Lower/Intrinsics/ieee_class_queries.f90
@@ -1,3 +1,4 @@
+! REQUIRES: flang-supports-f128-math
 ! RUN: bbc -emit-fir -o - %s | FileCheck %s
 
   ! CHECK-LABEL: func @_QQmain
diff --git a/flang/test/Lower/Intrinsics/ieee_unordered.f90 b/flang/test/Lower/Intrinsics/ieee_unordered.f90
index a6146eff7f06e..b7e81d53a2d75 100644
--- a/flang/test/Lower/Intrinsics/ieee_unordered.f90
+++ b/flang/test/Lower/Intrinsics/ieee_unordered.f90
@@ -1,3 +1,4 @@
+! REQUIRES: flang-supports-f128-math
 ! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: func @_QQmain
diff --git a/flang/test/Lower/common-block.f90 b/flang/test/Lower/common-block.f90
index 94e61b8bcc33d..b5c1389df45d3 100644
--- a/flang/test/Lower/common-block.f90
+++ b/flang/test/Lower/common-block.f90
@@ -1,3 +1,4 @@
+! REQUIRES: flang-supports-f128-math
 ! RUN: bbc %s -o - | tco | FileCheck %s
 ! RUN: %flang -emit-llvm -S -mmlir -disable-external-name-interop %s -o - | FileCheck %s
 
@@ -78,4 +79,3 @@ subroutine s7()
   real(16) r16
   common /co1/ r16
 end subroutine
-
diff --git a/flang/test/Semantics/kinds03.f90 b/flang/test/Semantics/kinds03.f90
index 751d4a9ffa3cb..a15a4a9baa731 100644
--- a/flang/test/Semantics/kinds03.f90
+++ b/flang/test/Semantics/kinds03.f90
@@ -1,3 +1,4 @@
+! REQUIRES: flang-supports-f128-math
 ! RUN: %python %S/test_symbols.py %s %flang_fc1
  !DEF: /MainProgram1/ipdt DerivedType
  !DEF: /MainProgram1/ipdt/k TypeParam INTEGER(4)
diff --git a/flang/test/Semantics/modfile26.f90 b/flang/test/Semantics/modfile26.f90
index 09dc8b0954d63..9c5d0015d9163 100644
--- a/flang/test/Semantics/modfile26.f90
+++ b/flang/test/Semantics/modfile26.f90
@@ -1,3 +1,4 @@
+! REQUIRES: flang-supports-f128-math
 ! RUN: %python %S/test_modfile.py %s %flang_fc1
 ! Intrinsics SELECTED_INT_KIND, SELECTED_REAL_KIND, PRECISION, RANGE,
 ! RADIX, DIGITS
diff --git a/flang/test/Semantics/realkinds-aarch64-01.f90 b/flang/test/Semantics/realkinds-aarch64-01.f90
index e22920ff991e9..2520c0b84c0e6 100644
--- a/flang/test/Semantics/realkinds-aarch64-01.f90
+++ b/flang/test/Semantics/realkinds-aarch64-01.f90
@@ -1,4 +1,5 @@
 ! REQUIRES: aarch64-registered-target
+! REQUIRES: flang-supports-f128-math
 ! RUN: %python %S/test_modfile.py %s %flang_fc1 -triple aarch64-unknown-linux-gnu
 
 module m1
diff --git a/flang/test/lit.cfg.py b/flang/test/lit.cfg.py
index 37869e7e2ecd7..4acbc0606d197 100644
--- a/flang/test/lit.cfg.py
+++ b/flang/test/lit.cfg.py
@@ -216,8 +216,9 @@
 # Add features and substitutions to test F128 math support.
 # %f128-lib substitution may be used to generate check prefixes
 # for LIT tests checking for F128 library support.
-if config.flang_runtime_f128_math_lib:
+if config.flang_runtime_f128_math_lib or config.have_ldbl_mant_dig_113:
     config.available_features.add("flang-supports-f128-math")
+if config.flang_runtime_f128_math_lib:
     config.available_features.add(
         "flang-f128-math-lib-" + config.flang_runtime_f128_math_lib
     )
diff --git a/flang/test/lit.site.cfg.py.in b/flang/test/lit.site.cfg.py.in
index fe6186d714071..d1a0ac763cf8a 100644
--- a/flang/test/lit.site.cfg.py.in
+++ b/flang/test/lit.site.cfg.py.in
@@ -31,6 +31,7 @@ if "openmp" in "@LLVM_ENABLE_RUNTIMES@".lower().split(";"):
 else:
     config.openmp_module_dir = None
 config.flang_runtime_f128_math_lib = "@FLANG_RUNTIME_F128_MATH_LIB@"
+config.have_ldbl_mant_dig_113 = "@HAVE_LDBL_MANT_DIG_113@"
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
index cec4e2d810720..344a781c41e95 100644
--- a/flang/tools/f18/CMakeLists.txt
+++ b/flang/tools/f18/CMakeLists.txt
@@ -31,6 +31,25 @@ set(MODULES_WITHOUT_IMPLEMENTATION
 
 set(MODULES ${MODULES_WITH_IMPLEMENTATION} ${MODULES_WITHOUT_IMPLEMENTATION})
 
+# Check if 128-bit float computations can be done via long double.
+check_cxx_source_compiles(
+  "#include <cfloat>
+   #if LDBL_MANT_DIG != 113
+   #error LDBL_MANT_DIG != 113
+   #endif
+   int main() { return 0; }
+  "
+  HAVE_LDBL_MANT_DIG_113)
+
+# Figure out whether we can support REAL(KIND=16)
+if (FLANG_RUNTIME_F128_MATH_LIB)
+  set(FLANG_SUPPORT_R16 "1")
+elseif (HAVE_LDBL_MANT_DIG_113)
+  set(FLANG_SUPPORT_R16 "1")
+else()
+  set(FLANG_SUPPORT_R16 "0")
+endif()
+
 # Init variable to hold extra object files coming from the Fortran modules;
 # these module files will be contributed from the CMakeLists in flang/tools/f18.
 set(module_objects "")
@@ -76,6 +95,10 @@ if (NOT CMAKE_CROSSCOMPILING)
       endif()
     endif()
 
+    set(decls "")
+    if (FLANG_SUPPORT_R16)
+      set(decls "-DFLANG_SUPPORT_R16")
+    endif()
 
     # Some modules have an implementation part that needs to be added to the
     # FortranRuntime library.
@@ -92,7 +115,7 @@ if (NOT CMAKE_CROSSCOMPILING)
     # TODO: We may need to flag this with conditional, in case Flang is built w/o OpenMP support
     add_custom_command(OUTPUT ${base}.mod ${object_output}
       COMMAND ${CMAKE_COMMAND} -E make_directory ${FLANG_INTRINSIC_MODULES_DIR}
-      COMMAND flang-new ${opts} -cpp ${compile_with} -module-dir ${FLANG_INTRINSIC_MODULES_DIR}
+      COMMAND flang-new ${opts} ${decls} -cpp ${compile_with} -module-dir ${FLANG_INTRINSIC_MODULES_DIR}
         ${FLANG_SOURCE_DIR}/module/${filename}.f90
       DEPENDS flang-new ${FLANG_SOURCE_DIR}/module/${filename}.f90 ${FLANG_SOURCE_DIR}/module/__fortran_builtins.f90 ${depends}
     )

From e80f48986c7ba6cc41378b8d8e12d804cf26895d Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Thu, 5 Sep 2024 17:01:56 +0200
Subject: [PATCH 237/425] [SCEV] BECount to zero if `((-C + (C smax %x)) /u
 %x), C > 0` holds

The SCEV expression `((-C + (C smax %x)) /u %x)` can be folded
to zero for any positive constant C.

Proof: https://alive2.llvm.org/ce/z/_dLm8C.
---
 llvm/lib/Analysis/ScalarEvolution.cpp         | 16 ++++
 .../udiv-of-x-xsmaxone-fold.ll                | 96 +++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 llvm/test/Analysis/ScalarEvolution/udiv-of-x-xsmaxone-fold.ll

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 6b4a81c217b3c..57e03f667ba6f 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -3547,6 +3547,22 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
     }
   }
 
+  // ((-C + (C smax %x)) /u %x) evaluates to zero, for any positive constant C.
+  if (const auto *AE = dyn_cast<SCEVAddExpr>(LHS);
+      AE && AE->getNumOperands() == 2) {
+    if (const auto *VC = dyn_cast<SCEVConstant>(AE->getOperand(0))) {
+      const APInt &NegC = VC->getAPInt();
+      if (NegC.isNegative() && !NegC.isMinSignedValue()) {
+        const auto *MME = dyn_cast<SCEVSMaxExpr>(AE->getOperand(1));
+        if (MME && MME->getNumOperands() == 2 &&
+            isa<SCEVConstant>(MME->getOperand(0)) &&
+            cast<SCEVConstant>(MME->getOperand(0))->getAPInt() == -NegC &&
+            MME->getOperand(1) == RHS)
+          return getZero(LHS->getType());
+      }
+    }
+  }
+
   // The Insertion Point (IP) might be invalid by now (due to UniqueSCEVs
   // changes). Make sure we get a new one.
   IP = nullptr;
diff --git a/llvm/test/Analysis/ScalarEvolution/udiv-of-x-xsmaxone-fold.ll b/llvm/test/Analysis/ScalarEvolution/udiv-of-x-xsmaxone-fold.ll
new file mode 100644
index 0000000000000..9405c0f726ac7
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/udiv-of-x-xsmaxone-fold.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -disable-output -passes="print<scalar-evolution>" < %s 2>&1 | FileCheck %s
+
+define i32 @test_expr_with_constant_1(i32 %x) {
+; CHECK-LABEL: 'test_expr_with_constant_1'
+; CHECK-NEXT:  Classifying expressions for: @test_expr_with_constant_1
+; CHECK-NEXT:    %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 1)
+; CHECK-NEXT:    --> (1 smax %x) U: [1,-2147483648) S: [1,-2147483648)
+; CHECK-NEXT:    %add = add nsw i32 %smax, -1
+; CHECK-NEXT:    --> (-1 + (1 smax %x))<nsw> U: [0,2147483647) S: [0,2147483647)
+; CHECK-NEXT:    %udiv = udiv i32 %add, %x
+; CHECK-NEXT:    --> 0 U: [0,1) S: [0,1)
+; CHECK-NEXT:  Determining loop execution counts for: @test_expr_with_constant_1
+;
+entry:
+  %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 1)
+  %add = add nsw i32 %smax, -1
+  %udiv = udiv i32 %add, %x
+  ret i32 %udiv
+}
+
+; Non-1 constant: (-2 + (2 smax %x)) /u %x
+define i32 @test_expr_with_constant_2(i32 %x) {
+; CHECK-LABEL: 'test_expr_with_constant_2'
+; CHECK-NEXT:  Classifying expressions for: @test_expr_with_constant_2
+; CHECK-NEXT:    %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 2)
+; CHECK-NEXT:    --> (2 smax %x) U: [2,-2147483648) S: [2,-2147483648)
+; CHECK-NEXT:    %add = add nsw i32 %smax, -2
+; CHECK-NEXT:    --> (-2 + (2 smax %x))<nsw> U: [0,2147483646) S: [0,2147483646)
+; CHECK-NEXT:    %udiv = udiv i32 %add, %x
+; CHECK-NEXT:    --> 0 U: [0,1) S: [0,1)
+; CHECK-NEXT:  Determining loop execution counts for: @test_expr_with_constant_2
+;
+entry:
+  %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 2)
+  %add = add nsw i32 %smax, -2
+  %udiv = udiv i32 %add, %x
+  ret i32 %udiv
+}
+
+; Negative test, constants mismatch: (-3 + (2 smax %x)) /u %x
+define i32 @test_expr_mismatch_constants(i32 %x) {
+; CHECK-LABEL: 'test_expr_mismatch_constants'
+; CHECK-NEXT:  Classifying expressions for: @test_expr_mismatch_constants
+; CHECK-NEXT:    %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 2)
+; CHECK-NEXT:    --> (2 smax %x) U: [2,-2147483648) S: [2,-2147483648)
+; CHECK-NEXT:    %add = add nsw i32 %smax, -3
+; CHECK-NEXT:    --> (-3 + (2 smax %x))<nsw> U: [-1,2147483645) S: [-1,2147483645)
+; CHECK-NEXT:    %udiv = udiv i32 %add, %x
+; CHECK-NEXT:    --> ((-3 + (2 smax %x))<nsw> /u %x) U: full-set S: full-set
+; CHECK-NEXT:  Determining loop execution counts for: @test_expr_mismatch_constants
+;
+entry:
+  %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 2)
+  %add = add nsw i32 %smax, -3
+  %udiv = udiv i32 %add, %x
+  ret i32 %udiv
+}
+
+; Negative constant: (3 + (-3 smax %x)) /u %x
+define i32 @test_expr_negative_constant(i32 %x) {
+; CHECK-LABEL: 'test_expr_negative_constant'
+; CHECK-NEXT:  Classifying expressions for: @test_expr_negative_constant
+; CHECK-NEXT:    %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 -3)
+; CHECK-NEXT:    --> (-3 smax %x) U: [-3,-2147483648) S: [-3,-2147483648)
+; CHECK-NEXT:    %add = add nsw i32 %smax, 3
+; CHECK-NEXT:    --> (3 + (-3 smax %x)) U: [0,-2147483645) S: [0,-2147483645)
+; CHECK-NEXT:    %udiv = udiv i32 %add, %x
+; CHECK-NEXT:    --> ((3 + (-3 smax %x)) /u %x) U: [0,-2147483645) S: [0,-2147483645)
+; CHECK-NEXT:  Determining loop execution counts for: @test_expr_negative_constant
+;
+entry:
+  %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 -3)
+  %add = add nsw i32 %smax, 3
+  %udiv = udiv i32 %add, %x
+  ret i32 %udiv
+}
+
+; Negative signed minimum value.
+define i8 @text_expr_with_constant_signed_min(i8 %x) {
+; CHECK-LABEL: 'text_expr_with_constant_signed_min'
+; CHECK-NEXT:  Classifying expressions for: @text_expr_with_constant_signed_min
+; CHECK-NEXT:    %smax = tail call i8 @llvm.smax.i8(i8 %x, i8 -128)
+; CHECK-NEXT:    --> %x U: full-set S: full-set
+; CHECK-NEXT:    %add = add nsw i8 %smax, -128
+; CHECK-NEXT:    --> (-128 + %x) U: full-set S: full-set
+; CHECK-NEXT:    %udiv = udiv i8 %add, %x
+; CHECK-NEXT:    --> ((-128 + %x) /u %x) U: full-set S: full-set
+; CHECK-NEXT:  Determining loop execution counts for: @text_expr_with_constant_signed_min
+;
+entry:
+  %smax = tail call i8 @llvm.smax.i8(i8 %x, i8 128)
+  %add = add nsw i8 %smax, -128
+  %udiv = udiv i8 %add, %x
+  ret i8 %udiv
+}

From 7eca38ce76d5d1915f4ab7e665964062c0b37697 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye@arm.com>
Date: Thu, 5 Sep 2024 16:13:11 +0100
Subject: [PATCH 238/425] Reland "[clang] Add nuw attribute to GEPs (#105496)"
 (#107257)

Add nuw attribute to inbounds GEPs where the expression used to form the
GEP is an addition of unsigned indices.

Relands #105496, which was reverted because it exposed a miscompilation
arising from #98608. This is now fixed by #106512.
---
 clang/lib/CodeGen/CGBuilder.h                 |   6 +-
 clang/lib/CodeGen/CGExprScalar.cpp            |  17 +-
 clang/test/CodeGen/2005-01-02-ConstantInits.c |   4 +-
 clang/test/CodeGen/PowerPC/ppc-emmintrin.c    |  12 +-
 clang/test/CodeGen/PowerPC/ppc-xmmintrin.c    |  16 +-
 clang/test/CodeGen/attr-counted-by.c          |  16 +-
 ...d-nonzero-offset-when-nullptr-is-defined.c |   2 +-
 .../catch-nullptr-and-nonzero-offset.c        |  64 ++---
 .../CodeGen/catch-pointer-overflow-volatile.c |   2 +-
 clang/test/CodeGen/catch-pointer-overflow.c   |   6 +-
 clang/test/CodeGen/ext-int.c                  |   2 +-
 .../test/CodeGen/hexagon-brev-ld-ptr-incdec.c |   6 +-
 clang/test/CodeGen/integer-overflow.c         |   6 +-
 clang/test/CodeGen/ms-intrinsics.c            |  12 +-
 clang/test/CodeGen/ubsan-pointer-overflow.m   |   2 +-
 clang/test/CodeGen/vla.c                      |   2 +-
 .../attr-likelihood-iteration-stmt.cpp        |  36 +--
 clang/test/CodeGenCXX/for-range.cpp           |  12 +-
 .../CodeGenCXX/pr45964-decomp-transform.cpp   |   2 +-
 clang/test/CodeGenCXX/vla.cpp                 |   4 +-
 .../CodeGenHLSL/buffer-array-operator.hlsl    |   4 +-
 .../CodeGenSYCL/address-space-deduction.cpp   |  48 ++--
 clang/test/Headers/__clang_hip_math.hip       |  70 ++---
 clang/test/OpenMP/bug60602.cpp                |   8 +-
 clang/test/OpenMP/declare_mapper_codegen.cpp  |   4 +-
 clang/test/OpenMP/distribute_codegen.cpp      |  32 +--
 ...te_parallel_for_reduction_task_codegen.cpp |  24 +-
 clang/test/OpenMP/distribute_simd_codegen.cpp |  96 +++----
 clang/test/OpenMP/for_linear_codegen.cpp      |   2 +-
 clang/test/OpenMP/for_reduction_codegen.cpp   |  64 ++---
 .../test/OpenMP/for_reduction_codegen_UDR.cpp |  32 +--
 .../OpenMP/for_reduction_task_codegen.cpp     |  24 +-
 clang/test/OpenMP/for_scan_codegen.cpp        |  40 +--
 clang/test/OpenMP/for_simd_scan_codegen.cpp   |  40 +--
 clang/test/OpenMP/irbuilder_for_iterator.cpp  |   6 +-
 clang/test/OpenMP/irbuilder_for_rangefor.cpp  |   6 +-
 clang/test/OpenMP/irbuilder_for_unsigned.c    |   8 +-
 .../test/OpenMP/irbuilder_for_unsigned_auto.c |   8 +-
 .../test/OpenMP/irbuilder_for_unsigned_down.c |   2 +-
 .../OpenMP/irbuilder_for_unsigned_dynamic.c   |   8 +-
 .../irbuilder_for_unsigned_dynamic_chunked.c  |   8 +-
 .../OpenMP/irbuilder_for_unsigned_runtime.c   |   8 +-
 .../irbuilder_for_unsigned_static_chunked.c   |   8 +-
 clang/test/OpenMP/map_struct_ordering.cpp     |   2 +-
 .../master_taskloop_in_reduction_codegen.cpp  |  10 +-
 .../master_taskloop_reduction_codegen.cpp     |  12 +-
 ...ter_taskloop_simd_in_reduction_codegen.cpp |  10 +-
 ...master_taskloop_simd_reduction_codegen.cpp |  12 +-
 clang/test/OpenMP/ordered_codegen.cpp         |  80 +++---
 clang/test/OpenMP/parallel_for_codegen.cpp    | 144 +++++-----
 .../OpenMP/parallel_for_linear_codegen.cpp    |   2 +-
 .../parallel_for_reduction_task_codegen.cpp   |  24 +-
 .../test/OpenMP/parallel_for_scan_codegen.cpp |  44 ++--
 .../OpenMP/parallel_for_simd_scan_codegen.cpp |  40 +--
 ...parallel_master_reduction_task_codegen.cpp |  24 +-
 ...llel_master_taskloop_reduction_codegen.cpp |  12 +-
 ...master_taskloop_simd_reduction_codegen.cpp |  12 +-
 .../OpenMP/parallel_reduction_codegen.cpp     |  12 +-
 .../parallel_reduction_task_codegen.cpp       |  24 +-
 ...rallel_sections_reduction_task_codegen.cpp |  24 +-
 clang/test/OpenMP/reduction_implicit_map.cpp  |  50 ++--
 .../sections_reduction_task_codegen.cpp       |  24 +-
 .../target_data_use_device_addr_codegen.cpp   |  10 +-
 .../target_data_use_device_ptr_codegen.cpp    |  82 +++---
 .../OpenMP/target_has_device_addr_codegen.cpp |  24 +-
 .../OpenMP/target_in_reduction_codegen.cpp    |  10 +-
 .../OpenMP/target_is_device_ptr_codegen.cpp   | 208 +++++++--------
 ...arget_map_both_pointer_pointee_codegen.cpp |   2 +-
 clang/test/OpenMP/target_map_codegen_01.cpp   |   2 +-
 clang/test/OpenMP/target_map_codegen_21.cpp   |   6 +-
 clang/test/OpenMP/target_map_codegen_27.cpp   |   2 +-
 clang/test/OpenMP/target_map_codegen_28.cpp   |   6 +-
 clang/test/OpenMP/target_map_codegen_29.cpp   |   4 +-
 .../OpenMP/target_map_deref_array_codegen.cpp |   2 +-
 ..._map_member_expr_array_section_codegen.cpp |   8 +-
 .../OpenMP/target_map_member_expr_codegen.cpp |   6 +-
 ...target_map_nest_defalut_mapper_codegen.cpp |   2 +-
 ...et_parallel_for_reduction_task_codegen.cpp |  24 +-
 ...target_parallel_reduction_task_codegen.cpp |  24 +-
 .../OpenMP/target_task_affinity_codegen.cpp   |  24 +-
 ...te_parallel_for_reduction_task_codegen.cpp |  44 ++--
 clang/test/OpenMP/target_update_codegen.cpp   |  34 +--
 clang/test/OpenMP/task_codegen.c              |   2 +-
 clang/test/OpenMP/task_codegen.cpp            | 248 +++++++++---------
 .../test/OpenMP/task_in_reduction_codegen.cpp |  10 +-
 .../taskgroup_task_reduction_codegen.cpp      |  10 +-
 .../OpenMP/taskloop_in_reduction_codegen.cpp  |  10 +-
 .../OpenMP/taskloop_reduction_codegen.cpp     |  12 +-
 .../taskloop_simd_in_reduction_codegen.cpp    |  10 +-
 .../taskloop_simd_reduction_codegen.cpp       |  12 +-
 ...te_parallel_for_reduction_task_codegen.cpp |  44 ++--
 91 files changed, 1118 insertions(+), 1105 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h
index 08730a6a6672a..b8036cf6e6a30 100644
--- a/clang/lib/CodeGen/CGBuilder.h
+++ b/clang/lib/CodeGen/CGBuilder.h
@@ -14,6 +14,7 @@
 #include "CodeGenTypeCache.h"
 #include "llvm/Analysis/Utils/Local.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GEPNoWrapFlags.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Type.h"
 
@@ -334,9 +335,10 @@ class CGBuilderTy : public CGBuilderBaseTy {
 
   Address CreateGEP(Address Addr, ArrayRef<llvm::Value *> IdxList,
                     llvm::Type *ElementType, CharUnits Align,
-                    const Twine &Name = "") {
+                    const Twine &Name = "",
+                    llvm::GEPNoWrapFlags NW = llvm::GEPNoWrapFlags::none()) {
     llvm::Value *Ptr = emitRawPointerFromAddress(Addr);
-    return RawAddress(CreateGEP(Addr.getElementType(), Ptr, IdxList, Name),
+    return RawAddress(CreateGEP(Addr.getElementType(), Ptr, IdxList, Name, NW),
                       ElementType, Align);
   }
 
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 7aa2d3d89c293..88fbbe6c4c965 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/FixedPointBuilder.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GEPNoWrapFlags.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Intrinsics.h"
@@ -5759,7 +5760,12 @@ CodeGenFunction::EmitCheckedInBoundsGEP(llvm::Type *ElemTy, Value *Ptr,
                                         bool SignedIndices, bool IsSubtraction,
                                         SourceLocation Loc, const Twine &Name) {
   llvm::Type *PtrTy = Ptr->getType();
-  Value *GEPVal = Builder.CreateInBoundsGEP(ElemTy, Ptr, IdxList, Name);
+
+  llvm::GEPNoWrapFlags NWFlags = llvm::GEPNoWrapFlags::inBounds();
+  if (!SignedIndices && !IsSubtraction)
+    NWFlags |= llvm::GEPNoWrapFlags::noUnsignedWrap();
+
+  Value *GEPVal = Builder.CreateGEP(ElemTy, Ptr, IdxList, Name, NWFlags);
 
   // If the pointer overflow sanitizer isn't enabled, do nothing.
   if (!SanOpts.has(SanitizerKind::PointerOverflow))
@@ -5874,8 +5880,13 @@ Address CodeGenFunction::EmitCheckedInBoundsGEP(
     Address Addr, ArrayRef<Value *> IdxList, llvm::Type *elementType,
     bool SignedIndices, bool IsSubtraction, SourceLocation Loc, CharUnits Align,
     const Twine &Name) {
-  if (!SanOpts.has(SanitizerKind::PointerOverflow))
-    return Builder.CreateInBoundsGEP(Addr, IdxList, elementType, Align, Name);
+  if (!SanOpts.has(SanitizerKind::PointerOverflow)) {
+    llvm::GEPNoWrapFlags NWFlags = llvm::GEPNoWrapFlags::inBounds();
+    if (!SignedIndices && !IsSubtraction)
+      NWFlags |= llvm::GEPNoWrapFlags::noUnsignedWrap();
+
+    return Builder.CreateGEP(Addr, IdxList, elementType, Align, Name, NWFlags);
+  }
 
   return RawAddress(
       EmitCheckedInBoundsGEP(Addr.getElementType(), Addr.emitRawPointer(*this),
diff --git a/clang/test/CodeGen/2005-01-02-ConstantInits.c b/clang/test/CodeGen/2005-01-02-ConstantInits.c
index 7772a64331ffb..d90c2ea42da61 100644
--- a/clang/test/CodeGen/2005-01-02-ConstantInits.c
+++ b/clang/test/CodeGen/2005-01-02-ConstantInits.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --global-value-regex "@.+"
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --global-value-regex "[A-Za-z].*"
 // RUN: %clang_cc1 -triple=x86_64-unknown-linux %s -emit-llvm -o - | FileCheck %s
 
 // This tests all kinds of hard cases with initializers and
@@ -51,7 +51,7 @@ int foo(int i) { return bar(&Arr[49])+bar(&Arr[i]); }
 // CHECK-NEXT:    store i32 [[I]], ptr [[I_ADDR]], align 4
 // CHECK-NEXT:    store ptr @Arr, ptr [[P]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[P]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I_ADDR]], align 4
 // CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[TMP1]] to i64
diff --git a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
index a3650beec625f..4c4d0dfce05ea 100644
--- a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
@@ -1012,14 +1012,14 @@ test_shuffle() {
 // CHECK: %[[SHR:[0-9a-zA-Z_.]+]] = ashr i32 %{{[0-9a-zA-Z_.]+}}, 6
 // CHECK: %[[AND4:[0-9a-zA-Z_.]+]] = and i32 %[[SHR]], 3
 // CHECK: sext i32 %[[AND4]] to i64
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 0
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 1
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK: %[[ADD:[0-9a-zA-Z_.]+]] = add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[ADD]], i32 2
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK: add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
 // CHECK: call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])
 
@@ -1050,7 +1050,7 @@ test_shuffle() {
 // CHECK: sext i32 %[[AND4]] to i64
 // CHECK-LE: store <2 x i64> <i64 1663540288323457296, i64 0>, ptr %{{[0-9a-zA-Z_.]+}}, align 16
 // CHECK-BE: store <2 x i64> <i64 1157726452361532951, i64 0>, ptr %{{[0-9a-zA-Z_.]+}}, align 16
-// CHECK-COUNT-4: getelementptr inbounds [4 x i16], ptr @_mm_shufflehi_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
+// CHECK-COUNT-4: getelementptr inbounds nuw [4 x i16], ptr @_mm_shufflehi_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
 // CHECK: call <2 x i64> @vec_perm(unsigned long long vector[2], unsigned long long vector[2], unsigned char vector[16])
 
 // CHECK-LABEL: define available_externally <2 x i64> @_mm_shufflelo_epi16
@@ -1067,7 +1067,7 @@ test_shuffle() {
 // CHECK: sext i32 %[[AND4]] to i64
 // CHECK-LE: store <2 x i64> <i64 0, i64 2242261671028070680>, ptr %{{[0-9a-zA-Z_.]+}}, align 16
 // CHECK-BE: store <2 x i64> <i64 0, i64 1736447835066146335>, ptr %{{[0-9a-zA-Z_.]+}}, align 16
-// CHECK-COUNT-4: getelementptr inbounds [4 x i16], ptr @_mm_shufflelo_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
+// CHECK-COUNT-4: getelementptr inbounds nuw [4 x i16], ptr @_mm_shufflelo_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
 // CHECK: call <2 x i64> @vec_perm(unsigned long long vector[2], unsigned long long vector[2], unsigned char vector[16])
 
 void __attribute__((noinline))
diff --git a/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c b/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c
index 95dfd1202f157..4a15fa9f76cee 100644
--- a/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c
@@ -894,16 +894,16 @@ test_shuffle() {
 // CHECK: %[[SHR3:[0-9a-zA-Z_.]+]] = ashr i32 %{{[0-9a-zA-Z_.]+}}, 6
 // CHECK: %[[AND4:[0-9a-zA-Z_.]+]] = and i32 %[[SHR3]], 3
 // CHECK: sext i32 %[[AND4]] to i64
-// CHECK: getelementptr inbounds [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK-LE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 0
 // CHECK-BE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 3
-// CHECK: getelementptr inbounds [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK-LE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 1
 // CHECK-BE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 2
-// CHECK: getelementptr inbounds [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK-LE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 2
 // CHECK-BE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 1
-// CHECK: getelementptr inbounds [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK-LE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 3
 // CHECK-BE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 0
 // CHECK: call <2 x i64> @vec_splats(unsigned long long)
@@ -923,14 +923,14 @@ test_shuffle() {
 // CHECK: %[[SHR3:[0-9a-zA-Z_.]+]] = ashr i32 %{{[0-9a-zA-Z_.]+}}, 6
 // CHECK: %[[AND4:[0-9a-zA-Z_.]+]] = and i32 %[[SHR3]], 3
 // CHECK: sext i32 %[[AND4]] to i64
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 0
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 1
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
 // CHECK: %[[ADD:[0-9a-zA-Z_.]+]] = add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[ADD]], i32 2
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
 // CHECK: %[[ADD2:[0-9a-zA-Z_.]+]] = add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[ADD2]], i32 3
 // CHECK: call <4 x float> @vec_perm(float vector[4], float vector[4], unsigned char vector[16])
diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c
index ab36b6e7720ba..a06e815737f4e 100644
--- a/clang/test/CodeGen/attr-counted-by.c
+++ b/clang/test/CodeGen/attr-counted-by.c
@@ -118,7 +118,7 @@ void test1(struct annotated *p, int index, int val) {
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP2]]
@@ -134,7 +134,7 @@ void test1(struct annotated *p, int index, int val) {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP0]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
@@ -142,7 +142,7 @@ void test1(struct annotated *p, int index, int val) {
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
@@ -150,7 +150,7 @@ void test1(struct annotated *p, int index, int val) {
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
@@ -207,7 +207,7 @@ size_t test2_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[TMP2]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP3]], i64 4)
@@ -231,7 +231,7 @@ size_t test2_bdos(struct annotated *p) {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP4]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
@@ -239,7 +239,7 @@ size_t test2_bdos(struct annotated *p) {
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
@@ -247,7 +247,7 @@ size_t test2_bdos(struct annotated *p) {
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
diff --git a/clang/test/CodeGen/catch-nullptr-and-nonzero-offset-when-nullptr-is-defined.c b/clang/test/CodeGen/catch-nullptr-and-nonzero-offset-when-nullptr-is-defined.c
index 39ede01d6e3b8..8a560a47ad1e1 100644
--- a/clang/test/CodeGen/catch-nullptr-and-nonzero-offset-when-nullptr-is-defined.c
+++ b/clang/test/CodeGen/catch-nullptr-and-nonzero-offset-when-nullptr-is-defined.c
@@ -33,7 +33,7 @@ char *add_unsigned(char *base, unsigned long offset) {
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
diff --git a/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c b/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c
index e93dbcb9f647b..d884993ffb2b3 100644
--- a/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c
+++ b/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c
@@ -50,7 +50,7 @@ char *var_var(char *base, unsigned long offset) {
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
@@ -83,7 +83,7 @@ char *var_zero(char *base) {
   // CHECK-NEXT:                          %[[BASE_ADDR:.*]] = alloca ptr, align 8
   // CHECK-NEXT:                          store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                          %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
-  // CHECK-NEXT:                          %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 0
+  // CHECK-NEXT:                          %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 0
   // CHECK-SANITIZE-C-NEXT:               %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize
   // CHECK-SANITIZE-C-NEXT:               %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], 0, !nosanitize
   // CHECK-SANITIZE-C-NEXT:               %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize
@@ -111,7 +111,7 @@ char *var_one(char *base) {
   // CHECK-NEXT:                        %[[BASE_ADDR:.*]] = alloca ptr, align 8
   // CHECK-NEXT:                        store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 1
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 1
   // CHECK-SANITIZE-NEXT:               %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize
@@ -140,7 +140,7 @@ char *var_allones(char *base) {
   // CHECK-NEXT:                        %[[BASE_ADDR:.*]] = alloca ptr, align 8
   // CHECK-NEXT:                        store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 -1
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 -1
   // CHECK-SANITIZE-NEXT:               %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], -1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize
@@ -171,7 +171,7 @@ char *nullptr_var(unsigned long offset) {
   // CHECK-NEXT:                        %[[OFFSET_ADDR:.*]] = alloca i64, align 8
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr null, i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr null, i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
@@ -217,17 +217,17 @@ char *nullptr_zero(void) {
 char *nullptr_one_BAD(void) {
   // CHECK:                           define{{.*}} ptr @nullptr_one_BAD()
   // CHECK-NEXT:                      [[ENTRY:.*]]:
-  // CHECK-SANITIZE-NEXT:             %[[CMP:.*]] = icmp ne i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 1) to i64), 0, !nosanitize
+  // CHECK-SANITIZE-NEXT:             %[[CMP:.*]] = icmp ne i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 1) to i64), 0, !nosanitize
   // CHECK-SANITIZE-C-NEXT:           %[[COND:.*]] = and i1 false, %[[CMP]], !nosanitize
   // CHECK-SANITIZE-CPP-NEXT:         %[[COND:.*]] = icmp eq i1 false, %[[CMP]], !nosanitize
   // CHECK-SANITIZE-NEXT:             br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_POINTER_OVERFLOW]]:
-  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_700]], i64 0, i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 1) to i64))
-  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_700]], i64 0, i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 1) to i64))
+  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_700]], i64 0, i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 1) to i64))
+  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_700]], i64 0, i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 1) to i64))
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        ret ptr getelementptr inbounds (i8, ptr null, i64 1)
+  // CHECK-NEXT:                        ret ptr getelementptr inbounds nuw (i8, ptr null, i64 1)
   static char *const base = (char *)0;
   static const unsigned long offset = 1;
 #line 700
@@ -237,17 +237,17 @@ char *nullptr_one_BAD(void) {
 char *nullptr_allones_BAD(void) {
   // CHECK:                           define{{.*}} ptr @nullptr_allones_BAD()
   // CHECK-NEXT:                      [[ENTRY:.*]]:
-  // CHECK-SANITIZE-NEXT:             %[[CMP:.*]] = icmp ne i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 -1) to i64), 0, !nosanitize
+  // CHECK-SANITIZE-NEXT:             %[[CMP:.*]] = icmp ne i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 -1) to i64), 0, !nosanitize
   // CHECK-SANITIZE-C-NEXT:           %[[COND:.*]] = and i1 false, %[[CMP]], !nosanitize
   // CHECK-SANITIZE-CPP-NEXT:         %[[COND:.*]] = icmp eq i1 false, %[[CMP]], !nosanitize
   // CHECK-SANITIZE-NEXT:             br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_POINTER_OVERFLOW]]:
-  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_800]], i64 0, i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 -1) to i64))
-  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_800]], i64 0, i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 -1) to i64))
+  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_800]], i64 0, i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 -1) to i64))
+  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_800]], i64 0, i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 -1) to i64))
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        ret ptr getelementptr inbounds (i8, ptr null, i64 -1)
+  // CHECK-NEXT:                        ret ptr getelementptr inbounds nuw (i8, ptr null, i64 -1)
   static char *const base = (char *)0;
   static const unsigned long offset = -1;
 #line 800
@@ -262,7 +262,7 @@ char *one_var(unsigned long offset) {
   // CHECK-NEXT:                        %[[OFFSET_ADDR:.*]] = alloca i64, align 8
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr inttoptr (i64 1 to ptr), i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr inttoptr (i64 1 to ptr), i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
@@ -312,17 +312,17 @@ char *one_one_OK(void) {
   // CHECK:                           define{{.*}} ptr @one_one_OK()
   // CHECK-NEXT:                      [[ENTRY:.*]]:
   // CHECK-SANITIZE-NEXT:               %[[CMP1:.*]] = icmp ne ptr inttoptr (i64 1 to ptr), null, !nosanitize
-  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1), 0, !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1), 0, !nosanitize
   // CHECK-SANITIZE-C-NEXT:             %[[COND:.*]] = and i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-CPP-NEXT:           %[[COND:.*]] = icmp eq i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_POINTER_OVERFLOW]]:
-  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1100]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1))
-  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1100]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1))
+  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1100]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1))
+  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1100]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1))
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        ret ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 1)
+  // CHECK-NEXT:                        ret ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 1)
   static char *const base = (char *)1;
   static const unsigned long offset = 1;
 #line 1100
@@ -333,17 +333,17 @@ char *one_allones_BAD(void) {
   // CHECK:                           define{{.*}} ptr @one_allones_BAD()
   // CHECK-NEXT:                      [[ENTRY:.*]]:
   // CHECK-SANITIZE-NEXT:               %[[CMP1:.*]] = icmp ne ptr inttoptr (i64 1 to ptr), null, !nosanitize
-  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1), 0, !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1), 0, !nosanitize
   // CHECK-SANITIZE-C-NEXT:             %[[COND:.*]] = and i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-CPP-NEXT:           %[[COND:.*]] = icmp eq i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_POINTER_OVERFLOW]]:
-  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1200]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1))
-  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1200]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1))
+  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1200]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1))
+  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1200]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1))
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        ret ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 -1)
+  // CHECK-NEXT:                        ret ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 -1)
   static char *const base = (char *)1;
   static const unsigned long offset = -1;
 #line 1200
@@ -358,7 +358,7 @@ char *allones_var(unsigned long offset) {
   // CHECK-NEXT:                        %[[OFFSET_ADDR:.*]] = alloca i64, align 8
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr inttoptr (i64 -1 to ptr), i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr inttoptr (i64 -1 to ptr), i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
@@ -408,17 +408,17 @@ char *allones_one_BAD(void) {
   // CHECK: define{{.*}} ptr @allones_one_BAD()
   // CHECK-NEXT: [[ENTRY:.*]]:
   // CHECK-SANITIZE-NEXT:               %[[CMP1:.*]] = icmp ne ptr inttoptr (i64 -1 to ptr), null, !nosanitize
-  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1), 0, !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1), 0, !nosanitize
   // CHECK-SANITIZE-C-NEXT:             %[[COND:.*]] = and i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-CPP-NEXT:           %[[COND:.*]] = icmp eq i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_POINTER_OVERFLOW]]:
-  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1500]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1))
-  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1500]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1))
+  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1500]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1))
+  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1500]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1))
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        ret ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 1)
+  // CHECK-NEXT:                        ret ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 1)
   static char *const base = (char *)-1;
   static const unsigned long offset = 1;
 #line 1500
@@ -429,17 +429,17 @@ char *allones_allones_OK(void) {
   // CHECK: define{{.*}} ptr @allones_allones_OK()
   // CHECK-NEXT: [[ENTRY:.*]]:
   // CHECK-SANITIZE-NEXT:               %[[CMP1:.*]] = icmp ne ptr inttoptr (i64 -1 to ptr), null, !nosanitize
-  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1), 0, !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1), 0, !nosanitize
   // CHECK-SANITIZE-C-NEXT:             %[[COND:.*]] = and i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-CPP-NEXT:           %[[COND:.*]] = icmp eq i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_POINTER_OVERFLOW]]:
-  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1600]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1))
-  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1600]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1))
+  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1600]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1))
+  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1600]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1))
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        ret ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 -1)
+  // CHECK-NEXT:                        ret ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 -1)
   static char *const base = (char *)-1;
   static const unsigned long offset = -1;
 #line 1600
@@ -461,7 +461,7 @@ char *void_ptr(void *base, unsigned long offset) {
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
diff --git a/clang/test/CodeGen/catch-pointer-overflow-volatile.c b/clang/test/CodeGen/catch-pointer-overflow-volatile.c
index 4b0653a0ae59e..626bbc0db7afb 100644
--- a/clang/test/CodeGen/catch-pointer-overflow-volatile.c
+++ b/clang/test/CodeGen/catch-pointer-overflow-volatile.c
@@ -23,7 +23,7 @@ char *volatile_ptr(char *volatile base, unsigned long offset) {
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load volatile ptr, ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
diff --git a/clang/test/CodeGen/catch-pointer-overflow.c b/clang/test/CodeGen/catch-pointer-overflow.c
index 899af73bd81e0..1f7f1729098c7 100644
--- a/clang/test/CodeGen/catch-pointer-overflow.c
+++ b/clang/test/CodeGen/catch-pointer-overflow.c
@@ -30,7 +30,7 @@ char *add_unsigned(char *base, unsigned long offset) {
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
@@ -179,7 +179,7 @@ char *postinc(char *base) {
   // CHECK-NEXT:                        %[[BASE_ADDR:.*]] = alloca ptr, align 8
   // CHECK-NEXT:                        store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i32 1
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i32 1
   // CHECK-SANITIZE-NEXT:               %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize
@@ -241,7 +241,7 @@ char *preinc(char *base) {
   // CHECK-NEXT:                        %[[BASE_ADDR:.*]] = alloca ptr, align 8
   // CHECK-NEXT:                        store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i32 1
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i32 1
   // CHECK-SANITIZE-NEXT:               %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize
diff --git a/clang/test/CodeGen/ext-int.c b/clang/test/CodeGen/ext-int.c
index 714b7e122a706..e3d609a4ba4a2 100644
--- a/clang/test/CodeGen/ext-int.c
+++ b/clang/test/CodeGen/ext-int.c
@@ -154,7 +154,7 @@ _BitInt(129) *f1(_BitInt(129) *p) {
 }
 
 char *f2(char *p) {
-  // CHECK64: getelementptr inbounds i8, {{.*}} i64 24
+  // CHECK64: getelementptr inbounds nuw i8, {{.*}} i64 24
   return p + sizeof(_BitInt(129));
 }
 
diff --git a/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c b/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c
index 7802168de4d75..d25d1e04f15fa 100644
--- a/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c
+++ b/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c
@@ -6,9 +6,9 @@
 // the return value will be the value in A[2]
 // CHECK: @brev_ptr_inc
 // CHECK-DAG: llvm.hexagon.L2.loadri.pbr
-// CHECK-DAG: getelementptr inbounds i8, {{.*}}i32 4
-// CHECK-NOT: getelementptr inbounds i8, {{.*}}i32 8
-// CHECK-NOT: getelementptr inbounds i8, {{.*}}i32 4
+// CHECK-DAG: getelementptr inbounds nuw i8, {{.*}}i32 4
+// CHECK-NOT: getelementptr inbounds nuw i8, {{.*}}i32 8
+// CHECK-NOT: getelementptr inbounds nuw i8, {{.*}}i32 4
 int brev_ptr_inc(int A[], int B[]) {
   int *p0 = &B[0];
   int *p1 = &A[0];
diff --git a/clang/test/CodeGen/integer-overflow.c b/clang/test/CodeGen/integer-overflow.c
index 461b026d39615..9e8cde8b33b16 100644
--- a/clang/test/CodeGen/integer-overflow.c
+++ b/clang/test/CodeGen/integer-overflow.c
@@ -60,10 +60,10 @@ void test1(void) {
   // -fwrapv should turn off inbounds for GEP's, PR9256
   extern int* P;
   ++P;
-  // DEFAULT: getelementptr inbounds i32, ptr
+  // DEFAULT: getelementptr inbounds nuw i32, ptr
   // WRAPV: getelementptr i32, ptr
-  // TRAPV: getelementptr inbounds i32, ptr
-  // CATCH_UB_POINTER: getelementptr inbounds i32, ptr
+  // TRAPV: getelementptr inbounds nuw i32, ptr
+  // CATCH_UB_POINTER: getelementptr inbounds nuw i32, ptr
   // NOCATCH_UB_POINTER: getelementptr i32, ptr
 
   // PR9350: char pre-increment never overflows.
diff --git a/clang/test/CodeGen/ms-intrinsics.c b/clang/test/CodeGen/ms-intrinsics.c
index c3d64fda0b901..459a708d9b2e0 100644
--- a/clang/test/CodeGen/ms-intrinsics.c
+++ b/clang/test/CodeGen/ms-intrinsics.c
@@ -156,7 +156,7 @@ unsigned char test_BitScanForward(unsigned long *Index, unsigned long Mask) {
 // CHECK:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
 // CHECK:   ret i8 [[RESULT]]
 // CHECK:   [[ISNOTZERO_LABEL]]:
-// CHECK:   [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds i8, ptr %Index, {{i64|i32}} 4
+// CHECK:   [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds nuw i8, ptr %Index, {{i64|i32}} 4
 // CHECK:   [[INDEX:%[0-9]+]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %Mask, i1 true)
 // CHECK:   store i32 [[INDEX]], ptr [[IDXGEP]], align 4
 // CHECK:   br label %[[END_LABEL]]
@@ -171,7 +171,7 @@ unsigned char test_BitScanReverse(unsigned long *Index, unsigned long Mask) {
 // CHECK:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
 // CHECK:   ret i8 [[RESULT]]
 // CHECK:   [[ISNOTZERO_LABEL]]:
-// CHECK:   [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds i8, ptr %Index, {{i64|i32}} 4
+// CHECK:   [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds nuw i8, ptr %Index, {{i64|i32}} 4
 // CHECK:   [[REVINDEX:%[0-9]+]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 %Mask, i1 true)
 // CHECK:   [[INDEX:%[0-9]+]] = xor i32 [[REVINDEX]], 31
 // CHECK:   store i32 [[INDEX]], ptr [[IDXGEP]], align 4
@@ -437,10 +437,10 @@ unsigned char test_InterlockedCompareExchange128(
                                         ++ExchangeLow, ++ComparandResult);
 }
 // CHECK-64: define{{.*}}i8 @test_InterlockedCompareExchange128(ptr{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%ExchangeHigh, i64{{[a-z_ ]*}}%ExchangeLow, ptr{{[a-z_ ]*}}%ComparandResult){{.*}}{
-// CHECK-64: %incdec.ptr = getelementptr inbounds i8, ptr %Destination, i64 8
+// CHECK-64: %incdec.ptr = getelementptr inbounds nuw i8, ptr %Destination, i64 8
 // CHECK-64: %inc = add nsw i64 %ExchangeHigh, 1
 // CHECK-64: %inc1 = add nsw i64 %ExchangeLow, 1
-// CHECK-64: %incdec.ptr2 = getelementptr inbounds i8, ptr %ComparandResult, i64 8
+// CHECK-64: %incdec.ptr2 = getelementptr inbounds nuw i8, ptr %ComparandResult, i64 8
 // CHECK-64: [[EH:%[0-9]+]] = zext i64 %inc to i128
 // CHECK-64: [[EL:%[0-9]+]] = zext i64 %inc1 to i128
 // CHECK-64: [[EHS:%[0-9]+]] = shl nuw i128 [[EH]], 64
@@ -486,7 +486,7 @@ short test_InterlockedIncrement16(short volatile *Addend) {
   return _InterlockedIncrement16(++Addend);
 }
 // CHECK: define{{.*}}i16 @test_InterlockedIncrement16(ptr{{[a-z_ ]*}}%Addend){{.*}}{
-// CHECK: %incdec.ptr = getelementptr inbounds i8, ptr %Addend, {{i64|i32}} 2
+// CHECK: %incdec.ptr = getelementptr inbounds nuw i8, ptr %Addend, {{i64|i32}} 2
 // CHECK: [[TMP:%[0-9]+]] = atomicrmw add ptr %incdec.ptr, i16 1 seq_cst, align 2
 // CHECK: [[RESULT:%[0-9]+]] = add i16 [[TMP]], 1
 // CHECK: ret i16 [[RESULT]]
@@ -496,7 +496,7 @@ long test_InterlockedIncrement(long volatile *Addend) {
   return _InterlockedIncrement(++Addend);
 }
 // CHECK: define{{.*}}i32 @test_InterlockedIncrement(ptr{{[a-z_ ]*}}%Addend){{.*}}{
-// CHECK: %incdec.ptr = getelementptr inbounds i8, ptr %Addend, {{i64|i32}} 4
+// CHECK: %incdec.ptr = getelementptr inbounds nuw i8, ptr %Addend, {{i64|i32}} 4
 // CHECK: [[TMP:%[0-9]+]] = atomicrmw add ptr %incdec.ptr, i32 1 seq_cst, align 4
 // CHECK: [[RESULT:%[0-9]+]] = add i32 [[TMP]], 1
 // CHECK: ret i32 [[RESULT]]
diff --git a/clang/test/CodeGen/ubsan-pointer-overflow.m b/clang/test/CodeGen/ubsan-pointer-overflow.m
index 9192598da92fc..4ecdac655669f 100644
--- a/clang/test/CodeGen/ubsan-pointer-overflow.m
+++ b/clang/test/CodeGen/ubsan-pointer-overflow.m
@@ -5,7 +5,7 @@ void variable_len_array_arith(int n, int k) {
   int vla[n];
   int (*p)[n] = &vla;
 
-  // CHECK: getelementptr inbounds i32, ptr {{.*}}, i64 [[INC:%.*]]
+  // CHECK: getelementptr inbounds nuw i32, ptr {{.*}}, i64 [[INC:%.*]]
   // CHECK: @llvm.smul.with.overflow.i64(i64 4, i64 [[INC]]), !nosanitize
   // CHECK-NOT: select
   // CHECK: call void @__ubsan_handle_pointer_overflow{{.*}}
diff --git a/clang/test/CodeGen/vla.c b/clang/test/CodeGen/vla.c
index 33621c5dd7a29..a22ba727df2fe 100644
--- a/clang/test/CodeGen/vla.c
+++ b/clang/test/CodeGen/vla.c
@@ -120,7 +120,7 @@ int test4(unsigned n, char (*p)[n][n+1][6]) {
   // CHECK-NEXT: [[T2:%.*]] = udiv i32 [[T1]], 2
   // CHECK-NEXT: [[T3:%.*]] = mul nuw i32 [[DIM0]], [[DIM1]]
   // CHECK-NEXT: [[T4:%.*]] = mul nsw i32 [[T2]], [[T3]]
-  // CHECK-NEXT: [[T5:%.*]] = getelementptr inbounds [6 x i8], ptr [[T0]], i32 [[T4]]
+  // CHECK-NEXT: [[T5:%.*]] = getelementptr inbounds nuw [6 x i8], ptr [[T0]], i32 [[T4]]
   // CHECK-NEXT: [[T6:%.*]] = load i32, ptr [[N]], align 4
   // CHECK-NEXT: [[T7:%.*]] = udiv i32 [[T6]], 4
   // CHECK-NEXT: [[T8:%.*]] = sub i32 0, [[T7]]
diff --git a/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp b/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp
index e842de6335046..fd9786de3a949 100644
--- a/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp
+++ b/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp
@@ -152,16 +152,16 @@ void f_branch_elided()
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__BEGIN1]]) #[[ATTR3]]
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP1]], i64 0, i64 0
-// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16:![0-9]+]]
+// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__END1]]) #[[ATTR3]]
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP2]], i64 0, i64 0
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[ARRAYDECAY1]], i64 4
-// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[TMP3]], [[TMP4]]
 // CHECK-NEXT:    [[CMP_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CMP]], i1 true)
 // CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
@@ -172,16 +172,16 @@ void f_branch_elided()
 // CHECK-NEXT:    br label [[FOR_END:%.*]]
 // CHECK:       for.body:
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR3]]
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    store i32 [[TMP6]], ptr [[I]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR3]]
 // CHECK-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 1
+// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 // CHECK:       for.end:
 // CHECK-NEXT:    ret void
 //
@@ -204,16 +204,16 @@ void frl(int (&&e) [4])
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__BEGIN1]]) #[[ATTR3]]
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP1]], i64 0, i64 0
-// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__END1]]) #[[ATTR3]]
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP2]], i64 0, i64 0
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[ARRAYDECAY1]], i64 4
-// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[TMP3]], [[TMP4]]
 // CHECK-NEXT:    [[CMP_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CMP]], i1 false)
 // CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
@@ -224,16 +224,16 @@ void frl(int (&&e) [4])
 // CHECK-NEXT:    br label [[FOR_END:%.*]]
 // CHECK:       for.body:
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR3]]
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    store i32 [[TMP6]], ptr [[I]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR3]]
 // CHECK-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 1
+// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK:       for.end:
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenCXX/for-range.cpp b/clang/test/CodeGenCXX/for-range.cpp
index 10d27206d12e4..088a34647c374 100644
--- a/clang/test/CodeGenCXX/for-range.cpp
+++ b/clang/test/CodeGenCXX/for-range.cpp
@@ -33,7 +33,7 @@ B *end(C&);
 
 extern B array[5];
 
-// CHECK-LABEL: define {{[^@]+}}@_Z9for_arrayv(
+// CHECK-LABEL: @_Z9for_arrayv(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_A:%.*]], align 1
 // CHECK-NEXT:    [[__RANGE1:%.*]] = alloca ptr, align 8
@@ -57,7 +57,7 @@ extern B array[5];
 // CHECK-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK:       for.inc:
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds [[STRUCT_B]], ptr [[TMP3]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw [[STRUCT_B]], ptr [[TMP3]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8
 // CHECK-NEXT:    br label [[FOR_COND]]
 // CHECK:       for.end:
@@ -70,7 +70,7 @@ void for_array() {
   }
 }
 
-// CHECK-LABEL: define {{[^@]+}}@_Z9for_rangev(
+// CHECK-LABEL: @_Z9for_rangev(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_A:%.*]], align 1
 // CHECK-NEXT:    [[__RANGE1:%.*]] = alloca ptr, align 8
@@ -103,7 +103,7 @@ void for_array() {
 // CHECK-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK:       for.inc:
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds [[STRUCT_B]], ptr [[TMP5]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw [[STRUCT_B]], ptr [[TMP5]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8
 // CHECK-NEXT:    br label [[FOR_COND]]
 // CHECK:       for.end:
@@ -116,7 +116,7 @@ void for_range() {
   }
 }
 
-// CHECK-LABEL: define {{[^@]+}}@_Z16for_member_rangev(
+// CHECK-LABEL: @_Z16for_member_rangev(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_A:%.*]], align 1
 // CHECK-NEXT:    [[__RANGE1:%.*]] = alloca ptr, align 8
@@ -149,7 +149,7 @@ void for_range() {
 // CHECK-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK:       for.inc:
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds [[STRUCT_B]], ptr [[TMP5]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw [[STRUCT_B]], ptr [[TMP5]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8
 // CHECK-NEXT:    br label [[FOR_COND]]
 // CHECK:       for.end:
diff --git a/clang/test/CodeGenCXX/pr45964-decomp-transform.cpp b/clang/test/CodeGenCXX/pr45964-decomp-transform.cpp
index f7df110ec0129..bcb2d875dce66 100644
--- a/clang/test/CodeGenCXX/pr45964-decomp-transform.cpp
+++ b/clang/test/CodeGenCXX/pr45964-decomp-transform.cpp
@@ -16,7 +16,7 @@ void (*d)(){test_transform<0>};
 // CHECK-NEXT:  [[BODY]]:
 // CHECK-NEXT:  [[CUR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[BODY]] ]
 // CHECK-NEXT:  [[DEST:%.*]] = getelementptr inbounds i32, ptr [[BEGIN]], i64 [[CUR]]
-// CHECK-NEXT:  [[SRC:%.*]] = getelementptr inbounds [1 x i32], ptr @a, i64 0, i64 [[CUR]]
+// CHECK-NEXT:  [[SRC:%.*]] = getelementptr inbounds nuw [1 x i32], ptr @a, i64 0, i64 [[CUR]]
 // CHECK-NEXT:  [[X:%.*]] = load i32, ptr [[SRC]]
 // CHECK-NEXT:  store i32 [[X]], ptr [[DEST]]
 // CHECK-NEXT:  [[NEXT]] = add nuw i64 [[CUR]], 1
diff --git a/clang/test/CodeGenCXX/vla.cpp b/clang/test/CodeGenCXX/vla.cpp
index 4cf2b3b445b40..aadf51fce3a44 100644
--- a/clang/test/CodeGenCXX/vla.cpp
+++ b/clang/test/CodeGenCXX/vla.cpp
@@ -83,7 +83,7 @@ void test2(int b) {
   
   //CHECK: [[VLA_SIZEOF:%.*]] = mul nuw i64 4, [[VLA_NUM_ELEMENTS_PRE]]
   //CHECK-NEXT: [[VLA_NUM_ELEMENTS_POST:%.*]] = udiv i64 [[VLA_SIZEOF]], 4
-  //CHECK-NEXT: [[VLA_END_PTR:%.*]] = getelementptr inbounds i32, ptr {{%.*}}, i64 [[VLA_NUM_ELEMENTS_POST]]
+  //CHECK-NEXT: [[VLA_END_PTR:%.*]] = getelementptr inbounds nuw i32, ptr {{%.*}}, i64 [[VLA_NUM_ELEMENTS_POST]]
   //X64-NEXT: store ptr [[VLA_END_PTR]], ptr %__end1
   //AMDGCN-NEXT: store ptr [[VLA_END_PTR]], ptr [[END]]
   for (int d : varr) 0;
@@ -116,7 +116,7 @@ void test3(int b, int c) {
   //CHECK-NEXT: [[VLA_SIZEOF_DIM2:%.*]] = mul nuw i64 4, [[VLA_DIM2_PRE]]
   //CHECK-NEXT: [[VLA_NUM_ELEMENTS:%.*]] = udiv i64 [[VLA_SIZEOF]], [[VLA_SIZEOF_DIM2]]
   //CHECK-NEXT: [[VLA_END_INDEX:%.*]] = mul nsw i64 [[VLA_NUM_ELEMENTS]], [[VLA_DIM2_PRE]]
-  //CHECK-NEXT: [[VLA_END_PTR:%.*]] = getelementptr inbounds i32, ptr {{%.*}}, i64 [[VLA_END_INDEX]]
+  //CHECK-NEXT: [[VLA_END_PTR:%.*]] = getelementptr inbounds nuw i32, ptr {{%.*}}, i64 [[VLA_END_INDEX]]
   //X64-NEXT: store ptr [[VLA_END_PTR]], ptr %__end
   //AMDGCN-NEXT: store ptr [[VLA_END_PTR]], ptr [[END]]
  
diff --git a/clang/test/CodeGenHLSL/buffer-array-operator.hlsl b/clang/test/CodeGenHLSL/buffer-array-operator.hlsl
index f5556df30871c..02e570ebdcb4f 100644
--- a/clang/test/CodeGenHLSL/buffer-array-operator.hlsl
+++ b/clang/test/CodeGenHLSL/buffer-array-operator.hlsl
@@ -17,7 +17,7 @@ void fn(int Idx) {
 // CHECK-NEXT: %h = getelementptr inbounds nuw %"class.hlsl::RWBuffer", ptr %this1, i32 0, i32 0
 // CHECK-NEXT: %0 = load ptr, ptr %h, align 4
 // CHECK-NEXT: %1 = load i32, ptr %Idx.addr, align 4
-// CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %0, i32 %1
+// CHECK-NEXT: %arrayidx = getelementptr inbounds nuw float, ptr %0, i32 %1
 // CHECK-NEXT: ret ptr %arrayidx
 
 // Const comes next, and returns the pointer instead of the value.
@@ -26,5 +26,5 @@ void fn(int Idx) {
 // CHECK-NEXT: %h = getelementptr inbounds nuw %"class.hlsl::RWBuffer", ptr %this1, i32 0, i32 0
 // CHECK-NEXT: %0 = load ptr, ptr %h, align 4
 // CHECK-NEXT: %1 = load i32, ptr %Idx.addr, align 4
-// CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %0, i32 %1
+// CHECK-NEXT: %arrayidx = getelementptr inbounds nuw float, ptr %0, i32 %1
 // CHECK-NEXT: ret ptr %arrayidx
diff --git a/clang/test/CodeGenSYCL/address-space-deduction.cpp b/clang/test/CodeGenSYCL/address-space-deduction.cpp
index 96075a47343fe..5910ec3bfc305 100644
--- a/clang/test/CodeGenSYCL/address-space-deduction.cpp
+++ b/clang/test/CodeGenSYCL/address-space-deduction.cpp
@@ -33,55 +33,55 @@
 // CHECK-NEXT:    store ptr addrspace(4) [[I_ASCAST]], ptr addrspace(4) [[PPTR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PPTR_ASCAST]], align 8
 // CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(4) [[TMP0]], [[I_ASCAST]]
-// CHECK-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[CMP]] to i8
-// CHECK-NEXT:    store i8 [[FROMBOOL]], ptr addrspace(4) [[IS_I_PTR_ASCAST]], align 1
+// CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[CMP]] to i8
+// CHECK-NEXT:    store i8 [[STOREDV]], ptr addrspace(4) [[IS_I_PTR_ASCAST]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PPTR_ASCAST]], align 8
 // CHECK-NEXT:    store i32 66, ptr addrspace(4) [[TMP1]], align 4
 // CHECK-NEXT:    store i32 23, ptr addrspace(4) [[VAR23_ASCAST]], align 4
 // CHECK-NEXT:    store ptr addrspace(4) [[VAR23_ASCAST]], ptr addrspace(4) [[CP_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[CP_ASCAST]], align 8
-// CHECK-NEXT:    store i8 41, ptr addrspace(4) [[TMP3]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[CP_ASCAST]], align 8
+// CHECK-NEXT:    store i8 41, ptr addrspace(4) [[TMP2]], align 1
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [42 x i32], ptr addrspace(4) [[ARR_ASCAST]], i64 0, i64 0
 // CHECK-NEXT:    store ptr addrspace(4) [[ARRAYDECAY]], ptr addrspace(4) [[CPP_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[CPP_ASCAST]], align 8
-// CHECK-NEXT:    store i8 43, ptr addrspace(4) [[TMP5]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[CPP_ASCAST]], align 8
+// CHECK-NEXT:    store i8 43, ptr addrspace(4) [[TMP3]], align 1
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [42 x i32], ptr addrspace(4) [[ARR_ASCAST]], i64 0, i64 0
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[ARRAYDECAY1]], i64 10
 // CHECK-NEXT:    store ptr addrspace(4) [[ADD_PTR]], ptr addrspace(4) [[APTR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[APTR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[APTR_ASCAST]], align 8
 // CHECK-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [42 x i32], ptr addrspace(4) [[ARR_ASCAST]], i64 0, i64 0
-// CHECK-NEXT:    [[ADD_PTR3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[ARRAYDECAY2]], i64 168
-// CHECK-NEXT:    [[CMP4:%.*]] = icmp ult ptr addrspace(4) [[TMP6]], [[ADD_PTR3]]
+// CHECK-NEXT:    [[ADD_PTR3:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[ARRAYDECAY2]], i64 168
+// CHECK-NEXT:    [[CMP4:%.*]] = icmp ult ptr addrspace(4) [[TMP4]], [[ADD_PTR3]]
 // CHECK-NEXT:    br i1 [[CMP4]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 // CHECK:       if.then:
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[APTR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 44, ptr addrspace(4) [[TMP7]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[APTR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 44, ptr addrspace(4) [[TMP5]], align 4
 // CHECK-NEXT:    br label [[IF_END]]
 // CHECK:       if.end:
 // CHECK-NEXT:    store ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str to ptr addrspace(4)), ptr addrspace(4) [[STR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP8]], i64 0
-// CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr addrspace(4) [[ARRAYIDX]], align 1
-// CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP9]] to i32
+// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP6]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(4) [[ARRAYIDX]], align 1
+// CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
 // CHECK-NEXT:    store i32 [[CONV]], ptr addrspace(4) [[I_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(4) [[I_ASCAST]], align 4
-// CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP10]], 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(4) [[I_ASCAST]], align 4
+// CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP8]], 2
 // CHECK-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK:       cond.true:
-// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8
 // CHECK-NEXT:    br label [[COND_END:%.*]]
 // CHECK:       cond.false:
 // CHECK-NEXT:    br label [[COND_END]]
 // CHECK:       cond.end:
-// CHECK-NEXT:    [[COND:%.*]] = phi ptr addrspace(4) [ [[TMP11]], [[COND_TRUE]] ], [ addrspacecast (ptr addrspace(1) @.str.1 to ptr addrspace(4)), [[COND_FALSE]] ]
+// CHECK-NEXT:    [[COND:%.*]] = phi ptr addrspace(4) [ [[TMP9]], [[COND_TRUE]] ], [ addrspacecast (ptr addrspace(1) @.str.1 to ptr addrspace(4)), [[COND_FALSE]] ]
 // CHECK-NEXT:    store ptr addrspace(4) [[COND]], ptr addrspace(4) [[PHI_STR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(4) [[I_ASCAST]], align 4
-// CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], 2
-// CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[CMP6]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(4) [[I_ASCAST]], align 4
+// CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP10]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[CMP6]] to i64
 // CHECK-NEXT:    [[COND7:%.*]] = select i1 [[CMP6]], ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str.2 to ptr addrspace(4)), ptr addrspace(4) null
 // CHECK-NEXT:    store ptr addrspace(4) [[COND7]], ptr addrspace(4) [[SELECT_NULL_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(4) [[TMP14]], ptr addrspace(4) [[SELECT_STR_TRIVIAL1_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(4) [[TMP12]], ptr addrspace(4) [[SELECT_STR_TRIVIAL1_ASCAST]], align 8
 // CHECK-NEXT:    store ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str.1 to ptr addrspace(4)), ptr addrspace(4) [[SELECT_STR_TRIVIAL2_ASCAST]], align 8
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index 9d202e0d04682..e4254d1e64bec 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -47,7 +47,7 @@ typedef unsigned long long uint64_t;
 // CHECK-NEXT:    [[CONV5_I:%.*]] = zext nneg i8 [[TMP0]] to i64
 // CHECK-NEXT:    [[ADD_I:%.*]] = add i64 [[MUL_I]], -48
 // CHECK-NEXT:    [[SUB_I:%.*]] = add i64 [[ADD_I]], [[CONV5_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I]]
 // CHECK:       cleanup.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I]] = phi ptr [ [[INCDEC_PTR_I]], [[IF_THEN_I]] ], [ [[__TAGP_ADDR_0_I]], [[WHILE_BODY_I]] ]
@@ -79,7 +79,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base8(const char *p) {
 // CHECK-NEXT:    [[CONV5_I:%.*]] = zext nneg i8 [[TMP0]] to i64
 // CHECK-NEXT:    [[ADD_I:%.*]] = add i64 [[MUL_I]], -48
 // CHECK-NEXT:    [[SUB_I:%.*]] = add i64 [[ADD_I]], [[CONV5_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I]]
 // CHECK:       cleanup.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I]] = phi ptr [ [[INCDEC_PTR_I]], [[IF_THEN_I]] ], [ [[__TAGP_ADDR_0_I]], [[WHILE_BODY_I]] ]
@@ -120,7 +120,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base10(const char *p) {
 // CHECK-NEXT:    [[CONV25_I:%.*]] = zext nneg i8 [[TMP0]] to i64
 // CHECK-NEXT:    [[ADD26_I:%.*]] = add i64 [[MUL24_I]], [[DOTSINK]]
 // CHECK-NEXT:    [[ADD28_I:%.*]] = add i64 [[ADD26_I]], [[CONV25_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I]]
 // CHECK:       cleanup.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I]] = phi ptr [ [[INCDEC_PTR_I]], [[IF_END31_I]] ], [ [[__TAGP_ADDR_0_I]], [[IF_ELSE17_I]] ]
@@ -141,7 +141,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i8 [[TMP0]], 48
 // CHECK-NEXT:    br i1 [[CMP_I]], label [[IF_THEN_I:%.*]], label [[WHILE_COND_I14_I:%.*]]
 // CHECK:       if.then.i:
-// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[TBAA4]]
 // CHECK-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I:%.*]] [
 // CHECK-NEXT:      i8 120, label [[WHILE_COND_I30_I_PREHEADER:%.*]]
@@ -173,7 +173,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // CHECK-NEXT:    [[CONV25_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64
 // CHECK-NEXT:    [[ADD26_I_I:%.*]] = add i64 [[MUL24_I_I]], [[DOTSINK]]
 // CHECK-NEXT:    [[ADD28_I_I:%.*]] = add i64 [[ADD26_I_I]], [[CONV25_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I40_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I40_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I31_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I36_I]]
 // CHECK:       cleanup.i36.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I37_I]] = phi ptr [ [[INCDEC_PTR_I40_I]], [[IF_END31_I_I]] ], [ [[__TAGP_ADDR_0_I31_I]], [[IF_ELSE17_I_I]] ]
@@ -195,7 +195,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // CHECK-NEXT:    [[CONV5_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64
 // CHECK-NEXT:    [[ADD_I_I:%.*]] = add i64 [[MUL_I_I]], -48
 // CHECK-NEXT:    [[SUB_I_I:%.*]] = add i64 [[ADD_I_I]], [[CONV5_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I_I]]
 // CHECK:       cleanup.i.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I_I]] = phi ptr [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ], [ [[__TAGP_ADDR_0_I_I]], [[WHILE_BODY_I_I]] ]
@@ -216,7 +216,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // CHECK-NEXT:    [[CONV5_I26_I:%.*]] = zext nneg i8 [[TMP8]] to i64
 // CHECK-NEXT:    [[ADD_I27_I:%.*]] = add i64 [[MUL_I25_I]], -48
 // CHECK-NEXT:    [[SUB_I28_I:%.*]] = add i64 [[ADD_I27_I]], [[CONV5_I26_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I29_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I29_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I20_I]]
 // CHECK:       cleanup.i20.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I21_I]] = phi ptr [ [[INCDEC_PTR_I29_I]], [[IF_THEN_I24_I]] ], [ [[__TAGP_ADDR_0_I15_I]], [[WHILE_BODY_I18_I]] ]
@@ -2367,7 +2367,7 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // DEFAULT-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
 // DEFAULT-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
 // DEFAULT:       if.then.i.i:
-// DEFAULT-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1
+// DEFAULT-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
 // DEFAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
 // DEFAULT-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
 // DEFAULT-NEXT:      i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]]
@@ -2399,7 +2399,7 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // DEFAULT-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64
 // DEFAULT-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // DEFAULT-NEXT:    [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
+// DEFAULT-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
 // DEFAULT-NEXT:    br label [[CLEANUP_I36_I_I]]
 // DEFAULT:       cleanup.i36.i.i:
 // DEFAULT-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ]
@@ -2421,7 +2421,7 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // DEFAULT-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64
 // DEFAULT-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
 // DEFAULT-NEXT:    [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
+// DEFAULT-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
 // DEFAULT-NEXT:    br label [[CLEANUP_I_I_I]]
 // DEFAULT:       cleanup.i.i.i:
 // DEFAULT-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
@@ -2442,7 +2442,7 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // DEFAULT-NEXT:    [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64
 // DEFAULT-NEXT:    [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48
 // DEFAULT-NEXT:    [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
+// DEFAULT-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
 // DEFAULT-NEXT:    br label [[CLEANUP_I20_I_I]]
 // DEFAULT:       cleanup.i20.i.i:
 // DEFAULT-NEXT:    [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ]
@@ -2466,7 +2466,7 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // APPROX-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
 // APPROX-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
 // APPROX:       if.then.i.i:
-// APPROX-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1
+// APPROX-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
 // APPROX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
 // APPROX-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
 // APPROX-NEXT:      i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]]
@@ -2498,7 +2498,7 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // APPROX-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64
 // APPROX-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // APPROX-NEXT:    [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
+// APPROX-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
 // APPROX-NEXT:    br label [[CLEANUP_I36_I_I]]
 // APPROX:       cleanup.i36.i.i:
 // APPROX-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ]
@@ -2520,7 +2520,7 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // APPROX-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64
 // APPROX-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
 // APPROX-NEXT:    [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
+// APPROX-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
 // APPROX-NEXT:    br label [[CLEANUP_I_I_I]]
 // APPROX:       cleanup.i.i.i:
 // APPROX-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
@@ -2541,7 +2541,7 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // APPROX-NEXT:    [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64
 // APPROX-NEXT:    [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48
 // APPROX-NEXT:    [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
+// APPROX-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
 // APPROX-NEXT:    br label [[CLEANUP_I20_I_I]]
 // APPROX:       cleanup.i20.i.i:
 // APPROX-NEXT:    [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ]
@@ -2565,7 +2565,7 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // DEFAULT-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
 // DEFAULT-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
 // DEFAULT:       if.then.i.i:
-// DEFAULT-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1
+// DEFAULT-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
 // DEFAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
 // DEFAULT-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
 // DEFAULT-NEXT:      i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]]
@@ -2597,7 +2597,7 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // DEFAULT-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64
 // DEFAULT-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // DEFAULT-NEXT:    [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
+// DEFAULT-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
 // DEFAULT-NEXT:    br label [[CLEANUP_I36_I_I]]
 // DEFAULT:       cleanup.i36.i.i:
 // DEFAULT-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ]
@@ -2619,7 +2619,7 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // DEFAULT-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64
 // DEFAULT-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
 // DEFAULT-NEXT:    [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
+// DEFAULT-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
 // DEFAULT-NEXT:    br label [[CLEANUP_I_I_I]]
 // DEFAULT:       cleanup.i.i.i:
 // DEFAULT-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
@@ -2640,7 +2640,7 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // DEFAULT-NEXT:    [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64
 // DEFAULT-NEXT:    [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48
 // DEFAULT-NEXT:    [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
+// DEFAULT-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
 // DEFAULT-NEXT:    br label [[CLEANUP_I20_I_I]]
 // DEFAULT:       cleanup.i20.i.i:
 // DEFAULT-NEXT:    [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ]
@@ -2663,7 +2663,7 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // APPROX-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
 // APPROX-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
 // APPROX:       if.then.i.i:
-// APPROX-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1
+// APPROX-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
 // APPROX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
 // APPROX-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
 // APPROX-NEXT:      i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]]
@@ -2695,7 +2695,7 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // APPROX-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64
 // APPROX-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // APPROX-NEXT:    [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
+// APPROX-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
 // APPROX-NEXT:    br label [[CLEANUP_I36_I_I]]
 // APPROX:       cleanup.i36.i.i:
 // APPROX-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ]
@@ -2717,7 +2717,7 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // APPROX-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64
 // APPROX-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
 // APPROX-NEXT:    [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
+// APPROX-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
 // APPROX-NEXT:    br label [[CLEANUP_I_I_I]]
 // APPROX:       cleanup.i.i.i:
 // APPROX-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
@@ -2738,7 +2738,7 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // APPROX-NEXT:    [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64
 // APPROX-NEXT:    [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48
 // APPROX-NEXT:    [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
+// APPROX-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
 // APPROX-NEXT:    br label [[CLEANUP_I20_I_I]]
 // APPROX:       cleanup.i20.i.i:
 // APPROX-NEXT:    [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ]
@@ -3059,7 +3059,7 @@ extern "C" __device__ double test_normcdfinv(double x) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
 // DEFAULT:       _ZL5normfiPKf.exit:
@@ -3079,7 +3079,7 @@ extern "C" __device__ double test_normcdfinv(double x) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract float [[__R_0_I4]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
 // FINITEONLY:       _ZL5normfiPKf.exit:
@@ -3099,7 +3099,7 @@ extern "C" __device__ double test_normcdfinv(double x) {
 // APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
 // APPROX:       _ZL5normfiPKf.exit:
@@ -3123,7 +3123,7 @@ extern "C" __device__ float test_normf(int x, const float *y) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
 // DEFAULT:       _ZL4normiPKd.exit:
@@ -3143,7 +3143,7 @@ extern "C" __device__ float test_normf(int x, const float *y) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract double [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract double [[__R_0_I4]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
 // FINITEONLY:       _ZL4normiPKd.exit:
@@ -3163,7 +3163,7 @@ extern "C" __device__ float test_normf(int x, const float *y) {
 // APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
 // APPROX:       _ZL4normiPKd.exit:
@@ -3483,7 +3483,7 @@ extern "C" __device__ double test_rint(double x) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // DEFAULT:       _ZL6rnormfiPKf.exit:
@@ -3503,7 +3503,7 @@ extern "C" __device__ double test_rint(double x) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract float [[__R_0_I4]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // FINITEONLY:       _ZL6rnormfiPKf.exit:
@@ -3523,7 +3523,7 @@ extern "C" __device__ double test_rint(double x) {
 // APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // APPROX:       _ZL6rnormfiPKf.exit:
@@ -3547,7 +3547,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // DEFAULT:       _ZL5rnormiPKd.exit:
@@ -3567,7 +3567,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract double [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract double [[__R_0_I4]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // FINITEONLY:       _ZL5rnormiPKd.exit:
@@ -3587,7 +3587,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // APPROX:       _ZL5rnormiPKd.exit:
diff --git a/clang/test/OpenMP/bug60602.cpp b/clang/test/OpenMP/bug60602.cpp
index cb2e4e5b11e33..0789ef958e523 100644
--- a/clang/test/OpenMP/bug60602.cpp
+++ b/clang/test/OpenMP/bug60602.cpp
@@ -58,13 +58,13 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i64 0
 // CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP8]] to i64
 // CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[CONV]], 4
 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP11]], i64 0
 // CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP12]] to i64
 // CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[CONV2]], 4
@@ -134,13 +134,13 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
 // CHECK-NEXT:    [[TMP46:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP47:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP48:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP48]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP48]], i64 0
 // CHECK-NEXT:    [[TMP49:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK-NEXT:    [[CONV5:%.*]] = sext i32 [[TMP49]] to i64
 // CHECK-NEXT:    [[TMP50:%.*]] = mul nuw i64 [[CONV5]], 4
 // CHECK-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP52]], i64 0
 // CHECK-NEXT:    [[TMP53:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP53]] to i64
 // CHECK-NEXT:    [[TMP54:%.*]] = mul nuw i64 [[CONV7]], 4
diff --git a/clang/test/OpenMP/declare_mapper_codegen.cpp b/clang/test/OpenMP/declare_mapper_codegen.cpp
index 52d5ceffa1471..d2954b7a74821 100644
--- a/clang/test/OpenMP/declare_mapper_codegen.cpp
+++ b/clang/test/OpenMP/declare_mapper_codegen.cpp
@@ -129,7 +129,7 @@ class C {
 // CK0-DAG: [[BBEGIN:%.+]] = getelementptr inbounds nuw %class.C, ptr [[PTR]], i32 0, i32 1
 // CK0-DAG: [[BBEGIN2:%.+]] = getelementptr inbounds nuw %class.C, ptr [[PTR]], i32 0, i32 1
 // CK0-DAG: [[BARRBEGIN:%.+]] = load ptr, ptr [[BBEGIN2]]
-// CK0-DAG: [[BARRBEGINGEP:%.+]] = getelementptr inbounds double, ptr [[BARRBEGIN]], i[[sz:64|32]] 0
+// CK0-DAG: [[BARRBEGINGEP:%.+]] = getelementptr inbounds nuw double, ptr [[BARRBEGIN]], i[[sz:64|32]] 0
 // CK0-DAG: [[BEND:%.+]] = getelementptr ptr, ptr [[BBEGIN]], i32 1
 // CK0-DAG: [[ABEGINI:%.+]] = ptrtoint ptr [[ABEGIN]] to i64
 // CK0-DAG: [[BENDI:%.+]] = ptrtoint ptr [[BEND]] to i64
@@ -965,7 +965,7 @@ class C {
 // CK4-DAG: [[BBEGIN:%.+]] = getelementptr inbounds nuw %class.C, ptr [[PTR]], i32 0, i32 1
 // CK4-DAG: [[BBEGIN2:%.+]] = getelementptr inbounds nuw %class.C, ptr [[PTR]], i32 0, i32 1
 // CK4-DAG: [[BARRBEGIN:%.+]] = load ptr, ptr [[BBEGIN2]]
-// CK4-DAG: [[BARRBEGINGEP:%.+]] = getelementptr inbounds double, ptr [[BARRBEGIN]], i[[sz:64|32]] 0
+// CK4-DAG: [[BARRBEGINGEP:%.+]] = getelementptr inbounds nuw double, ptr [[BARRBEGIN]], i[[sz:64|32]] 0
 // CK4-DAG: [[BEND:%.+]] = getelementptr ptr, ptr [[BBEGIN]], i32 1
 // CK4-DAG: [[ABEGINI:%.+]] = ptrtoint ptr [[ABEGIN]] to i64
 // CK4-DAG: [[BENDI:%.+]] = ptrtoint ptr [[BEND]] to i64
diff --git a/clang/test/OpenMP/distribute_codegen.cpp b/clang/test/OpenMP/distribute_codegen.cpp
index ea619cb6e0f26..6c588ba25db30 100644
--- a/clang/test/OpenMP/distribute_codegen.cpp
+++ b/clang/test/OpenMP/distribute_codegen.cpp
@@ -662,24 +662,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK1-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK1-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
@@ -1574,21 +1574,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]]
 // CHECK3-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK3-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
+// CHECK3-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]]
 // CHECK3-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK3-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK3-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
+// CHECK3-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]]
 // CHECK3-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
 // CHECK3-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK3-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
+// CHECK3-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]]
 // CHECK3-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
@@ -2252,24 +2252,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK17-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK17-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK17-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK17-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK17-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK17-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK17-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK17-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK17-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK17-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
@@ -2790,21 +2790,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
+// CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]]
 // CHECK19-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK19-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
+// CHECK19-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]]
 // CHECK19-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK19-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK19-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
+// CHECK19-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]]
 // CHECK19-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
 // CHECK19-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK19-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
+// CHECK19-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]]
 // CHECK19-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
diff --git a/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
index b019b4ff92ad5..93a6779ac02e8 100644
--- a/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
@@ -175,16 +175,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP6]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP7]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP7]], i64 9
 // CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]]
@@ -214,7 +214,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP22]]
 // CHECK1-NEXT:    store ptr [[_TMP6]], ptr [[_TMP5]], align 8
 // CHECK1-NEXT:    store ptr [[TMP23]], ptr [[_TMP6]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP24]], align 8
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -229,19 +229,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP30]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP32]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP32]], i64 0
 // CHECK1-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP33]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP33]], i64 0
 // CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP35]]
 // CHECK1-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP36]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP36]], i64 9
 // CHECK1-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[TMP38]], align 8
@@ -562,9 +562,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp
index f7353172e235c..ad93fd6030ac7 100644
--- a/clang/test/OpenMP/distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_codegen.cpp
@@ -706,24 +706,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK1-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK1-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
@@ -1682,21 +1682,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]]
 // CHECK3-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK3-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
+// CHECK3-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]]
 // CHECK3-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK3-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK3-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
+// CHECK3-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]]
 // CHECK3-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
 // CHECK3-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK3-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
+// CHECK3-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]]
 // CHECK3-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
@@ -2664,24 +2664,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK5-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK5-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK5-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK5-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK5-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK5-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
@@ -3671,21 +3671,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
+// CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]]
 // CHECK7-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK7-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
+// CHECK7-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]]
 // CHECK7-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK7-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK7-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
+// CHECK7-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]]
 // CHECK7-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
 // CHECK7-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK7-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
+// CHECK7-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]]
 // CHECK7-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
@@ -4290,24 +4290,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK9-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP5]] to i64
-// CHECK9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 [[IDXPROM]]
+// CHECK9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i64 [[IDXPROM]]
 // CHECK9-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP8]] to i64
-// CHECK9-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[IDXPROM1]]
+// CHECK9-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM1]]
 // CHECK9-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[MUL3:%.*]] = fmul float [[TMP6]], [[TMP9]]
 // CHECK9-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP11]] to i64
-// CHECK9-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM4]]
+// CHECK9-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM4]]
 // CHECK9-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[MUL6:%.*]] = fmul float [[MUL3]], [[TMP12]]
 // CHECK9-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP14]] to i64
-// CHECK9-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM7]]
+// CHECK9-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM7]]
 // CHECK9-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK9:       omp.body.continue:
@@ -4606,21 +4606,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK11-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
-// CHECK11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 [[TMP5]]
+// CHECK11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i32 [[TMP5]]
 // CHECK11-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
-// CHECK11-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 [[TMP8]]
+// CHECK11-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i32 [[TMP8]]
 // CHECK11-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX1]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[MUL2:%.*]] = fmul float [[TMP6]], [[TMP9]]
 // CHECK11-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
-// CHECK11-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 [[TMP11]]
+// CHECK11-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 [[TMP11]]
 // CHECK11-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[MUL4:%.*]] = fmul float [[MUL2]], [[TMP12]]
 // CHECK11-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
-// CHECK11-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP14]]
+// CHECK11-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i32 [[TMP14]]
 // CHECK11-NEXT:    store float [[MUL4]], ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK11:       omp.body.continue:
@@ -4928,24 +4928,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK13-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP5]] to i64
-// CHECK13-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 [[IDXPROM]]
+// CHECK13-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i64 [[IDXPROM]]
 // CHECK13-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP8]] to i64
-// CHECK13-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[IDXPROM1]]
+// CHECK13-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM1]]
 // CHECK13-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[MUL3:%.*]] = fmul float [[TMP6]], [[TMP9]]
 // CHECK13-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP11]] to i64
-// CHECK13-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM4]]
+// CHECK13-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM4]]
 // CHECK13-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[MUL6:%.*]] = fmul float [[MUL3]], [[TMP12]]
 // CHECK13-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP14]] to i64
-// CHECK13-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM7]]
+// CHECK13-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM7]]
 // CHECK13-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK13:       omp.body.continue:
@@ -5275,21 +5275,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK15-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK15-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 [[TMP5]]
+// CHECK15-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i32 [[TMP5]]
 // CHECK15-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK15-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 [[TMP8]]
+// CHECK15-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i32 [[TMP8]]
 // CHECK15-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX1]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[MUL2:%.*]] = fmul float [[TMP6]], [[TMP9]]
 // CHECK15-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK15-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 [[TMP11]]
+// CHECK15-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 [[TMP11]]
 // CHECK15-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[MUL4:%.*]] = fmul float [[MUL2]], [[TMP12]]
 // CHECK15-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK15-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP14]]
+// CHECK15-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i32 [[TMP14]]
 // CHECK15-NEXT:    store float [[MUL4]], ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK15:       omp.body.continue:
@@ -5782,24 +5782,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK17-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK17-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK17-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK17-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK17-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK17-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK17-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK17-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK17-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK17-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
@@ -6373,21 +6373,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
+// CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]]
 // CHECK19-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK19-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
+// CHECK19-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]]
 // CHECK19-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK19-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK19-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
+// CHECK19-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]]
 // CHECK19-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
 // CHECK19-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK19-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
+// CHECK19-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]]
 // CHECK19-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
@@ -6970,24 +6970,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK21-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK21-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK21-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK21-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK21-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK21-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK21-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK21-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK21-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK21-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK21-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK21-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK21-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
@@ -7592,21 +7592,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
+// CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]]
 // CHECK23-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK23-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
+// CHECK23-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]]
 // CHECK23-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK23-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK23-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
+// CHECK23-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]]
 // CHECK23-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
 // CHECK23-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK23-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
+// CHECK23-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]]
 // CHECK23-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
diff --git a/clang/test/OpenMP/for_linear_codegen.cpp b/clang/test/OpenMP/for_linear_codegen.cpp
index 395ccdbeed763..5a21fe8509fd3 100644
--- a/clang/test/OpenMP/for_linear_codegen.cpp
+++ b/clang/test/OpenMP/for_linear_codegen.cpp
@@ -650,7 +650,7 @@ int main() {
 // CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP16]], [[MUL9]]
 // CHECK1-NEXT:    store i32 [[ADD10]], ptr [[LVAR5]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[PVAR4]], align 8
-// CHECK1-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 1
+// CHECK1-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP18]], i32 1
 // CHECK1-NEXT:    store ptr [[INCDEC_PTR]], ptr [[PVAR4]], align 8
 // CHECK1-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP6]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
diff --git a/clang/test/OpenMP/for_reduction_codegen.cpp b/clang/test/OpenMP/for_reduction_codegen.cpp
index ea32e98bf1423..83632db238484 100644
--- a/clang/test/OpenMP/for_reduction_codegen.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen.cpp
@@ -1021,14 +1021,14 @@ int main() {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = mul nsw i64 1, [[TMP1]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP5]]
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX]], i64 0
 // CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP7]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = mul nsw i64 1, [[TMP1]]
 // CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP8]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX5]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX5]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64
 // CHECK1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]]
@@ -1054,16 +1054,16 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = sub i64 [[TMP17]], [[TMP18]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = sdiv exact i64 [[TMP19]], ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64)
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[VLA7]], i64 [[TMP20]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [4 x %struct.S]], ptr [[TMP4]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw [10 x [4 x %struct.S]], ptr [[TMP4]], i64 0, i64 1
 // CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[ARRAYIDX8]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[ARRAYDECAY]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[ARRAYDECAY]], i64 1
 // CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN11:%.*]] = add nsw i64 0, [[TMP23]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [4 x %struct.S]], ptr [[TMP4]], i64 0, i64 [[LB_ADD_LEN11]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw [10 x [4 x %struct.S]], ptr [[TMP4]], i64 0, i64 [[LB_ADD_LEN11]]
 // CHECK1-NEXT:    [[ARRAYDECAY13:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[ARRAYIDX12]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDECAY13]], i64 2
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[ARRAYDECAY13]], i64 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[ARRAYIDX14]] to i64
 // CHECK1-NEXT:    [[TMP25:%.*]] = ptrtoint ptr [[ARRAYIDX9]] to i64
 // CHECK1-NEXT:    [[TMP26:%.*]] = sub i64 [[TMP24]], [[TMP25]]
@@ -1580,10 +1580,10 @@ int main() {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = mul nsw i64 1, [[TMP1]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP3]]
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX]], i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = mul nsw i64 1, [[TMP1]]
 // CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP4]]
-// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX4]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX4]], i64 1
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [1 x [2 x i32]], ptr [[ARR6]], i32 0, i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 2
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP5]]
@@ -1757,13 +1757,13 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 4
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[TMP4]], i64 6
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP4]], i64 6
 // CHECK1-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64
 // CHECK1-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ARRAYIDX1]] to i64
 // CHECK1-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP5]], [[TMP6]]
@@ -1963,11 +1963,11 @@ int main() {
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 1
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 1
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[TMP4]], i64 6
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP4]], i64 6
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [1 x [6 x %struct.S]], ptr [[VAR24]], i32 0, i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 6
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP5]]
@@ -2148,13 +2148,13 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 1
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 1
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[TMP4]], i64 6
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP4]], i64 6
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [1 x [6 x %struct.S]], ptr [[VAR24]], i32 0, i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 6
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP5]]
@@ -2335,13 +2335,13 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 1
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[TMP2]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP2]], i64 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 1
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[TMP4]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP4]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR24]])
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
@@ -2459,8 +2459,8 @@ int main() {
 // CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x %struct.S], ptr [[TMP0]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [5 x %struct.S], ptr [[TMP0]], i64 0, i64 4
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [5 x %struct.S], ptr [[TMP0]], i64 0, i64 0
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [5 x %struct.S], ptr [[TMP0]], i64 0, i64 4
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [5 x %struct.S], ptr [[VVAR22]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_S:%.*]], ptr [[ARRAY_BEGIN]], i64 5
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
@@ -2641,9 +2641,9 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP2]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP2]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP3]], i64 0, i64 2
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP3]], i64 0, i64 2
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[VAR34]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_S:%.*]], ptr [[ARRAY_BEGIN]], i64 2
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP4]]
@@ -2826,9 +2826,9 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP2]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP3]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[VAR34]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_S:%.*]], ptr [[ARRAY_BEGIN]], i64 2
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP4]]
@@ -3012,9 +3012,9 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP2]], i64 0, i64 2
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP2]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP3]], i64 0, i64 3
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP3]], i64 0, i64 3
 // CHECK1-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64
 // CHECK1-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK1-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP4]], [[TMP5]]
@@ -3974,8 +3974,8 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [42 x %struct.S.0], ptr [[TMP0]], i64 0, i64 1
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [42 x %struct.S.0], ptr [[TMP0]], i64 0, i64 40
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [42 x %struct.S.0], ptr [[TMP0]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [42 x %struct.S.0], ptr [[TMP0]], i64 0, i64 40
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [40 x %struct.S.0], ptr [[ARR4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 40
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP6]]
diff --git a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
index 16d6c23542fce..82f94c949eea6 100644
--- a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
@@ -1074,14 +1074,14 @@ int main() {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = mul nsw i64 1, [[TMP1]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP5]]
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX]], i64 0
 // CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP7]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = mul nsw i64 1, [[TMP1]]
 // CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP8]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX5]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX5]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64
 // CHECK1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]]
@@ -1109,16 +1109,16 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = sub i64 [[TMP17]], [[TMP18]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = sdiv exact i64 [[TMP19]], ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64)
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[VLA7]], i64 [[TMP20]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [4 x %struct.S.0]], ptr [[TMP4]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw [10 x [4 x %struct.S.0]], ptr [[TMP4]], i64 0, i64 1
 // CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x %struct.S.0], ptr [[ARRAYIDX9]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[ARRAYDECAY]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[ARRAYDECAY]], i64 1
 // CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN12:%.*]] = add nsw i64 0, [[TMP23]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [4 x %struct.S.0]], ptr [[TMP4]], i64 0, i64 [[LB_ADD_LEN12]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw [10 x [4 x %struct.S.0]], ptr [[TMP4]], i64 0, i64 [[LB_ADD_LEN12]]
 // CHECK1-NEXT:    [[ARRAYDECAY14:%.*]] = getelementptr inbounds [4 x %struct.S.0], ptr [[ARRAYIDX13]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDECAY14]], i64 2
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[ARRAYDECAY14]], i64 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[ARRAYIDX15]] to i64
 // CHECK1-NEXT:    [[TMP25:%.*]] = ptrtoint ptr [[ARRAYIDX10]] to i64
 // CHECK1-NEXT:    [[TMP26:%.*]] = sub i64 [[TMP24]], [[TMP25]]
@@ -1669,13 +1669,13 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[TMP2]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[TMP2]], i64 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 4
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[TMP4]], i64 6
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP4]], i64 6
 // CHECK1-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64
 // CHECK1-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ARRAYIDX1]] to i64
 // CHECK1-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP5]], [[TMP6]]
@@ -1877,8 +1877,8 @@ int main() {
 // CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x %struct.S.0], ptr [[TMP0]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [5 x %struct.S.0], ptr [[TMP0]], i64 0, i64 4
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [5 x %struct.S.0], ptr [[TMP0]], i64 0, i64 0
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [5 x %struct.S.0], ptr [[TMP0]], i64 0, i64 4
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [5 x %struct.S.0], ptr [[VVAR22]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_S_0:%.*]], ptr [[ARRAY_BEGIN]], i64 5
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
@@ -2066,9 +2066,9 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x %struct.S.0], ptr [[TMP2]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x %struct.S.0], ptr [[TMP2]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x %struct.S.0], ptr [[TMP3]], i64 0, i64 2
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [4 x %struct.S.0], ptr [[TMP3]], i64 0, i64 2
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[VAR34]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_S_0:%.*]], ptr [[ARRAY_BEGIN]], i64 2
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP4]]
@@ -2979,8 +2979,8 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [42 x %struct.S], ptr [[TMP0]], i64 0, i64 1
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [42 x %struct.S], ptr [[TMP0]], i64 0, i64 40
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [42 x %struct.S], ptr [[TMP0]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [42 x %struct.S], ptr [[TMP0]], i64 0, i64 40
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [40 x %struct.S], ptr [[ARR4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr [[STRUCT_S:%.*]], ptr [[ARRAY_BEGIN]], i64 40
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP6]]
diff --git a/clang/test/OpenMP/for_reduction_task_codegen.cpp b/clang/test/OpenMP/for_reduction_task_codegen.cpp
index ea93323de77d0..b875279c2a144 100644
--- a/clang/test/OpenMP/for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/for_reduction_task_codegen.cpp
@@ -68,16 +68,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP4]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP5]]
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP6]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP6]], i64 9
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP8]], [[TMP9]]
@@ -107,7 +107,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP21]]
 // CHECK1-NEXT:    store ptr [[_TMP6]], ptr [[_TMP5]], align 8
 // CHECK1-NEXT:    store ptr [[TMP22]], ptr [[_TMP6]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP23]], align 8
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -122,19 +122,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP28]], align 8
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP29]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP32]], i64 0
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP34]]
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP35]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP35]], i64 9
 // CHECK1-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP36]], i64 [[LB_ADD_LEN10]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP36]], i64 [[LB_ADD_LEN10]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP30]], align 8
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[TMP37]], align 8
@@ -459,9 +459,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/for_scan_codegen.cpp b/clang/test/OpenMP/for_scan_codegen.cpp
index 4cf18a76fbfef..61e6534db471e 100644
--- a/clang/test/OpenMP/for_scan_codegen.cpp
+++ b/clang/test/OpenMP/for_scan_codegen.cpp
@@ -39,13 +39,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:.+]]
@@ -72,13 +72,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -132,13 +132,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE:[^,]+]]
@@ -179,13 +179,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:[^,]+]]
@@ -217,13 +217,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -280,13 +280,13 @@ void baz(int n) {
     // CHECK: [[IF_THEN]]:
     // CHECK: [[BASE_IDX_SUB_1:%.+]] = sub nuw i64 [[BASE_IDX]], 1
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX_SUB_1]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE]]
diff --git a/clang/test/OpenMP/for_simd_scan_codegen.cpp b/clang/test/OpenMP/for_simd_scan_codegen.cpp
index 29af5f74c5b5b..829f2656042fb 100644
--- a/clang/test/OpenMP/for_simd_scan_codegen.cpp
+++ b/clang/test/OpenMP/for_simd_scan_codegen.cpp
@@ -39,13 +39,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:.+]]
@@ -72,13 +72,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -132,13 +132,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE:[^,]+]]
@@ -179,13 +179,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:[^,]+]]
@@ -217,13 +217,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -280,13 +280,13 @@ void baz(int n) {
     // CHECK: [[IF_THEN]]:
     // CHECK: [[BASE_IDX_SUB_1:%.+]] = sub nuw i64 [[BASE_IDX]], 1
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX_SUB_1]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE]]
diff --git a/clang/test/OpenMP/irbuilder_for_iterator.cpp b/clang/test/OpenMP/irbuilder_for_iterator.cpp
index 0098a7db575c3..ec1c3af744b49 100644
--- a/clang/test/OpenMP/irbuilder_for_iterator.cpp
+++ b/clang/test/OpenMP/irbuilder_for_iterator.cpp
@@ -78,18 +78,18 @@ extern "C" void workshareloop_iterator(float *a, float *b, float *c) {
 // CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP8]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP11]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP9]], [[TMP12]]
 // CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP14]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    store float [[MUL]], ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_rangefor.cpp b/clang/test/OpenMP/irbuilder_for_rangefor.cpp
index 45b34621afbb9..86a043e638bc3 100644
--- a/clang/test/OpenMP/irbuilder_for_rangefor.cpp
+++ b/clang/test/OpenMP/irbuilder_for_rangefor.cpp
@@ -94,18 +94,18 @@ extern "C" void workshareloop_rangefor(float *a, float *b, float *c) {
 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP11]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP14]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP12]], [[TMP15]]
 // CHECK-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP17]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    store float [[MUL]], ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned.c b/clang/test/OpenMP/irbuilder_for_unsigned.c
index b0043b823ac85..675871a87b3bd 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned.c
@@ -65,24 +65,24 @@ extern "C" void workshareloop_unsigned(float *a, float *b, float *c, float *d) {
 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP10]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP11]], [[TMP14]]
 // CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP16]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP15]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    [[MUL6:%.*]] = fmul float [[MUL]], [[TMP17]]
 // CHECK-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP19]] to i64
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[IDXPROM7]]
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP18]], i64 [[IDXPROM7]]
 // CHECK-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_auto.c b/clang/test/OpenMP/irbuilder_for_unsigned_auto.c
index 19b2770bfa2df..39ede3ef971d0 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned_auto.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_auto.c
@@ -66,24 +66,24 @@ extern "C" void workshareloop_unsigned_auto(float *a, float *b, float *c, float
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP4]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], [[TMP8]]
 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP10]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    [[MUL6:%.*]] = fmul float [[MUL]], [[TMP11]]
 // CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM7]]
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[IDXPROM7]]
 // CHECK-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_down.c b/clang/test/OpenMP/irbuilder_for_unsigned_down.c
index 6e179826a6efa..5515f086c34a7 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned_down.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_down.c
@@ -67,7 +67,7 @@ extern "C" void workshareloop_unsigned_down(float *a) {
 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP11]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM]]
 // CHECK-NEXT:    store float [[CONV]], ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c
index 8f3297061938b..f20b60e608d2f 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c
@@ -66,24 +66,24 @@ extern "C" void workshareloop_unsigned_dynamic(float *a, float *b, float *c, flo
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP4]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], [[TMP8]]
 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP10]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    [[MUL6:%.*]] = fmul float [[MUL]], [[TMP11]]
 // CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM7]]
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[IDXPROM7]]
 // CHECK-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c
index c2b0948bf7aeb..599f256243b11 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c
@@ -66,24 +66,24 @@ extern "C" void workshareloop_unsigned_dynamic_chunked(float *a, float *b, float
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP4]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], [[TMP8]]
 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP10]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    [[MUL6:%.*]] = fmul float [[MUL]], [[TMP11]]
 // CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM7]]
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[IDXPROM7]]
 // CHECK-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_runtime.c b/clang/test/OpenMP/irbuilder_for_unsigned_runtime.c
index 68becf9f694ad..c27bcba155910 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned_runtime.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_runtime.c
@@ -66,24 +66,24 @@ extern "C" void workshareloop_unsigned_runtime(float *a, float *b, float *c, flo
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP4]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], [[TMP8]]
 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP10]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    [[MUL6:%.*]] = fmul float [[MUL]], [[TMP11]]
 // CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM7]]
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[IDXPROM7]]
 // CHECK-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c b/clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c
index 71fb6b5473da8..b937568ca9f11 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c
@@ -108,24 +108,24 @@ extern "C" void workshareloop_unsigned_static_chunked(float *a, float *b, float
 // CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP19]], [[TMP22]]
 // CHECK-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    [[MUL6:%.*]] = fmul float [[MUL]], [[TMP25]]
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP27]] to i64
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[IDXPROM7]]
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP26]], i64 [[IDXPROM7]]
 // CHECK-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/map_struct_ordering.cpp b/clang/test/OpenMP/map_struct_ordering.cpp
index d5b22d8ff2a4d..a52ddad465f37 100644
--- a/clang/test/OpenMP/map_struct_ordering.cpp
+++ b/clang/test/OpenMP/map_struct_ordering.cpp
@@ -57,7 +57,7 @@ int map_struct() {
 // CHECK-NEXT:    [[DATUM:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DAT]], i32 0, i32 0
 // CHECK-NEXT:    [[DATUM2:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DAT]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DATUM2]], align 8
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_DESCRIPTOR]], ptr [[DAT]], i32 1
 // CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 // CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[DAT]] to i64
diff --git a/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
index e90f0783787c0..7d467293d0c8f 100644
--- a/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
@@ -76,7 +76,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16
 // CHECK1-NEXT:    store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[A]], ptr [[TMP4]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -91,7 +91,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP10]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[B]], ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1
@@ -106,7 +106,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..2, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP17]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP18]], align 8
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1
@@ -124,7 +124,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP25:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]])
 // CHECK1-NEXT:    store ptr [[TMP25]], ptr [[DOTTASK_RED_]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[C]], ptr [[TMP26]], align 8
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
@@ -139,7 +139,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..6, ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP32]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1
diff --git a/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp
index 180ff3a94d24c..b0652c843845c 100644
--- a/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp
@@ -84,9 +84,9 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB1:.+]], ptr [[TMP25]],
 // CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK-DAG:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false)
-// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 0
+// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 0
 // CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
-// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
 // CHECK-DAG:    [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
@@ -138,10 +138,10 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB4:.+]], ptr [[TMP59]],
 // CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6
 // CHECK-DAG:    store i32 1, ptr [[TMP60]],
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
 // CHECK:    [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 4, ptr [[DOTRD_INPUT_]])
 // CHECK:    [[TMP63:%.*]] = load i32, ptr [[N]],
 // CHECK:    store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]],
diff --git a/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
index 2061da7c4d781..b0d00c5f539b1 100644
--- a/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
@@ -76,7 +76,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16
 // CHECK1-NEXT:    store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[A]], ptr [[TMP4]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -91,7 +91,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP10]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[B]], ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1
@@ -106,7 +106,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..2, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP17]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP18]], align 8
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1
@@ -124,7 +124,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP25:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]])
 // CHECK1-NEXT:    store ptr [[TMP25]], ptr [[DOTTASK_RED_]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[C]], ptr [[TMP26]], align 8
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
@@ -139,7 +139,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..6, ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP32]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1
diff --git a/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp
index a69844dc4dee2..7def61251b24e 100644
--- a/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp
@@ -80,9 +80,9 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB1:.+]], ptr [[TMP25]],
 // CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK-DAG:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false)
-// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 0
+// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 0
 // CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
-// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
 // CHECK-DAG:    [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
@@ -134,10 +134,10 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB4:.+]], ptr [[TMP59]],
 // CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6
 // CHECK-DAG:    store i32 1, ptr [[TMP60]],
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
 // CHECK:    [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 4, ptr [[DOTRD_INPUT_]])
 // CHECK:    [[TMP63:%.*]] = load i32, ptr [[N]],
 // CHECK:    store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]],
diff --git a/clang/test/OpenMP/ordered_codegen.cpp b/clang/test/OpenMP/ordered_codegen.cpp
index 0a73eaefc9808..67285cfaef34d 100644
--- a/clang/test/OpenMP/ordered_codegen.cpp
+++ b/clang/test/OpenMP/ordered_codegen.cpp
@@ -255,21 +255,21 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    call void @__kmpc_ordered(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP7]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[TMP7]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]]
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[TMP10]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 // CHECK1-NEXT:    [[MUL3:%.*]] = fmul float [[TMP8]], [[TMP11]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP13]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP13]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK1-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP14]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP15]], i64 [[TMP16]]
 // CHECK1-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4
 // CHECK1-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
@@ -485,24 +485,24 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP9]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[IDXPROM]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[IDXPROM]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-NEXT:    [[IDXPROM7:%.*]] = zext i8 [[TMP12]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[IDXPROM7]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[IDXPROM7]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
 // CHECK1-NEXT:    [[MUL9:%.*]] = fmul float [[TMP10]], [[TMP13]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-NEXT:    [[IDXPROM10:%.*]] = zext i8 [[TMP15]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM10]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM10]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
 // CHECK1-NEXT:    [[MUL12:%.*]] = fmul float [[MUL9]], [[TMP16]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-NEXT:    [[IDXPROM13:%.*]] = zext i8 [[TMP18]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM13]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM13]]
 // CHECK1-NEXT:    store float [[MUL12]], ptr [[ARRAYIDX14]], align 4
 // CHECK1-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
@@ -866,21 +866,21 @@ void foo_simd(int low, int up) {
 // CHECK1-IRBUILDER-NEXT:    call void @__kmpc_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
 // CHECK1-IRBUILDER-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i64 [[TMP6]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i64 [[TMP6]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[TMP9]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[TMP9]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[MUL5:%.*]] = fmul float [[TMP7]], [[TMP10]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[MUL7:%.*]] = fmul float [[MUL5]], [[TMP13]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK1-IRBUILDER-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX8]], align 4
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
@@ -1110,24 +1110,24 @@ void foo_simd(int low, int up) {
 // CHECK1-IRBUILDER-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP8]] to i64
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[IDXPROM]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-IRBUILDER-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP11]] to i64
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM9]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM9]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[MUL11:%.*]] = fmul float [[TMP9]], [[TMP12]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP14:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-IRBUILDER-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP14]] to i64
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM12]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM12]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[MUL14:%.*]] = fmul float [[MUL11]], [[TMP15]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP17:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-IRBUILDER-NEXT:    [[IDXPROM15:%.*]] = zext i8 [[TMP17]] to i64
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM15]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM15]]
 // CHECK1-IRBUILDER-NEXT:    store float [[MUL14]], ptr [[ARRAYIDX16]], align 4
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
@@ -1495,21 +1495,21 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    call void @__kmpc_ordered(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP7:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP7]]
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[TMP7]]
 // CHECK3-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP10:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]]
+// CHECK3-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[TMP10]]
 // CHECK3-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 // CHECK3-NEXT:    [[MUL3:%.*]] = fmul float [[TMP8]], [[TMP11]]
 // CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP13:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP13]]
+// CHECK3-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP13]]
 // CHECK3-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK3-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP14]]
 // CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP16:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]]
+// CHECK3-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP15]], i64 [[TMP16]]
 // CHECK3-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4
 // CHECK3-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
@@ -1725,24 +1725,24 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP9]] to i64
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[IDXPROM]]
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[IDXPROM]]
 // CHECK3-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP12:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-NEXT:    [[IDXPROM7:%.*]] = zext i8 [[TMP12]] to i64
-// CHECK3-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[IDXPROM7]]
+// CHECK3-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[IDXPROM7]]
 // CHECK3-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
 // CHECK3-NEXT:    [[MUL9:%.*]] = fmul float [[TMP10]], [[TMP13]]
 // CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP15:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-NEXT:    [[IDXPROM10:%.*]] = zext i8 [[TMP15]] to i64
-// CHECK3-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM10]]
+// CHECK3-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM10]]
 // CHECK3-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
 // CHECK3-NEXT:    [[MUL12:%.*]] = fmul float [[MUL9]], [[TMP16]]
 // CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP18:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-NEXT:    [[IDXPROM13:%.*]] = zext i8 [[TMP18]] to i64
-// CHECK3-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM13]]
+// CHECK3-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM13]]
 // CHECK3-NEXT:    store float [[MUL12]], ptr [[ARRAYIDX14]], align 4
 // CHECK3-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
@@ -2106,21 +2106,21 @@ void foo_simd(int low, int up) {
 // CHECK3-IRBUILDER-NEXT:    call void @__kmpc_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
 // CHECK3-IRBUILDER-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i64 [[TMP6]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i64 [[TMP6]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[TMP9]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[TMP9]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[MUL5:%.*]] = fmul float [[TMP7]], [[TMP10]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[MUL7:%.*]] = fmul float [[MUL5]], [[TMP13]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK3-IRBUILDER-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX8]], align 4
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
@@ -2350,24 +2350,24 @@ void foo_simd(int low, int up) {
 // CHECK3-IRBUILDER-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP8]] to i64
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[IDXPROM]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-IRBUILDER-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP11]] to i64
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM9]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM9]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[MUL11:%.*]] = fmul float [[TMP9]], [[TMP12]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP14:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-IRBUILDER-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP14]] to i64
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM12]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM12]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[MUL14:%.*]] = fmul float [[MUL11]], [[TMP15]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP17:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-IRBUILDER-NEXT:    [[IDXPROM15:%.*]] = zext i8 [[TMP17]] to i64
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM15]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM15]]
 // CHECK3-IRBUILDER-NEXT:    store float [[MUL14]], ptr [[ARRAYIDX16]], align 4
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
@@ -2674,21 +2674,21 @@ void foo_simd(int low, int up) {
 // CHECK5:       for.body:
 // CHECK5-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP2:%.*]] = load i64, ptr [[I]], align 8
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 [[TMP2]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i64 [[TMP2]]
 // CHECK5-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK5-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP5:%.*]] = load i64, ptr [[I]], align 8
-// CHECK5-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 [[TMP5]]
+// CHECK5-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i64 [[TMP5]]
 // CHECK5-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
 // CHECK5-NEXT:    [[MUL:%.*]] = fmul float [[TMP3]], [[TMP6]]
 // CHECK5-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP8:%.*]] = load i64, ptr [[I]], align 8
-// CHECK5-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[TMP8]]
+// CHECK5-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[TMP8]]
 // CHECK5-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 // CHECK5-NEXT:    [[MUL3:%.*]] = fmul float [[MUL]], [[TMP9]]
 // CHECK5-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP11:%.*]] = load i64, ptr [[I]], align 8
-// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]]
+// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[TMP11]]
 // CHECK5-NEXT:    store float [[MUL3]], ptr [[ARRAYIDX4]], align 4
 // CHECK5-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK5:       for.inc:
@@ -2804,24 +2804,24 @@ void foo_simd(int low, int up) {
 // CHECK5-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8, ptr [[I]], align 1
 // CHECK5-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP3]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 [[IDXPROM]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i64 [[IDXPROM]]
 // CHECK5-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK5-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP6:%.*]] = load i8, ptr [[I]], align 1
 // CHECK5-NEXT:    [[IDXPROM4:%.*]] = zext i8 [[TMP6]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i64 [[IDXPROM4]]
+// CHECK5-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i64 [[IDXPROM4]]
 // CHECK5-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 // CHECK5-NEXT:    [[MUL:%.*]] = fmul float [[TMP4]], [[TMP7]]
 // CHECK5-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP9:%.*]] = load i8, ptr [[I]], align 1
 // CHECK5-NEXT:    [[IDXPROM6:%.*]] = zext i8 [[TMP9]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[IDXPROM6]]
+// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[IDXPROM6]]
 // CHECK5-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX7]], align 4
 // CHECK5-NEXT:    [[MUL8:%.*]] = fmul float [[MUL]], [[TMP10]]
 // CHECK5-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP12:%.*]] = load i8, ptr [[I]], align 1
 // CHECK5-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP12]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[IDXPROM9]]
+// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[IDXPROM9]]
 // CHECK5-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4
 // CHECK5-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK5:       for.inc:
diff --git a/clang/test/OpenMP/parallel_for_codegen.cpp b/clang/test/OpenMP/parallel_for_codegen.cpp
index 2dec32d71b91a..c7afae419509b 100644
--- a/clang/test/OpenMP/parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_codegen.cpp
@@ -665,24 +665,24 @@ void range_for_collapsed() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK1-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8
 // CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4
 // CHECK1-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK1-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK1-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
@@ -779,21 +779,21 @@ void range_for_collapsed() {
 // CHECK1-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]]
 // CHECK1-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
@@ -882,21 +882,21 @@ void range_for_collapsed() {
 // CHECK1-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]]
 // CHECK1-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
@@ -1159,24 +1159,24 @@ void range_for_collapsed() {
 // CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP14]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[IDXPROM6:%.*]] = zext i8 [[TMP17]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM6]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM6]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP18]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP20]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM9]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP19]], i64 [[IDXPROM9]]
 // CHECK1-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[MUL11:%.*]] = fmul float [[MUL8]], [[TMP21]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[TMP23:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP23]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM12]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i64 [[IDXPROM12]]
 // CHECK1-NEXT:    store float [[MUL11]], ptr [[ARRAYIDX13]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
@@ -1303,7 +1303,7 @@ void range_for_collapsed() {
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[CALL]] to float
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[VLA1]], i64 [[IDXPROM]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[VLA1]], i64 [[IDXPROM]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK1-NEXT:    [[ADD4:%.*]] = fadd float [[CONV]], [[TMP14]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -1312,7 +1312,7 @@ void range_for_collapsed() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP17]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM7]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM7]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
 // CHECK1-NEXT:    [[ADD9:%.*]] = fadd float [[TMP18]], [[ADD6]]
 // CHECK1-NEXT:    store float [[ADD9]], ptr [[ARRAYIDX8]], align 4
@@ -1781,24 +1781,24 @@ void range_for_collapsed() {
 // CHECK2-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8
 // CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4
 // CHECK2-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK2-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK2-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8
 // CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
 // CHECK2-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK2-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK2-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK2-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8
 // CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4
 // CHECK2-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK2-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4
 // CHECK2-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK2-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4
 // CHECK2-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK2-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
@@ -1895,21 +1895,21 @@ void range_for_collapsed() {
 // CHECK2-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK2-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK2-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]]
 // CHECK2-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]]
+// CHECK2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]]
 // CHECK2-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]]
 // CHECK2-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK2-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
+// CHECK2-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]]
 // CHECK2-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
@@ -1998,21 +1998,21 @@ void range_for_collapsed() {
 // CHECK2-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK2-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK2-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]]
 // CHECK2-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]]
+// CHECK2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]]
 // CHECK2-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]]
 // CHECK2-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK2-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
+// CHECK2-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]]
 // CHECK2-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
@@ -2275,24 +2275,24 @@ void range_for_collapsed() {
 // CHECK2-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[TMP14:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP14]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM]]
 // CHECK2-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[IDXPROM6:%.*]] = zext i8 [[TMP17]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM6]]
+// CHECK2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM6]]
 // CHECK2-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP18]]
 // CHECK2-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[TMP20:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP20]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM9]]
+// CHECK2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP19]], i64 [[IDXPROM9]]
 // CHECK2-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[MUL11:%.*]] = fmul float [[MUL8]], [[TMP21]]
 // CHECK2-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[TMP23:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP23]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM12]]
+// CHECK2-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i64 [[IDXPROM12]]
 // CHECK2-NEXT:    store float [[MUL11]], ptr [[ARRAYIDX13]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
@@ -2419,7 +2419,7 @@ void range_for_collapsed() {
 // CHECK2-NEXT:    [[CONV:%.*]] = sitofp i32 [[CALL]] to float
 // CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK2-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[VLA1]], i64 [[IDXPROM]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[VLA1]], i64 [[IDXPROM]]
 // CHECK2-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK2-NEXT:    [[ADD4:%.*]] = fadd float [[CONV]], [[TMP14]]
 // CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -2428,7 +2428,7 @@ void range_for_collapsed() {
 // CHECK2-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4
 // CHECK2-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP17]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM7]]
+// CHECK2-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM7]]
 // CHECK2-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
 // CHECK2-NEXT:    [[ADD9:%.*]] = fadd float [[TMP18]], [[ADD6]]
 // CHECK2-NEXT:    store float [[ADD9]], ptr [[ARRAYIDX8]], align 4
@@ -2897,24 +2897,24 @@ void range_for_collapsed() {
 // CHECK5-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG45:![0-9]+]]
 // CHECK5-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64, !dbg [[DBG45]]
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]], !dbg [[DBG45]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]], !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64, !dbg [[DBG45]]
-// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]], !dbg [[DBG45]]
+// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]], !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]], !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64, !dbg [[DBG45]]
-// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]], !dbg [[DBG45]]
+// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]], !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]], !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64, !dbg [[DBG45]]
-// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]], !dbg [[DBG45]]
+// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]], !dbg [[DBG45]]
 // CHECK5-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG46:![0-9]+]]
 // CHECK5:       omp.body.continue:
@@ -3011,21 +3011,21 @@ void range_for_collapsed() {
 // CHECK5-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !dbg [[DBG54]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG56:![0-9]+]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]], !dbg [[DBG56]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]], !dbg [[DBG56]]
 // CHECK5-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
-// CHECK5-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]], !dbg [[DBG56]]
+// CHECK5-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]], !dbg [[DBG56]]
 // CHECK5-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]], !dbg [[DBG56]]
 // CHECK5-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
-// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]], !dbg [[DBG56]]
+// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]], !dbg [[DBG56]]
 // CHECK5-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]], !dbg [[DBG56]]
 // CHECK5-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
-// CHECK5-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]], !dbg [[DBG56]]
+// CHECK5-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]], !dbg [[DBG56]]
 // CHECK5-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG57:![0-9]+]]
 // CHECK5:       omp.body.continue:
@@ -3114,21 +3114,21 @@ void range_for_collapsed() {
 // CHECK5-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !dbg [[DBG66]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG68:![0-9]+]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]], !dbg [[DBG68]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]], !dbg [[DBG68]]
 // CHECK5-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
-// CHECK5-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]], !dbg [[DBG68]]
+// CHECK5-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]], !dbg [[DBG68]]
 // CHECK5-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]], !dbg [[DBG68]]
 // CHECK5-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
-// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]], !dbg [[DBG68]]
+// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]], !dbg [[DBG68]]
 // CHECK5-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]], !dbg [[DBG68]]
 // CHECK5-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
-// CHECK5-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]], !dbg [[DBG68]]
+// CHECK5-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]], !dbg [[DBG68]]
 // CHECK5-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG69:![0-9]+]]
 // CHECK5:       omp.body.continue:
@@ -3391,24 +3391,24 @@ void range_for_collapsed() {
 // CHECK5-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG97:![0-9]+]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[TMP14:%.*]] = load i8, ptr [[I]], align 1, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP14]] to i64, !dbg [[DBG97]]
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]], !dbg [[DBG97]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM]], !dbg [[DBG97]]
 // CHECK5-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[TMP17:%.*]] = load i8, ptr [[I]], align 1, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[IDXPROM6:%.*]] = zext i8 [[TMP17]] to i64, !dbg [[DBG97]]
-// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM6]], !dbg [[DBG97]]
+// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM6]], !dbg [[DBG97]]
 // CHECK5-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP18]], !dbg [[DBG97]]
 // CHECK5-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[TMP20:%.*]] = load i8, ptr [[I]], align 1, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP20]] to i64, !dbg [[DBG97]]
-// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM9]], !dbg [[DBG97]]
+// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP19]], i64 [[IDXPROM9]], !dbg [[DBG97]]
 // CHECK5-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[MUL11:%.*]] = fmul float [[MUL8]], [[TMP21]], !dbg [[DBG97]]
 // CHECK5-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[TMP23:%.*]] = load i8, ptr [[I]], align 1, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP23]] to i64, !dbg [[DBG97]]
-// CHECK5-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM12]], !dbg [[DBG97]]
+// CHECK5-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i64 [[IDXPROM12]], !dbg [[DBG97]]
 // CHECK5-NEXT:    store float [[MUL11]], ptr [[ARRAYIDX13]], align 4, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG98:![0-9]+]]
 // CHECK5:       omp.body.continue:
@@ -3535,7 +3535,7 @@ void range_for_collapsed() {
 // CHECK5-NEXT:    [[CONV:%.*]] = sitofp i32 [[CALL]] to float, !dbg [[DBG111]]
 // CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG111]]
 // CHECK5-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP13]] to i64, !dbg [[DBG111]]
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[VLA1]], i64 [[IDXPROM]], !dbg [[DBG111]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[VLA1]], i64 [[IDXPROM]], !dbg [[DBG111]]
 // CHECK5-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG111]]
 // CHECK5-NEXT:    [[ADD4:%.*]] = fadd float [[CONV]], [[TMP14]], !dbg [[DBG111]]
 // CHECK5-NEXT:    [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4, !dbg [[DBG111]]
@@ -3544,7 +3544,7 @@ void range_for_collapsed() {
 // CHECK5-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG111]]
 // CHECK5-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG111]]
 // CHECK5-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP17]] to i64, !dbg [[DBG111]]
-// CHECK5-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM7]], !dbg [[DBG111]]
+// CHECK5-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM7]], !dbg [[DBG111]]
 // CHECK5-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX8]], align 4, !dbg [[DBG111]]
 // CHECK5-NEXT:    [[ADD9:%.*]] = fadd float [[TMP18]], [[ADD6]], !dbg [[DBG111]]
 // CHECK5-NEXT:    store float [[ADD9]], ptr [[ARRAYIDX8]], align 4, !dbg [[DBG111]]
@@ -4013,24 +4013,24 @@ void range_for_collapsed() {
 // CHECK6-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8
 // CHECK6-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4
 // CHECK6-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK6-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK6-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8
 // CHECK6-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
 // CHECK6-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK6-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK6-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK6-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK6-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8
 // CHECK6-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4
 // CHECK6-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK6-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK6-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4
 // CHECK6-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK6-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK6-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4
 // CHECK6-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK6-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK6-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4
 // CHECK6-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK6:       omp.body.continue:
@@ -4127,21 +4127,21 @@ void range_for_collapsed() {
 // CHECK6-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK6-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK6-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK6-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK6-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]]
 // CHECK6-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK6-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]]
+// CHECK6-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]]
 // CHECK6-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]]
 // CHECK6-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK6-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
+// CHECK6-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]]
 // CHECK6-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK6:       omp.body.continue:
@@ -4230,21 +4230,21 @@ void range_for_collapsed() {
 // CHECK6-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK6-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK6-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK6-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK6-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]]
 // CHECK6-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK6-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]]
+// CHECK6-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]]
 // CHECK6-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]]
 // CHECK6-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK6-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
+// CHECK6-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]]
 // CHECK6-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK6:       omp.body.continue:
@@ -4507,24 +4507,24 @@ void range_for_collapsed() {
 // CHECK6-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[TMP14:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP14]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]]
+// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM]]
 // CHECK6-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[TMP17:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[IDXPROM6:%.*]] = zext i8 [[TMP17]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM6]]
+// CHECK6-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM6]]
 // CHECK6-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP18]]
 // CHECK6-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[TMP20:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP20]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM9]]
+// CHECK6-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP19]], i64 [[IDXPROM9]]
 // CHECK6-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[MUL11:%.*]] = fmul float [[MUL8]], [[TMP21]]
 // CHECK6-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[TMP23:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP23]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM12]]
+// CHECK6-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i64 [[IDXPROM12]]
 // CHECK6-NEXT:    store float [[MUL11]], ptr [[ARRAYIDX13]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK6:       omp.body.continue:
@@ -4649,7 +4649,7 @@ void range_for_collapsed() {
 // CHECK6-NEXT:    [[CONV:%.*]] = sitofp i32 [[CALL]] to float
 // CHECK6-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK6-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[VLA1]], i64 [[IDXPROM]]
+// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[VLA1]], i64 [[IDXPROM]]
 // CHECK6-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK6-NEXT:    [[ADD4:%.*]] = fadd float [[CONV]], [[TMP14]]
 // CHECK6-NEXT:    [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -4658,7 +4658,7 @@ void range_for_collapsed() {
 // CHECK6-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK6-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4
 // CHECK6-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP17]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM7]]
+// CHECK6-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM7]]
 // CHECK6-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
 // CHECK6-NEXT:    [[ADD9:%.*]] = fadd float [[TMP18]], [[ADD6]]
 // CHECK6-NEXT:    store float [[ADD9]], ptr [[ARRAYIDX8]], align 4
diff --git a/clang/test/OpenMP/parallel_for_linear_codegen.cpp b/clang/test/OpenMP/parallel_for_linear_codegen.cpp
index 8b46797ae253f..15eb0dfa42af5 100644
--- a/clang/test/OpenMP/parallel_for_linear_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_linear_codegen.cpp
@@ -337,7 +337,7 @@ int main() {
 // CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP14]], [[MUL6]]
 // CHECK1-NEXT:    store i32 [[ADD7]], ptr [[LVAR3]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[PVAR2]], align 8
-// CHECK1-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1
+// CHECK1-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP16]], i32 1
 // CHECK1-NEXT:    store ptr [[INCDEC_PTR]], ptr [[PVAR2]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[LVAR3]], align 4
 // CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP17]], 1
diff --git a/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
index 752aec788bf34..59d169d7a1738 100644
--- a/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
@@ -83,16 +83,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -122,7 +122,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP6]], ptr [[_TMP5]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP6]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -137,19 +137,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN10]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN10]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[TMP36]], align 8
@@ -470,9 +470,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/parallel_for_scan_codegen.cpp b/clang/test/OpenMP/parallel_for_scan_codegen.cpp
index 161534814a793..67b32407c712f 100644
--- a/clang/test/OpenMP/parallel_for_scan_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_scan_codegen.cpp
@@ -28,9 +28,9 @@ void baz(int n) {
 
   // CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(
   // CHECK: [[LAST:%.+]] = mul nsw i64 9, %
-  // CHECK: [[LAST_REF:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[LAST]]
+  // CHECK: [[LAST_REF:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[LAST]]
   // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 @_ZZ3baziE1a, ptr align 4 [[LAST_REF]], i64 %{{.+}}, i1 false)
-  // CHECK: [[LAST_REF_B:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 9
+  // CHECK: [[LAST_REF_B:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 9
   // CHECK: [[LAST_VAL:%.+]] = load double, ptr [[LAST_REF_B]],
   // CHECK: store double [[LAST_VAL]], ptr @_ZZ3baziE1b,
 
@@ -58,13 +58,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF:%.+]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF:%.+]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:.+]]
@@ -91,13 +91,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -151,13 +151,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE:[^,]+]]
@@ -188,13 +188,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF:%.+]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF:%.+]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:[^,]+]]
@@ -226,13 +226,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -289,13 +289,13 @@ void baz(int n) {
     // CHECK: [[IF_THEN]]:
     // CHECK: [[BASE_IDX_SUB_1:%.+]] = sub nuw i64 [[BASE_IDX]], 1
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX_SUB_1]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE]]
diff --git a/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp b/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp
index 7e973a602a65c..cac997753d480 100644
--- a/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp
@@ -51,13 +51,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF:%.+]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF:%.+]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:.+]]
@@ -84,13 +84,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -144,13 +144,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE:[^,]+]]
@@ -181,13 +181,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF:%.+]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF:%.+]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:[^,]+]]
@@ -219,13 +219,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -282,13 +282,13 @@ void baz(int n) {
     // CHECK: [[IF_THEN]]:
     // CHECK: [[BASE_IDX_SUB_1:%.+]] = sub nuw i64 [[BASE_IDX]], 1
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX_SUB_1]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE]]
diff --git a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
index b17757a5f978a..c1fe00f238001 100644
--- a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
@@ -72,16 +72,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -111,7 +111,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP5]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP5]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -126,19 +126,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8
@@ -425,9 +425,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp
index 3c74aaca3f46d..1d106922435d5 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp
@@ -84,9 +84,9 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB1:.+]], ptr [[TMP25]],
 // CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK-DAG:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false)
-// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C:%.+]], i64 0, i64 0
+// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C:%.+]], i64 0, i64 0
 // CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
-// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
 // CHECK-DAG:    [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
@@ -138,10 +138,10 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB4:.+]], ptr [[TMP59]],
 // CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6
 // CHECK-DAG:    store i32 1, ptr [[TMP60]],
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
 // CHECK:    [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0:%.+]], i32 4, ptr [[DOTRD_INPUT_]])
 // CHECK:    [[TMP63:%.*]] = load i32, ptr [[N:%.+]],
 // CHECK:    store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]],
diff --git a/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp
index 7d4216ddde6a3..9a524c3b94c6e 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp
@@ -84,9 +84,9 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB1:.+]], ptr [[TMP25]],
 // CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK-DAG:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false)
-// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C:%.+]], i64 0, i64 0
+// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C:%.+]], i64 0, i64 0
 // CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
-// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
 // CHECK-DAG:    [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
@@ -138,10 +138,10 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB4:.+]], ptr [[TMP59]],
 // CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6
 // CHECK-DAG:    store i32 1, ptr [[TMP60]],
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
 // CHECK:    [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0:%.+]], i32 4, ptr [[DOTRD_INPUT_]])
 // CHECK:    [[TMP63:%.*]] = load i32, ptr [[N:%.+]],
 // CHECK:    store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]],
diff --git a/clang/test/OpenMP/parallel_reduction_codegen.cpp b/clang/test/OpenMP/parallel_reduction_codegen.cpp
index f49faa6b89deb..ce76429b871fe 100644
--- a/clang/test/OpenMP/parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/parallel_reduction_codegen.cpp
@@ -354,9 +354,9 @@ int main() {
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP0]], i64 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARRAYIDX1]] to i64
 // CHECK1-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK1-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP2]], [[TMP3]]
@@ -1632,9 +1632,9 @@ int main() {
 // CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i64 0
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP0]], i64 0
 // CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// CHECK3-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i64 0
+// CHECK3-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i64 0
 // CHECK3-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARRAYIDX1]] to i64
 // CHECK3-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK3-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP2]], [[TMP3]]
@@ -2142,9 +2142,9 @@ int main() {
 // CHECK4-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK4-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
 // CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// CHECK4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i64 0
+// CHECK4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP0]], i64 0
 // CHECK4-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// CHECK4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i64 0
+// CHECK4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i64 0
 // CHECK4-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARRAYIDX1]] to i64
 // CHECK4-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK4-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP2]], [[TMP3]]
diff --git a/clang/test/OpenMP/parallel_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
index 208f7a41aa3db..40cc3103b1c0f 100644
--- a/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
@@ -72,16 +72,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -111,7 +111,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP5]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP5]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -126,19 +126,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8
@@ -416,9 +416,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
index 6d73652c3ea27..61597a074cf59 100644
--- a/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
@@ -81,16 +81,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_SECTIONS_IL_]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -120,7 +120,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP5]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP5]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -135,19 +135,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8
@@ -458,9 +458,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/reduction_implicit_map.cpp b/clang/test/OpenMP/reduction_implicit_map.cpp
index 4d2b93ffd4712..a7db3da7d1f86 100644
--- a/clang/test/OpenMP/reduction_implicit_map.cpp
+++ b/clang/test/OpenMP/reduction_implicit_map.cpp
@@ -133,9 +133,9 @@ int main()
 // CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 8
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8
-// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i64 0
 // CHECK-NEXT:    store double 0.000000e+00, ptr [[E2]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[E_ADDR]], align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64
@@ -529,16 +529,16 @@ int main()
 // CHECK1-NEXT:    store i64 9, ptr [[DOTOMP_UB]], align 8
 // CHECK1-NEXT:    store i64 1, ptr [[DOTOMP_STRIDE]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 0
 // CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x [10 x double]], ptr [[ARRAYIDX]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYDECAY]], i64 2
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [10 x double], ptr [[ARRAYDECAY]], i64 2
 // CHECK1-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX1]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY2]], i64 1
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw double, ptr [[ARRAYDECAY2]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 1
 // CHECK1-NEXT:    [[ARRAYDECAY5:%.*]] = getelementptr inbounds [10 x [10 x double]], ptr [[ARRAYIDX4]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYDECAY5]], i64 5
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [10 x double], ptr [[ARRAYDECAY5]], i64 5
 // CHECK1-NEXT:    [[ARRAYDECAY7:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX6]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY7]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw double, ptr [[ARRAYDECAY7]], i64 1
 // CHECK1-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64
 // CHECK1-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64
 // CHECK1-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], [[TMP2]]
@@ -564,18 +564,18 @@ int main()
 // CHECK1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = sdiv exact i64 [[TMP11]], ptrtoint (ptr getelementptr (double, ptr null, i32 1) to i64)
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[VLA]], i64 [[TMP12]]
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [1 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [1 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 0
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 0
 // CHECK1-NEXT:    [[ARRAYDECAY10:%.*]] = getelementptr inbounds [10 x [10 x double]], ptr [[ARRAYIDX9]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYDECAY10]], i64 2
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw [10 x double], ptr [[ARRAYDECAY10]], i64 2
 // CHECK1-NEXT:    [[ARRAYDECAY12:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX11]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY12]], i64 1
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw double, ptr [[ARRAYDECAY12]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 1
 // CHECK1-NEXT:    [[ARRAYDECAY15:%.*]] = getelementptr inbounds [10 x [10 x double]], ptr [[ARRAYIDX14]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYDECAY15]], i64 5
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw [10 x double], ptr [[ARRAYDECAY15]], i64 5
 // CHECK1-NEXT:    [[ARRAYDECAY17:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX16]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY17]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw double, ptr [[ARRAYDECAY17]], i64 1
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP14]], align 8
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX13]], ptr [[TMP15]], align 8
@@ -852,7 +852,7 @@ int main()
 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
 // CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[INPUT_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[INPUT_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 0
 // CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[SIZE_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP9:%.*]] = mul nuw i32 [[TMP8]], 4
 // CHECK2-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
@@ -930,10 +930,10 @@ int main()
 // CHECK2-NEXT:    [[TMP46:%.*]] = load ptr, ptr [[INPUT_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP47:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP48:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[TMP48]], i32 0
+// CHECK2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP48]], i32 0
 // CHECK2-NEXT:    [[TMP49:%.*]] = load ptr, ptr [[INPUT_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP50:%.*]] = load ptr, ptr [[INPUT_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[TMP50]], i32 0
+// CHECK2-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP50]], i32 0
 // CHECK2-NEXT:    [[TMP51:%.*]] = load i32, ptr [[SIZE_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP52:%.*]] = mul nuw i32 [[TMP51]], 4
 // CHECK2-NEXT:    [[TMP53:%.*]] = sext i32 [[TMP52]] to i64
@@ -1007,7 +1007,7 @@ int main()
 // CHECK2-NEXT:    [[TMP86:%.*]] = load i32, ptr [[SIZE_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP86]], ptr [[SIZE_CASTED21]], align 4
 // CHECK2-NEXT:    [[TMP87:%.*]] = load i32, ptr [[SIZE_CASTED21]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[A]], i32 0, i32 0
+// CHECK2-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds nuw [10 x i32], ptr [[A]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP88:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK2-NEXT:    store i32 [[TMP87]], ptr [[TMP88]], align 4
 // CHECK2-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
@@ -1480,9 +1480,9 @@ int main()
 // CHECK2-NEXT:    store ptr [[OUTPUT]], ptr [[OUTPUT_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[INPUT]], ptr [[INPUT_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 0
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 2
+// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 2
 // CHECK2-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [3 x i32], ptr [[OUTPUT2]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i32 3
 // CHECK2-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP2]]
@@ -1664,9 +1664,9 @@ int main()
 // CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP6]], i32 0
 // CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 2
+// CHECK2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 2
 // CHECK2-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [3 x i32], ptr [[OUTPUT4]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i32 3
 // CHECK2-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP8]]
@@ -1878,8 +1878,8 @@ int main()
 // CHECK2-NEXT:    store i32 [[SIZE]], ptr [[SIZE_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 0
-// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 1
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [10 x i32], ptr [[TMP0]], i32 0, i32 0
+// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [10 x i32], ptr [[TMP0]], i32 0, i32 1
 // CHECK2-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x i32], ptr [[A2]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i32 2
 // CHECK2-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
diff --git a/clang/test/OpenMP/sections_reduction_task_codegen.cpp b/clang/test/OpenMP/sections_reduction_task_codegen.cpp
index 1a2cf7aede321..5d749eeb81776 100644
--- a/clang/test/OpenMP/sections_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/sections_reduction_task_codegen.cpp
@@ -82,16 +82,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_SECTIONS_IL_]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP4]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP5]]
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP6]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP6]], i64 9
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP8]], [[TMP9]]
@@ -121,7 +121,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP21]]
 // CHECK1-NEXT:    store ptr [[_TMP5]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store ptr [[TMP22]], ptr [[_TMP5]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP23]], align 8
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -136,19 +136,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP28]], align 8
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP29]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP32]], i64 0
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP34]]
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP35]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP35]], i64 9
 // CHECK1-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP36]], i64 [[LB_ADD_LEN9]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP36]], i64 [[LB_ADD_LEN9]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP30]], align 8
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[TMP37]], align 8
@@ -463,9 +463,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/target_data_use_device_addr_codegen.cpp b/clang/test/OpenMP/target_data_use_device_addr_codegen.cpp
index d912f801a33db..19a523ee165c8 100644
--- a/clang/test/OpenMP/target_data_use_device_addr_codegen.cpp
+++ b/clang/test/OpenMP/target_data_use_device_addr_codegen.cpp
@@ -54,12 +54,12 @@ int main() {
 // CHECK: [[SIZES:%.+]] = alloca [6 x i64],
 // CHECK: [[VLA_ADDR:%.+]] = alloca float, i64 %{{.+}},
 // CHECK: [[PTR:%.+]] = load ptr, ptr [[PTR_ADDR]],
-// CHECK-NEXT: [[ARR_IDX:%.+]] = getelementptr inbounds float, ptr [[PTR]], i64 3
+// CHECK-NEXT: [[ARR_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[PTR]], i64 3
 // CHECK: [[P5:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8
 // CHECK-NEXT: [[ARR_IDX1:%.+]] = getelementptr inbounds float, ptr [[P5]], i64 0
 // CHECK: [[P7:%.+]] = load ptr, ptr [[REF_ADDR]],
 // CHECK-NEXT: [[REF:%.+]] = load ptr, ptr [[REF_ADDR]],
-// CHECK-NEXT: [[ARR_IDX2:%.+]] = getelementptr inbounds [4 x float], ptr [[ARR_ADDR]], i64 0, i64 0
+// CHECK-NEXT: [[ARR_IDX2:%.+]] = getelementptr inbounds nuw [4 x float], ptr [[ARR_ADDR]], i64 0, i64 0
 // CHECK: [[P10:%.+]] = mul nuw i64 {{.+}}, 4
 // CHECK-NEXT: [[ARR_IDX5:%.+]] = getelementptr inbounds float, ptr [[VLA_ADDR]], i64 0
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[SIZES]], ptr align 8 [[SIZES1]], i64 48, i1 false)
@@ -132,14 +132,14 @@ int main() {
 // CHECK: [[SIZES:%.+]] = alloca [6 x i64],
 // CHECK: [[A_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS:%.+]], i32 0, i32 0
 // CHECK: [[PTR_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 1
-// CHECK: [[ARR_IDX:%.+]] = getelementptr inbounds i32, ptr %{{.+}}, i64 3
+// CHECK: [[ARR_IDX:%.+]] = getelementptr inbounds nuw i32, ptr %{{.+}}, i64 3
 // CHECK: [[REF_REF:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 2
 // CHECK: [[REF_PTR:%.+]] = load ptr, ptr [[REF_REF]],
 // CHECK-NEXT: [[P3:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 1
 // CHECK: [[ARR_IDX5:%.+]] = getelementptr inbounds i32, ptr {{.+}}, i64 0
 // CHECK: [[ARR_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 3
 
-// CHECK: [[ARR_IDX6:%.+]] = getelementptr inbounds [4 x i32], ptr [[ARR_ADDR]], i64 0, i64 0
+// CHECK: [[ARR_IDX6:%.+]] = getelementptr inbounds nuw [4 x i32], ptr [[ARR_ADDR]], i64 0, i64 0
 // CHECK: [[A_ADDR2:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 0
 // CHECK: [[P4:%.+]] = mul nuw i64 [[CONV:%.+]], 4
 // CHECK: [[A_ADDR3:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 0
@@ -147,7 +147,7 @@ int main() {
 // CHECK: [[L6:%.+]] = sext i32 [[L5]] to i64
 // CHECK: [[LB_ADD_LEN:%lb_add_len]] = add nsw i64 -1, [[L6]]
 // CHECK: [[ARR_ADDR9:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 3
-// CHECK: [[ARR_IDX10:%arrayidx.+]] = getelementptr inbounds [4 x i32], ptr [[ARR_ADDR9]], i64 0, i64 %lb_add_len
+// CHECK: [[ARR_IDX10:%arrayidx.+]] = getelementptr inbounds nuw [4 x i32], ptr [[ARR_ADDR9]], i64 0, i64 %lb_add_len
 // CHECK: [[ARR_END:%.+]] = getelementptr i32, ptr [[ARR_IDX10]], i32 1
 // CHECK: [[E:%.+]] = ptrtoint ptr [[ARR_END]] to i64
 // CHECK: [[B:%.+]] = ptrtoint ptr [[A_ADDR]] to i64
diff --git a/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp b/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp
index c90819dc2a22f..6d1c0213d648c 100644
--- a/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp
+++ b/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp
@@ -49,14 +49,14 @@ void foo(float *&lr, T *&tr) {
   // CK1-NOT: store ptr [[VAL]], ptr [[DECL]],
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[TT:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds double, ptr [[TT]], i32 1
+  // CK1:     getelementptr inbounds nuw double, ptr [[TT]], i32 1
   #pragma omp target data map(g[:10]) use_device_ptr(g)
   {
     ++g;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE00]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds double, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw double, ptr [[TTT]], i32 1
   ++g;
 
   // CK1:     [[T1:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -67,26 +67,26 @@ void foo(float *&lr, T *&tr) {
   // CK1-NOT: store ptr [[VAL]], ptr [[DECL]],
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds float, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TT1]], i32 1
   #pragma omp target data map(l[:10]) use_device_ptr(l)
   {
     ++l;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE01]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds float, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTT]], i32 1
   ++l;
 
   // CK1-NOT: call void @__tgt_target
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds float, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTT]], i32 1
   #pragma omp target data map(l[:10]) use_device_ptr(l) if(0)
   {
     ++l;
   }
   // CK1-NOT: call void @__tgt_target
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds float, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTT]], i32 1
   ++l;
 
   // CK1:     [[T1:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -97,14 +97,14 @@ void foo(float *&lr, T *&tr) {
   // CK1-NOT: store ptr [[VAL]], ptr [[DECL]],
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds float, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TT1]], i32 1
   #pragma omp target data map(l[:10]) use_device_ptr(l) if(1)
   {
     ++l;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE03]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds float, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTT]], i32 1
   ++l;
 
   // CK1:     [[CMP:%.+]] = icmp ne ptr %{{.+}}, null
@@ -119,12 +119,12 @@ void foo(float *&lr, T *&tr) {
   // CK1-NOT: store ptr [[VAL]], ptr [[DECL]],
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds float, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TT1]], i32 1
   // CK1:     br label %[[BEND:.+]]
 
   // CK1:     [[BELSE]]:
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds float, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTT]], i32 1
   // CK1:     br label %[[BEND]]
   #pragma omp target data map(l[:10]) use_device_ptr(l) if(lr != 0)
   {
@@ -142,7 +142,7 @@ void foo(float *&lr, T *&tr) {
 
   // CK1:     [[BEND]]:
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds float, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTT]], i32 1
   ++l;
 
   // CK1:     [[T2:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -156,7 +156,7 @@ void foo(float *&lr, T *&tr) {
   // CK1:     store ptr [[PVTV]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
   // CK1:     [[TT2:%.+]] = load ptr, ptr [[TT1]],
-  // CK1:     getelementptr inbounds float, ptr [[TT2]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TT2]], i32 1
   #pragma omp target data map(lr[:10]) use_device_ptr(lr)
   {
     ++lr;
@@ -164,7 +164,7 @@ void foo(float *&lr, T *&tr) {
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE05]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
   // CK1:     [[TTTT:%.+]] = load ptr, ptr [[TTT]],
-  // CK1:     getelementptr inbounds float, ptr [[TTTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTTT]], i32 1
   ++lr;
 
   // CK1:     [[T1:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -175,14 +175,14 @@ void foo(float *&lr, T *&tr) {
   // CK1-NOT: store ptr [[VAL]], ptr [[DECL]],
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds i32, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TT1]], i32 1
   #pragma omp target data map(t[:10]) use_device_ptr(t)
   {
     ++t;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE06]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds i32, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TTT]], i32 1
   ++t;
 
   // CK1:     [[T2:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -196,7 +196,7 @@ void foo(float *&lr, T *&tr) {
   // CK1:     store ptr [[PVTV]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
   // CK1:     [[TT2:%.+]] = load ptr, ptr [[TT1]],
-  // CK1:     getelementptr inbounds i32, ptr [[TT2]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TT2]], i32 1
   #pragma omp target data map(tr[:10]) use_device_ptr(tr)
   {
     ++tr;
@@ -204,7 +204,7 @@ void foo(float *&lr, T *&tr) {
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE07]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
   // CK1:     [[TTTT:%.+]] = load ptr, ptr [[TTT]],
-  // CK1:     getelementptr inbounds i32, ptr [[TTTT]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TTTT]], i32 1
   ++tr;
 
   // CK1:     [[T1:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -215,14 +215,14 @@ void foo(float *&lr, T *&tr) {
   // CK1-NOT: store ptr [[VAL]], ptr [[DECL]],
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds float, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TT1]], i32 1
   #pragma omp target data map(l[:10], t[:10]) use_device_ptr(l)
   {
     ++l; ++t;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE08]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds float, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTT]], i32 1
   ++l; ++t;
 
 
@@ -232,18 +232,18 @@ void foo(float *&lr, T *&tr) {
   // CK1:     [[VAL:%.+]] = load ptr, ptr {{%.+}},
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[_TT1:%.+]] = load ptr, ptr [[_PVT]],
-  // CK1:     getelementptr inbounds float, ptr [[_TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[_TT1]], i32 1
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds i32, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TT1]], i32 1
   #pragma omp target data map(l[:10], t[:10]) use_device_ptr(l) use_device_ptr(t)
   {
     ++l; ++t;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE09]]
   // CK1:     [[_TTT:%.+]] = load ptr, ptr {{%.+}},
-  // CK1:     getelementptr inbounds float, ptr [[_TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[_TTT]], i32 1
   // CK1:     [[TTT:%.+]] = load ptr, ptr {{%.+}},
-  // CK1:     getelementptr inbounds i32, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TTT]], i32 1
   ++l; ++t;
 
   // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE10]]
@@ -252,18 +252,18 @@ void foo(float *&lr, T *&tr) {
   // CK1:     [[VAL:%.+]] = load ptr, ptr {{%.+}},
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[_TT1:%.+]] = load ptr, ptr [[_PVT]],
-  // CK1:     getelementptr inbounds float, ptr [[_TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[_TT1]], i32 1
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds i32, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TT1]], i32 1
   #pragma omp target data map(l[:10], t[:10]) use_device_ptr(l,t)
   {
     ++l; ++t;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE10]]
   // CK1:     [[_TTT:%.+]] = load ptr, ptr {{%.+}},
-  // CK1:     getelementptr inbounds float, ptr [[_TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[_TTT]], i32 1
   // CK1:     [[TTT:%.+]] = load ptr, ptr {{%.+}},
-  // CK1:     getelementptr inbounds i32, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TTT]], i32 1
   ++l; ++t;
 
   // CK1:     [[T1:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -274,14 +274,14 @@ void foo(float *&lr, T *&tr) {
   // CK1-NOT: store ptr [[VAL]], ptr [[DECL]],
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds i32, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TT1]], i32 1
   #pragma omp target data map(l[:10]) use_device_ptr(t)
   {
     ++l; ++t;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE11]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds i32, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TTT]], i32 1
   ++l; ++t;
 
   // CK1:     [[T2:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -295,7 +295,7 @@ void foo(float *&lr, T *&tr) {
   // CK1:     store ptr [[PVTV]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
   // CK1:     [[TT2:%.+]] = load ptr, ptr [[TT1]],
-  // CK1:     getelementptr inbounds i32, ptr [[TT2]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TT2]], i32 1
   #pragma omp target data map(l[:10]) use_device_ptr(tr)
   {
     ++l; ++tr;
@@ -303,7 +303,7 @@ void foo(float *&lr, T *&tr) {
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE12]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
   // CK1:     [[TTTT:%.+]] = load ptr, ptr [[TTT]],
-  // CK1:     getelementptr inbounds i32, ptr [[TTTT]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TTTT]], i32 1
   ++l; ++tr;
 
 }
@@ -354,7 +354,7 @@ struct ST {
     // CK2:     store ptr [[PVT]], ptr [[PVT2:%.+]],
     // CK2:     [[TT1:%.+]] = load ptr, ptr [[PVT2]],
     // CK2:     [[TT2:%.+]] = load ptr, ptr [[TT1]],
-    // CK2:     getelementptr inbounds double, ptr [[TT2]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[TT2]], i32 1
     #pragma omp target data map(a[:10]) use_device_ptr(a)
     {
       a++;
@@ -362,7 +362,7 @@ struct ST {
     // CK2:     call void @__tgt_target_data_end{{.+}}[[MTYPE00]]
     // CK2:     [[DECL:%.+]] = getelementptr inbounds nuw [[ST]], ptr %this1, i32 0, i32 0
     // CK2:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-    // CK2:     getelementptr inbounds double, ptr [[TTT]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[TTT]], i32 1
     a++;
 
     // CK2:     [[BP:%.+]] = getelementptr inbounds [2 x ptr], ptr %{{.+}}, i32 0, i32 1
@@ -373,7 +373,7 @@ struct ST {
     // CK2:     store ptr [[PVT]], ptr [[PVT2:%.+]],
     // CK2:     [[TT1:%.+]] = load ptr, ptr [[PVT2]],
     // CK2:     [[TT2:%.+]] = load ptr, ptr [[TT1]],
-    // CK2:     getelementptr inbounds double, ptr [[TT2]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[TT2]], i32 1
     #pragma omp target data map(b[:10]) use_device_ptr(b)
     {
       b++;
@@ -382,7 +382,7 @@ struct ST {
     // CK2:     [[DECL:%.+]] = getelementptr inbounds nuw [[ST]], ptr %{{.+}}, i32 0, i32 1
     // CK2:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
     // CK2:     [[TTTT:%.+]] = load ptr, ptr [[TTT]],
-    // CK2:     getelementptr inbounds double, ptr [[TTTT]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[TTTT]], i32 1
     b++;
 
     // CK2:     [[BP:%.+]] = getelementptr inbounds [3 x ptr], ptr %{{.+}}, i32 0, i32 2
@@ -393,7 +393,7 @@ struct ST {
     // CK2:     store ptr [[PVT]], ptr [[PVT2:%.+]],
     // CK2:     [[TT1:%.+]] = load ptr, ptr [[PVT2]],
     // CK2:     [[TT2:%.+]] = load ptr, ptr [[TT1]],
-    // CK2:     getelementptr inbounds double, ptr [[TT2]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[TT2]], i32 1
     #pragma omp target data map(la[:10]) use_device_ptr(a)
     {
       a++;
@@ -402,7 +402,7 @@ struct ST {
     // CK2:     call void @__tgt_target_data_end{{.+}}[[MTYPE02]]
     // CK2:     [[DECL:%.+]] = getelementptr inbounds nuw [[ST]], ptr %this1, i32 0, i32 0
     // CK2:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-    // CK2:     getelementptr inbounds double, ptr [[TTT]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[TTT]], i32 1
     a++;
     la++;
 
@@ -419,10 +419,10 @@ struct ST {
     // CK2:     store ptr [[PVT1]], ptr [[_PVT1:%.+]],
     // CK2:     [[TT2:%.+]] = load ptr, ptr [[_PVT2]],
     // CK2:     [[_TT2:%.+]] = load ptr, ptr [[TT2]],
-    // CK2:     getelementptr inbounds double, ptr [[_TT2]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[_TT2]], i32 1
     // CK2:     [[TT1:%.+]] = load ptr, ptr [[_PVT1]],
     // CK2:     [[_TT1:%.+]] = load ptr, ptr [[TT1]],
-    // CK2:     getelementptr inbounds double, ptr [[_TT1]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[_TT1]], i32 1
     #pragma omp target data map(b[:10]) use_device_ptr(a, b)
     {
       a++;
@@ -431,11 +431,11 @@ struct ST {
     // CK2:     call void @__tgt_target_data_end{{.+}}[[MTYPE03]]
     // CK2:     [[DECL:%.+]] = getelementptr inbounds nuw [[ST]], ptr %this1, i32 0, i32 0
     // CK2:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-    // CK2:     getelementptr inbounds double, ptr [[TTT]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[TTT]], i32 1
     // CK2:     [[_DECL:%.+]] = getelementptr inbounds nuw [[ST]], ptr %this1, i32 0, i32 1
     // CK2:     [[_TTT:%.+]] = load ptr, ptr [[_DECL]],
     // CK2:     [[_TTTT:%.+]] = load ptr, ptr [[_TTT]],
-    // CK2:     getelementptr inbounds double, ptr [[_TTTT]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[_TTTT]], i32 1
     a++;
     b++;
   }
diff --git a/clang/test/OpenMP/target_has_device_addr_codegen.cpp b/clang/test/OpenMP/target_has_device_addr_codegen.cpp
index 08bcc87ca5f0a..39eaedb0e48d1 100644
--- a/clang/test/OpenMP/target_has_device_addr_codegen.cpp
+++ b/clang/test/OpenMP/target_has_device_addr_codegen.cpp
@@ -586,7 +586,7 @@ void use_template() {
 // CHECK-NEXT:    store ptr [[K]], ptr [[K_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[K_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CHECK-NEXT:    ret void
 //
@@ -601,7 +601,7 @@ void use_template() {
 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8
 // CHECK-NEXT:    ret void
 //
@@ -1079,7 +1079,7 @@ void use_template() {
 // CHECK-NEXT:    store ptr [[K]], ptr [[K_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[K_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CHECK-NEXT:    ret void
 //
@@ -1094,7 +1094,7 @@ void use_template() {
 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8
 // CHECK-NEXT:    ret void
 //
@@ -1133,7 +1133,7 @@ void use_template() {
 // CHECK-NEXT:    store ptr [[K]], ptr [[K_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[K_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CHECK-NEXT:    ret void
 //
@@ -1148,7 +1148,7 @@ void use_template() {
 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP2]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8
 // CHECK-NEXT:    ret void
 //
@@ -1422,14 +1422,14 @@ void use_template() {
 // SIMD-ONLY0-NEXT:    store ptr [[K]], ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    store ptr [[AA]], ptr [[RAA]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[K]], align 8
-// SIMD-ONLY0-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// SIMD-ONLY0-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // SIMD-ONLY0-NEXT:    store ptr [[INCDEC_PTR]], ptr [[K]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-// SIMD-ONLY0-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
+// SIMD-ONLY0-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 1
 // SIMD-ONLY0-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP4]], align 8
 // SIMD-ONLY0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[AA]], i64 0, i64 0
 // SIMD-ONLY0-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4
@@ -1478,14 +1478,14 @@ void use_template() {
 // SIMD-ONLY0-NEXT:    store ptr [[TMP0]], ptr [[K]], align 8
 // SIMD-ONLY0-NEXT:    store ptr [[K]], ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[K]], align 8
-// SIMD-ONLY0-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// SIMD-ONLY0-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // SIMD-ONLY0-NEXT:    store ptr [[INCDEC_PTR]], ptr [[K]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-// SIMD-ONLY0-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
+// SIMD-ONLY0-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 1
 // SIMD-ONLY0-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP4]], align 8
 // SIMD-ONLY0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[AA]], i64 0, i64 0
 // SIMD-ONLY0-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
@@ -1520,14 +1520,14 @@ void use_template() {
 // SIMD-ONLY0-NEXT:    store ptr [[TMP0]], ptr [[K]], align 8
 // SIMD-ONLY0-NEXT:    store ptr [[K]], ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[K]], align 8
-// SIMD-ONLY0-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i32 1
+// SIMD-ONLY0-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i32 1
 // SIMD-ONLY0-NEXT:    store ptr [[INCDEC_PTR]], ptr [[K]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-// SIMD-ONLY0-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 1
+// SIMD-ONLY0-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i32 1
 // SIMD-ONLY0-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP4]], align 8
 // SIMD-ONLY0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[AA]], i64 0, i64 0
 // SIMD-ONLY0-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
diff --git a/clang/test/OpenMP/target_in_reduction_codegen.cpp b/clang/test/OpenMP/target_in_reduction_codegen.cpp
index fb715e2de2a59..56191ee575136 100644
--- a/clang/test/OpenMP/target_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_in_reduction_codegen.cpp
@@ -70,7 +70,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16
 // CHECK1-NEXT:    store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[A]], ptr [[TMP4]], align 8
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -85,7 +85,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP12]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[B]], ptr [[TMP14]], align 8
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1
@@ -100,7 +100,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..2, ptr [[TMP21]], align 8
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP22]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP24]], align 8
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1
@@ -118,7 +118,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP35:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]])
 // CHECK1-NEXT:    store ptr [[TMP35]], ptr [[DOTTASK_RED_]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[C]], ptr [[TMP36]], align 8
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
@@ -133,7 +133,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..6, ptr [[TMP43]], align 8
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP44]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP46]], align 8
 // CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1
diff --git a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
index 3a1c168533c37..505c34e21733c 100644
--- a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
+++ b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
@@ -2142,7 +2142,7 @@ void bar() {
 // CK10-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
 // CK10-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
 // CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 8
-// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 8
 // CK10-NEXT:    ret void
 //
@@ -2153,7 +2153,7 @@ void bar() {
 // CK10-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
 // CK10-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
 // CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 8
-// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 8
 // CK10-NEXT:    ret void
 //
@@ -2164,7 +2164,7 @@ void bar() {
 // CK10-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 8
 // CK10-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 8
 // CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 8
-// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 8
 // CK10-NEXT:    ret void
 //
@@ -2178,7 +2178,7 @@ void bar() {
 // CK10-NEXT:    store ptr [[LR_ADDR]], ptr [[TMP]], align 8
 // CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK10-NEXT:    ret void
 //
@@ -2192,7 +2192,7 @@ void bar() {
 // CK10-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
 // CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK10-NEXT:    ret void
 //
@@ -2206,7 +2206,7 @@ void bar() {
 // CK10-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
 // CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK10-NEXT:    ret void
 //
@@ -2224,11 +2224,11 @@ void bar() {
 // CK10-NEXT:    store ptr [[LR_ADDR]], ptr [[_TMP1]], align 8
 // CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK10-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8
 // CK10-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
-// CK10-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 8
 // CK10-NEXT:    ret void
 //
@@ -2613,7 +2613,7 @@ void bar() {
 // CK11-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
 // CK11-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
 // CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 8
-// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 8
 // CK11-NEXT:    ret void
 //
@@ -2624,7 +2624,7 @@ void bar() {
 // CK11-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
 // CK11-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
 // CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 8
-// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 8
 // CK11-NEXT:    ret void
 //
@@ -2635,7 +2635,7 @@ void bar() {
 // CK11-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 8
 // CK11-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 8
 // CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 8
-// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 8
 // CK11-NEXT:    ret void
 //
@@ -2649,7 +2649,7 @@ void bar() {
 // CK11-NEXT:    store ptr [[LR_ADDR]], ptr [[TMP]], align 8
 // CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK11-NEXT:    ret void
 //
@@ -2663,7 +2663,7 @@ void bar() {
 // CK11-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
 // CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK11-NEXT:    ret void
 //
@@ -2677,7 +2677,7 @@ void bar() {
 // CK11-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
 // CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK11-NEXT:    ret void
 //
@@ -2695,11 +2695,11 @@ void bar() {
 // CK11-NEXT:    store ptr [[LR_ADDR]], ptr [[_TMP1]], align 8
 // CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8
 // CK11-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
-// CK11-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 8
 // CK11-NEXT:    ret void
 //
@@ -3084,7 +3084,7 @@ void bar() {
 // CK12-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 4
 // CK12-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 4
 // CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 4
-// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 4
 // CK12-NEXT:    ret void
 //
@@ -3095,7 +3095,7 @@ void bar() {
 // CK12-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 4
 // CK12-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 4
 // CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 4
-// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 4
 // CK12-NEXT:    ret void
 //
@@ -3106,7 +3106,7 @@ void bar() {
 // CK12-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 4
 // CK12-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 4
 // CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 4
-// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 4
 // CK12-NEXT:    ret void
 //
@@ -3120,7 +3120,7 @@ void bar() {
 // CK12-NEXT:    store ptr [[LR_ADDR]], ptr [[TMP]], align 4
 // CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK12-NEXT:    ret void
 //
@@ -3134,7 +3134,7 @@ void bar() {
 // CK12-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
 // CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK12-NEXT:    ret void
 //
@@ -3148,7 +3148,7 @@ void bar() {
 // CK12-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
 // CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK12-NEXT:    ret void
 //
@@ -3166,11 +3166,11 @@ void bar() {
 // CK12-NEXT:    store ptr [[LR_ADDR]], ptr [[_TMP1]], align 4
 // CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK12-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 4
 // CK12-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
-// CK12-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 4
 // CK12-NEXT:    ret void
 //
@@ -3555,7 +3555,7 @@ void bar() {
 // CK13-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 4
 // CK13-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 4
 // CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 4
-// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 4
 // CK13-NEXT:    ret void
 //
@@ -3566,7 +3566,7 @@ void bar() {
 // CK13-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 4
 // CK13-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 4
 // CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 4
-// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 4
 // CK13-NEXT:    ret void
 //
@@ -3577,7 +3577,7 @@ void bar() {
 // CK13-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 4
 // CK13-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 4
 // CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 4
-// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 4
 // CK13-NEXT:    ret void
 //
@@ -3591,7 +3591,7 @@ void bar() {
 // CK13-NEXT:    store ptr [[LR_ADDR]], ptr [[TMP]], align 4
 // CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK13-NEXT:    ret void
 //
@@ -3605,7 +3605,7 @@ void bar() {
 // CK13-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
 // CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK13-NEXT:    ret void
 //
@@ -3619,7 +3619,7 @@ void bar() {
 // CK13-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
 // CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK13-NEXT:    ret void
 //
@@ -3637,11 +3637,11 @@ void bar() {
 // CK13-NEXT:    store ptr [[LR_ADDR]], ptr [[_TMP1]], align 4
 // CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 4
 // CK13-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
-// CK13-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 4
 // CK13-NEXT:    ret void
 //
@@ -3674,34 +3674,34 @@ void bar() {
 // SIMD-ONLY00-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR]], ptr @g, align 8
 // SIMD-ONLY00-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L]], align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[L]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[T]], align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[T]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    store ptr [[TMP3]], ptr [[TMP]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    store ptr [[TMP7]], ptr [[_TMP4]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    store ptr [[TMP11]], ptr [[_TMP6]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    store ptr [[TMP15]], ptr [[_TMP8]], align 8
@@ -3711,11 +3711,11 @@ void bar() {
 // SIMD-ONLY00-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 8
 // SIMD-ONLY00-NEXT:    ret void
 //
@@ -3748,34 +3748,34 @@ void bar() {
 // SIMD-ONLY01-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR]], ptr @g, align 8
 // SIMD-ONLY01-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L]], align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[L]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[T]], align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[T]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    store ptr [[TMP3]], ptr [[TMP]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    store ptr [[TMP7]], ptr [[_TMP4]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    store ptr [[TMP11]], ptr [[_TMP6]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    store ptr [[TMP15]], ptr [[_TMP8]], align 8
@@ -3785,11 +3785,11 @@ void bar() {
 // SIMD-ONLY01-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 8
 // SIMD-ONLY01-NEXT:    ret void
 //
@@ -3822,34 +3822,34 @@ void bar() {
 // SIMD-ONLY02-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR]], ptr @g, align 4
 // SIMD-ONLY02-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L]], align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[L]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[T]], align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[T]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    store ptr [[TMP3]], ptr [[TMP]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    store ptr [[TMP7]], ptr [[_TMP4]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    store ptr [[TMP11]], ptr [[_TMP6]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    store ptr [[TMP15]], ptr [[_TMP8]], align 4
@@ -3859,11 +3859,11 @@ void bar() {
 // SIMD-ONLY02-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 4
 // SIMD-ONLY02-NEXT:    ret void
 //
@@ -3896,34 +3896,34 @@ void bar() {
 // SIMD-ONLY03-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR]], ptr @g, align 4
 // SIMD-ONLY03-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L]], align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[L]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[T]], align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[T]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    store ptr [[TMP3]], ptr [[TMP]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    store ptr [[TMP7]], ptr [[_TMP4]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    store ptr [[TMP11]], ptr [[_TMP6]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    store ptr [[TMP15]], ptr [[_TMP8]], align 4
@@ -3933,11 +3933,11 @@ void bar() {
 // SIMD-ONLY03-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 4
 // SIMD-ONLY03-NEXT:    ret void
 //
@@ -3951,7 +3951,7 @@ void bar() {
 // CK20-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // CK20-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // CK20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8
-// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK20-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8
 // CK20-NEXT:    ret void
 //
@@ -4185,7 +4185,7 @@ void bar() {
 // CK20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CK20-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
-// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK20-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
 // CK20-NEXT:    ret void
 //
@@ -4199,7 +4199,7 @@ void bar() {
 // CK20-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1
 // CK20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 8
 // CK20-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // CK20-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8
 // CK20-NEXT:    ret void
 //
@@ -4212,12 +4212,12 @@ void bar() {
 // CK20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CK20-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
-// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK20-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
 // CK20-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1
 // CK20-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B]], align 8
 // CK20-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
-// CK20-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// CK20-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // CK20-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 8
 // CK20-NEXT:    ret void
 //
@@ -4231,7 +4231,7 @@ void bar() {
 // CK21-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // CK21-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // CK21-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8
-// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK21-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8
 // CK21-NEXT:    ret void
 //
@@ -4465,7 +4465,7 @@ void bar() {
 // CK21-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CK21-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK21-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
-// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK21-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
 // CK21-NEXT:    ret void
 //
@@ -4479,7 +4479,7 @@ void bar() {
 // CK21-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1
 // CK21-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 8
 // CK21-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // CK21-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8
 // CK21-NEXT:    ret void
 //
@@ -4492,12 +4492,12 @@ void bar() {
 // CK21-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CK21-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK21-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
-// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK21-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
 // CK21-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1
 // CK21-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B]], align 8
 // CK21-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
-// CK21-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// CK21-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // CK21-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 8
 // CK21-NEXT:    ret void
 //
@@ -4511,7 +4511,7 @@ void bar() {
 // CK22-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // CK22-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // CK22-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4
-// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK22-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4
 // CK22-NEXT:    ret void
 //
@@ -4745,7 +4745,7 @@ void bar() {
 // CK22-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CK22-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK22-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
-// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK22-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
 // CK22-NEXT:    ret void
 //
@@ -4759,7 +4759,7 @@ void bar() {
 // CK22-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1
 // CK22-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 4
 // CK22-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
-// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // CK22-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 4
 // CK22-NEXT:    ret void
 //
@@ -4772,12 +4772,12 @@ void bar() {
 // CK22-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CK22-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK22-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
-// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK22-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
 // CK22-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1
 // CK22-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B]], align 4
 // CK22-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
-// CK22-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// CK22-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // CK22-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 4
 // CK22-NEXT:    ret void
 //
@@ -4791,7 +4791,7 @@ void bar() {
 // CK23-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // CK23-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // CK23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4
-// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK23-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4
 // CK23-NEXT:    ret void
 //
@@ -5025,7 +5025,7 @@ void bar() {
 // CK23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CK23-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK23-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
-// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK23-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
 // CK23-NEXT:    ret void
 //
@@ -5039,7 +5039,7 @@ void bar() {
 // CK23-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1
 // CK23-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 4
 // CK23-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
-// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // CK23-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 4
 // CK23-NEXT:    ret void
 //
@@ -5052,12 +5052,12 @@ void bar() {
 // CK23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CK23-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK23-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
-// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK23-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
 // CK23-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1
 // CK23-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B]], align 4
 // CK23-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
-// CK23-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// CK23-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // CK23-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 4
 // CK23-NEXT:    ret void
 //
@@ -5071,7 +5071,7 @@ void bar() {
 // SIMD-ONLY10-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // SIMD-ONLY10-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // SIMD-ONLY10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8
-// SIMD-ONLY10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8
 // SIMD-ONLY10-NEXT:    ret void
 //
@@ -5101,21 +5101,21 @@ void bar() {
 // SIMD-ONLY10-NEXT:    store ptr null, ptr [[LA]], align 8
 // SIMD-ONLY10-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A]], align 8
-// SIMD-ONLY10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
 // SIMD-ONLY10-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 8
 // SIMD-ONLY10-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// SIMD-ONLY10-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// SIMD-ONLY10-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // SIMD-ONLY10-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 8
 // SIMD-ONLY10-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY10-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A3]], align 8
-// SIMD-ONLY10-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// SIMD-ONLY10-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // SIMD-ONLY10-NEXT:    store ptr [[INCDEC_PTR4]], ptr [[A3]], align 8
 // SIMD-ONLY10-NEXT:    [[B5:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY10-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B5]], align 8
 // SIMD-ONLY10-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-// SIMD-ONLY10-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1
+// SIMD-ONLY10-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 1
 // SIMD-ONLY10-NEXT:    store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 8
 // SIMD-ONLY10-NEXT:    ret void
 //
@@ -5145,7 +5145,7 @@ void bar() {
 // SIMD-ONLY11-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // SIMD-ONLY11-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // SIMD-ONLY11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8
-// SIMD-ONLY11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8
 // SIMD-ONLY11-NEXT:    ret void
 //
@@ -5175,21 +5175,21 @@ void bar() {
 // SIMD-ONLY11-NEXT:    store ptr null, ptr [[LA]], align 8
 // SIMD-ONLY11-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A]], align 8
-// SIMD-ONLY11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
 // SIMD-ONLY11-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 8
 // SIMD-ONLY11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// SIMD-ONLY11-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// SIMD-ONLY11-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // SIMD-ONLY11-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 8
 // SIMD-ONLY11-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY11-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A3]], align 8
-// SIMD-ONLY11-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// SIMD-ONLY11-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // SIMD-ONLY11-NEXT:    store ptr [[INCDEC_PTR4]], ptr [[A3]], align 8
 // SIMD-ONLY11-NEXT:    [[B5:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY11-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B5]], align 8
 // SIMD-ONLY11-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-// SIMD-ONLY11-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1
+// SIMD-ONLY11-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 1
 // SIMD-ONLY11-NEXT:    store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 8
 // SIMD-ONLY11-NEXT:    ret void
 //
@@ -5219,7 +5219,7 @@ void bar() {
 // SIMD-ONLY12-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // SIMD-ONLY12-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // SIMD-ONLY12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4
-// SIMD-ONLY12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4
 // SIMD-ONLY12-NEXT:    ret void
 //
@@ -5249,21 +5249,21 @@ void bar() {
 // SIMD-ONLY12-NEXT:    store ptr null, ptr [[LA]], align 4
 // SIMD-ONLY12-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A]], align 4
-// SIMD-ONLY12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
 // SIMD-ONLY12-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 4
 // SIMD-ONLY12-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
-// SIMD-ONLY12-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// SIMD-ONLY12-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // SIMD-ONLY12-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 4
 // SIMD-ONLY12-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY12-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A3]], align 4
-// SIMD-ONLY12-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// SIMD-ONLY12-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // SIMD-ONLY12-NEXT:    store ptr [[INCDEC_PTR4]], ptr [[A3]], align 4
 // SIMD-ONLY12-NEXT:    [[B5:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY12-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B5]], align 4
 // SIMD-ONLY12-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4
-// SIMD-ONLY12-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1
+// SIMD-ONLY12-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 1
 // SIMD-ONLY12-NEXT:    store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 4
 // SIMD-ONLY12-NEXT:    ret void
 //
@@ -5293,7 +5293,7 @@ void bar() {
 // SIMD-ONLY13-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // SIMD-ONLY13-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // SIMD-ONLY13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4
-// SIMD-ONLY13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4
 // SIMD-ONLY13-NEXT:    ret void
 //
@@ -5323,21 +5323,21 @@ void bar() {
 // SIMD-ONLY13-NEXT:    store ptr null, ptr [[LA]], align 4
 // SIMD-ONLY13-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A]], align 4
-// SIMD-ONLY13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
 // SIMD-ONLY13-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 4
 // SIMD-ONLY13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
-// SIMD-ONLY13-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// SIMD-ONLY13-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // SIMD-ONLY13-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 4
 // SIMD-ONLY13-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY13-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A3]], align 4
-// SIMD-ONLY13-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// SIMD-ONLY13-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // SIMD-ONLY13-NEXT:    store ptr [[INCDEC_PTR4]], ptr [[A3]], align 4
 // SIMD-ONLY13-NEXT:    [[B5:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY13-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B5]], align 4
 // SIMD-ONLY13-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4
-// SIMD-ONLY13-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1
+// SIMD-ONLY13-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 1
 // SIMD-ONLY13-NEXT:    store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 4
 // SIMD-ONLY13-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp b/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp
index fcaceac7d3467..87fa7fe462daa 100644
--- a/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp
+++ b/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp
@@ -45,7 +45,7 @@ void foo() {
 // CHECK-NEXT:    store ptr [[CALL]], ptr [[PTR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PTR]], align 8
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i64 0
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    store ptr [[PTR]], ptr [[TMP2]], align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
diff --git a/clang/test/OpenMP/target_map_codegen_01.cpp b/clang/test/OpenMP/target_map_codegen_01.cpp
index d112500eb5fdd..9f3553d2377cb 100644
--- a/clang/test/OpenMP/target_map_codegen_01.cpp
+++ b/clang/test/OpenMP/target_map_codegen_01.cpp
@@ -108,6 +108,6 @@ void implicit_maps_reference (int a, int *b){
 // CK2: store ptr [[ADDR]], ptr [[REF]],
 // CK2: [[T:%.+]] = load ptr, ptr [[REF]],
 // CK2: [[TT:%.+]] = load ptr, ptr [[T]],
-// CK2: getelementptr inbounds i32, ptr [[TT]], i32 1
+// CK2: getelementptr inbounds nuw i32, ptr [[TT]], i32 1
 #endif // CK2
 #endif
diff --git a/clang/test/OpenMP/target_map_codegen_21.cpp b/clang/test/OpenMP/target_map_codegen_21.cpp
index a1419b7d4beb8..f5c517692d8c8 100644
--- a/clang/test/OpenMP/target_map_codegen_21.cpp
+++ b/clang/test/OpenMP/target_map_codegen_21.cpp
@@ -185,7 +185,7 @@ int explicit_maps_globals(void){
 // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
 // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
 // CK22-DAG: store ptr @c, ptr [[BP0]]
-// CK22-DAG: store ptr getelementptr inbounds ([100 x i32], ptr @c, i{{.+}} 0, i{{.+}} 1), ptr [[P0]]
+// CK22-DAG: store ptr getelementptr inbounds nuw ([100 x i32], ptr @c, i{{.+}} 0, i{{.+}} 1), ptr [[P0]]
 
 // CK22: call void [[CALL03:@.+]](ptr {{[^,]+}})
 #pragma omp target map(c [1:4])
@@ -277,7 +277,7 @@ int explicit_maps_globals(void){
 // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
 // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
 // CK22-DAG: store ptr @sc, ptr [[BP0]]
-// CK22-DAG: store ptr getelementptr inbounds ([100 x [[ST]]], ptr @sc, i{{.+}} 0, i{{.+}} 1), ptr [[P0]]
+// CK22-DAG: store ptr getelementptr inbounds nuw ([100 x [[ST]]], ptr @sc, i{{.+}} 0, i{{.+}} 1), ptr [[P0]]
 
 // CK22: call void [[CALL08:@.+]](ptr {{[^,]+}})
 #pragma omp target map(sc [1:4])
@@ -369,7 +369,7 @@ int explicit_maps_globals(void){
 // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
 // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
 // CK22-DAG: store ptr @stc, ptr [[BP0]]
-// CK22-DAG: store ptr getelementptr inbounds ([100 x [[STT]]], ptr @stc, i{{.+}} 0, i{{.+}} 1),  ptr [[P0]]
+// CK22-DAG: store ptr getelementptr inbounds nuw ([100 x [[STT]]], ptr @stc, i{{.+}} 0, i{{.+}} 1),  ptr [[P0]]
 
 // CK22: call void [[CALL13:@.+]](ptr {{[^,]+}})
 #pragma omp target map(stc [1:4])
diff --git a/clang/test/OpenMP/target_map_codegen_27.cpp b/clang/test/OpenMP/target_map_codegen_27.cpp
index fe7ae12e00d13..bfe75bca481be 100644
--- a/clang/test/OpenMP/target_map_codegen_27.cpp
+++ b/clang/test/OpenMP/target_map_codegen_27.cpp
@@ -82,7 +82,7 @@ void explicit_maps_pointer_references (int *p){
 // CK28-DAG: store ptr [[VAR1:%.+]], ptr [[P0]]
 // CK28-DAG: [[VAR0]] = load ptr, ptr [[VAR00:%.+]],
 // CK28-DAG: [[VAR00]] = load ptr, ptr [[VAR000:%.+]],
-// CK28-DAG: [[VAR1]] = getelementptr inbounds i32, ptr [[VAR11:%.+]], i{{64|32}} 2
+// CK28-DAG: [[VAR1]] = getelementptr inbounds nuw i32, ptr [[VAR11:%.+]], i{{64|32}} 2
 // CK28-DAG: [[VAR11]] = load ptr, ptr [[VAR111:%.+]],
 // CK28-DAG: [[VAR111]] = load ptr, ptr [[VAR1111:%.+]],
 
diff --git a/clang/test/OpenMP/target_map_codegen_28.cpp b/clang/test/OpenMP/target_map_codegen_28.cpp
index e92f7e4773ecf..67ea72d791d03 100644
--- a/clang/test/OpenMP/target_map_codegen_28.cpp
+++ b/clang/test/OpenMP/target_map_codegen_28.cpp
@@ -89,7 +89,7 @@ struct SSB{
 // CK29-DAG: store ptr [[VAR1:%.+]], ptr [[BP2]]
 // CK29-DAG: store ptr [[VAR2:%.+]], ptr [[P2]]
 // CK29-DAG: [[VAR1]] = getelementptr inbounds nuw [[SSA]], ptr %{{.+}}, i32 0, i32 1
-// CK29-DAG: [[VAR2]] = getelementptr inbounds double, ptr [[VAR22:%.+]], i{{.+}} 0
+// CK29-DAG: [[VAR2]] = getelementptr inbounds nuw double, ptr [[VAR22:%.+]], i{{.+}} 0
 // CK29-DAG: [[VAR22]] = load ptr, ptr %{{.+}},
 
 // CK29: call void [[CALL00:@.+]](ptr {{[^,]+}})
@@ -129,7 +129,7 @@ struct SSB{
 // CK29-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
 // CK29-DAG: store ptr [[VAR1]], ptr [[BP2]]
 // CK29-DAG: store ptr [[VAR2:%.+]], ptr [[P2]]
-// CK29-DAG: [[VAR2]] = getelementptr inbounds double, ptr [[VAR22:%.+]], i{{.+}} 0
+// CK29-DAG: [[VAR2]] = getelementptr inbounds nuw double, ptr [[VAR22:%.+]], i{{.+}} 0
 // CK29-DAG: [[VAR22]] = load ptr, ptr %{{.+}},
 
 // CK29: call void [[CALL00:@.+]](ptr {{[^,]+}})
@@ -164,7 +164,7 @@ struct SSB{
 // CK29-DAG: store ptr [[VAR1:%.+]], ptr [[BP2]]
 // CK29-DAG: store ptr [[VAR2:%.+]], ptr [[P2]]
 // CK29-DAG: [[VAR1]] = getelementptr inbounds nuw [[SSA]], ptr %{{.+}}, i32 0, i32 1
-// CK29-DAG: [[VAR2]] = getelementptr inbounds double, ptr [[VAR22:%.+]], i{{.+}} 0
+// CK29-DAG: [[VAR2]] = getelementptr inbounds nuw double, ptr [[VAR22:%.+]], i{{.+}} 0
 // CK29-DAG: [[VAR22]] = load ptr, ptr %{{.+}},
 
 // CK29: call void [[CALL00:@.+]](ptr {{[^,]+}})
diff --git a/clang/test/OpenMP/target_map_codegen_29.cpp b/clang/test/OpenMP/target_map_codegen_29.cpp
index 936a01573c2d2..3ca7b228d26c2 100644
--- a/clang/test/OpenMP/target_map_codegen_29.cpp
+++ b/clang/test/OpenMP/target_map_codegen_29.cpp
@@ -89,7 +89,7 @@ typedef struct StructWithPtrTag : public Base {
 // CK30-DAG: [[PTR:%.+]] = getelementptr inbounds [4 x ptr], ptr [[PTRS]], i32 0, i32 2
 // CK30-DAG: store ptr [[S_PTR1_BEGIN:%.+]], ptr [[PTR]],
 // CK30-DAG: [[S_PTR1]] = getelementptr inbounds nuw [[STRUCT]], ptr [[S]], i32 0, i32 4
-// CK30-DAG: [[S_PTR1_BEGIN]] = getelementptr inbounds i32, ptr [[S_PTR1_BEGIN_REF:%.+]], i{{64|32}} 0
+// CK30-DAG: [[S_PTR1_BEGIN]] = getelementptr inbounds nuw i32, ptr [[S_PTR1_BEGIN_REF:%.+]], i{{64|32}} 0
 // CK30-DAG: [[S_PTR1_BEGIN_REF]] = load ptr, ptr [[S_PTR1:%.+]],
 // CK30-DAG: [[S_PTR1]] = getelementptr inbounds nuw [[STRUCT]], ptr [[S]], i32 0, i32 4
 
@@ -98,7 +98,7 @@ typedef struct StructWithPtrTag : public Base {
 // CK30-DAG: [[PTR:%.+]] = getelementptr inbounds [4 x ptr], ptr [[PTRS]], i32 0, i32 3
 // CK30-DAG: store ptr [[S_PTRBASE1_BEGIN:%.+]], ptr [[PTR]],
 // CK30-DAG: [[S_PTRBASE1]] = getelementptr inbounds nuw [[BASE]], ptr [[S_BASE:%.+]], i32 0, i32 2
-// CK30-DAG: [[S_PTRBASE1_BEGIN]] = getelementptr inbounds i32, ptr [[S_PTRBASE1_BEGIN_REF:%.+]], i{{64|32}} 0
+// CK30-DAG: [[S_PTRBASE1_BEGIN]] = getelementptr inbounds nuw i32, ptr [[S_PTRBASE1_BEGIN_REF:%.+]], i{{64|32}} 0
 // CK30-DAG: [[S_PTRBASE1_BEGIN_REF]] = load ptr, ptr [[S_PTRBASE1:%.+]],
 // CK30-DAG: [[S_PTRBASE1]] = getelementptr inbounds nuw [[BASE]], ptr [[S_BASE:%.+]], i32 0, i32 2
 void map_with_deep_copy() {
diff --git a/clang/test/OpenMP/target_map_deref_array_codegen.cpp b/clang/test/OpenMP/target_map_deref_array_codegen.cpp
index 9d395b0ab8cd8..e61fc7296332b 100644
--- a/clang/test/OpenMP/target_map_deref_array_codegen.cpp
+++ b/clang/test/OpenMP/target_map_deref_array_codegen.cpp
@@ -75,7 +75,7 @@ void foo(int **t1d)
 // CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[T1D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[T1D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
-// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i64 0
 // CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    store ptr [[TMP7]], ptr [[TMP11]], align 8
 // CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
diff --git a/clang/test/OpenMP/target_map_member_expr_array_section_codegen.cpp b/clang/test/OpenMP/target_map_member_expr_array_section_codegen.cpp
index 7a0da002fb944..692e3a4214c9d 100644
--- a/clang/test/OpenMP/target_map_member_expr_array_section_codegen.cpp
+++ b/clang/test/OpenMP/target_map_member_expr_array_section_codegen.cpp
@@ -28,12 +28,12 @@ struct maptest {
     // CHECK: getelementptr inbounds
     // CHECK: [[S_ADDR:%.+]] = getelementptr inbounds nuw %struct.maptest, ptr [[THIS:%.+]], i32 0, i32 0
     // CHECK: [[S_DATA_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[S_ADDR]], i32 0, i32 0
-    // CHECK: [[S_DATA_0_ADDR:%.+]] = getelementptr inbounds [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 0
+    // CHECK: [[S_DATA_0_ADDR:%.+]] = getelementptr inbounds nuw [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 0
 
     // SZ = &this->s.data[6]-&this->s.data[0]
     // CHECK: [[S_ADDR:%.+]] = getelementptr inbounds nuw %struct.maptest, ptr [[THIS]], i32 0, i32 0
     // CHECK: [[S_DATA_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[S_ADDR]], i32 0, i32 0
-    // CHECK: [[S_DATA_5_ADDR:%.+]] = getelementptr inbounds [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 5
+    // CHECK: [[S_DATA_5_ADDR:%.+]] = getelementptr inbounds nuw [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 5
     // CHECK: [[S_DATA_6_ADDR:%.+]] = getelementptr float, ptr [[S_DATA_5_ADDR]], i32 1
     // CHECK: [[END_BC:%.+]] = ptrtoint ptr [[S_DATA_6_ADDR]] to i64
     // CHECK: [[BEG_BC:%.+]] = ptrtoint ptr [[S_DATA_0_ADDR]] to i64
@@ -64,12 +64,12 @@ struct maptest {
     // CHECK: [[SIZE:%.+]] = alloca [2 x i64],
     // CHECK: [[S_ADDR:%.+]] = getelementptr inbounds nuw %struct.maptest, ptr [[THIS:%.+]], i32 0, i32 0
     // CHECK: [[S_DATA_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[S_ADDR]], i32 0, i32 0
-    // CHECK: [[S_DATA_0_ADDR:%.+]] = getelementptr inbounds [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 0
+    // CHECK: [[S_DATA_0_ADDR:%.+]] = getelementptr inbounds nuw [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 0
 
     // SZ = &this->s.data[6]-&this->s.data[0]
     // CHECK: [[S_ADDR:%.+]] = getelementptr inbounds nuw %struct.maptest, ptr [[THIS]], i32 0, i32 0
     // CHECK: [[S_DATA_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[S_ADDR]], i32 0, i32 0
-    // CHECK: [[S_DATA_5_ADDR:%.+]] = getelementptr inbounds [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 5
+    // CHECK: [[S_DATA_5_ADDR:%.+]] = getelementptr inbounds nuw [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 5
     // CHECK: [[S_DATA_6_ADDR:%.+]] = getelementptr float, ptr [[S_DATA_5_ADDR]], i32 1
     // CHECK: [[END_BC:%.+]] = ptrtoint ptr [[S_DATA_6_ADDR]] to i64
     // CHECK: [[BEG_BC:%.+]] = ptrtoint ptr [[S_DATA_0_ADDR]] to i64
diff --git a/clang/test/OpenMP/target_map_member_expr_codegen.cpp b/clang/test/OpenMP/target_map_member_expr_codegen.cpp
index 9b64647928a24..fb36ba7b78d5b 100644
--- a/clang/test/OpenMP/target_map_member_expr_codegen.cpp
+++ b/clang/test/OpenMP/target_map_member_expr_codegen.cpp
@@ -223,7 +223,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[A4:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[TMP10]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[A4]], align 8
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 0
 // CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ASIZE]], align 4
 // CHECK-NEXT:    [[CONV:%.*]] = zext i32 [[TMP12]] to i64
 // CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[CONV]], 4
@@ -233,7 +233,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[C5:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[TMP16]], i32 0, i32 1
 // CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[C5]], align 8
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 0
 // CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[CSIZE]], align 4
 // CHECK-NEXT:    [[CONV7:%.*]] = zext i32 [[TMP18]] to i64
 // CHECK-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[CONV7]], 4
@@ -343,7 +343,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP79:%.*]] = load ptr, ptr [[_TMP12]], align 8
 // CHECK-NEXT:    [[C15:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[TMP79]], i32 0, i32 1
 // CHECK-NEXT:    [[TMP80:%.*]] = load ptr, ptr [[C15]], align 8
-// CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP80]], i64 0
 // CHECK-NEXT:    [[TMP81:%.*]] = load i32, ptr [[CSIZE]], align 4
 // CHECK-NEXT:    [[CONV17:%.*]] = zext i32 [[TMP81]] to i64
 // CHECK-NEXT:    [[TMP82:%.*]] = mul nuw i64 [[CONV17]], 4
diff --git a/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp b/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp
index ffb145d8e50fc..775f0b296b1b6 100644
--- a/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp
+++ b/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp
@@ -45,7 +45,7 @@ void foo() {
 // CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[ARRAYIDX1]], i32 0, i32 1
 // CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C:%.*]], ptr [[F]], i32 0, i32 0
 // CHECK-NEXT:    store i32 222, ptr [[A]], align 4
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[SA]], i64 0, i64 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [10 x %struct.D], ptr [[SA]], i64 0, i64 0
 // CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    store ptr [[SA]], ptr [[TMP0]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
diff --git a/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
index 3d0710acf0ee7..5cce677e88572 100644
--- a/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
@@ -96,16 +96,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -135,7 +135,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP6]], ptr [[_TMP5]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP6]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -150,19 +150,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN10]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN10]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[TMP36]], align 8
@@ -483,9 +483,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp b/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
index 28d63dbf8c4a9..c0bb4a6d6cc82 100644
--- a/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
@@ -85,16 +85,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -124,7 +124,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP5]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP5]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -139,19 +139,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8
@@ -429,9 +429,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/target_task_affinity_codegen.cpp b/clang/test/OpenMP/target_task_affinity_codegen.cpp
index 85c5d63a6cd9c..53960cee4b730 100644
--- a/clang/test/OpenMP/target_task_affinity_codegen.cpp
+++ b/clang/test/OpenMP/target_task_affinity_codegen.cpp
@@ -76,7 +76,7 @@ int main() {
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP3]], align 8
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
@@ -102,7 +102,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[B]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[B]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP18]], i64 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[TMP17]], ptr [[TMP19]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
@@ -174,9 +174,9 @@ int main() {
 // CHECK1-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 48, i64 8, ptr @.omp_task_entry.)
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x %struct.kmp_task_affinity_info_t], ptr [[DOTAFFS_ARR_ADDR]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i64 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 1023
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i64 1023
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[ARRAYIDX1]], i32 1
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP6]] to i64
@@ -299,7 +299,7 @@ int main() {
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A]], align 4
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 0
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP3]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
@@ -325,7 +325,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[B]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[B]], align 4
-// CHECK3-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+// CHECK3-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP18]], i32 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    store ptr [[TMP17]], ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
@@ -397,9 +397,9 @@ int main() {
 // CHECK3-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 24, i32 4, ptr @.omp_task_entry.)
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x %struct.kmp_task_affinity_info_t], ptr [[DOTAFFS_ARR_ADDR]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK3-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1023
+// CHECK3-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 1023
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[ARRAYIDX1]], i32 1
 // CHECK3-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i32
 // CHECK3-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP6]] to i32
@@ -587,9 +587,9 @@ int main() {
 // CHECK9-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 48, i64 8, ptr @.omp_task_entry.)
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x %struct.kmp_task_affinity_info_t], ptr [[DOTAFFS_ARR_ADDR]], i64 0, i64 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0
+// CHECK9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i64 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK9-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 1023
+// CHECK9-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i64 1023
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[ARRAYIDX1]], i32 1
 // CHECK9-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK9-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP6]] to i64
@@ -709,9 +709,9 @@ int main() {
 // CHECK11-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 24, i32 4, ptr @.omp_task_entry.)
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x %struct.kmp_task_affinity_info_t], ptr [[DOTAFFS_ARR_ADDR]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+// CHECK11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK11-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1023
+// CHECK11-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 1023
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[ARRAYIDX1]], i32 1
 // CHECK11-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i32
 // CHECK11-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP6]] to i32
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
index 6f671dbb27abb..2c36b410af064 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
@@ -91,16 +91,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -130,7 +130,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP5]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP5]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -145,19 +145,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8
@@ -435,16 +435,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP6]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP7]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP7]], i64 9
 // CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]]
@@ -474,7 +474,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP22]]
 // CHECK1-NEXT:    store ptr [[_TMP6]], ptr [[_TMP5]], align 8
 // CHECK1-NEXT:    store ptr [[TMP23]], ptr [[_TMP6]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP24]], align 8
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -489,19 +489,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..4, ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP30]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP32]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP32]], i64 0
 // CHECK1-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP33]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP33]], i64 0
 // CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP35]]
 // CHECK1-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP36]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP36]], i64 9
 // CHECK1-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[TMP38]], align 8
@@ -822,9 +822,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/target_update_codegen.cpp b/clang/test/OpenMP/target_update_codegen.cpp
index 5e038989ab6dd..c8211f475c7fc 100644
--- a/clang/test/OpenMP/target_update_codegen.cpp
+++ b/clang/test/OpenMP/target_update_codegen.cpp
@@ -1118,9 +1118,9 @@ struct ST {
 void foo(int arg) {
   ST arr[3][4];
   // CK20: [[DIMS:%.+]] = alloca [3 x [[STRUCT_DESCRIPTOR]]],
-  // CK20: [[ARRAY_IDX:%.+]] = getelementptr inbounds [3 x [4 x [[STRUCT_ST]]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
+  // CK20: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [3 x [4 x [[STRUCT_ST]]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
   // CK20: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [4 x [[STRUCT_ST]]], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0
-  // CK20: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds [[STRUCT_ST]], ptr [[ARRAY_DECAY]], {{.+}}
+  // CK20: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[ARRAY_DECAY]], {{.+}}
   // CK20: [[BP0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[BP:%.+]], {{.+}} 0, {{.+}} 0
   // CK20: store ptr [[ARR]], ptr [[BP0]],
   // CK20: [[P0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[P:%.+]], {{.+}} 0, {{.+}} 0
@@ -1186,9 +1186,9 @@ struct ST {
   // CK21: _ZN2ST3fooEv
   void foo() {
     // CK21: [[DIMS:%.+]] = alloca [4 x [[STRUCT_DESCRIPTOR]]],
-    // CK21: [[ARRAY_IDX:%.+]] = getelementptr inbounds [10 x [10 x [10 x ptr]]], ptr [[DPTR:%.+]], {{.+}} 0, {{.+}} 0
+    // CK21: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [10 x [10 x [10 x ptr]]], ptr [[DPTR:%.+]], {{.+}} 0, {{.+}} 0
     // CK21: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [10 x [10 x ptr]], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0
-    // CK21: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds [10 x ptr], ptr [[ARRAY_DECAY]], {{.+}} 1
+    // CK21: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds nuw [10 x ptr], ptr [[ARRAY_DECAY]], {{.+}} 1
     // CK21: [[ARRAY_DECAY_2:%.+]] = getelementptr inbounds [10 x ptr], ptr [[ARRAY_IDX_1]], {{.+}} 0, {{.+}} 0
     // CK21: [[ARRAY_IDX_3:%.+]] = getelementptr inbounds {{.+}}, ptr [[ARRAY_DECAY_2]], {{.+}} 0
     // CK21: [[BP0:%.+]] = getelementptr inbounds [2 x ptr], ptr [[BP:%.+]], {{.+}} 0, {{.+}} 0
@@ -1262,9 +1262,9 @@ struct ST {
   // CK22: _ZN2ST3fooEPA10_Pi
   void foo(int *arr[5][10]) {
     // CK22: [[DIMS:%.+]] = alloca [4 x [[STRUCT_DESCRIPTOR]]],
-    // CK22: [[ARRAY_IDX:%.+]] = getelementptr inbounds [10 x ptr], ptr [[ARR:%.+]], {{.+}} 0
+    // CK22: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [10 x ptr], ptr [[ARR:%.+]], {{.+}} 0
     // CK22: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [10 x ptr], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0
-    // CK22: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds ptr, ptr [[ARRAY_DECAY:%.+]], {{.+}} 1
+    // CK22: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds nuw ptr, ptr [[ARRAY_DECAY:%.+]], {{.+}} 1
     // CK22: [[BP0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[BP:%.+]], {{.+}} 0, {{.+}} 0
     // CK22: [[P0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[P:%.+]], i{{.+}} 0, i{{.+}} 0
     // CK22: [[DIM_1:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 0
@@ -1338,11 +1338,11 @@ void foo(int arg) {
   float farr[5][5][5];
   // CK23: [[ARG_ADDR:%.+]] = alloca i32,
   // CK23: [[DIMS:%.+]] = alloca [4 x [[STRUCT_DESCRIPTOR]]],
-  // CK23: [[ARRAY_IDX:%.+]] = getelementptr inbounds [5 x [5 x [5 x float]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
+  // CK23: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [5 x [5 x [5 x float]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
   // CK23: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [5 x [5 x float]], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0
-  // CK23: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds [5 x float], ptr [[ARRAY_DECAY]], {{.+}}
+  // CK23: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds nuw [5 x float], ptr [[ARRAY_DECAY]], {{.+}}
   // CK23: [[ARRAY_DECAY_2:%.+]] = getelementptr inbounds [5 x float], ptr [[ARRAY_IDX_1]], {{.+}} 0, {{.+}} 0
-  // CK23: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds float, ptr [[ARRAY_DECAY_2]], {{.+}}
+  // CK23: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds nuw float, ptr [[ARRAY_DECAY_2]], {{.+}}
   // CK23: [[MUL:%.+]] = mul nuw i64 4,
   // CK23: [[BP0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[BP:%.+]], {{.+}} 0, {{.+}} 0
   // CK23: store ptr [[ARR]], ptr [[BP0]],
@@ -1411,11 +1411,11 @@ void foo(int arg) {
 void foo(int arg) {
   double darr[3][4][5];
   // CK24: [[DIMS:%.+]] = alloca [4 x [[STRUCT_DESCRIPTOR]]],
-  // CK24: [[ARRAY_IDX:%.+]] = getelementptr inbounds [3 x [4 x [5 x double]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
+  // CK24: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [3 x [4 x [5 x double]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
   // CK24: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [4 x [5 x double]], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0
-  // CK24: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds [5 x double], ptr [[ARRAY_DECAY]], {{.+}}
+  // CK24: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds nuw [5 x double], ptr [[ARRAY_DECAY]], {{.+}}
   // CK24: [[ARRAY_DECAY_2:%.+]] = getelementptr inbounds [5 x double], ptr [[ARRAY_IDX_1]], {{.+}} 0, {{.+}} 0
-  // CK24: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds double, ptr [[ARRAY_DECAY_2]], {{.+}}
+  // CK24: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds nuw double, ptr [[ARRAY_DECAY_2]], {{.+}}
   // CK24: [[MUL:%.+]] = mul nuw i64 8,
   // CK24: [[SUB:%.+]] = sub nuw i64 4, [[ARG:%.+]]
   // CK24: [[LEN:%.+]] = udiv {{.+}} [[SUB]], 1
@@ -1488,15 +1488,15 @@ void foo(int arg) {
 
   // CK25: [[DIMS:%.+]] = alloca [4 x [[STRUCT_DESCRIPTOR]]],
   // CK25: [[DIMS_2:%.+]] = alloca [3 x [[STRUCT_DESCRIPTOR]]],
-  // CK25: [[ARRAY_IDX:%.+]] = getelementptr inbounds [3 x [4 x [5 x i32]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
+  // CK25: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [3 x [4 x [5 x i32]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
   // CK25: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [4 x [5 x i32]], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0
-  // CK25: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds [5 x i32], ptr [[ARRAY_DECAY]], {{.+}}
+  // CK25: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds nuw [5 x i32], ptr [[ARRAY_DECAY]], {{.+}}
   // CK25: [[ARRAY_DECAY_2:%.+]] = getelementptr inbounds [5 x i32], ptr [[ARRAY_IDX_1]], {{.+}} 0, {{.+}} 0
-  // CK25: [[ARRAY_IDX_3:%.+]] = getelementptr inbounds {{.+}}, ptr [[ARRAY_DECAY_2]], {{.+}} 1
+  // CK25: [[ARRAY_IDX_3:%.+]] = getelementptr inbounds nuw {{.+}}, ptr [[ARRAY_DECAY_2]], {{.+}} 1
   // CK25: [[LEN:%.+]] = sub nuw i64 4, [[ARG_ADDR:%.+]]
-  // CK25: [[ARRAY_IDX_4:%.+]] = getelementptr inbounds [4 x [3 x float]], ptr [[FARR:%.+]], {{.+}} 0, {{.+}} 0
+  // CK25: [[ARRAY_IDX_4:%.+]] = getelementptr inbounds nuw [4 x [3 x float]], ptr [[FARR:%.+]], {{.+}} 0, {{.+}} 0
   // CK25: [[ARRAY_DECAY_5:%.+]] = getelementptr inbounds [3 x float], ptr [[ARRAY_IDX_4]], {{.+}} 0, {{.+}} 0
-  // CK25: [[ARRAY_IDX_6:%.+]] = getelementptr inbounds float, ptr [[ARRAY_DECAY_5:%.+]], {{.+}} 1
+  // CK25: [[ARRAY_IDX_6:%.+]] = getelementptr inbounds nuw float, ptr [[ARRAY_DECAY_5:%.+]], {{.+}} 1
   // CK25: [[BP0:%.+]] = getelementptr inbounds [3 x ptr], ptr [[BP:%.+]], i{{.+}} 0, i{{.+}} 0
   // CK25: [[P0:%.+]] = getelementptr inbounds [3 x ptr], ptr [[P:%.+]], i{{.+}} 0, i{{.+}} 0
   // CK25: [[DIM_1:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 0
diff --git a/clang/test/OpenMP/task_codegen.c b/clang/test/OpenMP/task_codegen.c
index 0d10cbce4aa80..d08eb3762d5c9 100644
--- a/clang/test/OpenMP/task_codegen.c
+++ b/clang/test/OpenMP/task_codegen.c
@@ -183,7 +183,7 @@ for (int i = 0; i < 10; ++i)
   // CHECK: [[A:%.+]] = load ptr, ptr [[A_ADDR:%.+]],
   // CHECK: [[K:%.+]] = load i32, ptr [[K_ADDR]],
   // CHECK: [[IDX:%.+]] = zext i32 [[K]] to i64
-  // CHECK: [[AK_ADDR:%.+]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IDX]]
+  // CHECK: [[AK_ADDR:%.+]] = getelementptr inbounds nuw ptr, ptr [[A]], i64 [[IDX]]
   // CHECK: [[AK:%.+]] = load ptr, ptr [[AK_ADDR]],
   // CHECK: [[I:%.+]] = load i32, ptr [[I_ADDR]],
   // CHECK: [[IDX:%.+]] = sext i32 [[I]] to i64
diff --git a/clang/test/OpenMP/task_codegen.cpp b/clang/test/OpenMP/task_codegen.cpp
index b256c41132ed3..c3e6d9e6b1cf7 100644
--- a/clang/test/OpenMP/task_codegen.cpp
+++ b/clang/test/OpenMP/task_codegen.cpp
@@ -309,9 +309,9 @@ void test_omp_all_memory()
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP30]], i32 0, i32 2
 // CHECK1-NEXT:    store i8 1, ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP34:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK1-NEXT:    [[TMP35:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP35]]
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP35]]
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[ARRAYIDX2]], i32 1
 // CHECK1-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK1-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64
@@ -346,13 +346,13 @@ void test_omp_all_memory()
 // CHECK1-NEXT:    [[TMP58:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-NEXT:    [[TMP59:%.*]] = sext i8 [[TMP58]] to i64
 // CHECK1-NEXT:    [[TMP60:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP60]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP60]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
 // CHECK1-NEXT:    [[TMP61:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-NEXT:    [[TMP62:%.*]] = sext i8 [[TMP61]] to i64
 // CHECK1-NEXT:    [[TMP63:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP63]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP63]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ARRAYIDX10]], i32 1
 // CHECK1-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64
 // CHECK1-NEXT:    [[TMP66:%.*]] = ptrtoint ptr [[TMP64]] to i64
@@ -384,13 +384,13 @@ void test_omp_all_memory()
 // CHECK1-NEXT:    [[TMP83:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-NEXT:    [[TMP84:%.*]] = sext i8 [[TMP83]] to i64
 // CHECK1-NEXT:    [[TMP85:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP85]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP85]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
 // CHECK1-NEXT:    [[TMP86:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-NEXT:    [[TMP87:%.*]] = sext i8 [[TMP86]] to i64
 // CHECK1-NEXT:    [[TMP88:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP88]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP88]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
 // CHECK1-NEXT:    [[TMP89:%.*]] = getelementptr i32, ptr [[ARRAYIDX18]], i32 1
 // CHECK1-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[ARRAYIDX16]] to i64
 // CHECK1-NEXT:    [[TMP91:%.*]] = ptrtoint ptr [[TMP89]] to i64
@@ -427,8 +427,8 @@ void test_omp_all_memory()
 // CHECK1-NEXT:    [[TMP111:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP108]], i32 0, i32 2
 // CHECK1-NEXT:    store i8 3, ptr [[TMP111]], align 8
 // CHECK1-NEXT:    [[TMP112:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP112]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 3
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP112]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 3
 // CHECK1-NEXT:    [[TMP113:%.*]] = load i32, ptr @a, align 4
 // CHECK1-NEXT:    [[TMP114:%.*]] = sext i32 [[TMP113]] to i64
 // CHECK1-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP114]], 1
@@ -436,8 +436,8 @@ void test_omp_all_memory()
 // CHECK1-NEXT:    [[TMP116:%.*]] = sext i32 [[TMP115]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP116]]
 // CHECK1-NEXT:    [[TMP117:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP117]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
+// CHECK1-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP117]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
 // CHECK1-NEXT:    [[TMP118:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK1-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK1-NEXT:    [[TMP120:%.*]] = ptrtoint ptr [[TMP118]] to i64
@@ -1432,9 +1432,9 @@ void test_omp_all_memory()
 // CHECK1-51-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP30]], i32 0, i32 2
 // CHECK1-51-NEXT:    store i8 1, ptr [[TMP33]], align 8
 // CHECK1-51-NEXT:    [[TMP34:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK1-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK1-51-NEXT:    [[TMP35:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP35]]
+// CHECK1-51-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP35]]
 // CHECK1-51-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[ARRAYIDX2]], i32 1
 // CHECK1-51-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK1-51-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64
@@ -1469,13 +1469,13 @@ void test_omp_all_memory()
 // CHECK1-51-NEXT:    [[TMP58:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-51-NEXT:    [[TMP59:%.*]] = sext i8 [[TMP58]] to i64
 // CHECK1-51-NEXT:    [[TMP60:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP60]]
-// CHECK1-51-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
+// CHECK1-51-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP60]]
+// CHECK1-51-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
 // CHECK1-51-NEXT:    [[TMP61:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-51-NEXT:    [[TMP62:%.*]] = sext i8 [[TMP61]] to i64
 // CHECK1-51-NEXT:    [[TMP63:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP63]]
-// CHECK1-51-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
+// CHECK1-51-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP63]]
+// CHECK1-51-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
 // CHECK1-51-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ARRAYIDX10]], i32 1
 // CHECK1-51-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64
 // CHECK1-51-NEXT:    [[TMP66:%.*]] = ptrtoint ptr [[TMP64]] to i64
@@ -1507,13 +1507,13 @@ void test_omp_all_memory()
 // CHECK1-51-NEXT:    [[TMP83:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-51-NEXT:    [[TMP84:%.*]] = sext i8 [[TMP83]] to i64
 // CHECK1-51-NEXT:    [[TMP85:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP85]]
-// CHECK1-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
+// CHECK1-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP85]]
+// CHECK1-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
 // CHECK1-51-NEXT:    [[TMP86:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-51-NEXT:    [[TMP87:%.*]] = sext i8 [[TMP86]] to i64
 // CHECK1-51-NEXT:    [[TMP88:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP88]]
-// CHECK1-51-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
+// CHECK1-51-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP88]]
+// CHECK1-51-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
 // CHECK1-51-NEXT:    [[TMP89:%.*]] = getelementptr i32, ptr [[ARRAYIDX18]], i32 1
 // CHECK1-51-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[ARRAYIDX16]] to i64
 // CHECK1-51-NEXT:    [[TMP91:%.*]] = ptrtoint ptr [[TMP89]] to i64
@@ -1550,8 +1550,8 @@ void test_omp_all_memory()
 // CHECK1-51-NEXT:    [[TMP111:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP108]], i32 0, i32 2
 // CHECK1-51-NEXT:    store i8 3, ptr [[TMP111]], align 8
 // CHECK1-51-NEXT:    [[TMP112:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP112]]
-// CHECK1-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 3
+// CHECK1-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP112]]
+// CHECK1-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 3
 // CHECK1-51-NEXT:    [[TMP113:%.*]] = load i32, ptr @a, align 4
 // CHECK1-51-NEXT:    [[TMP114:%.*]] = sext i32 [[TMP113]] to i64
 // CHECK1-51-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP114]], 1
@@ -1559,8 +1559,8 @@ void test_omp_all_memory()
 // CHECK1-51-NEXT:    [[TMP116:%.*]] = sext i32 [[TMP115]] to i64
 // CHECK1-51-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP116]]
 // CHECK1-51-NEXT:    [[TMP117:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP117]]
-// CHECK1-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
+// CHECK1-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP117]]
+// CHECK1-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
 // CHECK1-51-NEXT:    [[TMP118:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK1-51-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK1-51-NEXT:    [[TMP120:%.*]] = ptrtoint ptr [[TMP118]] to i64
@@ -1595,8 +1595,8 @@ void test_omp_all_memory()
 // CHECK1-51-NEXT:    [[TMP139:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP136]], i32 0, i32 2
 // CHECK1-51-NEXT:    store i8 8, ptr [[TMP139]], align 8
 // CHECK1-51-NEXT:    [[TMP140:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP140]]
-// CHECK1-51-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX31]], i64 3
+// CHECK1-51-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP140]]
+// CHECK1-51-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX31]], i64 3
 // CHECK1-51-NEXT:    [[TMP141:%.*]] = load i32, ptr @a, align 4
 // CHECK1-51-NEXT:    [[TMP142:%.*]] = sext i32 [[TMP141]] to i64
 // CHECK1-51-NEXT:    [[LEN_SUB_133:%.*]] = sub nsw i64 [[TMP142]], 1
@@ -1604,8 +1604,8 @@ void test_omp_all_memory()
 // CHECK1-51-NEXT:    [[TMP144:%.*]] = sext i32 [[TMP143]] to i64
 // CHECK1-51-NEXT:    [[LB_ADD_LEN34:%.*]] = add nsw i64 -1, [[TMP144]]
 // CHECK1-51-NEXT:    [[TMP145:%.*]] = mul nsw i64 [[LB_ADD_LEN34]], [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP145]]
-// CHECK1-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_133]]
+// CHECK1-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP145]]
+// CHECK1-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_133]]
 // CHECK1-51-NEXT:    [[TMP146:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1
 // CHECK1-51-NEXT:    [[TMP147:%.*]] = ptrtoint ptr [[ARRAYIDX32]] to i64
 // CHECK1-51-NEXT:    [[TMP148:%.*]] = ptrtoint ptr [[TMP146]] to i64
@@ -3040,9 +3040,9 @@ void test_omp_all_memory()
 // CHECK2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP30]], i32 0, i32 2
 // CHECK2-NEXT:    store i8 1, ptr [[TMP33]], align 8
 // CHECK2-NEXT:    [[TMP34:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK2-NEXT:    [[TMP35:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP35]]
+// CHECK2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP35]]
 // CHECK2-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[ARRAYIDX2]], i32 1
 // CHECK2-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK2-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64
@@ -3077,13 +3077,13 @@ void test_omp_all_memory()
 // CHECK2-NEXT:    [[TMP58:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-NEXT:    [[TMP59:%.*]] = sext i8 [[TMP58]] to i64
 // CHECK2-NEXT:    [[TMP60:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP60]]
-// CHECK2-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
+// CHECK2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP60]]
+// CHECK2-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
 // CHECK2-NEXT:    [[TMP61:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-NEXT:    [[TMP62:%.*]] = sext i8 [[TMP61]] to i64
 // CHECK2-NEXT:    [[TMP63:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP63]]
-// CHECK2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
+// CHECK2-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP63]]
+// CHECK2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
 // CHECK2-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ARRAYIDX10]], i32 1
 // CHECK2-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64
 // CHECK2-NEXT:    [[TMP66:%.*]] = ptrtoint ptr [[TMP64]] to i64
@@ -3115,13 +3115,13 @@ void test_omp_all_memory()
 // CHECK2-NEXT:    [[TMP83:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-NEXT:    [[TMP84:%.*]] = sext i8 [[TMP83]] to i64
 // CHECK2-NEXT:    [[TMP85:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP85]]
-// CHECK2-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
+// CHECK2-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP85]]
+// CHECK2-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
 // CHECK2-NEXT:    [[TMP86:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-NEXT:    [[TMP87:%.*]] = sext i8 [[TMP86]] to i64
 // CHECK2-NEXT:    [[TMP88:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP88]]
-// CHECK2-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
+// CHECK2-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP88]]
+// CHECK2-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
 // CHECK2-NEXT:    [[TMP89:%.*]] = getelementptr i32, ptr [[ARRAYIDX18]], i32 1
 // CHECK2-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[ARRAYIDX16]] to i64
 // CHECK2-NEXT:    [[TMP91:%.*]] = ptrtoint ptr [[TMP89]] to i64
@@ -3158,8 +3158,8 @@ void test_omp_all_memory()
 // CHECK2-NEXT:    [[TMP111:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP108]], i32 0, i32 2
 // CHECK2-NEXT:    store i8 3, ptr [[TMP111]], align 8
 // CHECK2-NEXT:    [[TMP112:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP112]]
-// CHECK2-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 3
+// CHECK2-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP112]]
+// CHECK2-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 3
 // CHECK2-NEXT:    [[TMP113:%.*]] = load i32, ptr @a, align 4
 // CHECK2-NEXT:    [[TMP114:%.*]] = sext i32 [[TMP113]] to i64
 // CHECK2-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP114]], 1
@@ -3167,8 +3167,8 @@ void test_omp_all_memory()
 // CHECK2-NEXT:    [[TMP116:%.*]] = sext i32 [[TMP115]] to i64
 // CHECK2-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP116]]
 // CHECK2-NEXT:    [[TMP117:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP117]]
-// CHECK2-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
+// CHECK2-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP117]]
+// CHECK2-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
 // CHECK2-NEXT:    [[TMP118:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK2-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK2-NEXT:    [[TMP120:%.*]] = ptrtoint ptr [[TMP118]] to i64
@@ -4163,9 +4163,9 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP30]], i32 0, i32 2
 // CHECK2-51-NEXT:    store i8 1, ptr [[TMP33]], align 8
 // CHECK2-51-NEXT:    [[TMP34:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK2-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK2-51-NEXT:    [[TMP35:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP35]]
+// CHECK2-51-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP35]]
 // CHECK2-51-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[ARRAYIDX2]], i32 1
 // CHECK2-51-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK2-51-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64
@@ -4200,13 +4200,13 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    [[TMP58:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-51-NEXT:    [[TMP59:%.*]] = sext i8 [[TMP58]] to i64
 // CHECK2-51-NEXT:    [[TMP60:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP60]]
-// CHECK2-51-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
+// CHECK2-51-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP60]]
+// CHECK2-51-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
 // CHECK2-51-NEXT:    [[TMP61:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-51-NEXT:    [[TMP62:%.*]] = sext i8 [[TMP61]] to i64
 // CHECK2-51-NEXT:    [[TMP63:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP63]]
-// CHECK2-51-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
+// CHECK2-51-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP63]]
+// CHECK2-51-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
 // CHECK2-51-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ARRAYIDX10]], i32 1
 // CHECK2-51-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64
 // CHECK2-51-NEXT:    [[TMP66:%.*]] = ptrtoint ptr [[TMP64]] to i64
@@ -4238,13 +4238,13 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    [[TMP83:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-51-NEXT:    [[TMP84:%.*]] = sext i8 [[TMP83]] to i64
 // CHECK2-51-NEXT:    [[TMP85:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP85]]
-// CHECK2-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
+// CHECK2-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP85]]
+// CHECK2-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
 // CHECK2-51-NEXT:    [[TMP86:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-51-NEXT:    [[TMP87:%.*]] = sext i8 [[TMP86]] to i64
 // CHECK2-51-NEXT:    [[TMP88:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP88]]
-// CHECK2-51-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
+// CHECK2-51-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP88]]
+// CHECK2-51-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
 // CHECK2-51-NEXT:    [[TMP89:%.*]] = getelementptr i32, ptr [[ARRAYIDX18]], i32 1
 // CHECK2-51-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[ARRAYIDX16]] to i64
 // CHECK2-51-NEXT:    [[TMP91:%.*]] = ptrtoint ptr [[TMP89]] to i64
@@ -4281,8 +4281,8 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    [[TMP111:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP108]], i32 0, i32 2
 // CHECK2-51-NEXT:    store i8 3, ptr [[TMP111]], align 8
 // CHECK2-51-NEXT:    [[TMP112:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP112]]
-// CHECK2-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 3
+// CHECK2-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP112]]
+// CHECK2-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 3
 // CHECK2-51-NEXT:    [[TMP113:%.*]] = load i32, ptr @a, align 4
 // CHECK2-51-NEXT:    [[TMP114:%.*]] = sext i32 [[TMP113]] to i64
 // CHECK2-51-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP114]], 1
@@ -4290,8 +4290,8 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    [[TMP116:%.*]] = sext i32 [[TMP115]] to i64
 // CHECK2-51-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP116]]
 // CHECK2-51-NEXT:    [[TMP117:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP117]]
-// CHECK2-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
+// CHECK2-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP117]]
+// CHECK2-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
 // CHECK2-51-NEXT:    [[TMP118:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK2-51-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK2-51-NEXT:    [[TMP120:%.*]] = ptrtoint ptr [[TMP118]] to i64
@@ -4326,8 +4326,8 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    [[TMP139:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP136]], i32 0, i32 2
 // CHECK2-51-NEXT:    store i8 8, ptr [[TMP139]], align 8
 // CHECK2-51-NEXT:    [[TMP140:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP140]]
-// CHECK2-51-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX31]], i64 3
+// CHECK2-51-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP140]]
+// CHECK2-51-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX31]], i64 3
 // CHECK2-51-NEXT:    [[TMP141:%.*]] = load i32, ptr @a, align 4
 // CHECK2-51-NEXT:    [[TMP142:%.*]] = sext i32 [[TMP141]] to i64
 // CHECK2-51-NEXT:    [[LEN_SUB_133:%.*]] = sub nsw i64 [[TMP142]], 1
@@ -4335,8 +4335,8 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    [[TMP144:%.*]] = sext i32 [[TMP143]] to i64
 // CHECK2-51-NEXT:    [[LB_ADD_LEN34:%.*]] = add nsw i64 -1, [[TMP144]]
 // CHECK2-51-NEXT:    [[TMP145:%.*]] = mul nsw i64 [[LB_ADD_LEN34]], [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP145]]
-// CHECK2-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_133]]
+// CHECK2-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP145]]
+// CHECK2-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_133]]
 // CHECK2-51-NEXT:    [[TMP146:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1
 // CHECK2-51-NEXT:    [[TMP147:%.*]] = ptrtoint ptr [[ARRAYIDX32]] to i64
 // CHECK2-51-NEXT:    [[TMP148:%.*]] = ptrtoint ptr [[TMP146]] to i64
@@ -5773,9 +5773,9 @@ void test_omp_all_memory()
 // CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP29]], i32 0, i32 2
 // CHECK3-NEXT:    store i8 1, ptr [[TMP32]], align 8
 // CHECK3-NEXT:    [[TMP33:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP33]]
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP33]]
 // CHECK3-NEXT:    [[TMP34:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK3-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[ARRAYIDX4]], i32 1
 // CHECK3-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK3-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[TMP35]] to i64
@@ -5814,13 +5814,13 @@ void test_omp_all_memory()
 // CHECK3-NEXT:    [[TMP57:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-NEXT:    [[TMP58:%.*]] = sext i8 [[TMP57]] to i64
 // CHECK3-NEXT:    [[TMP59:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP59]]
-// CHECK3-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
+// CHECK3-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP59]]
+// CHECK3-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
 // CHECK3-NEXT:    [[TMP60:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-NEXT:    [[TMP61:%.*]] = sext i8 [[TMP60]] to i64
 // CHECK3-NEXT:    [[TMP62:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP62]]
-// CHECK3-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
+// CHECK3-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP62]]
+// CHECK3-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
 // CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr i32, ptr [[ARRAYIDX16]], i32 1
 // CHECK3-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[ARRAYIDX14]] to i64
 // CHECK3-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[TMP63]] to i64
@@ -5854,13 +5854,13 @@ void test_omp_all_memory()
 // CHECK3-NEXT:    [[TMP82:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-NEXT:    [[TMP83:%.*]] = sext i8 [[TMP82]] to i64
 // CHECK3-NEXT:    [[TMP84:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP84]]
-// CHECK3-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
+// CHECK3-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP84]]
+// CHECK3-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
 // CHECK3-NEXT:    [[TMP85:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-NEXT:    [[TMP86:%.*]] = sext i8 [[TMP85]] to i64
 // CHECK3-NEXT:    [[TMP87:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP87]]
-// CHECK3-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
+// CHECK3-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP87]]
+// CHECK3-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
 // CHECK3-NEXT:    [[TMP88:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK3-NEXT:    [[TMP89:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK3-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[TMP88]] to i64
@@ -5899,8 +5899,8 @@ void test_omp_all_memory()
 // CHECK3-NEXT:    [[TMP110:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP107]], i32 0, i32 2
 // CHECK3-NEXT:    store i8 3, ptr [[TMP110]], align 8
 // CHECK3-NEXT:    [[TMP111:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP111]]
-// CHECK3-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX33]], i64 3
+// CHECK3-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP111]]
+// CHECK3-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX33]], i64 3
 // CHECK3-NEXT:    [[TMP112:%.*]] = load i32, ptr @a, align 4
 // CHECK3-NEXT:    [[TMP113:%.*]] = sext i32 [[TMP112]] to i64
 // CHECK3-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP113]], 1
@@ -5908,8 +5908,8 @@ void test_omp_all_memory()
 // CHECK3-NEXT:    [[TMP115:%.*]] = sext i32 [[TMP114]] to i64
 // CHECK3-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP115]]
 // CHECK3-NEXT:    [[TMP116:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP116]]
-// CHECK3-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
+// CHECK3-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP116]]
+// CHECK3-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
 // CHECK3-NEXT:    [[TMP117:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1
 // CHECK3-NEXT:    [[TMP118:%.*]] = ptrtoint ptr [[ARRAYIDX34]] to i64
 // CHECK3-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[TMP117]] to i64
@@ -6789,9 +6789,9 @@ void test_omp_all_memory()
 // CHECK4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP29]], i32 0, i32 2
 // CHECK4-NEXT:    store i8 1, ptr [[TMP32]], align 8
 // CHECK4-NEXT:    [[TMP33:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP33]]
+// CHECK4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP33]]
 // CHECK4-NEXT:    [[TMP34:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK4-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK4-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[ARRAYIDX4]], i32 1
 // CHECK4-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK4-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[TMP35]] to i64
@@ -6830,13 +6830,13 @@ void test_omp_all_memory()
 // CHECK4-NEXT:    [[TMP57:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-NEXT:    [[TMP58:%.*]] = sext i8 [[TMP57]] to i64
 // CHECK4-NEXT:    [[TMP59:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP59]]
-// CHECK4-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
+// CHECK4-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP59]]
+// CHECK4-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
 // CHECK4-NEXT:    [[TMP60:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-NEXT:    [[TMP61:%.*]] = sext i8 [[TMP60]] to i64
 // CHECK4-NEXT:    [[TMP62:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP62]]
-// CHECK4-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
+// CHECK4-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP62]]
+// CHECK4-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
 // CHECK4-NEXT:    [[TMP63:%.*]] = getelementptr i32, ptr [[ARRAYIDX16]], i32 1
 // CHECK4-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[ARRAYIDX14]] to i64
 // CHECK4-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[TMP63]] to i64
@@ -6870,13 +6870,13 @@ void test_omp_all_memory()
 // CHECK4-NEXT:    [[TMP82:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-NEXT:    [[TMP83:%.*]] = sext i8 [[TMP82]] to i64
 // CHECK4-NEXT:    [[TMP84:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP84]]
-// CHECK4-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
+// CHECK4-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP84]]
+// CHECK4-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
 // CHECK4-NEXT:    [[TMP85:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-NEXT:    [[TMP86:%.*]] = sext i8 [[TMP85]] to i64
 // CHECK4-NEXT:    [[TMP87:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP87]]
-// CHECK4-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
+// CHECK4-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP87]]
+// CHECK4-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
 // CHECK4-NEXT:    [[TMP88:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK4-NEXT:    [[TMP89:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK4-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[TMP88]] to i64
@@ -6915,8 +6915,8 @@ void test_omp_all_memory()
 // CHECK4-NEXT:    [[TMP110:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP107]], i32 0, i32 2
 // CHECK4-NEXT:    store i8 3, ptr [[TMP110]], align 8
 // CHECK4-NEXT:    [[TMP111:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP111]]
-// CHECK4-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX33]], i64 3
+// CHECK4-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP111]]
+// CHECK4-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX33]], i64 3
 // CHECK4-NEXT:    [[TMP112:%.*]] = load i32, ptr @a, align 4
 // CHECK4-NEXT:    [[TMP113:%.*]] = sext i32 [[TMP112]] to i64
 // CHECK4-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP113]], 1
@@ -6924,8 +6924,8 @@ void test_omp_all_memory()
 // CHECK4-NEXT:    [[TMP115:%.*]] = sext i32 [[TMP114]] to i64
 // CHECK4-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP115]]
 // CHECK4-NEXT:    [[TMP116:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP116]]
-// CHECK4-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
+// CHECK4-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP116]]
+// CHECK4-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
 // CHECK4-NEXT:    [[TMP117:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1
 // CHECK4-NEXT:    [[TMP118:%.*]] = ptrtoint ptr [[ARRAYIDX34]] to i64
 // CHECK4-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[TMP117]] to i64
@@ -7808,9 +7808,9 @@ void test_omp_all_memory()
 // CHECK3-51-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP29]], i32 0, i32 2
 // CHECK3-51-NEXT:    store i8 1, ptr [[TMP32]], align 8
 // CHECK3-51-NEXT:    [[TMP33:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP33]]
+// CHECK3-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP33]]
 // CHECK3-51-NEXT:    [[TMP34:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK3-51-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK3-51-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[ARRAYIDX4]], i32 1
 // CHECK3-51-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK3-51-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[TMP35]] to i64
@@ -7849,13 +7849,13 @@ void test_omp_all_memory()
 // CHECK3-51-NEXT:    [[TMP57:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-51-NEXT:    [[TMP58:%.*]] = sext i8 [[TMP57]] to i64
 // CHECK3-51-NEXT:    [[TMP59:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP59]]
-// CHECK3-51-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
+// CHECK3-51-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP59]]
+// CHECK3-51-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
 // CHECK3-51-NEXT:    [[TMP60:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-51-NEXT:    [[TMP61:%.*]] = sext i8 [[TMP60]] to i64
 // CHECK3-51-NEXT:    [[TMP62:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP62]]
-// CHECK3-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
+// CHECK3-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP62]]
+// CHECK3-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
 // CHECK3-51-NEXT:    [[TMP63:%.*]] = getelementptr i32, ptr [[ARRAYIDX16]], i32 1
 // CHECK3-51-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[ARRAYIDX14]] to i64
 // CHECK3-51-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[TMP63]] to i64
@@ -7889,13 +7889,13 @@ void test_omp_all_memory()
 // CHECK3-51-NEXT:    [[TMP82:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-51-NEXT:    [[TMP83:%.*]] = sext i8 [[TMP82]] to i64
 // CHECK3-51-NEXT:    [[TMP84:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP84]]
-// CHECK3-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
+// CHECK3-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP84]]
+// CHECK3-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
 // CHECK3-51-NEXT:    [[TMP85:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-51-NEXT:    [[TMP86:%.*]] = sext i8 [[TMP85]] to i64
 // CHECK3-51-NEXT:    [[TMP87:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP87]]
-// CHECK3-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
+// CHECK3-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP87]]
+// CHECK3-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
 // CHECK3-51-NEXT:    [[TMP88:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK3-51-NEXT:    [[TMP89:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK3-51-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[TMP88]] to i64
@@ -7934,8 +7934,8 @@ void test_omp_all_memory()
 // CHECK3-51-NEXT:    [[TMP110:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP107]], i32 0, i32 2
 // CHECK3-51-NEXT:    store i8 3, ptr [[TMP110]], align 8
 // CHECK3-51-NEXT:    [[TMP111:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP111]]
-// CHECK3-51-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX33]], i64 3
+// CHECK3-51-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP111]]
+// CHECK3-51-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX33]], i64 3
 // CHECK3-51-NEXT:    [[TMP112:%.*]] = load i32, ptr @a, align 4
 // CHECK3-51-NEXT:    [[TMP113:%.*]] = sext i32 [[TMP112]] to i64
 // CHECK3-51-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP113]], 1
@@ -7943,8 +7943,8 @@ void test_omp_all_memory()
 // CHECK3-51-NEXT:    [[TMP115:%.*]] = sext i32 [[TMP114]] to i64
 // CHECK3-51-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP115]]
 // CHECK3-51-NEXT:    [[TMP116:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP116]]
-// CHECK3-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
+// CHECK3-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP116]]
+// CHECK3-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
 // CHECK3-51-NEXT:    [[TMP117:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1
 // CHECK3-51-NEXT:    [[TMP118:%.*]] = ptrtoint ptr [[ARRAYIDX34]] to i64
 // CHECK3-51-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[TMP117]] to i64
@@ -7981,8 +7981,8 @@ void test_omp_all_memory()
 // CHECK3-51-NEXT:    [[TMP138:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP135]], i32 0, i32 2
 // CHECK3-51-NEXT:    store i8 8, ptr [[TMP138]], align 8
 // CHECK3-51-NEXT:    [[TMP139:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX43:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP139]]
-// CHECK3-51-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX43]], i64 3
+// CHECK3-51-NEXT:    [[ARRAYIDX43:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP139]]
+// CHECK3-51-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX43]], i64 3
 // CHECK3-51-NEXT:    [[TMP140:%.*]] = load i32, ptr @a, align 4
 // CHECK3-51-NEXT:    [[TMP141:%.*]] = sext i32 [[TMP140]] to i64
 // CHECK3-51-NEXT:    [[LEN_SUB_145:%.*]] = sub nsw i64 [[TMP141]], 1
@@ -7990,8 +7990,8 @@ void test_omp_all_memory()
 // CHECK3-51-NEXT:    [[TMP143:%.*]] = sext i32 [[TMP142]] to i64
 // CHECK3-51-NEXT:    [[LB_ADD_LEN46:%.*]] = add nsw i64 -1, [[TMP143]]
 // CHECK3-51-NEXT:    [[TMP144:%.*]] = mul nsw i64 [[LB_ADD_LEN46]], [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX47:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP144]]
-// CHECK3-51-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX47]], i64 [[LEN_SUB_145]]
+// CHECK3-51-NEXT:    [[ARRAYIDX47:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP144]]
+// CHECK3-51-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX47]], i64 [[LEN_SUB_145]]
 // CHECK3-51-NEXT:    [[TMP145:%.*]] = getelementptr i32, ptr [[ARRAYIDX48]], i32 1
 // CHECK3-51-NEXT:    [[TMP146:%.*]] = ptrtoint ptr [[ARRAYIDX44]] to i64
 // CHECK3-51-NEXT:    [[TMP147:%.*]] = ptrtoint ptr [[TMP145]] to i64
@@ -9323,9 +9323,9 @@ void test_omp_all_memory()
 // CHECK4-51-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP29]], i32 0, i32 2
 // CHECK4-51-NEXT:    store i8 1, ptr [[TMP32]], align 8
 // CHECK4-51-NEXT:    [[TMP33:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP33]]
+// CHECK4-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP33]]
 // CHECK4-51-NEXT:    [[TMP34:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK4-51-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK4-51-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[ARRAYIDX4]], i32 1
 // CHECK4-51-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK4-51-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[TMP35]] to i64
@@ -9364,13 +9364,13 @@ void test_omp_all_memory()
 // CHECK4-51-NEXT:    [[TMP57:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-51-NEXT:    [[TMP58:%.*]] = sext i8 [[TMP57]] to i64
 // CHECK4-51-NEXT:    [[TMP59:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP59]]
-// CHECK4-51-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
+// CHECK4-51-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP59]]
+// CHECK4-51-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
 // CHECK4-51-NEXT:    [[TMP60:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-51-NEXT:    [[TMP61:%.*]] = sext i8 [[TMP60]] to i64
 // CHECK4-51-NEXT:    [[TMP62:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP62]]
-// CHECK4-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
+// CHECK4-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP62]]
+// CHECK4-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
 // CHECK4-51-NEXT:    [[TMP63:%.*]] = getelementptr i32, ptr [[ARRAYIDX16]], i32 1
 // CHECK4-51-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[ARRAYIDX14]] to i64
 // CHECK4-51-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[TMP63]] to i64
@@ -9404,13 +9404,13 @@ void test_omp_all_memory()
 // CHECK4-51-NEXT:    [[TMP82:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-51-NEXT:    [[TMP83:%.*]] = sext i8 [[TMP82]] to i64
 // CHECK4-51-NEXT:    [[TMP84:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP84]]
-// CHECK4-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
+// CHECK4-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP84]]
+// CHECK4-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
 // CHECK4-51-NEXT:    [[TMP85:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-51-NEXT:    [[TMP86:%.*]] = sext i8 [[TMP85]] to i64
 // CHECK4-51-NEXT:    [[TMP87:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP87]]
-// CHECK4-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
+// CHECK4-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP87]]
+// CHECK4-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
 // CHECK4-51-NEXT:    [[TMP88:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK4-51-NEXT:    [[TMP89:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK4-51-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[TMP88]] to i64
@@ -9449,8 +9449,8 @@ void test_omp_all_memory()
 // CHECK4-51-NEXT:    [[TMP110:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP107]], i32 0, i32 2
 // CHECK4-51-NEXT:    store i8 3, ptr [[TMP110]], align 8
 // CHECK4-51-NEXT:    [[TMP111:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP111]]
-// CHECK4-51-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX33]], i64 3
+// CHECK4-51-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP111]]
+// CHECK4-51-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX33]], i64 3
 // CHECK4-51-NEXT:    [[TMP112:%.*]] = load i32, ptr @a, align 4
 // CHECK4-51-NEXT:    [[TMP113:%.*]] = sext i32 [[TMP112]] to i64
 // CHECK4-51-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP113]], 1
@@ -9458,8 +9458,8 @@ void test_omp_all_memory()
 // CHECK4-51-NEXT:    [[TMP115:%.*]] = sext i32 [[TMP114]] to i64
 // CHECK4-51-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP115]]
 // CHECK4-51-NEXT:    [[TMP116:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP116]]
-// CHECK4-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
+// CHECK4-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP116]]
+// CHECK4-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
 // CHECK4-51-NEXT:    [[TMP117:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1
 // CHECK4-51-NEXT:    [[TMP118:%.*]] = ptrtoint ptr [[ARRAYIDX34]] to i64
 // CHECK4-51-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[TMP117]] to i64
diff --git a/clang/test/OpenMP/task_in_reduction_codegen.cpp b/clang/test/OpenMP/task_in_reduction_codegen.cpp
index aa2a478137990..29dc12978d7d9 100644
--- a/clang/test/OpenMP/task_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/task_in_reduction_codegen.cpp
@@ -90,7 +90,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16
 // CHECK1-NEXT:    store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[A]], ptr [[TMP4]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -105,7 +105,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP10]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[B]], ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1
@@ -120,7 +120,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..2, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP17]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP18]], align 8
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1
@@ -138,7 +138,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP25:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]])
 // CHECK1-NEXT:    store ptr [[TMP25]], ptr [[DOTTASK_RED_]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[C]], ptr [[TMP26]], align 8
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
@@ -153,7 +153,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..6, ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP32]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1
diff --git a/clang/test/OpenMP/taskgroup_task_reduction_codegen.cpp b/clang/test/OpenMP/taskgroup_task_reduction_codegen.cpp
index faf86479dfdae..a8577c7a13579 100644
--- a/clang/test/OpenMP/taskgroup_task_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskgroup_task_reduction_codegen.cpp
@@ -67,7 +67,7 @@ int main(int argc, char **argv) {
 // CHECK-DAG:   [[A_REF]] = getelementptr inbounds nuw [[T1]], ptr [[GEPA:%[^,]+]], i32 0, i32 0
 // CHECK-DAG:   store ptr [[A]], ptr [[A_REF:[^,]+]],
 // CHECK-DAG:   [[A_REF]] = getelementptr inbounds nuw [[T1]], ptr [[GEPA]], i32 0, i32 1
-// CHECK-DAG:   [[GEPA]] = getelementptr inbounds [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64
+// CHECK-DAG:   [[GEPA]] = getelementptr inbounds nuw [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64
 // CHECK-DAG:   [[TMP6:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPA]], i32 0, i32 2
 // CHECK-DAG:   store i64 4, ptr [[TMP6]],
 // CHECK-DAG:   [[TMP7:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPA]], i32 0, i32 3
@@ -82,7 +82,7 @@ int main(int argc, char **argv) {
 // CHECK-DAG:   [[TMP12]] = getelementptr inbounds nuw [[T1]], ptr [[GEPB:%[^,]+]], i32 0, i32 0
 // CHECK-DAG:   store ptr [[B]], ptr [[TMP12:%[^,]+]],
 // CHECK-DAG:   [[TMP12]] = getelementptr inbounds nuw [[T1]], ptr [[GEPB]], i32 0, i32 1
-// CHECK-DAG:   [[GEPB]] = getelementptr inbounds [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64
+// CHECK-DAG:   [[GEPB]] = getelementptr inbounds nuw [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64
 // CHECK-DAG:   [[TMP14:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPB]], i32 0, i32 2
 // CHECK-DAG:   store i64 4, ptr [[TMP14]],
 // CHECK-DAG:   [[TMP15:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPB]], i32 0, i32 3
@@ -97,7 +97,7 @@ int main(int argc, char **argv) {
 // CHECK-DAG:   [[TMP20]] = getelementptr inbounds nuw [[T1]], ptr [[GEPARGC:%[^,]+]], i32 0, i32 0
 // CHECK-DAG:   store ptr [[ARGC_ADDR]], ptr [[TMP20:%[^,]+]],
 // CHECK-DAG:   [[TMP20]] = getelementptr inbounds nuw [[T1]], ptr [[GEPARGC]], i32 0, i32 1
-// CHECK-DAG:   [[GEPARGC]] = getelementptr inbounds [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64
+// CHECK-DAG:   [[GEPARGC]] = getelementptr inbounds nuw [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64
 // CHECK-DAG:   [[TMP22:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPARGC]], i32 0, i32 2
 // CHECK-DAG:   store i64 4, ptr [[TMP22]],
 // CHECK-DAG:   [[TMP23:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPARGC]], i32 0, i32 3
@@ -116,7 +116,7 @@ int main(int argc, char **argv) {
 // CHECK-DAG:   [[TMP30]] = getelementptr inbounds nuw [[T2]], ptr [[GEPC:%[^,]+]], i32 0, i32 0
 // CHECK-DAG:   store ptr [[C]], ptr [[TMP30:%[^,]+]],
 // CHECK-DAG:   [[TMP30]] = getelementptr inbounds nuw [[T2]], ptr [[GEPC]], i32 0, i32 1
-// CHECK-DAG:   [[GEPC]] = getelementptr inbounds [2 x [[T2]]], ptr [[RD_IN2]], i64 0, i64
+// CHECK-DAG:   [[GEPC]] = getelementptr inbounds nuw [2 x [[T2]]], ptr [[RD_IN2]], i64 0, i64
 // CHECK-DAG:   [[TMP32:%.+]] = getelementptr inbounds nuw [[T2]], ptr [[GEPC]], i32 0, i32 2
 // CHECK-DAG:   store i64 20, ptr [[TMP32]],
 // CHECK-DAG:   [[TMP33:%.+]] = getelementptr inbounds nuw [[T2]], ptr [[GEPC]], i32 0, i32 3
@@ -131,7 +131,7 @@ int main(int argc, char **argv) {
 // CHECK-DAG:   [[TMP38]] = getelementptr inbounds nuw [[T2]], ptr [[GEPVLA:%[^,]+]], i32 0, i32 0
 // CHECK-DAG:   store ptr [[VLA]], ptr [[TMP38:%[^,]+]],
 // CHECK-DAG:   [[TMP38]] = getelementptr inbounds nuw [[T2]], ptr [[GEPVLA]], i32 0, i32 1
-// CHECK-DAG:   [[GEPVLA]] = getelementptr inbounds [2 x [[T2]]], ptr [[RD_IN2]], i64 0, i64
+// CHECK-DAG:   [[GEPVLA]] = getelementptr inbounds nuw [2 x [[T2]]], ptr [[RD_IN2]], i64 0, i64
 // CHECK-DAG:   [[TMP40:%.+]] = mul nuw i64 [[VLA_SIZE]], 2
 // CHECK-DAG:   [[TMP41:%.+]] = udiv exact i64 [[TMP40]], ptrtoint (ptr getelementptr (i16, ptr null, i32 1) to i64)
 // CHECK-DAG:   [[TMP42:%.+]] = getelementptr inbounds nuw [[T2]], ptr [[GEPVLA]], i32 0, i32 2
diff --git a/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
index ae0d007756140..87b4cd2caf18a 100644
--- a/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
@@ -76,7 +76,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16
 // CHECK1-NEXT:    store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[A]], ptr [[TMP4]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -91,7 +91,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP10]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[B]], ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1
@@ -106,7 +106,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..2, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP17]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP18]], align 8
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1
@@ -124,7 +124,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP25:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]])
 // CHECK1-NEXT:    store ptr [[TMP25]], ptr [[DOTTASK_RED_]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[C]], ptr [[TMP26]], align 8
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
@@ -139,7 +139,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..6, ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP32]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1
diff --git a/clang/test/OpenMP/taskloop_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_reduction_codegen.cpp
index 3cdc88ba20b77..6eca033eca551 100644
--- a/clang/test/OpenMP/taskloop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_reduction_codegen.cpp
@@ -83,9 +83,9 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB1:.+]], ptr [[TMP25]],
 // CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK-DAG:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false)
-// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 0
+// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 0
 // CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
-// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
 // CHECK-DAG:    [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
@@ -137,10 +137,10 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB4:.+]], ptr [[TMP59]],
 // CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6
 // CHECK-DAG:    store i32 1, ptr [[TMP60]],
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
 // CHECK:    [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 4, ptr [[DOTRD_INPUT_]])
 // CHECK:    [[TMP63:%.*]] = load i32, ptr [[N]],
 // CHECK:    store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]],
diff --git a/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
index 6da28d2d973c9..9e4e51a442742 100644
--- a/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
@@ -76,7 +76,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16
 // CHECK1-NEXT:    store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[A]], ptr [[TMP4]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -91,7 +91,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP10]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[B]], ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1
@@ -106,7 +106,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..2, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP17]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP18]], align 8
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1
@@ -124,7 +124,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP25:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]])
 // CHECK1-NEXT:    store ptr [[TMP25]], ptr [[DOTTASK_RED_]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[C]], ptr [[TMP26]], align 8
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
@@ -139,7 +139,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..6, ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP32]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1
diff --git a/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp
index d6e40831484aa..83ae053cfd9bd 100644
--- a/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp
@@ -80,9 +80,9 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB1:.+]], ptr [[TMP25]],
 // CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK-DAG:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false)
-// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 0
+// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 0
 // CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
-// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
 // CHECK-DAG:    [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
@@ -134,10 +134,10 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB4:.+]], ptr [[TMP59]],
 // CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6
 // CHECK-DAG:    store i32 1, ptr [[TMP60]],
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
 // CHECK:    [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 4, ptr [[DOTRD_INPUT_]])
 // CHECK:    [[TMP63:%.*]] = load i32, ptr [[N]],
 // CHECK:    store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]],
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
index be499e0b36548..7987c2de7dd8f 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
@@ -100,16 +100,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -139,7 +139,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP5]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP5]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -154,19 +154,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8
@@ -444,16 +444,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP6]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP7]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP7]], i64 9
 // CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]]
@@ -483,7 +483,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP22]]
 // CHECK1-NEXT:    store ptr [[_TMP6]], ptr [[_TMP5]], align 8
 // CHECK1-NEXT:    store ptr [[TMP23]], ptr [[_TMP6]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP24]], align 8
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -498,19 +498,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..4, ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP30]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP32]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP32]], i64 0
 // CHECK1-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP33]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP33]], i64 0
 // CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP35]]
 // CHECK1-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP36]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP36]], i64 9
 // CHECK1-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[TMP38]], align 8
@@ -831,9 +831,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]

From 122874c955e06defb619b1afd4e26db482dbbf19 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 5 Sep 2024 16:17:18 +0100
Subject: [PATCH 239/425] [X86] Fold scalar_to_vector(shift(x,imm)) ->
 vshift(scalar_to_vector(x),imm)

Noticed while working on #107289
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 24 +++++-
 llvm/test/CodeGen/X86/buildvec-insertvec.ll   |  4 +-
 .../test/CodeGen/X86/known-signbits-vector.ll |  7 +-
 .../test/CodeGen/X86/load-scalar-as-vector.ll | 18 ++--
 llvm/test/CodeGen/X86/pr44915.ll              | 15 ++--
 llvm/test/CodeGen/X86/vec_insert-5.ll         |  7 +-
 llvm/test/CodeGen/X86/vec_shift5.ll           | 16 ++--
 llvm/test/CodeGen/X86/vector-sext.ll          | 83 +++++++++----------
 .../CodeGen/X86/vector-shuffle-combining.ll   | 36 ++++----
 9 files changed, 109 insertions(+), 101 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5f87ffd2f1eab..a4ad4a1bb1201 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57403,7 +57403,8 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   SDValue Src = N->getOperand(0);
   SDLoc DL(N);
@@ -57482,6 +57483,25 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
         // coverage.
       }
 
+  // Check for cases where we've ended up with a scalarized shift, typically
+  // during type legalization.
+  switch (Src.getOpcode()) {
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:
+    if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
+      if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
+          Src.hasOneUse()) {
+        SDValue SrcVec =
+            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
+        unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
+        return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
+                                          Amt->getZExtValue(), DAG);
+      }
+    }
+    break;
+  }
+
   return SDValue();
 }
 
@@ -58034,7 +58054,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   // clang-format off
   default: break;
   case ISD::SCALAR_TO_VECTOR:
-    return combineScalarToVector(N, DAG);
+    return combineScalarToVector(N, DAG, Subtarget);
   case ISD::EXTRACT_VECTOR_ELT:
   case X86ISD::PEXTRW:
   case X86ISD::PEXTRB:
diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index ae70b6a5a4665..4b0e5441b4abf 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -726,9 +726,9 @@ define void @PR46461(i16 %x, ptr %y) {
 ; SSE-LABEL: PR46461:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movzwl %di, %eax
-; SSE-NEXT:    shrl %eax
 ; SSE-NEXT:    movd %eax, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT:    psrld $1, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, 48(%rsi)
 ; SSE-NEXT:    movdqa %xmm0, 32(%rsi)
 ; SSE-NEXT:    movdqa %xmm0, 16(%rsi)
@@ -738,9 +738,9 @@ define void @PR46461(i16 %x, ptr %y) {
 ; AVX1-LABEL: PR46461:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    movzwl %di, %eax
-; AVX1-NEXT:    shrl %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovaps %ymm0, 32(%rsi)
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll
index 9648daf7427b1..45b61155fe626 100644
--- a/llvm/test/CodeGen/X86/known-signbits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll
@@ -220,11 +220,10 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sarl $30, %ecx
-; X86-NEXT:    shll $2, %eax
 ; X86-NEXT:    vmovd %eax, %xmm0
-; X86-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; X86-NEXT:    sarl $30, %eax
+; X86-NEXT:    vpslld $2, %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
 ; X86-NEXT:    vpsrlq $3, %xmm0, %xmm0
 ; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
diff --git a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll
index 3edbcd1fe18eb..d2359ced3e19d 100644
--- a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll
+++ b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll
@@ -274,16 +274,14 @@ define <2 x i64> @lshr_op0_constant(ptr %p) nounwind {
 define <4 x i32> @lshr_op1_constant(ptr %p) nounwind {
 ; SSE-LABEL: lshr_op1_constant:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movl (%rdi), %eax
-; SSE-NEXT:    shrl $17, %eax
-; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    psrld $17, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: lshr_op1_constant:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movl (%rdi), %eax
-; AVX-NEXT:    shrl $17, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vpsrld $17, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x = load i32, ptr %p
   %b = lshr i32 %x, 17
@@ -317,15 +315,15 @@ define <8 x i16> @ashr_op1_constant(ptr %p) nounwind {
 ; SSE-LABEL: ashr_op1_constant:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movswl (%rdi), %eax
-; SSE-NEXT:    sarl $7, %eax
 ; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    psrad $7, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: ashr_op1_constant:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movswl (%rdi), %eax
-; AVX-NEXT:    sarl $7, %eax
 ; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vpsrad $7, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x = load i16, ptr %p
   %b = ashr i16 %x, 7
@@ -474,8 +472,8 @@ define <2 x i64> @udiv_op1_constant(ptr %p) nounwind {
 ; SSE-NEXT:    shrq %rax
 ; SSE-NEXT:    movabsq $-4392081922311798003, %rcx # imm = 0xC30C30C30C30C30D
 ; SSE-NEXT:    mulq %rcx
-; SSE-NEXT:    shrq $4, %rdx
 ; SSE-NEXT:    movq %rdx, %xmm0
+; SSE-NEXT:    psrlq $4, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: udiv_op1_constant:
@@ -484,8 +482,8 @@ define <2 x i64> @udiv_op1_constant(ptr %p) nounwind {
 ; AVX-NEXT:    shrq %rax
 ; AVX-NEXT:    movabsq $-4392081922311798003, %rcx # imm = 0xC30C30C30C30C30D
 ; AVX-NEXT:    mulq %rcx
-; AVX-NEXT:    shrq $4, %rdx
 ; AVX-NEXT:    vmovq %rdx, %xmm0
+; AVX-NEXT:    vpsrlq $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x = load i64, ptr %p
   %b = udiv i64 %x, 42
diff --git a/llvm/test/CodeGen/X86/pr44915.ll b/llvm/test/CodeGen/X86/pr44915.ll
index 1ebdd9ccb3190..99205ab60ae11 100644
--- a/llvm/test/CodeGen/X86/pr44915.ll
+++ b/llvm/test/CodeGen/X86/pr44915.ll
@@ -52,15 +52,14 @@ define i32 @extract3(ptr, i32) nounwind {
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $7, %ecx
 ; X64-NEXT:    movd %ecx, %xmm0
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    shrl $3, %ecx
-; X64-NEXT:    andl $7, %ecx
-; X64-NEXT:    movd %ecx, %xmm2
-; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT:    movd %eax, %xmm2
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    andl $7, %eax
+; X64-NEXT:    movd %eax, %xmm3
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT:    shrl $12, %eax
-; X64-NEXT:    movd %eax, %xmm1
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    psrld $12, %xmm2
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; X64-NEXT:    movdqa %xmm0, -24(%rsp)
 ; X64-NEXT:    andl $7, %esi
 ; X64-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll
index 176ae81e08a76..91743898545ee 100644
--- a/llvm/test/CodeGen/X86/vec_insert-5.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-5.ll
@@ -9,17 +9,16 @@ define void  @t1(i32 %a, ptr %P) nounwind {
 ; X86-LABEL: t1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll $12, %ecx
-; X86-NEXT:    movd %ecx, %xmm0
+; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    pslld $12, %xmm0
 ; X86-NEXT:    psllq $32, %xmm0
 ; X86-NEXT:    movq %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t1:
 ; X64:       # %bb.0:
-; X64-NEXT:    shll $12, %edi
 ; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    pslld $12, %xmm0
 ; X64-NEXT:    psllq $32, %xmm0
 ; X64-NEXT:    movq %xmm0, (%rsi)
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_shift5.ll b/llvm/test/CodeGen/X86/vec_shift5.ll
index f8bc6b01c70a8..2ab00ea96ada1 100644
--- a/llvm/test/CodeGen/X86/vec_shift5.ll
+++ b/llvm/test/CodeGen/X86/vec_shift5.ll
@@ -290,13 +290,15 @@ define <4 x i32> @extelt0_twice_sub_pslli_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x
 ; This would crash because the scalar shift amount has a different type than the shift result.
 
 define <2 x i8> @PR58661(<2 x i8> %a0) {
-; CHECK-LABEL: PR58661:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    psrlw $8, %xmm0
-; CHECK-NEXT:    movd %xmm0, %eax
-; CHECK-NEXT:    shll $8, %eax
-; CHECK-NEXT:    movd %eax, %xmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: PR58661:
+; X86:       # %bb.0:
+; X86-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: PR58661:
+; X64:       # %bb.0:
+; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    retq
   %shuffle = shufflevector <2 x i8> %a0, <2 x i8> <i8 poison, i8 0>, <2 x i32> <i32 1, i32 3>
   %x = bitcast <2 x i8> %shuffle to i16
   %shl = shl nuw i16 %x, 8
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index dc9e69137a8a7..d44b11f4ca1da 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -3601,15 +3601,14 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
 ; SSE2-LABEL: sext_4i17_to_4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq (%rdi), %rax
-; SSE2-NEXT:    movl %eax, %ecx
-; SSE2-NEXT:    shll $15, %ecx
-; SSE2-NEXT:    sarl $15, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $15, %xmm0
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq $17, %rcx
-; SSE2-NEXT:    shll $15, %ecx
-; SSE2-NEXT:    sarl $15, %ecx
 ; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    pslld $15, %xmm1
+; SSE2-NEXT:    psrad $15, %xmm1
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    movl 8(%rdi), %ecx
 ; SSE2-NEXT:    shll $28, %ecx
@@ -3617,12 +3616,12 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
 ; SSE2-NEXT:    shrq $51, %rdx
 ; SSE2-NEXT:    shll $15, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    sarl $15, %edx
 ; SSE2-NEXT:    movd %edx, %xmm1
+; SSE2-NEXT:    psrad $15, %xmm1
 ; SSE2-NEXT:    shrq $34, %rax
-; SSE2-NEXT:    shll $15, %eax
-; SSE2-NEXT:    sarl $15, %eax
 ; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    pslld $15, %xmm2
+; SSE2-NEXT:    psrad $15, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    retq
@@ -3630,15 +3629,14 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
 ; SSSE3-LABEL: sext_4i17_to_4i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movq (%rdi), %rax
-; SSSE3-NEXT:    movl %eax, %ecx
-; SSSE3-NEXT:    shll $15, %ecx
-; SSSE3-NEXT:    sarl $15, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    pslld $15, %xmm0
+; SSSE3-NEXT:    psrad $15, %xmm0
 ; SSSE3-NEXT:    movq %rax, %rcx
 ; SSSE3-NEXT:    shrq $17, %rcx
-; SSSE3-NEXT:    shll $15, %ecx
-; SSSE3-NEXT:    sarl $15, %ecx
 ; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    pslld $15, %xmm1
+; SSSE3-NEXT:    psrad $15, %xmm1
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    movl 8(%rdi), %ecx
 ; SSSE3-NEXT:    shll $28, %ecx
@@ -3646,12 +3644,12 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
 ; SSSE3-NEXT:    shrq $51, %rdx
 ; SSSE3-NEXT:    shll $15, %edx
 ; SSSE3-NEXT:    orl %ecx, %edx
-; SSSE3-NEXT:    sarl $15, %edx
 ; SSSE3-NEXT:    movd %edx, %xmm1
+; SSSE3-NEXT:    psrad $15, %xmm1
 ; SSSE3-NEXT:    shrq $34, %rax
-; SSSE3-NEXT:    shll $15, %eax
-; SSSE3-NEXT:    sarl $15, %eax
 ; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    pslld $15, %xmm2
+; SSSE3-NEXT:    psrad $15, %xmm2
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSSE3-NEXT:    retq
@@ -3663,10 +3661,9 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
 ; SSE41-NEXT:    shrq $17, %rcx
 ; SSE41-NEXT:    shll $15, %ecx
 ; SSE41-NEXT:    sarl $15, %ecx
-; SSE41-NEXT:    movl %eax, %edx
-; SSE41-NEXT:    shll $15, %edx
-; SSE41-NEXT:    sarl $15, %edx
-; SSE41-NEXT:    movd %edx, %xmm0
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pslld $15, %xmm0
+; SSE41-NEXT:    psrad $15, %xmm0
 ; SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
 ; SSE41-NEXT:    movq %rax, %rcx
 ; SSE41-NEXT:    shrq $34, %rcx
@@ -3689,10 +3686,9 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
 ; AVX-NEXT:    shrq $17, %rcx
 ; AVX-NEXT:    shll $15, %ecx
 ; AVX-NEXT:    sarl $15, %ecx
-; AVX-NEXT:    movl %eax, %edx
-; AVX-NEXT:    shll $15, %edx
-; AVX-NEXT:    sarl $15, %edx
-; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vpslld $15, %xmm0, %xmm0
+; AVX-NEXT:    vpsrad $15, %xmm0, %xmm0
 ; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
 ; AVX-NEXT:    movq %rax, %rcx
 ; AVX-NEXT:    shrq $34, %rcx
@@ -3711,25 +3707,24 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
 ; X86-SSE2-LABEL: sext_4i17_to_4i32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl (%edx), %ecx
-; X86-SSE2-NEXT:    movl 4(%edx), %eax
+; X86-SSE2-NEXT:    movl (%edx), %eax
+; X86-SSE2-NEXT:    movl 4(%edx), %ecx
 ; X86-SSE2-NEXT:    movl 8(%edx), %edx
-; X86-SSE2-NEXT:    shldl $13, %eax, %edx
-; X86-SSE2-NEXT:    shll $15, %edx
-; X86-SSE2-NEXT:    sarl $15, %edx
+; X86-SSE2-NEXT:    shldl $13, %ecx, %edx
 ; X86-SSE2-NEXT:    movd %edx, %xmm0
-; X86-SSE2-NEXT:    movl %eax, %edx
-; X86-SSE2-NEXT:    shll $13, %edx
-; X86-SSE2-NEXT:    sarl $15, %edx
-; X86-SSE2-NEXT:    movd %edx, %xmm1
+; X86-SSE2-NEXT:    pslld $15, %xmm0
+; X86-SSE2-NEXT:    psrad $15, %xmm0
+; X86-SSE2-NEXT:    movd %ecx, %xmm1
+; X86-SSE2-NEXT:    pslld $13, %xmm1
+; X86-SSE2-NEXT:    psrad $15, %xmm1
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE2-NEXT:    shldl $15, %ecx, %eax
-; X86-SSE2-NEXT:    shll $15, %ecx
-; X86-SSE2-NEXT:    sarl $15, %ecx
-; X86-SSE2-NEXT:    movd %ecx, %xmm0
-; X86-SSE2-NEXT:    shll $15, %eax
-; X86-SSE2-NEXT:    sarl $15, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm2
+; X86-SSE2-NEXT:    movd %eax, %xmm0
+; X86-SSE2-NEXT:    pslld $15, %xmm0
+; X86-SSE2-NEXT:    psrad $15, %xmm0
+; X86-SSE2-NEXT:    shldl $15, %eax, %ecx
+; X86-SSE2-NEXT:    movd %ecx, %xmm2
+; X86-SSE2-NEXT:    pslld $15, %xmm2
+; X86-SSE2-NEXT:    psrad $15, %xmm2
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X86-SSE2-NEXT:    retl
@@ -3748,9 +3743,9 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
 ; X86-SSE41-NEXT:    shldl $15, %eax, %ecx
 ; X86-SSE41-NEXT:    shll $15, %ecx
 ; X86-SSE41-NEXT:    sarl $15, %ecx
-; X86-SSE41-NEXT:    shll $15, %eax
-; X86-SSE41-NEXT:    sarl $15, %eax
 ; X86-SSE41-NEXT:    movd %eax, %xmm0
+; X86-SSE41-NEXT:    pslld $15, %xmm0
+; X86-SSE41-NEXT:    psrad $15, %xmm0
 ; X86-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
 ; X86-SSE41-NEXT:    shll $13, %esi
 ; X86-SSE41-NEXT:    sarl $15, %esi
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 923af983f1d47..04262b4249256 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -3537,47 +3537,43 @@ define <16 x i8> @PR107289(<16 x i8> %0) {
 ; SSE2-LABEL: PR107289:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %rcx
 ; SSE2-NEXT:    shldq $8, %rax, %rcx
-; SSE2-NEXT:    shlq $8, %rax
 ; SSE2-NEXT:    movq %rcx, %xmm1
-; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    psllq $8, %xmm0
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: PR107289:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movq %xmm0, %rax
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSSE3-NEXT:    movq %xmm0, %rcx
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSSE3-NEXT:    movq %xmm1, %rcx
 ; SSSE3-NEXT:    shldq $8, %rax, %rcx
-; SSSE3-NEXT:    shlq $8, %rax
 ; SSSE3-NEXT:    movq %rcx, %xmm1
-; SSSE3-NEXT:    movq %rax, %xmm0
+; SSSE3-NEXT:    psllq $8, %xmm0
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: PR107289:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    movq %xmm0, %rcx
-; SSE41-NEXT:    shldq $8, %rcx, %rax
-; SSE41-NEXT:    shlq $8, %rcx
-; SSE41-NEXT:    movq %rax, %xmm1
-; SSE41-NEXT:    movq %rcx, %xmm0
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE41-NEXT:    shldq $8, %rax, %rcx
+; SSE41-NEXT:    movq %rcx, %xmm1
+; SSE41-NEXT:    psllq $8, %xmm0
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: PR107289:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX-NEXT:    vmovq %xmm0, %rcx
-; AVX-NEXT:    shldq $8, %rcx, %rax
-; AVX-NEXT:    shlq $8, %rcx
-; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX-NEXT:    shldq $8, %rax, %rcx
 ; AVX-NEXT:    vmovq %rcx, %xmm1
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    vpsllq $8, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
   %src = bitcast <16 x i8> %0 to i128
   %shl = shl i128 %src, 8

From 0c8d6df362fe5b4bce54776e2199623d0382293b Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Thu, 5 Sep 2024 08:18:41 -0700
Subject: [PATCH 240/425] Fix handling of FP-classify where the last arg fails
 to convert

The last argument of an FP-classify function was checked for vailidity
as an expression, but we never ensured that the usual unary
conversions/etc properly resulted in a valid value. Thus, when we got
the value, it was null, so we had a null dereference.

This patch instead fails out/marks the function call as invalid if the
argument is incorrect.  I DID consider just allowing it to continue, but
the result was an extraneous error about how the last argument wasn't a
float (in this case, it was an overload set).

Fixes: #107411
---
 clang/lib/Sema/SemaChecking.cpp    | 17 +++++++++++++----
 clang/test/Sema/builtin-unary-fp.c |  4 ++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 2aab52160afa7..99500daca295c 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -4936,10 +4936,19 @@ bool Sema::BuiltinFPClassification(CallExpr *TheCall, unsigned NumArgs,
   // Usual Unary Conversions will convert half to float, which we want for
   // machines that use fp16 conversion intrinsics. Else, we wnat to leave the
   // type how it is, but do normal L->Rvalue conversions.
-  if (Context.getTargetInfo().useFP16ConversionIntrinsics())
-    OrigArg = UsualUnaryConversions(OrigArg).get();
-  else
-    OrigArg = DefaultFunctionArrayLvalueConversion(OrigArg).get();
+  if (Context.getTargetInfo().useFP16ConversionIntrinsics()) {
+    ExprResult Res = UsualUnaryConversions(OrigArg);
+
+    if (!Res.isUsable())
+      return true;
+    OrigArg = Res.get();
+  } else {
+    ExprResult Res = DefaultFunctionArrayLvalueConversion(OrigArg);
+
+    if (!Res.isUsable())
+      return true;
+    OrigArg = Res.get();
+  }
   TheCall->setArg(FPArgNo, OrigArg);
 
   QualType VectorResultTy;
diff --git a/clang/test/Sema/builtin-unary-fp.c b/clang/test/Sema/builtin-unary-fp.c
index 3f4f65eeb73a1..fb8e341156a59 100644
--- a/clang/test/Sema/builtin-unary-fp.c
+++ b/clang/test/Sema/builtin-unary-fp.c
@@ -14,4 +14,8 @@ void a(void) {
   check(__builtin_fpclassify(0, 1, 2, 3, 4.5, 5.0)); // expected-warning{{implicit conversion from 'double' to 'int' changes value from 4.5 to 4}}
   check(__builtin_fpclassify(0, 0, 0, 0, 1)); // expected-error{{too few arguments}}
   check(__builtin_fpclassify(0, 0, 0, 0, 0, 1, 0)); // expected-error{{too many arguments}}
+
+  check(__builtin_fpclassify(0,0,0,0,0, (invalid))); // expected-error{{use of undeclared identifier 'invalid'}}
+  check(__builtin_fpclassify(0,0,0,0,0, (inf))); // expected-error{{use of undeclared identifier 'inf'}}
+                                                // expected-error@-1{{reference to overloaded function could not be resolved}}
 }

From aea3b0f6838bd8268fc3653e1b662d771c87ab15 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 5 Sep 2024 08:35:18 -0700
Subject: [PATCH 241/425] [ARM] Avoid repeated hash lookups (NFC) (#107356)

---
 llvm/lib/Target/ARM/ARMMachineFunctionInfo.h | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index b9ff3a08f998f..54bf5fffd3942 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -254,13 +254,9 @@ class ARMFunctionInfo : public MachineFunctionInfo {
       return -1U;
   }
 
-  DenseMap<const MachineBasicBlock*, unsigned>::iterator getCoalescedWeight(
-                                                  MachineBasicBlock* MBB) {
-    auto It = CoalescedWeights.find(MBB);
-    if (It == CoalescedWeights.end()) {
-      It = CoalescedWeights.insert(std::make_pair(MBB, 0)).first;
-    }
-    return It;
+  DenseMap<const MachineBasicBlock *, unsigned>::iterator
+  getCoalescedWeight(MachineBasicBlock *MBB) {
+    return CoalescedWeights.try_emplace(MBB, 0).first;
   }
 
   /// Indicate to the backend that \c GV has had its storage changed to inside

From abfb340b779f2b20009fe42ebc522417adf79c44 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 5 Sep 2024 08:35:46 -0700
Subject: [PATCH 242/425] [Analysis] Avoid repeated hash lookups (NFC)
 (#107357)

---
 clang/lib/Analysis/ThreadSafety.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/Analysis/ThreadSafety.cpp b/clang/lib/Analysis/ThreadSafety.cpp
index c4a83b069e079..5577f45aa5217 100644
--- a/clang/lib/Analysis/ThreadSafety.cpp
+++ b/clang/lib/Analysis/ThreadSafety.cpp
@@ -1180,8 +1180,7 @@ void BeforeSet::checkBeforeAfter(const ValueDecl* StartVd,
       }
       // Transitively search other before sets, and warn on cycles.
       if (traverse(Vdb)) {
-        if (!CycMap.contains(Vd)) {
-          CycMap.insert(std::make_pair(Vd, true));
+        if (CycMap.try_emplace(Vd, true).second) {
           StringRef L1 = Vd->getName();
           Analyzer.Handler.handleBeforeAfterCycle(L1, Vd->getLocation());
         }

From 0593b95ff4c459ccf71f9472c148967d40f6d865 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 5 Sep 2024 08:36:09 -0700
Subject: [PATCH 243/425] [CGOpenMPRuntime] Avoid repeated hash lookups (NFC)
 (#107358)

---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 3d392d869ee39..9cf597a65be04 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -11674,9 +11674,7 @@ CGOpenMPRuntime::LastprivateConditionalRAII::~LastprivateConditionalRAII() {
 Address CGOpenMPRuntime::emitLastprivateConditionalInit(CodeGenFunction &CGF,
                                                         const VarDecl *VD) {
   ASTContext &C = CGM.getContext();
-  auto I = LastprivateConditionalToTypes.find(CGF.CurFn);
-  if (I == LastprivateConditionalToTypes.end())
-    I = LastprivateConditionalToTypes.try_emplace(CGF.CurFn).first;
+  auto I = LastprivateConditionalToTypes.try_emplace(CGF.CurFn).first;
   QualType NewType;
   const FieldDecl *VDField;
   const FieldDecl *FiredField;

From be427dfb9ea6689947253d737708dc3645e179dc Mon Sep 17 00:00:00 2001
From: Mital Ashok <mital@mitalashok.co.uk>
Date: Thu, 5 Sep 2024 16:38:08 +0100
Subject: [PATCH 244/425] [Clang][Parser] Accept P2741R3 (static_assert with
 user-generated message) in C++11 as an extension (#102044)

Added a new `-Wpre-c++26-compat` warning for when this feature is used
in C++26 and a `-Wc++26-extensions` warning for when this is used in
C++11 through C++23.

---------

Co-authored-by: cor3ntin <corentinjabot@gmail.com>
---
 clang/docs/LanguageExtensions.rst             |  2 ++
 clang/docs/ReleaseNotes.rst                   |  3 ++
 .../clang/Basic/DiagnosticParseKinds.td       |  6 ++++
 clang/lib/Frontend/InitPreprocessor.cpp       |  7 ++---
 clang/lib/Parse/ParseDeclCXX.cpp              | 10 +++++--
 clang/test/CXX/drs/cwg27xx.cpp                |  2 +-
 clang/test/Lexer/cxx-features.cpp             |  2 +-
 .../Parser/cxx11-user-defined-literals.cpp    |  3 +-
 clang/test/Sema/static-assert.c               |  8 ++++--
 clang/test/SemaCXX/static-assert-ext.cpp      | 28 +++++++++++++++++++
 10 files changed, 59 insertions(+), 12 deletions(-)
 create mode 100644 clang/test/SemaCXX/static-assert-ext.cpp

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 62903fc3744ca..c08697282cbfe 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -1483,6 +1483,7 @@ Generic lambda expressions                   __cpp_generic_lambdas            C+
 variable templates                           __cpp_variable_templates         C++14         C++03
 Binary literals                              __cpp_binary_literals            C++14         C++03
 Relaxed constexpr                            __cpp_constexpr                  C++14         C++11
+Static assert with no message                __cpp_static_assert >= 201411L   C++17         C++11
 Pack expansion in generalized lambda-capture __cpp_init_captures              C++17         C++03
 ``if constexpr``                             __cpp_if_constexpr               C++17         C++11
 fold expressions                             __cpp_fold_expressions           C++17         C++03
@@ -1503,6 +1504,7 @@ Conditional ``explicit``                     __cpp_conditional_explicit       C+
 ``static operator()``                        __cpp_static_call_operator       C++23         C++03
 Attributes on Lambda-Expressions                                              C++23         C++11
 Attributes on Structured Bindings            __cpp_structured_bindings        C++26         C++03
+Static assert with user-generated message    __cpp_static_assert >= 202306L   C++26         C++11
 Pack Indexing                                __cpp_pack_indexing              C++26         C++03
 ``= delete ("should have a reason");``       __cpp_deleted_function           C++26         C++03
 Variadic Friends                             __cpp_variadic_friend            C++26         C++03
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ab3c3e6049f60..ebd0b7371e1be 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -109,6 +109,9 @@ C++ Language Changes
   constant expression. Supports the `V.xyzw` syntax and other tidbits
   as seen in OpenCL. Selecting multiple elements is left as a future work.
 
+- Accept C++26 user-defined ``static_assert`` messages in C++11 as an extension.
+
+
 C++2c Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 0b8ab4bf09250..0aa2c4a70849a 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -470,6 +470,12 @@ def warn_c17_compat_static_assert_no_message : Warning<
   "'_Static_assert' with no message is incompatible with C standards before "
   "C23">,
   DefaultIgnore, InGroup<CPre23Compat>;
+def ext_cxx_static_assert_user_generated_message : ExtWarn<
+  "'static_assert' with a user-generated message is a C++26 extension">,
+  InGroup<CXX26>;
+def warn_cxx20_compat_static_assert_user_generated_message : Warning<
+  "'static_assert' with a user-generated message is incompatible with "
+  "C++ standards before C++26">, DefaultIgnore, InGroup<CXXPre26Compat>;
 def err_function_definition_not_allowed : Error<
   "function definition is not allowed here">;
 def err_expected_end_of_enumerator : Error<
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 61260a3379828..9a0fdb175ff29 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -671,10 +671,9 @@ static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts,
                         LangOpts.CPlusPlus23   ? "202211L"
                         : LangOpts.CPlusPlus17 ? "201603L"
                                                : "200907");
-    Builder.defineMacro("__cpp_static_assert", LangOpts.CPlusPlus26 ? "202306L"
-                                               : LangOpts.CPlusPlus17
-                                                   ? "201411L"
-                                                   : "200410");
+    // C++17 / C++26 static_assert supported as an extension in earlier language
+    // modes, so we use the C++26 value.
+    Builder.defineMacro("__cpp_static_assert", "202306L");
     Builder.defineMacro("__cpp_decltype", "200707L");
     Builder.defineMacro("__cpp_attributes", "200809L");
     Builder.defineMacro("__cpp_rvalue_references", "200610L");
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 7ca27d00c0bcb..6370da1fab004 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -1076,7 +1076,7 @@ Decl *Parser::ParseStaticAssertDeclaration(SourceLocation &DeclEnd) {
     }
 
     bool ParseAsExpression = false;
-    if (getLangOpts().CPlusPlus26) {
+    if (getLangOpts().CPlusPlus11) {
       for (unsigned I = 0;; ++I) {
         const Token &T = GetLookAheadToken(I);
         if (T.is(tok::r_paren))
@@ -1088,9 +1088,13 @@ Decl *Parser::ParseStaticAssertDeclaration(SourceLocation &DeclEnd) {
       }
     }
 
-    if (ParseAsExpression)
+    if (ParseAsExpression) {
+      Diag(Tok,
+           getLangOpts().CPlusPlus26
+               ? diag::warn_cxx20_compat_static_assert_user_generated_message
+               : diag::ext_cxx_static_assert_user_generated_message);
       AssertMessage = ParseConstantExpressionInExprEvalContext();
-    else if (tokenIsLikeStringLiteral(Tok, getLangOpts()))
+    } else if (tokenIsLikeStringLiteral(Tok, getLangOpts()))
       AssertMessage = ParseUnevaluatedStringLiteralExpression();
     else {
       Diag(Tok, diag::err_expected_string_literal)
diff --git a/clang/test/CXX/drs/cwg27xx.cpp b/clang/test/CXX/drs/cwg27xx.cpp
index b3867696c615b..2b57dbc60aed7 100644
--- a/clang/test/CXX/drs/cwg27xx.cpp
+++ b/clang/test/CXX/drs/cwg27xx.cpp
@@ -178,7 +178,7 @@ void test() {
 }
 
 namespace cwg2798 { // cwg2798: 17
-#if __cpp_static_assert >= 202306
+#if __cplusplus > 202302L
 struct string {
   constexpr string() {
     data_ = new char[6]();
diff --git a/clang/test/Lexer/cxx-features.cpp b/clang/test/Lexer/cxx-features.cpp
index 4a06d29ae9dbc..5b88e00b71508 100644
--- a/clang/test/Lexer/cxx-features.cpp
+++ b/clang/test/Lexer/cxx-features.cpp
@@ -325,7 +325,7 @@
 #error "wrong value for __cpp_range_based_for"
 #endif
 
-#if check(static_assert, 0, 200410, 200410, 201411, 201411, 201411, 202306)
+#if check(static_assert, 0, 202306, 202306, 202306, 202306, 202306, 202306)
 #error "wrong value for __cpp_static_assert"
 #endif
 
diff --git a/clang/test/Parser/cxx11-user-defined-literals.cpp b/clang/test/Parser/cxx11-user-defined-literals.cpp
index 1a7e780588229..cdd06729efc39 100644
--- a/clang/test/Parser/cxx11-user-defined-literals.cpp
+++ b/clang/test/Parser/cxx11-user-defined-literals.cpp
@@ -21,7 +21,8 @@ int f() {
   asm("mov %eax, %rdx"_foo); // expected-error {{user-defined suffix cannot be used here}}
 }
 
-static_assert(true, "foo"_bar); // expected-error {{user-defined suffix cannot be used here}}
+static_assert(true, "foo"_bar); // expected-error {{no matching literal operator for call to 'operator""_bar'}}
+// expected-warning@-1 {{'static_assert' with a user-generated message is a C++26 extension}}
 
 int cake() __attribute__((availability(macosx, unavailable, message = "is a lie"_x))); // expected-error {{user-defined suffix cannot be used here}}
 
diff --git a/clang/test/Sema/static-assert.c b/clang/test/Sema/static-assert.c
index 4e9e6b7ee558b..d603bc19bb824 100644
--- a/clang/test/Sema/static-assert.c
+++ b/clang/test/Sema/static-assert.c
@@ -25,8 +25,12 @@ void foo(void) {
 #endif
 }
 
-_Static_assert(1, invalid); // expected-error {{expected string literal for diagnostic message in static_assert}} \
-                            // ext-warning {{'_Static_assert' is a C11 extension}}
+_Static_assert(1, invalid); // ext-warning {{'_Static_assert' is a C11 extension}}
+#ifndef __cplusplus
+// expected-error@-2 {{expected string literal for diagnostic message in static_assert}}
+#endif
+// cxx-error@-4 {{use of undeclared identifier 'invalid'}}
+// cxx-warning@-5 {{'static_assert' with a user-generated message is a C++26 extension}}
 
 struct A {
   int a;
diff --git a/clang/test/SemaCXX/static-assert-ext.cpp b/clang/test/SemaCXX/static-assert-ext.cpp
new file mode 100644
index 0000000000000..05f7a0e96974a
--- /dev/null
+++ b/clang/test/SemaCXX/static-assert-ext.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -std=c++98 -fsyntax-only -pedantic %s -verify=precxx11,precxx17,precxx26
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -pedantic %s -verify=since-cxx11,precxx17,precxx26 -Wc++98-compat
+// RUN: %clang_cc1 -std=c++17 -fsyntax-only -pedantic %s -verify=since-cxx11,since-cxx17,precxx26 -Wc++98-compat -Wpre-c++17-compat
+// RUN: %clang_cc1 -std=c++26 -fsyntax-only -pedantic %s -verify=since-cxx11,since-cxx17,since-cxx26 -Wc++98-compat -Wpre-c++17-compat -Wpre-c++26-compat
+
+static_assert(false, "a");
+// precxx11-error@-1 {{a type specifier is required for all declarations}}
+// since-cxx11-warning@-2 {{'static_assert' declarations are incompatible with C++98}}
+// since-cxx11-error@-3 {{static assertion failed: a}}
+
+#if __cplusplus >= 201103L
+static_assert(false);
+// since-cxx11-warning@-1 {{'static_assert' declarations are incompatible with C++98}}
+// precxx17-warning@-2 {{'static_assert' with no message is a C++17 extension}}
+// since-cxx17-warning@-3 {{'static_assert' with no message is incompatible with C++ standards before C++17}}
+// since-cxx11-error@-4 {{static assertion failed}}
+
+struct X {
+    static constexpr int size() { return 1; } // since-cxx11-warning {{'constexpr'}}
+    static constexpr const char* data() { return "b"; } // since-cxx11-warning {{'constexpr'}}
+};
+
+static_assert(false, X());
+// since-cxx11-warning@-1 {{'static_assert' declarations are incompatible with C++98}}
+// precxx26-warning@-2 {{'static_assert' with a user-generated message is a C++26 extension}}
+// since-cxx26-warning@-3 {{'static_assert' with a user-generated message is incompatible with C++ standards before C++26}}
+// since-cxx11-error@-4 {{static assertion failed: b}}
+#endif

From 13013bdc6a5e4def05204fb69d7a31ef17ccd1c7 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 5 Sep 2024 08:42:13 -0700
Subject: [PATCH 245/425] [RISCV] Don't cost Fmv for Zfinx in isFPImmLegal.
 (#107361)

There is no Fmv with Zfinx.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   7 +-
 llvm/test/CodeGen/RISCV/double-convert.ll     |  36 +--
 llvm/test/CodeGen/RISCV/double-imm.ll         |   4 +-
 llvm/test/CodeGen/RISCV/double-intrinsics.ll  |  24 +-
 llvm/test/CodeGen/RISCV/double-round-conv.ll  |  20 +-
 llvm/test/CodeGen/RISCV/float-convert.ll      |  32 +-
 .../CodeGen/RISCV/float-round-conv-sat.ll     |  48 +--
 llvm/test/CodeGen/RISCV/half-arith.ll         |  41 +--
 llvm/test/CodeGen/RISCV/half-convert.ll       | 232 +++++++-------
 llvm/test/CodeGen/RISCV/half-imm.ll           |   8 +-
 llvm/test/CodeGen/RISCV/half-intrinsics.ll    |  24 +-
 .../test/CodeGen/RISCV/half-round-conv-sat.ll | 264 +++++++--------
 llvm/test/CodeGen/RISCV/half-round-conv.ll    | 300 +++++++++---------
 13 files changed, 512 insertions(+), 528 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d400b2ea1ca2c..6b4219b462384 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2288,10 +2288,11 @@ bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
     return true;
 
   // Building an integer and then converting requires a fmv at the end of
-  // the integer sequence.
+  // the integer sequence. The fmv is not required for Zfinx.
+  const int FmvCost = Subtarget.hasStdExtZfinx() ? 0 : 1;
   const int Cost =
-      1 + RISCVMatInt::getIntMatCost(Imm.bitcastToAPInt(), Subtarget.getXLen(),
-                                     Subtarget);
+      FmvCost + RISCVMatInt::getIntMatCost(Imm.bitcastToAPInt(),
+                                           Subtarget.getXLen(), Subtarget);
   return Cost <= FPImmCost;
 }
 
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index 2e2e1b924cf00..ef2d8e7627be5 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -1668,16 +1668,16 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind {
 ;
 ; RV64IZFINXZDINX-LABEL: fcvt_w_s_sat_i16:
 ; RV64IZFINXZDINX:       # %bb.0: # %start
-; RV64IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI26_0)
-; RV64IZFINXZDINX-NEXT:    ld a1, %lo(.LCPI26_0)(a1)
-; RV64IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI26_1)
-; RV64IZFINXZDINX-NEXT:    ld a2, %lo(.LCPI26_1)(a2)
-; RV64IZFINXZDINX-NEXT:    feq.d a3, a0, a0
-; RV64IZFINXZDINX-NEXT:    neg a3, a3
-; RV64IZFINXZDINX-NEXT:    fmax.d a0, a0, a1
+; RV64IZFINXZDINX-NEXT:    feq.d a1, a0, a0
+; RV64IZFINXZDINX-NEXT:    neg a1, a1
+; RV64IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI26_0)
+; RV64IZFINXZDINX-NEXT:    ld a2, %lo(.LCPI26_0)(a2)
+; RV64IZFINXZDINX-NEXT:    li a3, -505
+; RV64IZFINXZDINX-NEXT:    slli a3, a3, 53
+; RV64IZFINXZDINX-NEXT:    fmax.d a0, a0, a3
 ; RV64IZFINXZDINX-NEXT:    fmin.d a0, a0, a2
 ; RV64IZFINXZDINX-NEXT:    fcvt.l.d a0, a0, rtz
-; RV64IZFINXZDINX-NEXT:    and a0, a3, a0
+; RV64IZFINXZDINX-NEXT:    and a0, a1, a0
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_w_s_sat_i16:
@@ -2043,16 +2043,16 @@ define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind {
 ;
 ; RV64IZFINXZDINX-LABEL: fcvt_w_s_sat_i8:
 ; RV64IZFINXZDINX:       # %bb.0: # %start
-; RV64IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI30_0)
-; RV64IZFINXZDINX-NEXT:    ld a1, %lo(.LCPI30_0)(a1)
-; RV64IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI30_1)
-; RV64IZFINXZDINX-NEXT:    ld a2, %lo(.LCPI30_1)(a2)
-; RV64IZFINXZDINX-NEXT:    feq.d a3, a0, a0
-; RV64IZFINXZDINX-NEXT:    neg a3, a3
-; RV64IZFINXZDINX-NEXT:    fmax.d a0, a0, a1
+; RV64IZFINXZDINX-NEXT:    feq.d a1, a0, a0
+; RV64IZFINXZDINX-NEXT:    neg a1, a1
+; RV64IZFINXZDINX-NEXT:    li a2, -509
+; RV64IZFINXZDINX-NEXT:    slli a2, a2, 53
+; RV64IZFINXZDINX-NEXT:    fmax.d a0, a0, a2
+; RV64IZFINXZDINX-NEXT:    lui a2, 65919
+; RV64IZFINXZDINX-NEXT:    slli a2, a2, 34
 ; RV64IZFINXZDINX-NEXT:    fmin.d a0, a0, a2
 ; RV64IZFINXZDINX-NEXT:    fcvt.l.d a0, a0, rtz
-; RV64IZFINXZDINX-NEXT:    and a0, a3, a0
+; RV64IZFINXZDINX-NEXT:    and a0, a1, a0
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_w_s_sat_i8:
@@ -2234,9 +2234,9 @@ define zeroext i8 @fcvt_wu_s_sat_i8(double %a) nounwind {
 ;
 ; RV64IZFINXZDINX-LABEL: fcvt_wu_s_sat_i8:
 ; RV64IZFINXZDINX:       # %bb.0: # %start
-; RV64IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV64IZFINXZDINX-NEXT:    ld a1, %lo(.LCPI32_0)(a1)
 ; RV64IZFINXZDINX-NEXT:    fmax.d a0, a0, zero
+; RV64IZFINXZDINX-NEXT:    lui a1, 131967
+; RV64IZFINXZDINX-NEXT:    slli a1, a1, 33
 ; RV64IZFINXZDINX-NEXT:    fmin.d a0, a0, a1
 ; RV64IZFINXZDINX-NEXT:    fcvt.lu.d a0, a0, rtz
 ; RV64IZFINXZDINX-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/double-imm.ll b/llvm/test/CodeGen/RISCV/double-imm.ll
index 74d4acc4f23f8..827f034f143fb 100644
--- a/llvm/test/CodeGen/RISCV/double-imm.ll
+++ b/llvm/test/CodeGen/RISCV/double-imm.ll
@@ -62,8 +62,8 @@ define double @double_imm_op(double %a) nounwind {
 ;
 ; CHECKRV64ZDINX-LABEL: double_imm_op:
 ; CHECKRV64ZDINX:       # %bb.0:
-; CHECKRV64ZDINX-NEXT:    lui a1, %hi(.LCPI1_0)
-; CHECKRV64ZDINX-NEXT:    ld a1, %lo(.LCPI1_0)(a1)
+; CHECKRV64ZDINX-NEXT:    li a1, 1023
+; CHECKRV64ZDINX-NEXT:    slli a1, a1, 52
 ; CHECKRV64ZDINX-NEXT:    fadd.d a0, a0, a1
 ; CHECKRV64ZDINX-NEXT:    ret
   %1 = fadd double %a, 1.0
diff --git a/llvm/test/CodeGen/RISCV/double-intrinsics.ll b/llvm/test/CodeGen/RISCV/double-intrinsics.ll
index eef48d1eafbfe..94b3b1f1b199c 100644
--- a/llvm/test/CodeGen/RISCV/double-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/double-intrinsics.ll
@@ -869,8 +869,8 @@ define double @floor_f64(double %a) nounwind {
 ;
 ; RV64IZFINXZDINX-LABEL: floor_f64:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI17_0)
-; RV64IZFINXZDINX-NEXT:    ld a1, %lo(.LCPI17_0)(a1)
+; RV64IZFINXZDINX-NEXT:    li a1, 1075
+; RV64IZFINXZDINX-NEXT:    slli a1, a1, 52
 ; RV64IZFINXZDINX-NEXT:    fabs.d a2, a0
 ; RV64IZFINXZDINX-NEXT:    flt.d a1, a2, a1
 ; RV64IZFINXZDINX-NEXT:    beqz a1, .LBB17_2
@@ -934,8 +934,8 @@ define double @ceil_f64(double %a) nounwind {
 ;
 ; RV64IZFINXZDINX-LABEL: ceil_f64:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI18_0)
-; RV64IZFINXZDINX-NEXT:    ld a1, %lo(.LCPI18_0)(a1)
+; RV64IZFINXZDINX-NEXT:    li a1, 1075
+; RV64IZFINXZDINX-NEXT:    slli a1, a1, 52
 ; RV64IZFINXZDINX-NEXT:    fabs.d a2, a0
 ; RV64IZFINXZDINX-NEXT:    flt.d a1, a2, a1
 ; RV64IZFINXZDINX-NEXT:    beqz a1, .LBB18_2
@@ -999,8 +999,8 @@ define double @trunc_f64(double %a) nounwind {
 ;
 ; RV64IZFINXZDINX-LABEL: trunc_f64:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI19_0)
-; RV64IZFINXZDINX-NEXT:    ld a1, %lo(.LCPI19_0)(a1)
+; RV64IZFINXZDINX-NEXT:    li a1, 1075
+; RV64IZFINXZDINX-NEXT:    slli a1, a1, 52
 ; RV64IZFINXZDINX-NEXT:    fabs.d a2, a0
 ; RV64IZFINXZDINX-NEXT:    flt.d a1, a2, a1
 ; RV64IZFINXZDINX-NEXT:    beqz a1, .LBB19_2
@@ -1064,8 +1064,8 @@ define double @rint_f64(double %a) nounwind {
 ;
 ; RV64IZFINXZDINX-LABEL: rint_f64:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI20_0)
-; RV64IZFINXZDINX-NEXT:    ld a1, %lo(.LCPI20_0)(a1)
+; RV64IZFINXZDINX-NEXT:    li a1, 1075
+; RV64IZFINXZDINX-NEXT:    slli a1, a1, 52
 ; RV64IZFINXZDINX-NEXT:    fabs.d a2, a0
 ; RV64IZFINXZDINX-NEXT:    flt.d a1, a2, a1
 ; RV64IZFINXZDINX-NEXT:    beqz a1, .LBB20_2
@@ -1170,8 +1170,8 @@ define double @round_f64(double %a) nounwind {
 ;
 ; RV64IZFINXZDINX-LABEL: round_f64:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI22_0)
-; RV64IZFINXZDINX-NEXT:    ld a1, %lo(.LCPI22_0)(a1)
+; RV64IZFINXZDINX-NEXT:    li a1, 1075
+; RV64IZFINXZDINX-NEXT:    slli a1, a1, 52
 ; RV64IZFINXZDINX-NEXT:    fabs.d a2, a0
 ; RV64IZFINXZDINX-NEXT:    flt.d a1, a2, a1
 ; RV64IZFINXZDINX-NEXT:    beqz a1, .LBB22_2
@@ -1235,8 +1235,8 @@ define double @roundeven_f64(double %a) nounwind {
 ;
 ; RV64IZFINXZDINX-LABEL: roundeven_f64:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI23_0)
-; RV64IZFINXZDINX-NEXT:    ld a1, %lo(.LCPI23_0)(a1)
+; RV64IZFINXZDINX-NEXT:    li a1, 1075
+; RV64IZFINXZDINX-NEXT:    slli a1, a1, 52
 ; RV64IZFINXZDINX-NEXT:    fabs.d a2, a0
 ; RV64IZFINXZDINX-NEXT:    flt.d a1, a2, a1
 ; RV64IZFINXZDINX-NEXT:    beqz a1, .LBB23_2
diff --git a/llvm/test/CodeGen/RISCV/double-round-conv.ll b/llvm/test/CodeGen/RISCV/double-round-conv.ll
index d84d80a4a10e9..12f025c65f36a 100644
--- a/llvm/test/CodeGen/RISCV/double-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/double-round-conv.ll
@@ -1130,8 +1130,8 @@ define double @test_floor_double(double %x) {
 ;
 ; RV64IZFINXZDINX-LABEL: test_floor_double:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI40_0)
-; RV64IZFINXZDINX-NEXT:    ld a1, %lo(.LCPI40_0)(a1)
+; RV64IZFINXZDINX-NEXT:    li a1, 1075
+; RV64IZFINXZDINX-NEXT:    slli a1, a1, 52
 ; RV64IZFINXZDINX-NEXT:    fabs.d a2, a0
 ; RV64IZFINXZDINX-NEXT:    flt.d a1, a2, a1
 ; RV64IZFINXZDINX-NEXT:    beqz a1, .LBB40_2
@@ -1177,8 +1177,8 @@ define double @test_ceil_double(double %x) {
 ;
 ; RV64IZFINXZDINX-LABEL: test_ceil_double:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI41_0)
-; RV64IZFINXZDINX-NEXT:    ld a1, %lo(.LCPI41_0)(a1)
+; RV64IZFINXZDINX-NEXT:    li a1, 1075
+; RV64IZFINXZDINX-NEXT:    slli a1, a1, 52
 ; RV64IZFINXZDINX-NEXT:    fabs.d a2, a0
 ; RV64IZFINXZDINX-NEXT:    flt.d a1, a2, a1
 ; RV64IZFINXZDINX-NEXT:    beqz a1, .LBB41_2
@@ -1224,8 +1224,8 @@ define double @test_trunc_double(double %x) {
 ;
 ; RV64IZFINXZDINX-LABEL: test_trunc_double:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI42_0)
-; RV64IZFINXZDINX-NEXT:    ld a1, %lo(.LCPI42_0)(a1)
+; RV64IZFINXZDINX-NEXT:    li a1, 1075
+; RV64IZFINXZDINX-NEXT:    slli a1, a1, 52
 ; RV64IZFINXZDINX-NEXT:    fabs.d a2, a0
 ; RV64IZFINXZDINX-NEXT:    flt.d a1, a2, a1
 ; RV64IZFINXZDINX-NEXT:    beqz a1, .LBB42_2
@@ -1271,8 +1271,8 @@ define double @test_round_double(double %x) {
 ;
 ; RV64IZFINXZDINX-LABEL: test_round_double:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI43_0)
-; RV64IZFINXZDINX-NEXT:    ld a1, %lo(.LCPI43_0)(a1)
+; RV64IZFINXZDINX-NEXT:    li a1, 1075
+; RV64IZFINXZDINX-NEXT:    slli a1, a1, 52
 ; RV64IZFINXZDINX-NEXT:    fabs.d a2, a0
 ; RV64IZFINXZDINX-NEXT:    flt.d a1, a2, a1
 ; RV64IZFINXZDINX-NEXT:    beqz a1, .LBB43_2
@@ -1318,8 +1318,8 @@ define double @test_roundeven_double(double %x) {
 ;
 ; RV64IZFINXZDINX-LABEL: test_roundeven_double:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI44_0)
-; RV64IZFINXZDINX-NEXT:    ld a1, %lo(.LCPI44_0)(a1)
+; RV64IZFINXZDINX-NEXT:    li a1, 1075
+; RV64IZFINXZDINX-NEXT:    slli a1, a1, 52
 ; RV64IZFINXZDINX-NEXT:    fabs.d a2, a0
 ; RV64IZFINXZDINX-NEXT:    flt.d a1, a2, a1
 ; RV64IZFINXZDINX-NEXT:    beqz a1, .LBB44_2
diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll
index 805ddee4ac3f6..031976b4fa2b2 100644
--- a/llvm/test/CodeGen/RISCV/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert.ll
@@ -682,8 +682,8 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
 ; RV32IZFINX-NEXT:  # %bb.1: # %start
 ; RV32IZFINX-NEXT:    mv a2, a1
 ; RV32IZFINX-NEXT:  .LBB12_2: # %start
-; RV32IZFINX-NEXT:    lui a1, %hi(.LCPI12_0)
-; RV32IZFINX-NEXT:    lw a1, %lo(.LCPI12_0)(a1)
+; RV32IZFINX-NEXT:    lui a1, 389120
+; RV32IZFINX-NEXT:    addi a1, a1, -1
 ; RV32IZFINX-NEXT:    flt.s a3, a1, s0
 ; RV32IZFINX-NEXT:    beqz a3, .LBB12_4
 ; RV32IZFINX-NEXT:  # %bb.3:
@@ -910,9 +910,9 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
 ; RV32IZFINX-NEXT:    neg s1, a0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixunssfdi
-; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI14_0)
-; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI14_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s1, a0
+; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
@@ -1445,11 +1445,11 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind {
 ; RV32IZFINX-LABEL: fcvt_w_s_sat_i16:
 ; RV32IZFINX:       # %bb.0: # %start
 ; RV32IZFINX-NEXT:    feq.s a1, a0, a0
-; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI24_0)
-; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI24_0)(a2)
 ; RV32IZFINX-NEXT:    neg a1, a1
-; RV32IZFINX-NEXT:    lui a3, 815104
-; RV32IZFINX-NEXT:    fmax.s a0, a0, a3
+; RV32IZFINX-NEXT:    lui a2, 815104
+; RV32IZFINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZFINX-NEXT:    lui a2, 290816
+; RV32IZFINX-NEXT:    addi a2, a2, -512
 ; RV32IZFINX-NEXT:    fmin.s a0, a0, a2
 ; RV32IZFINX-NEXT:    fcvt.w.s a0, a0, rtz
 ; RV32IZFINX-NEXT:    and a0, a1, a0
@@ -1458,11 +1458,11 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind {
 ; RV64IZFINX-LABEL: fcvt_w_s_sat_i16:
 ; RV64IZFINX:       # %bb.0: # %start
 ; RV64IZFINX-NEXT:    feq.s a1, a0, a0
-; RV64IZFINX-NEXT:    lui a2, %hi(.LCPI24_0)
-; RV64IZFINX-NEXT:    lw a2, %lo(.LCPI24_0)(a2)
 ; RV64IZFINX-NEXT:    neg a1, a1
-; RV64IZFINX-NEXT:    lui a3, 815104
-; RV64IZFINX-NEXT:    fmax.s a0, a0, a3
+; RV64IZFINX-NEXT:    lui a2, 815104
+; RV64IZFINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZFINX-NEXT:    lui a2, 290816
+; RV64IZFINX-NEXT:    addiw a2, a2, -512
 ; RV64IZFINX-NEXT:    fmin.s a0, a0, a2
 ; RV64IZFINX-NEXT:    fcvt.l.s a0, a0, rtz
 ; RV64IZFINX-NEXT:    and a0, a1, a0
@@ -1622,18 +1622,18 @@ define zeroext i16 @fcvt_wu_s_sat_i16(float %a) nounwind {
 ;
 ; RV32IZFINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV32IZFINX:       # %bb.0: # %start
-; RV32IZFINX-NEXT:    lui a1, %hi(.LCPI26_0)
-; RV32IZFINX-NEXT:    lw a1, %lo(.LCPI26_0)(a1)
 ; RV32IZFINX-NEXT:    fmax.s a0, a0, zero
+; RV32IZFINX-NEXT:    lui a1, 292864
+; RV32IZFINX-NEXT:    addi a1, a1, -256
 ; RV32IZFINX-NEXT:    fmin.s a0, a0, a1
 ; RV32IZFINX-NEXT:    fcvt.wu.s a0, a0, rtz
 ; RV32IZFINX-NEXT:    ret
 ;
 ; RV64IZFINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV64IZFINX:       # %bb.0: # %start
-; RV64IZFINX-NEXT:    lui a1, %hi(.LCPI26_0)
-; RV64IZFINX-NEXT:    lw a1, %lo(.LCPI26_0)(a1)
 ; RV64IZFINX-NEXT:    fmax.s a0, a0, zero
+; RV64IZFINX-NEXT:    lui a1, 292864
+; RV64IZFINX-NEXT:    addiw a1, a1, -256
 ; RV64IZFINX-NEXT:    fmin.s a0, a0, a1
 ; RV64IZFINX-NEXT:    fcvt.lu.s a0, a0, rtz
 ; RV64IZFINX-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
index 5e99c7eb90562..42ac20286a892 100644
--- a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
@@ -112,9 +112,9 @@ define i64 @test_floor_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    neg s2, s1
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI1_0)
-; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
+; RV32IZFINX-NEXT:    lui a2, 389120
+; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a4, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
@@ -241,9 +241,9 @@ define i64 @test_floor_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    neg s1, a0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixunssfdi
-; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI3_0)
-; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI3_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s1, a0
+; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
@@ -372,9 +372,9 @@ define i64 @test_ceil_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    neg s2, s1
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI5_0)
-; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI5_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
+; RV32IZFINX-NEXT:    lui a2, 389120
+; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a4, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
@@ -501,9 +501,9 @@ define i64 @test_ceil_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    neg s1, a0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixunssfdi
-; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI7_0)
-; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI7_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s1, a0
+; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
@@ -632,9 +632,9 @@ define i64 @test_trunc_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    neg s2, s1
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI9_0)
-; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI9_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
+; RV32IZFINX-NEXT:    lui a2, 389120
+; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a4, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
@@ -761,9 +761,9 @@ define i64 @test_trunc_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    neg s1, a0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixunssfdi
-; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI11_0)
-; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI11_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s1, a0
+; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
@@ -892,9 +892,9 @@ define i64 @test_round_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    neg s2, s1
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI13_0)
-; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI13_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
+; RV32IZFINX-NEXT:    lui a2, 389120
+; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a4, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
@@ -1021,9 +1021,9 @@ define i64 @test_round_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    neg s1, a0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixunssfdi
-; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI15_0)
-; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI15_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s1, a0
+; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
@@ -1152,9 +1152,9 @@ define i64 @test_roundeven_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    neg s2, s1
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI17_0)
-; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI17_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
+; RV32IZFINX-NEXT:    lui a2, 389120
+; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a4, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
@@ -1281,9 +1281,9 @@ define i64 @test_roundeven_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    neg s1, a0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixunssfdi
-; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI19_0)
-; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI19_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s1, a0
+; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
@@ -1412,9 +1412,9 @@ define i64 @test_rint_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    neg s2, s1
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI21_0)
-; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI21_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
+; RV32IZFINX-NEXT:    lui a2, 389120
+; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a4, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
@@ -1541,9 +1541,9 @@ define i64 @test_rint_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    neg s1, a0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixunssfdi
-; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI23_0)
-; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI23_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s1, a0
+; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll
index f8522b09970bf..b033c75eeadd8 100644
--- a/llvm/test/CodeGen/RISCV/half-arith.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith.ll
@@ -2910,35 +2910,18 @@ define half @fsgnjx_f16(half %x, half %y) nounwind {
 ; RV64IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; RV64IZFHMIN-NEXT:    ret
 ;
-; RV32IZHINXMIN-LABEL: fsgnjx_f16:
-; RV32IZHINXMIN:       # %bb.0:
-; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI23_0)
-; RV32IZHINXMIN-NEXT:    lh a2, %lo(.LCPI23_0)(a2)
-; RV32IZHINXMIN-NEXT:    lui a3, 1048568
-; RV32IZHINXMIN-NEXT:    and a0, a0, a3
-; RV32IZHINXMIN-NEXT:    slli a2, a2, 17
-; RV32IZHINXMIN-NEXT:    srli a2, a2, 17
-; RV32IZHINXMIN-NEXT:    or a0, a2, a0
-; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; RV32IZHINXMIN-NEXT:    fmul.s a0, a0, a1
-; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; RV32IZHINXMIN-NEXT:    ret
-;
-; RV64IZHINXMIN-LABEL: fsgnjx_f16:
-; RV64IZHINXMIN:       # %bb.0:
-; RV64IZHINXMIN-NEXT:    lui a2, %hi(.LCPI23_0)
-; RV64IZHINXMIN-NEXT:    lh a2, %lo(.LCPI23_0)(a2)
-; RV64IZHINXMIN-NEXT:    lui a3, 1048568
-; RV64IZHINXMIN-NEXT:    and a0, a0, a3
-; RV64IZHINXMIN-NEXT:    slli a2, a2, 49
-; RV64IZHINXMIN-NEXT:    srli a2, a2, 49
-; RV64IZHINXMIN-NEXT:    or a0, a2, a0
-; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; RV64IZHINXMIN-NEXT:    fmul.s a0, a0, a1
-; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; RV64IZHINXMIN-NEXT:    ret
+; CHECKIZHINXMIN-LABEL: fsgnjx_f16:
+; CHECKIZHINXMIN:       # %bb.0:
+; CHECKIZHINXMIN-NEXT:    lui a2, 1048568
+; CHECKIZHINXMIN-NEXT:    and a0, a0, a2
+; CHECKIZHINXMIN-NEXT:    li a2, 15
+; CHECKIZHINXMIN-NEXT:    slli a2, a2, 10
+; CHECKIZHINXMIN-NEXT:    or a0, a0, a2
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fmul.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    ret
   %z = call half @llvm.copysign.f16(half 1.0, half %x)
   %mul = fmul half %z, %y
   ret half %mul
diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll
index bc1a652061020..32f7dfaee8837 100644
--- a/llvm/test/CodeGen/RISCV/half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert.ll
@@ -255,11 +255,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV32IZHINX:       # %bb.0: # %start
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV32IZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI1_0)
-; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
 ; RV32IZHINX-NEXT:    neg a1, a1
-; RV32IZHINX-NEXT:    lui a3, 815104
-; RV32IZHINX-NEXT:    fmax.s a0, a0, a3
+; RV32IZHINX-NEXT:    lui a2, 815104
+; RV32IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZHINX-NEXT:    lui a2, 290816
+; RV32IZHINX-NEXT:    addi a2, a2, -512
 ; RV32IZHINX-NEXT:    fmin.s a0, a0, a2
 ; RV32IZHINX-NEXT:    fcvt.w.s a0, a0, rtz
 ; RV32IZHINX-NEXT:    and a0, a1, a0
@@ -269,11 +269,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV64IZHINX:       # %bb.0: # %start
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV64IZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZHINX-NEXT:    lui a2, %hi(.LCPI1_0)
-; RV64IZHINX-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
 ; RV64IZHINX-NEXT:    neg a1, a1
-; RV64IZHINX-NEXT:    lui a3, 815104
-; RV64IZHINX-NEXT:    fmax.s a0, a0, a3
+; RV64IZHINX-NEXT:    lui a2, 815104
+; RV64IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZHINX-NEXT:    lui a2, 290816
+; RV64IZHINX-NEXT:    addiw a2, a2, -512
 ; RV64IZHINX-NEXT:    fmin.s a0, a0, a2
 ; RV64IZHINX-NEXT:    fcvt.l.s a0, a0, rtz
 ; RV64IZHINX-NEXT:    and a0, a1, a0
@@ -283,11 +283,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV32IZDINXZHINX:       # %bb.0: # %start
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV32IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZDINXZHINX-NEXT:    lui a2, %hi(.LCPI1_0)
-; RV32IZDINXZHINX-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
 ; RV32IZDINXZHINX-NEXT:    neg a1, a1
-; RV32IZDINXZHINX-NEXT:    lui a3, 815104
-; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, a3
+; RV32IZDINXZHINX-NEXT:    lui a2, 815104
+; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZDINXZHINX-NEXT:    lui a2, 290816
+; RV32IZDINXZHINX-NEXT:    addi a2, a2, -512
 ; RV32IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
 ; RV32IZDINXZHINX-NEXT:    fcvt.w.s a0, a0, rtz
 ; RV32IZDINXZHINX-NEXT:    and a0, a1, a0
@@ -297,11 +297,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV64IZDINXZHINX:       # %bb.0: # %start
 ; RV64IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV64IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZDINXZHINX-NEXT:    lui a2, %hi(.LCPI1_0)
-; RV64IZDINXZHINX-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
 ; RV64IZDINXZHINX-NEXT:    neg a1, a1
-; RV64IZDINXZHINX-NEXT:    lui a3, 815104
-; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, a3
+; RV64IZDINXZHINX-NEXT:    lui a2, 815104
+; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZDINXZHINX-NEXT:    lui a2, 290816
+; RV64IZDINXZHINX-NEXT:    addiw a2, a2, -512
 ; RV64IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
 ; RV64IZDINXZHINX-NEXT:    fcvt.l.s a0, a0, rtz
 ; RV64IZDINXZHINX-NEXT:    and a0, a1, a0
@@ -505,11 +505,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; CHECK32-IZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK32-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZHINXMIN-NEXT:    lui a2, %hi(.LCPI1_0)
-; CHECK32-IZHINXMIN-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
 ; CHECK32-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZHINXMIN-NEXT:    lui a3, 815104
-; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, a3
+; CHECK32-IZHINXMIN-NEXT:    lui a2, 815104
+; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZHINXMIN-NEXT:    lui a2, 290816
+; CHECK32-IZHINXMIN-NEXT:    addi a2, a2, -512
 ; CHECK32-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
 ; CHECK32-IZHINXMIN-NEXT:    and a0, a1, a0
@@ -519,11 +519,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; CHECK64-IZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK64-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZHINXMIN-NEXT:    lui a2, %hi(.LCPI1_0)
-; CHECK64-IZHINXMIN-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
 ; CHECK64-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZHINXMIN-NEXT:    lui a3, 815104
-; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, a3
+; CHECK64-IZHINXMIN-NEXT:    lui a2, 815104
+; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZHINXMIN-NEXT:    lui a2, 290816
+; CHECK64-IZHINXMIN-NEXT:    addiw a2, a2, -512
 ; CHECK64-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
 ; CHECK64-IZHINXMIN-NEXT:    and a0, a1, a0
@@ -533,11 +533,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, %hi(.LCPI1_0)
-; CHECK32-IZDINXZHINXMIN-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
 ; CHECK32-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a3, 815104
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a3
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 815104
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 290816
+; CHECK32-IZDINXZHINXMIN-NEXT:    addi a2, a2, -512
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
 ; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
@@ -547,11 +547,11 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK64-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, %hi(.LCPI1_0)
-; CHECK64-IZDINXZHINXMIN-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
 ; CHECK64-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZDINXZHINXMIN-NEXT:    lui a3, 815104
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a3
+; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 815104
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 290816
+; CHECK64-IZDINXZHINXMIN-NEXT:    addiw a2, a2, -512
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
 ; CHECK64-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
@@ -755,40 +755,40 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ;
 ; RV32IZHINX-LABEL: fcvt_ui_h_sat:
 ; RV32IZHINX:       # %bb.0: # %start
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI3_0)
-; RV32IZHINX-NEXT:    lw a1, %lo(.LCPI3_0)(a1)
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV32IZHINX-NEXT:    fmax.s a0, a0, zero
+; RV32IZHINX-NEXT:    lui a1, 292864
+; RV32IZHINX-NEXT:    addi a1, a1, -256
 ; RV32IZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV32IZHINX-NEXT:    fcvt.wu.s a0, a0, rtz
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: fcvt_ui_h_sat:
 ; RV64IZHINX:       # %bb.0: # %start
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI3_0)
-; RV64IZHINX-NEXT:    lw a1, %lo(.LCPI3_0)(a1)
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV64IZHINX-NEXT:    fmax.s a0, a0, zero
+; RV64IZHINX-NEXT:    lui a1, 292864
+; RV64IZHINX-NEXT:    addiw a1, a1, -256
 ; RV64IZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV64IZHINX-NEXT:    fcvt.lu.s a0, a0, rtz
 ; RV64IZHINX-NEXT:    ret
 ;
 ; RV32IZDINXZHINX-LABEL: fcvt_ui_h_sat:
 ; RV32IZDINXZHINX:       # %bb.0: # %start
-; RV32IZDINXZHINX-NEXT:    lui a1, %hi(.LCPI3_0)
-; RV32IZDINXZHINX-NEXT:    lw a1, %lo(.LCPI3_0)(a1)
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
+; RV32IZDINXZHINX-NEXT:    lui a1, 292864
+; RV32IZDINXZHINX-NEXT:    addi a1, a1, -256
 ; RV32IZDINXZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV32IZDINXZHINX-NEXT:    fcvt.wu.s a0, a0, rtz
 ; RV32IZDINXZHINX-NEXT:    ret
 ;
 ; RV64IZDINXZHINX-LABEL: fcvt_ui_h_sat:
 ; RV64IZDINXZHINX:       # %bb.0: # %start
-; RV64IZDINXZHINX-NEXT:    lui a1, %hi(.LCPI3_0)
-; RV64IZDINXZHINX-NEXT:    lw a1, %lo(.LCPI3_0)(a1)
 ; RV64IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
+; RV64IZDINXZHINX-NEXT:    lui a1, 292864
+; RV64IZDINXZHINX-NEXT:    addiw a1, a1, -256
 ; RV64IZDINXZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV64IZDINXZHINX-NEXT:    fcvt.lu.s a0, a0, rtz
 ; RV64IZDINXZHINX-NEXT:    ret
@@ -955,40 +955,40 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ;
 ; CHECK32-IZHINXMIN-LABEL: fcvt_ui_h_sat:
 ; CHECK32-IZHINXMIN:       # %bb.0: # %start
-; CHECK32-IZHINXMIN-NEXT:    lui a1, %hi(.LCPI3_0)
-; CHECK32-IZHINXMIN-NEXT:    lw a1, %lo(.LCPI3_0)(a1)
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
+; CHECK32-IZHINXMIN-NEXT:    lui a1, 292864
+; CHECK32-IZHINXMIN-NEXT:    addi a1, a1, -256
 ; CHECK32-IZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.wu.s a0, a0, rtz
 ; CHECK32-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZHINXMIN-LABEL: fcvt_ui_h_sat:
 ; CHECK64-IZHINXMIN:       # %bb.0: # %start
-; CHECK64-IZHINXMIN-NEXT:    lui a1, %hi(.LCPI3_0)
-; CHECK64-IZHINXMIN-NEXT:    lw a1, %lo(.LCPI3_0)(a1)
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
+; CHECK64-IZHINXMIN-NEXT:    lui a1, 292864
+; CHECK64-IZHINXMIN-NEXT:    addiw a1, a1, -256
 ; CHECK64-IZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.lu.s a0, a0, rtz
 ; CHECK64-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_ui_h_sat:
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0: # %start
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, %hi(.LCPI3_0)
-; CHECK32-IZDINXZHINXMIN-NEXT:    lw a1, %lo(.LCPI3_0)(a1)
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 292864
+; CHECK32-IZDINXZHINXMIN-NEXT:    addi a1, a1, -256
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.wu.s a0, a0, rtz
 ; CHECK32-IZDINXZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_ui_h_sat:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0: # %start
-; CHECK64-IZDINXZHINXMIN-NEXT:    lui a1, %hi(.LCPI3_0)
-; CHECK64-IZDINXZHINXMIN-NEXT:    lw a1, %lo(.LCPI3_0)(a1)
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
+; CHECK64-IZDINXZHINXMIN-NEXT:    lui a1, 292864
+; CHECK64-IZDINXZHINXMIN-NEXT:    addiw a1, a1, -256
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.lu.s a0, a0, rtz
 ; CHECK64-IZDINXZHINXMIN-NEXT:    ret
@@ -2248,10 +2248,10 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZHINX-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IZHINX-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32IZHINX-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI10_0)
-; RV32IZHINX-NEXT:    lw a1, %lo(.LCPI10_0)(a1)
 ; RV32IZHINX-NEXT:    fcvt.s.h s0, a0
-; RV32IZHINX-NEXT:    flt.s s1, a1, s0
+; RV32IZHINX-NEXT:    lui a0, 389120
+; RV32IZHINX-NEXT:    addi a0, a0, -1
+; RV32IZHINX-NEXT:    flt.s s1, a0, s0
 ; RV32IZHINX-NEXT:    neg s2, s1
 ; RV32IZHINX-NEXT:    lui a0, 913408
 ; RV32IZHINX-NEXT:    fle.s s3, a0, s0
@@ -2301,10 +2301,10 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZDINXZHINX-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IZDINXZHINX-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32IZDINXZHINX-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32IZDINXZHINX-NEXT:    lui a1, %hi(.LCPI10_0)
-; RV32IZDINXZHINX-NEXT:    lw a1, %lo(.LCPI10_0)(a1)
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h s0, a0
-; RV32IZDINXZHINX-NEXT:    flt.s s1, a1, s0
+; RV32IZDINXZHINX-NEXT:    lui a0, 389120
+; RV32IZDINXZHINX-NEXT:    addi a0, a0, -1
+; RV32IZDINXZHINX-NEXT:    flt.s s1, a0, s0
 ; RV32IZDINXZHINX-NEXT:    neg s2, s1
 ; RV32IZDINXZHINX-NEXT:    lui a0, 913408
 ; RV32IZDINXZHINX-NEXT:    fle.s s3, a0, s0
@@ -2651,10 +2651,10 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; CHECK32-IZHINXMIN-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; CHECK32-IZHINXMIN-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; CHECK32-IZHINXMIN-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; CHECK32-IZHINXMIN-NEXT:    lui a1, %hi(.LCPI10_0)
-; CHECK32-IZHINXMIN-NEXT:    lw a1, %lo(.LCPI10_0)(a1)
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h s0, a0
-; CHECK32-IZHINXMIN-NEXT:    flt.s s1, a1, s0
+; CHECK32-IZHINXMIN-NEXT:    lui a0, 389120
+; CHECK32-IZHINXMIN-NEXT:    addi a0, a0, -1
+; CHECK32-IZHINXMIN-NEXT:    flt.s s1, a0, s0
 ; CHECK32-IZHINXMIN-NEXT:    neg s2, s1
 ; CHECK32-IZHINXMIN-NEXT:    lui a0, 913408
 ; CHECK32-IZHINXMIN-NEXT:    fle.s s3, a0, s0
@@ -2705,10 +2705,10 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; CHECK32-IZDINXZHINXMIN-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; CHECK32-IZDINXZHINXMIN-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; CHECK32-IZDINXZHINXMIN-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, %hi(.LCPI10_0)
-; CHECK32-IZDINXZHINXMIN-NEXT:    lw a1, %lo(.LCPI10_0)(a1)
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h s0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    flt.s s1, a1, s0
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a0, 389120
+; CHECK32-IZDINXZHINXMIN-NEXT:    addi a0, a0, -1
+; CHECK32-IZDINXZHINXMIN-NEXT:    flt.s s1, a0, s0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    neg s2, s1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a0, 913408
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fle.s s3, a0, s0
@@ -3000,9 +3000,9 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32IZHINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZHINX-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZHINX-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI12_0)
-; RV32IZHINX-NEXT:    lw a1, %lo(.LCPI12_0)(a1)
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINX-NEXT:    lui a1, 391168
+; RV32IZHINX-NEXT:    addi a1, a1, -1
 ; RV32IZHINX-NEXT:    flt.s a1, a1, a0
 ; RV32IZHINX-NEXT:    neg s0, a1
 ; RV32IZHINX-NEXT:    fle.s a1, zero, a0
@@ -3033,9 +3033,9 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32IZDINXZHINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZDINXZHINX-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZDINXZHINX-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZDINXZHINX-NEXT:    lui a1, %hi(.LCPI12_0)
-; RV32IZDINXZHINX-NEXT:    lw a1, %lo(.LCPI12_0)(a1)
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
+; RV32IZDINXZHINX-NEXT:    lui a1, 391168
+; RV32IZDINXZHINX-NEXT:    addi a1, a1, -1
 ; RV32IZDINXZHINX-NEXT:    flt.s a1, a1, a0
 ; RV32IZDINXZHINX-NEXT:    neg s0, a1
 ; RV32IZDINXZHINX-NEXT:    fle.s a1, zero, a0
@@ -3245,9 +3245,9 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; CHECK32-IZHINXMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; CHECK32-IZHINXMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; CHECK32-IZHINXMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; CHECK32-IZHINXMIN-NEXT:    lui a1, %hi(.LCPI12_0)
-; CHECK32-IZHINXMIN-NEXT:    lw a1, %lo(.LCPI12_0)(a1)
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECK32-IZHINXMIN-NEXT:    lui a1, 391168
+; CHECK32-IZHINXMIN-NEXT:    addi a1, a1, -1
 ; CHECK32-IZHINXMIN-NEXT:    flt.s a1, a1, a0
 ; CHECK32-IZHINXMIN-NEXT:    neg s0, a1
 ; CHECK32-IZHINXMIN-NEXT:    fle.s a1, zero, a0
@@ -3279,9 +3279,9 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; CHECK32-IZDINXZHINXMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; CHECK32-IZDINXZHINXMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; CHECK32-IZDINXZHINXMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, %hi(.LCPI12_0)
-; CHECK32-IZDINXZHINXMIN-NEXT:    lw a1, %lo(.LCPI12_0)(a1)
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 391168
+; CHECK32-IZDINXZHINXMIN-NEXT:    addi a1, a1, -1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    flt.s a1, a1, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    neg s0, a1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fle.s a1, zero, a0
@@ -6373,11 +6373,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV32IZHINX:       # %bb.0: # %start
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV32IZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI32_0)
-; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI32_0)(a2)
 ; RV32IZHINX-NEXT:    neg a1, a1
-; RV32IZHINX-NEXT:    lui a3, 815104
-; RV32IZHINX-NEXT:    fmax.s a0, a0, a3
+; RV32IZHINX-NEXT:    lui a2, 815104
+; RV32IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZHINX-NEXT:    lui a2, 290816
+; RV32IZHINX-NEXT:    addi a2, a2, -512
 ; RV32IZHINX-NEXT:    fmin.s a0, a0, a2
 ; RV32IZHINX-NEXT:    fcvt.w.s a0, a0, rtz
 ; RV32IZHINX-NEXT:    and a0, a1, a0
@@ -6387,11 +6387,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV64IZHINX:       # %bb.0: # %start
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV64IZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZHINX-NEXT:    lui a2, %hi(.LCPI32_0)
-; RV64IZHINX-NEXT:    lw a2, %lo(.LCPI32_0)(a2)
 ; RV64IZHINX-NEXT:    neg a1, a1
-; RV64IZHINX-NEXT:    lui a3, 815104
-; RV64IZHINX-NEXT:    fmax.s a0, a0, a3
+; RV64IZHINX-NEXT:    lui a2, 815104
+; RV64IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZHINX-NEXT:    lui a2, 290816
+; RV64IZHINX-NEXT:    addiw a2, a2, -512
 ; RV64IZHINX-NEXT:    fmin.s a0, a0, a2
 ; RV64IZHINX-NEXT:    fcvt.l.s a0, a0, rtz
 ; RV64IZHINX-NEXT:    and a0, a1, a0
@@ -6401,11 +6401,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV32IZDINXZHINX:       # %bb.0: # %start
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV32IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZDINXZHINX-NEXT:    lui a2, %hi(.LCPI32_0)
-; RV32IZDINXZHINX-NEXT:    lw a2, %lo(.LCPI32_0)(a2)
 ; RV32IZDINXZHINX-NEXT:    neg a1, a1
-; RV32IZDINXZHINX-NEXT:    lui a3, 815104
-; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, a3
+; RV32IZDINXZHINX-NEXT:    lui a2, 815104
+; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZDINXZHINX-NEXT:    lui a2, 290816
+; RV32IZDINXZHINX-NEXT:    addi a2, a2, -512
 ; RV32IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
 ; RV32IZDINXZHINX-NEXT:    fcvt.w.s a0, a0, rtz
 ; RV32IZDINXZHINX-NEXT:    and a0, a1, a0
@@ -6415,11 +6415,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV64IZDINXZHINX:       # %bb.0: # %start
 ; RV64IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV64IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZDINXZHINX-NEXT:    lui a2, %hi(.LCPI32_0)
-; RV64IZDINXZHINX-NEXT:    lw a2, %lo(.LCPI32_0)(a2)
 ; RV64IZDINXZHINX-NEXT:    neg a1, a1
-; RV64IZDINXZHINX-NEXT:    lui a3, 815104
-; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, a3
+; RV64IZDINXZHINX-NEXT:    lui a2, 815104
+; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZDINXZHINX-NEXT:    lui a2, 290816
+; RV64IZDINXZHINX-NEXT:    addiw a2, a2, -512
 ; RV64IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
 ; RV64IZDINXZHINX-NEXT:    fcvt.l.s a0, a0, rtz
 ; RV64IZDINXZHINX-NEXT:    and a0, a1, a0
@@ -6627,11 +6627,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; CHECK32-IZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK32-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZHINXMIN-NEXT:    lui a2, %hi(.LCPI32_0)
-; CHECK32-IZHINXMIN-NEXT:    lw a2, %lo(.LCPI32_0)(a2)
 ; CHECK32-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZHINXMIN-NEXT:    lui a3, 815104
-; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, a3
+; CHECK32-IZHINXMIN-NEXT:    lui a2, 815104
+; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZHINXMIN-NEXT:    lui a2, 290816
+; CHECK32-IZHINXMIN-NEXT:    addi a2, a2, -512
 ; CHECK32-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
 ; CHECK32-IZHINXMIN-NEXT:    and a0, a1, a0
@@ -6641,11 +6641,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; CHECK64-IZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK64-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZHINXMIN-NEXT:    lui a2, %hi(.LCPI32_0)
-; CHECK64-IZHINXMIN-NEXT:    lw a2, %lo(.LCPI32_0)(a2)
 ; CHECK64-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZHINXMIN-NEXT:    lui a3, 815104
-; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, a3
+; CHECK64-IZHINXMIN-NEXT:    lui a2, 815104
+; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZHINXMIN-NEXT:    lui a2, 290816
+; CHECK64-IZHINXMIN-NEXT:    addiw a2, a2, -512
 ; CHECK64-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
 ; CHECK64-IZHINXMIN-NEXT:    and a0, a1, a0
@@ -6655,11 +6655,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, %hi(.LCPI32_0)
-; CHECK32-IZDINXZHINXMIN-NEXT:    lw a2, %lo(.LCPI32_0)(a2)
 ; CHECK32-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a3, 815104
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a3
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 815104
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 290816
+; CHECK32-IZDINXZHINXMIN-NEXT:    addi a2, a2, -512
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
 ; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
@@ -6669,11 +6669,11 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK64-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, %hi(.LCPI32_0)
-; CHECK64-IZDINXZHINXMIN-NEXT:    lw a2, %lo(.LCPI32_0)(a2)
 ; CHECK64-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZDINXZHINXMIN-NEXT:    lui a3, 815104
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a3
+; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 815104
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 290816
+; CHECK64-IZDINXZHINXMIN-NEXT:    addiw a2, a2, -512
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
 ; CHECK64-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
@@ -6876,40 +6876,40 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ;
 ; RV32IZHINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV32IZHINX:       # %bb.0: # %start
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI34_0)
-; RV32IZHINX-NEXT:    lw a1, %lo(.LCPI34_0)(a1)
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV32IZHINX-NEXT:    fmax.s a0, a0, zero
+; RV32IZHINX-NEXT:    lui a1, 292864
+; RV32IZHINX-NEXT:    addi a1, a1, -256
 ; RV32IZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV32IZHINX-NEXT:    fcvt.wu.s a0, a0, rtz
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV64IZHINX:       # %bb.0: # %start
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI34_0)
-; RV64IZHINX-NEXT:    lw a1, %lo(.LCPI34_0)(a1)
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV64IZHINX-NEXT:    fmax.s a0, a0, zero
+; RV64IZHINX-NEXT:    lui a1, 292864
+; RV64IZHINX-NEXT:    addiw a1, a1, -256
 ; RV64IZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV64IZHINX-NEXT:    fcvt.lu.s a0, a0, rtz
 ; RV64IZHINX-NEXT:    ret
 ;
 ; RV32IZDINXZHINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV32IZDINXZHINX:       # %bb.0: # %start
-; RV32IZDINXZHINX-NEXT:    lui a1, %hi(.LCPI34_0)
-; RV32IZDINXZHINX-NEXT:    lw a1, %lo(.LCPI34_0)(a1)
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
+; RV32IZDINXZHINX-NEXT:    lui a1, 292864
+; RV32IZDINXZHINX-NEXT:    addi a1, a1, -256
 ; RV32IZDINXZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV32IZDINXZHINX-NEXT:    fcvt.wu.s a0, a0, rtz
 ; RV32IZDINXZHINX-NEXT:    ret
 ;
 ; RV64IZDINXZHINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV64IZDINXZHINX:       # %bb.0: # %start
-; RV64IZDINXZHINX-NEXT:    lui a1, %hi(.LCPI34_0)
-; RV64IZDINXZHINX-NEXT:    lw a1, %lo(.LCPI34_0)(a1)
 ; RV64IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
+; RV64IZDINXZHINX-NEXT:    lui a1, 292864
+; RV64IZDINXZHINX-NEXT:    addiw a1, a1, -256
 ; RV64IZDINXZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV64IZDINXZHINX-NEXT:    fcvt.lu.s a0, a0, rtz
 ; RV64IZDINXZHINX-NEXT:    ret
@@ -7082,40 +7082,40 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ;
 ; CHECK32-IZHINXMIN-LABEL: fcvt_wu_s_sat_i16:
 ; CHECK32-IZHINXMIN:       # %bb.0: # %start
-; CHECK32-IZHINXMIN-NEXT:    lui a1, %hi(.LCPI34_0)
-; CHECK32-IZHINXMIN-NEXT:    lw a1, %lo(.LCPI34_0)(a1)
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
+; CHECK32-IZHINXMIN-NEXT:    lui a1, 292864
+; CHECK32-IZHINXMIN-NEXT:    addi a1, a1, -256
 ; CHECK32-IZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.wu.s a0, a0, rtz
 ; CHECK32-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZHINXMIN-LABEL: fcvt_wu_s_sat_i16:
 ; CHECK64-IZHINXMIN:       # %bb.0: # %start
-; CHECK64-IZHINXMIN-NEXT:    lui a1, %hi(.LCPI34_0)
-; CHECK64-IZHINXMIN-NEXT:    lw a1, %lo(.LCPI34_0)(a1)
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
+; CHECK64-IZHINXMIN-NEXT:    lui a1, 292864
+; CHECK64-IZHINXMIN-NEXT:    addiw a1, a1, -256
 ; CHECK64-IZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.lu.s a0, a0, rtz
 ; CHECK64-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_wu_s_sat_i16:
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0: # %start
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, %hi(.LCPI34_0)
-; CHECK32-IZDINXZHINXMIN-NEXT:    lw a1, %lo(.LCPI34_0)(a1)
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 292864
+; CHECK32-IZDINXZHINXMIN-NEXT:    addi a1, a1, -256
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.wu.s a0, a0, rtz
 ; CHECK32-IZDINXZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_wu_s_sat_i16:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0: # %start
-; CHECK64-IZDINXZHINXMIN-NEXT:    lui a1, %hi(.LCPI34_0)
-; CHECK64-IZDINXZHINXMIN-NEXT:    lw a1, %lo(.LCPI34_0)(a1)
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
+; CHECK64-IZDINXZHINXMIN-NEXT:    lui a1, 292864
+; CHECK64-IZDINXZHINXMIN-NEXT:    addiw a1, a1, -256
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.lu.s a0, a0, rtz
 ; CHECK64-IZDINXZHINXMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/half-imm.ll b/llvm/test/CodeGen/RISCV/half-imm.ll
index 4c39885176f01..2ebc28c2ebd44 100644
--- a/llvm/test/CodeGen/RISCV/half-imm.ll
+++ b/llvm/test/CodeGen/RISCV/half-imm.ll
@@ -70,15 +70,15 @@ define half @half_imm_op(half %a) nounwind {
 ;
 ; RV32IZHINX-LABEL: half_imm_op:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI1_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 15
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fadd.h a0, a0, a1
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: half_imm_op:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI1_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 15
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fadd.h a0, a0, a1
 ; RV64IZHINX-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
index 40363b321848d..3e0f838270aa5 100644
--- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
@@ -2132,8 +2132,8 @@ define half @floor_f16(half %a) nounwind {
 ;
 ; CHECKIZHINX-LABEL: floor_f16:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI17_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI17_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB17_2
@@ -2223,8 +2223,8 @@ define half @ceil_f16(half %a) nounwind {
 ;
 ; CHECKIZHINX-LABEL: ceil_f16:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI18_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI18_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB18_2
@@ -2314,8 +2314,8 @@ define half @trunc_f16(half %a) nounwind {
 ;
 ; CHECKIZHINX-LABEL: trunc_f16:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI19_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI19_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB19_2
@@ -2405,8 +2405,8 @@ define half @rint_f16(half %a) nounwind {
 ;
 ; CHECKIZHINX-LABEL: rint_f16:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI20_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI20_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB20_2
@@ -2616,8 +2616,8 @@ define half @round_f16(half %a) nounwind {
 ;
 ; CHECKIZHINX-LABEL: round_f16:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI22_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI22_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB22_2
@@ -2707,8 +2707,8 @@ define half @roundeven_f16(half %a) nounwind {
 ;
 ; CHECKIZHINX-LABEL: roundeven_f16:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI23_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI23_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB23_2
diff --git a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
index 04a8a66f44598..0b93c8789fca5 100644
--- a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
@@ -28,8 +28,8 @@ define signext i32 @test_floor_si32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_floor_si32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI0_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI0_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB0_2
@@ -153,8 +153,8 @@ define i64 @test_floor_si64(half %x) nounwind {
 ;
 ; RV32IZHINX-LABEL: test_floor_si64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI1_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB1_2
@@ -174,9 +174,9 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    neg s2, s1
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI1_1)
-; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI1_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
+; RV32IZHINX-NEXT:    lui a2, 389120
+; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a4, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
@@ -203,8 +203,8 @@ define i64 @test_floor_si64(half %x) nounwind {
 ;
 ; RV64IZHINX-LABEL: test_floor_si64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI1_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB1_2
@@ -317,9 +317,9 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    neg s2, s1
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI1_0)
-; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
+; RV32IZHINXMIN-NEXT:    lui a2, 389120
+; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
@@ -381,8 +381,8 @@ define signext i32 @test_floor_ui32(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_floor_ui32:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI2_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI2_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB2_2
@@ -400,8 +400,8 @@ define signext i32 @test_floor_ui32(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_floor_ui32:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI2_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI2_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB2_2
@@ -555,8 +555,8 @@ define i64 @test_floor_ui64(half %x) nounwind {
 ;
 ; RV32IZHINX-LABEL: test_floor_ui64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI3_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI3_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB3_2
@@ -574,9 +574,9 @@ define i64 @test_floor_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    neg s1, a0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixunssfdi
-; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI3_1)
-; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI3_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s1, a0
+; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
@@ -590,8 +590,8 @@ define i64 @test_floor_ui64(half %x) nounwind {
 ;
 ; RV64IZHINX-LABEL: test_floor_ui64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI3_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI3_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB3_2
@@ -689,9 +689,9 @@ define i64 @test_floor_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    neg s1, a0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
-; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI3_0)
-; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI3_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
+; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
@@ -740,8 +740,8 @@ define signext i32 @test_ceil_si32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_ceil_si32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI4_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI4_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB4_2
@@ -865,8 +865,8 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ;
 ; RV32IZHINX-LABEL: test_ceil_si64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI5_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI5_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB5_2
@@ -886,9 +886,9 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    neg s2, s1
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI5_1)
-; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI5_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
+; RV32IZHINX-NEXT:    lui a2, 389120
+; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a4, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
@@ -915,8 +915,8 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ;
 ; RV64IZHINX-LABEL: test_ceil_si64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI5_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI5_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB5_2
@@ -1029,9 +1029,9 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    neg s2, s1
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI5_0)
-; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI5_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
+; RV32IZHINXMIN-NEXT:    lui a2, 389120
+; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
@@ -1093,8 +1093,8 @@ define signext i32 @test_ceil_ui32(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_ceil_ui32:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI6_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI6_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB6_2
@@ -1112,8 +1112,8 @@ define signext i32 @test_ceil_ui32(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_ceil_ui32:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI6_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI6_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB6_2
@@ -1267,8 +1267,8 @@ define i64 @test_ceil_ui64(half %x) nounwind {
 ;
 ; RV32IZHINX-LABEL: test_ceil_ui64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI7_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI7_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB7_2
@@ -1286,9 +1286,9 @@ define i64 @test_ceil_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    neg s1, a0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixunssfdi
-; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI7_1)
-; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI7_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s1, a0
+; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
@@ -1302,8 +1302,8 @@ define i64 @test_ceil_ui64(half %x) nounwind {
 ;
 ; RV64IZHINX-LABEL: test_ceil_ui64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI7_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI7_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB7_2
@@ -1401,9 +1401,9 @@ define i64 @test_ceil_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    neg s1, a0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
-; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI7_0)
-; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI7_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
+; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
@@ -1452,8 +1452,8 @@ define signext i32 @test_trunc_si32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_trunc_si32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI8_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI8_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB8_2
@@ -1577,8 +1577,8 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ;
 ; RV32IZHINX-LABEL: test_trunc_si64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI9_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI9_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB9_2
@@ -1598,9 +1598,9 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    neg s2, s1
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI9_1)
-; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI9_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
+; RV32IZHINX-NEXT:    lui a2, 389120
+; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a4, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
@@ -1627,8 +1627,8 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ;
 ; RV64IZHINX-LABEL: test_trunc_si64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI9_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI9_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB9_2
@@ -1741,9 +1741,9 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    neg s2, s1
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI9_0)
-; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI9_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
+; RV32IZHINXMIN-NEXT:    lui a2, 389120
+; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
@@ -1805,8 +1805,8 @@ define signext i32 @test_trunc_ui32(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_trunc_ui32:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI10_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI10_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB10_2
@@ -1824,8 +1824,8 @@ define signext i32 @test_trunc_ui32(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_trunc_ui32:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI10_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI10_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB10_2
@@ -1979,8 +1979,8 @@ define i64 @test_trunc_ui64(half %x) nounwind {
 ;
 ; RV32IZHINX-LABEL: test_trunc_ui64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI11_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI11_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB11_2
@@ -1998,9 +1998,9 @@ define i64 @test_trunc_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    neg s1, a0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixunssfdi
-; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI11_1)
-; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI11_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s1, a0
+; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
@@ -2014,8 +2014,8 @@ define i64 @test_trunc_ui64(half %x) nounwind {
 ;
 ; RV64IZHINX-LABEL: test_trunc_ui64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI11_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI11_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB11_2
@@ -2113,9 +2113,9 @@ define i64 @test_trunc_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    neg s1, a0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
-; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI11_0)
-; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI11_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
+; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
@@ -2164,8 +2164,8 @@ define signext i32 @test_round_si32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_round_si32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI12_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI12_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB12_2
@@ -2289,8 +2289,8 @@ define i64 @test_round_si64(half %x) nounwind {
 ;
 ; RV32IZHINX-LABEL: test_round_si64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI13_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI13_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB13_2
@@ -2310,9 +2310,9 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    neg s2, s1
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI13_1)
-; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI13_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
+; RV32IZHINX-NEXT:    lui a2, 389120
+; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a4, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
@@ -2339,8 +2339,8 @@ define i64 @test_round_si64(half %x) nounwind {
 ;
 ; RV64IZHINX-LABEL: test_round_si64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI13_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI13_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB13_2
@@ -2453,9 +2453,9 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    neg s2, s1
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI13_0)
-; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI13_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
+; RV32IZHINXMIN-NEXT:    lui a2, 389120
+; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
@@ -2517,8 +2517,8 @@ define signext i32 @test_round_ui32(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_round_ui32:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI14_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI14_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB14_2
@@ -2536,8 +2536,8 @@ define signext i32 @test_round_ui32(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_round_ui32:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI14_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI14_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB14_2
@@ -2691,8 +2691,8 @@ define i64 @test_round_ui64(half %x) nounwind {
 ;
 ; RV32IZHINX-LABEL: test_round_ui64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI15_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI15_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB15_2
@@ -2710,9 +2710,9 @@ define i64 @test_round_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    neg s1, a0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixunssfdi
-; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI15_1)
-; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI15_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s1, a0
+; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
@@ -2726,8 +2726,8 @@ define i64 @test_round_ui64(half %x) nounwind {
 ;
 ; RV64IZHINX-LABEL: test_round_ui64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI15_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI15_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB15_2
@@ -2825,9 +2825,9 @@ define i64 @test_round_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    neg s1, a0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
-; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI15_0)
-; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI15_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
+; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
@@ -2876,8 +2876,8 @@ define signext i32 @test_roundeven_si32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_roundeven_si32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI16_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI16_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB16_2
@@ -3001,8 +3001,8 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ;
 ; RV32IZHINX-LABEL: test_roundeven_si64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI17_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI17_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB17_2
@@ -3022,9 +3022,9 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    neg s2, s1
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI17_1)
-; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI17_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
+; RV32IZHINX-NEXT:    lui a2, 389120
+; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a4, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
@@ -3051,8 +3051,8 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ;
 ; RV64IZHINX-LABEL: test_roundeven_si64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI17_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI17_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB17_2
@@ -3165,9 +3165,9 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    neg s2, s1
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI17_0)
-; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI17_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
+; RV32IZHINXMIN-NEXT:    lui a2, 389120
+; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
@@ -3229,8 +3229,8 @@ define signext i32 @test_roundeven_ui32(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_roundeven_ui32:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI18_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI18_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB18_2
@@ -3248,8 +3248,8 @@ define signext i32 @test_roundeven_ui32(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_roundeven_ui32:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI18_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI18_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB18_2
@@ -3403,8 +3403,8 @@ define i64 @test_roundeven_ui64(half %x) nounwind {
 ;
 ; RV32IZHINX-LABEL: test_roundeven_ui64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI19_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI19_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB19_2
@@ -3422,9 +3422,9 @@ define i64 @test_roundeven_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    neg s1, a0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixunssfdi
-; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI19_1)
-; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI19_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s1, a0
+; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
@@ -3438,8 +3438,8 @@ define i64 @test_roundeven_ui64(half %x) nounwind {
 ;
 ; RV64IZHINX-LABEL: test_roundeven_ui64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI19_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI19_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB19_2
@@ -3537,9 +3537,9 @@ define i64 @test_roundeven_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    neg s1, a0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
-; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI19_0)
-; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI19_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
+; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
@@ -3588,8 +3588,8 @@ define signext i32 @test_rint_si32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_rint_si32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI20_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI20_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB20_2
@@ -3713,8 +3713,8 @@ define i64 @test_rint_si64(half %x) nounwind {
 ;
 ; RV32IZHINX-LABEL: test_rint_si64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI21_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI21_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB21_2
@@ -3734,9 +3734,9 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    neg s2, s1
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI21_1)
-; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI21_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
+; RV32IZHINX-NEXT:    lui a2, 389120
+; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a4, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
@@ -3763,8 +3763,8 @@ define i64 @test_rint_si64(half %x) nounwind {
 ;
 ; RV64IZHINX-LABEL: test_rint_si64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI21_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI21_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB21_2
@@ -3877,9 +3877,9 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    neg s2, s1
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI21_0)
-; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI21_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
+; RV32IZHINXMIN-NEXT:    lui a2, 389120
+; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
@@ -3941,8 +3941,8 @@ define signext i32 @test_rint_ui32(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_rint_ui32:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI22_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI22_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB22_2
@@ -3960,8 +3960,8 @@ define signext i32 @test_rint_ui32(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_rint_ui32:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI22_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI22_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB22_2
@@ -4115,8 +4115,8 @@ define i64 @test_rint_ui64(half %x) nounwind {
 ;
 ; RV32IZHINX-LABEL: test_rint_ui64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI23_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI23_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB23_2
@@ -4134,9 +4134,9 @@ define i64 @test_rint_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    neg s1, a0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixunssfdi
-; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI23_1)
-; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI23_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s1, a0
+; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
@@ -4150,8 +4150,8 @@ define i64 @test_rint_ui64(half %x) nounwind {
 ;
 ; RV64IZHINX-LABEL: test_rint_ui64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI23_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI23_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB23_2
@@ -4249,9 +4249,9 @@ define i64 @test_rint_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    neg s1, a0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
-; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI23_0)
-; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI23_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
+; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
diff --git a/llvm/test/CodeGen/RISCV/half-round-conv.ll b/llvm/test/CodeGen/RISCV/half-round-conv.ll
index 2a1e0cfdda83e..8eeea07426575 100644
--- a/llvm/test/CodeGen/RISCV/half-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/half-round-conv.ll
@@ -29,8 +29,8 @@ define signext i8 @test_floor_si8(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_floor_si8:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI0_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI0_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB0_2
@@ -44,8 +44,8 @@ define signext i8 @test_floor_si8(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_floor_si8:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI0_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI0_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB0_2
@@ -144,8 +144,8 @@ define signext i16 @test_floor_si16(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_floor_si16:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI1_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB1_2
@@ -159,8 +159,8 @@ define signext i16 @test_floor_si16(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_floor_si16:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI1_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB1_2
@@ -254,8 +254,8 @@ define signext i32 @test_floor_si32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_floor_si32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI2_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI2_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB2_2
@@ -335,8 +335,8 @@ define i64 @test_floor_si64(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_floor_si64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI3_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI3_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB3_2
@@ -356,8 +356,8 @@ define i64 @test_floor_si64(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_floor_si64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI3_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI3_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB3_2
@@ -466,8 +466,8 @@ define zeroext i8 @test_floor_ui8(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_floor_ui8:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI4_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI4_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB4_2
@@ -481,8 +481,8 @@ define zeroext i8 @test_floor_ui8(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_floor_ui8:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI4_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI4_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB4_2
@@ -581,8 +581,8 @@ define zeroext i16 @test_floor_ui16(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_floor_ui16:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI5_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI5_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB5_2
@@ -596,8 +596,8 @@ define zeroext i16 @test_floor_ui16(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_floor_ui16:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI5_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI5_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB5_2
@@ -691,8 +691,8 @@ define signext i32 @test_floor_ui32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_floor_ui32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI6_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI6_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB6_2
@@ -772,8 +772,8 @@ define i64 @test_floor_ui64(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_floor_ui64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI7_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI7_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB7_2
@@ -793,8 +793,8 @@ define i64 @test_floor_ui64(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_floor_ui64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI7_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI7_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB7_2
@@ -903,8 +903,8 @@ define signext i8 @test_ceil_si8(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_ceil_si8:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI8_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI8_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB8_2
@@ -918,8 +918,8 @@ define signext i8 @test_ceil_si8(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_ceil_si8:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI8_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI8_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB8_2
@@ -1018,8 +1018,8 @@ define signext i16 @test_ceil_si16(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_ceil_si16:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI9_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI9_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB9_2
@@ -1033,8 +1033,8 @@ define signext i16 @test_ceil_si16(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_ceil_si16:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI9_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI9_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB9_2
@@ -1128,8 +1128,8 @@ define signext i32 @test_ceil_si32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_ceil_si32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI10_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI10_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB10_2
@@ -1209,8 +1209,8 @@ define i64 @test_ceil_si64(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_ceil_si64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI11_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI11_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB11_2
@@ -1230,8 +1230,8 @@ define i64 @test_ceil_si64(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_ceil_si64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI11_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI11_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB11_2
@@ -1340,8 +1340,8 @@ define zeroext i8 @test_ceil_ui8(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_ceil_ui8:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI12_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI12_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB12_2
@@ -1355,8 +1355,8 @@ define zeroext i8 @test_ceil_ui8(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_ceil_ui8:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI12_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI12_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB12_2
@@ -1455,8 +1455,8 @@ define zeroext i16 @test_ceil_ui16(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_ceil_ui16:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI13_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI13_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB13_2
@@ -1470,8 +1470,8 @@ define zeroext i16 @test_ceil_ui16(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_ceil_ui16:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI13_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI13_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB13_2
@@ -1565,8 +1565,8 @@ define signext i32 @test_ceil_ui32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_ceil_ui32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI14_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI14_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB14_2
@@ -1646,8 +1646,8 @@ define i64 @test_ceil_ui64(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_ceil_ui64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI15_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI15_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB15_2
@@ -1667,8 +1667,8 @@ define i64 @test_ceil_ui64(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_ceil_ui64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI15_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI15_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB15_2
@@ -1777,8 +1777,8 @@ define signext i8 @test_trunc_si8(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_trunc_si8:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI16_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI16_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB16_2
@@ -1792,8 +1792,8 @@ define signext i8 @test_trunc_si8(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_trunc_si8:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI16_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI16_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB16_2
@@ -1892,8 +1892,8 @@ define signext i16 @test_trunc_si16(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_trunc_si16:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI17_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI17_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB17_2
@@ -1907,8 +1907,8 @@ define signext i16 @test_trunc_si16(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_trunc_si16:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI17_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI17_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB17_2
@@ -2002,8 +2002,8 @@ define signext i32 @test_trunc_si32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_trunc_si32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI18_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI18_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB18_2
@@ -2083,8 +2083,8 @@ define i64 @test_trunc_si64(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_trunc_si64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI19_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI19_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB19_2
@@ -2104,8 +2104,8 @@ define i64 @test_trunc_si64(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_trunc_si64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI19_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI19_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB19_2
@@ -2214,8 +2214,8 @@ define zeroext i8 @test_trunc_ui8(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_trunc_ui8:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI20_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI20_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB20_2
@@ -2229,8 +2229,8 @@ define zeroext i8 @test_trunc_ui8(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_trunc_ui8:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI20_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI20_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB20_2
@@ -2329,8 +2329,8 @@ define zeroext i16 @test_trunc_ui16(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_trunc_ui16:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI21_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI21_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB21_2
@@ -2344,8 +2344,8 @@ define zeroext i16 @test_trunc_ui16(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_trunc_ui16:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI21_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI21_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB21_2
@@ -2439,8 +2439,8 @@ define signext i32 @test_trunc_ui32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_trunc_ui32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI22_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI22_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB22_2
@@ -2520,8 +2520,8 @@ define i64 @test_trunc_ui64(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_trunc_ui64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI23_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI23_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB23_2
@@ -2541,8 +2541,8 @@ define i64 @test_trunc_ui64(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_trunc_ui64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI23_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI23_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB23_2
@@ -2651,8 +2651,8 @@ define signext i8 @test_round_si8(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_round_si8:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI24_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI24_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB24_2
@@ -2666,8 +2666,8 @@ define signext i8 @test_round_si8(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_round_si8:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI24_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI24_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB24_2
@@ -2766,8 +2766,8 @@ define signext i16 @test_round_si16(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_round_si16:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI25_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI25_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB25_2
@@ -2781,8 +2781,8 @@ define signext i16 @test_round_si16(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_round_si16:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI25_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI25_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB25_2
@@ -2876,8 +2876,8 @@ define signext i32 @test_round_si32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_round_si32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI26_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI26_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB26_2
@@ -2957,8 +2957,8 @@ define i64 @test_round_si64(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_round_si64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI27_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI27_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB27_2
@@ -2978,8 +2978,8 @@ define i64 @test_round_si64(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_round_si64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI27_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI27_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB27_2
@@ -3088,8 +3088,8 @@ define zeroext i8 @test_round_ui8(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_round_ui8:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI28_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI28_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB28_2
@@ -3103,8 +3103,8 @@ define zeroext i8 @test_round_ui8(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_round_ui8:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI28_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI28_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB28_2
@@ -3203,8 +3203,8 @@ define zeroext i16 @test_round_ui16(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_round_ui16:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI29_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI29_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB29_2
@@ -3218,8 +3218,8 @@ define zeroext i16 @test_round_ui16(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_round_ui16:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI29_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI29_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB29_2
@@ -3313,8 +3313,8 @@ define signext i32 @test_round_ui32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_round_ui32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI30_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI30_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB30_2
@@ -3394,8 +3394,8 @@ define i64 @test_round_ui64(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_round_ui64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI31_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI31_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB31_2
@@ -3415,8 +3415,8 @@ define i64 @test_round_ui64(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_round_ui64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI31_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI31_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB31_2
@@ -3525,8 +3525,8 @@ define signext i8 @test_roundeven_si8(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_roundeven_si8:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI32_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB32_2
@@ -3540,8 +3540,8 @@ define signext i8 @test_roundeven_si8(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_roundeven_si8:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI32_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB32_2
@@ -3640,8 +3640,8 @@ define signext i16 @test_roundeven_si16(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_roundeven_si16:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI33_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI33_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB33_2
@@ -3655,8 +3655,8 @@ define signext i16 @test_roundeven_si16(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_roundeven_si16:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI33_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI33_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB33_2
@@ -3750,8 +3750,8 @@ define signext i32 @test_roundeven_si32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_roundeven_si32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI34_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI34_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB34_2
@@ -3831,8 +3831,8 @@ define i64 @test_roundeven_si64(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_roundeven_si64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI35_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI35_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB35_2
@@ -3852,8 +3852,8 @@ define i64 @test_roundeven_si64(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_roundeven_si64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI35_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI35_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB35_2
@@ -3962,8 +3962,8 @@ define zeroext i8 @test_roundeven_ui8(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_roundeven_ui8:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI36_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI36_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB36_2
@@ -3977,8 +3977,8 @@ define zeroext i8 @test_roundeven_ui8(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_roundeven_ui8:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI36_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI36_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB36_2
@@ -4077,8 +4077,8 @@ define zeroext i16 @test_roundeven_ui16(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_roundeven_ui16:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI37_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI37_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB37_2
@@ -4092,8 +4092,8 @@ define zeroext i16 @test_roundeven_ui16(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_roundeven_ui16:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI37_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI37_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB37_2
@@ -4187,8 +4187,8 @@ define signext i32 @test_roundeven_ui32(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_roundeven_ui32:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI38_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI38_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB38_2
@@ -4268,8 +4268,8 @@ define i64 @test_roundeven_ui64(half %x) {
 ;
 ; RV32IZHINX-LABEL: test_roundeven_ui64:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a1, %hi(.LCPI39_0)
-; RV32IZHINX-NEXT:    lh a1, %lo(.LCPI39_0)(a1)
+; RV32IZHINX-NEXT:    li a1, 25
+; RV32IZHINX-NEXT:    slli a1, a1, 10
 ; RV32IZHINX-NEXT:    fabs.h a2, a0
 ; RV32IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV32IZHINX-NEXT:    beqz a1, .LBB39_2
@@ -4289,8 +4289,8 @@ define i64 @test_roundeven_ui64(half %x) {
 ;
 ; RV64IZHINX-LABEL: test_roundeven_ui64:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a1, %hi(.LCPI39_0)
-; RV64IZHINX-NEXT:    lh a1, %lo(.LCPI39_0)(a1)
+; RV64IZHINX-NEXT:    li a1, 25
+; RV64IZHINX-NEXT:    slli a1, a1, 10
 ; RV64IZHINX-NEXT:    fabs.h a2, a0
 ; RV64IZHINX-NEXT:    flt.h a1, a2, a1
 ; RV64IZHINX-NEXT:    beqz a1, .LBB39_2
@@ -4424,8 +4424,8 @@ define half @test_floor_half(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_floor_half:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI40_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI40_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB40_2
@@ -4508,8 +4508,8 @@ define half @test_ceil_half(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_ceil_half:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI41_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI41_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB41_2
@@ -4592,8 +4592,8 @@ define half @test_trunc_half(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_trunc_half:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI42_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI42_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB42_2
@@ -4676,8 +4676,8 @@ define half @test_round_half(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_round_half:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI43_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI43_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB43_2
@@ -4760,8 +4760,8 @@ define half @test_roundeven_half(half %x) {
 ;
 ; CHECKIZHINX-LABEL: test_roundeven_half:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    lui a1, %hi(.LCPI44_0)
-; CHECKIZHINX-NEXT:    lh a1, %lo(.LCPI44_0)(a1)
+; CHECKIZHINX-NEXT:    li a1, 25
+; CHECKIZHINX-NEXT:    slli a1, a1, 10
 ; CHECKIZHINX-NEXT:    fabs.h a2, a0
 ; CHECKIZHINX-NEXT:    flt.h a1, a2, a1
 ; CHECKIZHINX-NEXT:    beqz a1, .LBB44_2

From 56b2be4a7608770bae5db9d467f50c232c3cf19a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 5 Sep 2024 16:46:02 +0100
Subject: [PATCH 246/425] [X86] Fold scalar_to_vector(funnel(x,y,imm)) ->
 funnel(scalar_to_vector(x),scalar_to_vector(y),imm)

Limit this to cases where x, y are known to be extracted from a vector.

Addresses poor x86 codegen on #107289
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 18 ++++++++
 .../CodeGen/X86/vector-shuffle-combining.ll   | 42 +++----------------
 2 files changed, 23 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a4ad4a1bb1201..c91d37727b611 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57500,6 +57500,24 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG,
       }
     }
     break;
+  case ISD::FSHL:
+  case ISD::FSHR:
+    if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
+      if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
+          Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          Src.hasOneUse()) {
+        uint64_t AmtVal =
+            Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
+        SDValue SrcVec0 =
+            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
+        SDValue SrcVec1 =
+            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
+        return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
+                           DAG.getConstant(AmtVal, DL, VT));
+      }
+    }
+    break;
   }
 
   return SDValue();
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 04262b4249256..36c4be5f1939e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -3534,46 +3534,14 @@ define <4 x i32> @PR63700(i128 %0) {
 }
 
 define <16 x i8> @PR107289(<16 x i8> %0) {
-; SSE2-LABEL: PR107289:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm1, %rcx
-; SSE2-NEXT:    shldq $8, %rax, %rcx
-; SSE2-NEXT:    movq %rcx, %xmm1
-; SSE2-NEXT:    psllq $8, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: PR107289:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movq %xmm0, %rax
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSSE3-NEXT:    movq %xmm1, %rcx
-; SSSE3-NEXT:    shldq $8, %rax, %rcx
-; SSSE3-NEXT:    movq %rcx, %xmm1
-; SSSE3-NEXT:    psllq $8, %xmm0
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: PR107289:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    pextrq $1, %xmm0, %rcx
-; SSE41-NEXT:    shldq $8, %rax, %rcx
-; SSE41-NEXT:    movq %rcx, %xmm1
-; SSE41-NEXT:    psllq $8, %xmm0
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT:    retq
+; SSE-LABEL: PR107289:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: PR107289:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovq %xmm0, %rax
-; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX-NEXT:    shldq $8, %rax, %rcx
-; AVX-NEXT:    vmovq %rcx, %xmm1
-; AVX-NEXT:    vpsllq $8, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
 ; AVX-NEXT:    retq
   %src = bitcast <16 x i8> %0 to i128
   %shl = shl i128 %src, 8

From 2f6e4ed389a6589f340d7efab2b0c7ee22c3d086 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 5 Sep 2024 16:48:22 +0100
Subject: [PATCH 247/425] [IR] Check parameters of target extension types on
 construction (#107268)

Since IR Types are immutable it makes sense to check them on
construction instead of in the IR Verifier pass.

This patch checks that some TargetExtTypes are well-formed in the sense
that they have the expected number of type parameters and integer
parameters. When called from LLParser it gives a diagnostic message.
When called from anywhere else it just asserts that they are
well-formed.
---
 llvm/include/llvm/IR/DerivedTypes.h           | 14 +++++++
 llvm/lib/AsmParser/LLParser.cpp               |  7 +++-
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |  6 ++-
 llvm/lib/IR/Type.cpp                          | 39 +++++++++++++++----
 .../Assembler/target-type-param-errors.ll     | 12 ++++++
 5 files changed, 69 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/Assembler/target-type-param-errors.ll

diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 01f76d4932780..0c8cbe1921ac9 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -32,6 +32,7 @@ namespace llvm {
 class Value;
 class APInt;
 class LLVMContext;
+template <typename T> class Expected;
 
 /// Class to represent integer types. Note that this class is also used to
 /// represent the built-in integer types: Int1Ty, Int8Ty, Int16Ty, Int32Ty and
@@ -735,6 +736,19 @@ class TargetExtType : public Type {
                             ArrayRef<Type *> Types = std::nullopt,
                             ArrayRef<unsigned> Ints = std::nullopt);
 
+  /// Return a target extension type having the specified name and optional
+  /// type and integer parameters, or an appropriate Error if it fails the
+  /// parameters check.
+  static Expected<TargetExtType *>
+  getOrError(LLVMContext &Context, StringRef Name,
+             ArrayRef<Type *> Types = std::nullopt,
+             ArrayRef<unsigned> Ints = std::nullopt);
+
+  /// Check that a newly created target extension type has the expected number
+  /// of type parameters and integer parameters, returning the type itself if OK
+  /// or an appropriate Error if not.
+  static Expected<TargetExtType *> checkParams(TargetExtType *TTy);
+
   /// Return the name for this target extension type. Two distinct target
   /// extension types may have the same name if their type or integer parameters
   /// differ.
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index f41907f035125..93dc2bd241581 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -3530,7 +3530,12 @@ bool LLParser::parseTargetExtType(Type *&Result) {
   if (parseToken(lltok::rparen, "expected ')' in target extension type"))
     return true;
 
-  Result = TargetExtType::get(Context, TypeName, TypeParams, IntParams);
+  auto TTy =
+      TargetExtType::getOrError(Context, TypeName, TypeParams, IntParams);
+  if (auto E = TTy.takeError())
+    return tokError(toString(std::move(E)));
+
+  Result = *TTy;
   return false;
 }
 
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 654be985a3229..1cd9ec6b8fca2 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2679,7 +2679,11 @@ Error BitcodeReader::parseTypeTableBody() {
           return error("Integer parameter too large");
         IntParams.push_back(Record[i]);
       }
-      ResultTy = TargetExtType::get(Context, TypeName, TypeParams, IntParams);
+      auto TTy =
+          TargetExtType::getOrError(Context, TypeName, TypeParams, IntParams);
+      if (auto E = TTy.takeError())
+        return E;
+      ResultTy = *TTy;
       TypeName.clear();
       break;
     }
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 753aa5fd2118e..93891461dd663 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/TypeSize.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/RISCVTargetParser.h"
@@ -792,6 +793,13 @@ TargetExtType::TargetExtType(LLVMContext &C, StringRef Name,
 TargetExtType *TargetExtType::get(LLVMContext &C, StringRef Name,
                                   ArrayRef<Type *> Types,
                                   ArrayRef<unsigned> Ints) {
+  return cantFail(getOrError(C, Name, Types, Ints));
+}
+
+Expected<TargetExtType *> TargetExtType::getOrError(LLVMContext &C,
+                                                    StringRef Name,
+                                                    ArrayRef<Type *> Types,
+                                                    ArrayRef<unsigned> Ints) {
   const TargetExtTypeKeyInfo::KeyTy Key(Name, Types, Ints);
   TargetExtType *TT;
   // Since we only want to allocate a fresh target type in case none is found
@@ -799,8 +807,8 @@ TargetExtType *TargetExtType::get(LLVMContext &C, StringRef Name,
   // one for inserting the newly allocated one), here we instead lookup based on
   // Key and update the reference to the target type in-place to a newly
   // allocated one if not found.
-  auto Insertion = C.pImpl->TargetExtTypes.insert_as(nullptr, Key);
-  if (Insertion.second) {
+  auto [Iter, Inserted] = C.pImpl->TargetExtTypes.insert_as(nullptr, Key);
+  if (Inserted) {
     // The target type was not found. Allocate one and update TargetExtTypes
     // in-place.
     TT = (TargetExtType *)C.pImpl->Alloc.Allocate(
@@ -808,12 +816,29 @@ TargetExtType *TargetExtType::get(LLVMContext &C, StringRef Name,
             sizeof(unsigned) * Ints.size(),
         alignof(TargetExtType));
     new (TT) TargetExtType(C, Name, Types, Ints);
-    *Insertion.first = TT;
-  } else {
-    // The target type was found. Just return it.
-    TT = *Insertion.first;
+    *Iter = TT;
+    return checkParams(TT);
   }
-  return TT;
+
+  // The target type was found. Just return it.
+  return *Iter;
+}
+
+Expected<TargetExtType *> TargetExtType::checkParams(TargetExtType *TTy) {
+  // Opaque types in the AArch64 name space.
+  if (TTy->Name == "aarch64.svcount" &&
+      (TTy->getNumTypeParameters() != 0 || TTy->getNumIntParameters() != 0))
+    return createStringError(
+        "target extension type aarch64.svcount should have no parameters");
+
+  // Opaque types in the RISC-V name space.
+  if (TTy->Name == "riscv.vector.tuple" &&
+      (TTy->getNumTypeParameters() != 1 || TTy->getNumIntParameters() != 1))
+    return createStringError(
+        "target extension type riscv.vector.tuple should have one "
+        "type parameter and one integer parameter");
+
+  return TTy;
 }
 
 namespace {
diff --git a/llvm/test/Assembler/target-type-param-errors.ll b/llvm/test/Assembler/target-type-param-errors.ll
new file mode 100644
index 0000000000000..03180811c7549
--- /dev/null
+++ b/llvm/test/Assembler/target-type-param-errors.ll
@@ -0,0 +1,12 @@
+; RUN: split-file %s %t
+; RUN: not llvm-as < %t/aarch64-svcount.ll -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-SVCOUNT %s
+; RUN: not llvm-as < %t/riscv-vector-tuple.ll -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-RISCV-VECTOR-TUPLE %s
+; Check target extension type properties are verified in the assembler.
+
+;--- aarch64-svcount.ll
+declare target("aarch64.svcount", i32) @aarch64_svcount()
+; CHECK-AARCH64-SVCOUNT: error: target extension type aarch64.svcount should have no parameters
+
+;--- riscv-vector-tuple.ll
+declare target("riscv.vector.tuple", 99) @riscv_vector_tuple()
+; CHECK-RISCV-VECTOR-TUPLE: target extension type riscv.vector.tuple should have one type parameter and one integer parameter

From fc3e6a81868a0c84e405622a64756e57f020ca37 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 5 Sep 2024 19:54:32 +0400
Subject: [PATCH 248/425] DAG: Handle lowering unordered compare with inf
 (#100378)

Try to take advantage of the nan check behavior of fcmp.
x86_64 looks better, x86_32 looks worse.
---
 llvm/include/llvm/CodeGen/CodeGenCommonISel.h |  6 +-
 llvm/lib/CodeGen/CodeGenCommonISel.cpp        |  8 ++-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 58 +++++++++++--------
 llvm/test/CodeGen/AArch64/isinf.ll            | 12 ++--
 llvm/test/CodeGen/PowerPC/fp-classify.ll      | 28 ++++-----
 5 files changed, 67 insertions(+), 45 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/CodeGenCommonISel.h b/llvm/include/llvm/CodeGen/CodeGenCommonISel.h
index 90ef890f22d1b..4c22be9450786 100644
--- a/llvm/include/llvm/CodeGen/CodeGenCommonISel.h
+++ b/llvm/include/llvm/CodeGen/CodeGenCommonISel.h
@@ -218,10 +218,14 @@ findSplitPointForStackProtector(MachineBasicBlock *BB,
 /// Evaluates if the specified FP class test is better performed as the inverse
 /// (i.e. fewer instructions should be required to lower it).  An example is the
 /// test "inf|normal|subnormal|zero", which is an inversion of "nan".
+///
 /// \param Test The test as specified in 'is_fpclass' intrinsic invocation.
+/// \param UseFCmp The intention is to perform the comparison using
+/// floating-point compare instructions which check for nan.
+///
 /// \returns The inverted test, or fcNone, if inversion does not produce a
 /// simpler test.
-FPClassTest invertFPClassTestIfSimpler(FPClassTest Test);
+FPClassTest invertFPClassTestIfSimpler(FPClassTest Test, bool UseFCmp);
 
 /// Assuming the instruction \p MI is going to be deleted, attempt to salvage
 /// debug users of \p MI by writing the effect of \p MI in a DIExpression.
diff --git a/llvm/lib/CodeGen/CodeGenCommonISel.cpp b/llvm/lib/CodeGen/CodeGenCommonISel.cpp
index fe144d3c18203..d985751e2be0b 100644
--- a/llvm/lib/CodeGen/CodeGenCommonISel.cpp
+++ b/llvm/lib/CodeGen/CodeGenCommonISel.cpp
@@ -173,8 +173,9 @@ llvm::findSplitPointForStackProtector(MachineBasicBlock *BB,
   return SplitPoint;
 }
 
-FPClassTest llvm::invertFPClassTestIfSimpler(FPClassTest Test) {
+FPClassTest llvm::invertFPClassTestIfSimpler(FPClassTest Test, bool UseFCmp) {
   FPClassTest InvertedTest = ~Test;
+
   // Pick the direction with fewer tests
   // TODO: Handle more combinations of cases that can be handled together
   switch (static_cast<unsigned>(InvertedTest)) {
@@ -200,6 +201,11 @@ FPClassTest llvm::invertFPClassTestIfSimpler(FPClassTest Test) {
   case fcSubnormal | fcZero:
   case fcSubnormal | fcZero | fcNan:
     return InvertedTest;
+  case fcInf | fcNan:
+    // If we're trying to use fcmp, we can take advantage of the nan check
+    // behavior of the compare (but this is more instructions in the integer
+    // expansion).
+    return UseFCmp ? InvertedTest : fcNone;
   default:
     return fcNone;
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 01feec0c435ed..c3affabb19d37 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8675,7 +8675,7 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
   // Degenerated cases.
   if (Test == fcNone)
     return DAG.getBoolConstant(false, DL, ResultVT, OperandVT);
-  if ((Test & fcAllFlags) == fcAllFlags)
+  if (Test == fcAllFlags)
     return DAG.getBoolConstant(true, DL, ResultVT, OperandVT);
 
   // PPC double double is a pair of doubles, of which the higher part determines
@@ -8686,14 +8686,6 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
     OperandVT = MVT::f64;
   }
 
-  // Some checks may be represented as inversion of simpler check, for example
-  // "inf|normal|subnormal|zero" => !"nan".
-  bool IsInverted = false;
-  if (FPClassTest InvertedCheck = invertFPClassTestIfSimpler(Test)) {
-    IsInverted = true;
-    Test = InvertedCheck;
-  }
-
   // Floating-point type properties.
   EVT ScalarFloatVT = OperandVT.getScalarType();
   const Type *FloatTy = ScalarFloatVT.getTypeForEVT(*DAG.getContext());
@@ -8705,9 +8697,16 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
   if (Flags.hasNoFPExcept() &&
       isOperationLegalOrCustom(ISD::SETCC, OperandVT.getScalarType())) {
     FPClassTest FPTestMask = Test;
+    bool IsInvertedFP = false;
+
+    if (FPClassTest InvertedFPCheck =
+            invertFPClassTestIfSimpler(FPTestMask, true)) {
+      FPTestMask = InvertedFPCheck;
+      IsInvertedFP = true;
+    }
 
-    ISD::CondCode OrderedCmpOpcode = IsInverted ? ISD::SETUNE : ISD::SETOEQ;
-    ISD::CondCode UnorderedCmpOpcode = IsInverted ? ISD::SETONE : ISD::SETUEQ;
+    ISD::CondCode OrderedCmpOpcode = IsInvertedFP ? ISD::SETUNE : ISD::SETOEQ;
+    ISD::CondCode UnorderedCmpOpcode = IsInvertedFP ? ISD::SETONE : ISD::SETUEQ;
 
     // See if we can fold an | fcNan into an unordered compare.
     FPClassTest OrderedFPTestMask = FPTestMask & ~fcNan;
@@ -8720,7 +8719,7 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
     const bool IsOrdered = FPTestMask == OrderedFPTestMask;
 
     if (std::optional<bool> IsCmp0 =
-            isFCmpEqualZero(Test, Semantics, DAG.getMachineFunction());
+            isFCmpEqualZero(FPTestMask, Semantics, DAG.getMachineFunction());
         IsCmp0 && (isCondCodeLegalOrCustom(
                       *IsCmp0 ? OrderedCmpOpcode : UnorderedCmpOpcode,
                       OperandVT.getScalarType().getSimpleVT()))) {
@@ -8732,31 +8731,35 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
                           *IsCmp0 ? OrderedCmpOpcode : UnorderedCmpOpcode);
     }
 
-    if (Test == fcNan &&
-        isCondCodeLegalOrCustom(IsInverted ? ISD::SETO : ISD::SETUO,
-                                OperandVT.getScalarType().getSimpleVT())) {
+    if (FPTestMask == fcNan &&
+        isCondCodeLegalOrCustom(IsInvertedFP ? ISD::SETO : ISD::SETUO,
+                                OperandVT.getScalarType().getSimpleVT()))
       return DAG.getSetCC(DL, ResultVT, Op, Op,
-                          IsInverted ? ISD::SETO : ISD::SETUO);
-    }
+                          IsInvertedFP ? ISD::SETO : ISD::SETUO);
 
-    if (Test == fcInf &&
-        isCondCodeLegalOrCustom(IsInverted ? ISD::SETUNE : ISD::SETOEQ,
+    bool IsOrderedInf = FPTestMask == fcInf;
+    if ((FPTestMask == fcInf || FPTestMask == (fcInf | fcNan)) &&
+        isCondCodeLegalOrCustom(IsOrderedInf ? OrderedCmpOpcode
+                                             : UnorderedCmpOpcode,
                                 OperandVT.getScalarType().getSimpleVT()) &&
-        isOperationLegalOrCustom(ISD::FABS, OperandVT.getScalarType())) {
+        isOperationLegalOrCustom(ISD::FABS, OperandVT.getScalarType()) &&
+        (isOperationLegal(ISD::ConstantFP, OperandVT.getScalarType()) ||
+         (OperandVT.isVector() &&
+          isOperationLegalOrCustom(ISD::BUILD_VECTOR, OperandVT)))) {
       // isinf(x) --> fabs(x) == inf
       SDValue Abs = DAG.getNode(ISD::FABS, DL, OperandVT, Op);
       SDValue Inf =
           DAG.getConstantFP(APFloat::getInf(Semantics), DL, OperandVT);
       return DAG.getSetCC(DL, ResultVT, Abs, Inf,
-                          IsInverted ? ISD::SETUNE : ISD::SETOEQ);
+                          IsOrderedInf ? OrderedCmpOpcode : UnorderedCmpOpcode);
     }
 
     if (OrderedFPTestMask == (fcSubnormal | fcZero) && !IsOrdered) {
       // TODO: Could handle ordered case, but it produces worse code for
       // x86. Maybe handle ordered if fabs is free?
 
-      ISD::CondCode OrderedOp = IsInverted ? ISD::SETUGE : ISD::SETOLT;
-      ISD::CondCode UnorderedOp = IsInverted ? ISD::SETOGE : ISD::SETULT;
+      ISD::CondCode OrderedOp = IsInvertedFP ? ISD::SETUGE : ISD::SETOLT;
+      ISD::CondCode UnorderedOp = IsInvertedFP ? ISD::SETOGE : ISD::SETULT;
 
       if (isCondCodeLegalOrCustom(IsOrdered ? OrderedOp : UnorderedOp,
                                   OperandVT.getScalarType().getSimpleVT())) {
@@ -8773,6 +8776,15 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
     }
   }
 
+  // Some checks may be represented as inversion of simpler check, for example
+  // "inf|normal|subnormal|zero" => !"nan".
+  bool IsInverted = false;
+
+  if (FPClassTest InvertedCheck = invertFPClassTestIfSimpler(Test, false)) {
+    Test = InvertedCheck;
+    IsInverted = true;
+  }
+
   // In the general case use integer operations.
   unsigned BitSize = OperandVT.getScalarSizeInBits();
   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), BitSize);
diff --git a/llvm/test/CodeGen/AArch64/isinf.ll b/llvm/test/CodeGen/AArch64/isinf.ll
index 834417b98743a..e68539bcf07d9 100644
--- a/llvm/test/CodeGen/AArch64/isinf.ll
+++ b/llvm/test/CodeGen/AArch64/isinf.ll
@@ -26,10 +26,10 @@ define i32 @replace_isinf_call_f16(half %x) {
 define i32 @replace_isinf_call_f32(float %x) {
 ; CHECK-LABEL: replace_isinf_call_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fabs s0, s0
+; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    and w9, w9, #0x7fffffff
+; CHECK-NEXT:    cmp w9, w8
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %abs = tail call float @llvm.fabs.f32(float %x)
@@ -42,10 +42,10 @@ define i32 @replace_isinf_call_f32(float %x) {
 define i32 @replace_isinf_call_f64(double %x) {
 ; CHECK-LABEL: replace_isinf_call_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fabs d0, d0
+; CHECK-NEXT:    fmov x9, d0
 ; CHECK-NEXT:    mov x8, #9218868437227405312 // =0x7ff0000000000000
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    fcmp d0, d1
+; CHECK-NEXT:    and x9, x9, #0x7fffffffffffffff
+; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %abs = tail call double @llvm.fabs.f64(double %x)
diff --git a/llvm/test/CodeGen/PowerPC/fp-classify.ll b/llvm/test/CodeGen/PowerPC/fp-classify.ll
index f527b3c48040e..dc9853ff2e301 100644
--- a/llvm/test/CodeGen/PowerPC/fp-classify.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-classify.ll
@@ -7,13 +7,13 @@
 define zeroext i1 @abs_isinff(float %x) {
 ; P8-LABEL: abs_isinff:
 ; P8:       # %bb.0: # %entry
-; P8-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
-; P8-NEXT:    xsabsdp 0, 1
-; P8-NEXT:    li 4, 1
-; P8-NEXT:    lfs 1, .LCPI0_0@toc@l(3)
-; P8-NEXT:    li 3, 0
-; P8-NEXT:    fcmpu 0, 0, 1
-; P8-NEXT:    iseleq 3, 4, 3
+; P8-NEXT:    xscvdpspn 0, 1
+; P8-NEXT:    lis 4, 32640
+; P8-NEXT:    mffprwz 3, 0
+; P8-NEXT:    clrlwi 3, 3, 1
+; P8-NEXT:    xor 3, 3, 4
+; P8-NEXT:    cntlzw 3, 3
+; P8-NEXT:    srwi 3, 3, 5
 ; P8-NEXT:    blr
 ;
 ; P9-LABEL: abs_isinff:
@@ -32,13 +32,13 @@ entry:
 define zeroext i1 @abs_isinf(double %x) {
 ; P8-LABEL: abs_isinf:
 ; P8:       # %bb.0: # %entry
-; P8-NEXT:    addis 3, 2, .LCPI1_0@toc@ha
-; P8-NEXT:    xsabsdp 0, 1
-; P8-NEXT:    li 4, 1
-; P8-NEXT:    lfs 1, .LCPI1_0@toc@l(3)
-; P8-NEXT:    li 3, 0
-; P8-NEXT:    fcmpu 0, 0, 1
-; P8-NEXT:    iseleq 3, 4, 3
+; P8-NEXT:    mffprd 3, 1
+; P8-NEXT:    li 4, 2047
+; P8-NEXT:    rldic 4, 4, 52, 1
+; P8-NEXT:    clrldi 3, 3, 1
+; P8-NEXT:    xor 3, 3, 4
+; P8-NEXT:    cntlzd 3, 3
+; P8-NEXT:    rldicl 3, 3, 58, 63
 ; P8-NEXT:    blr
 ;
 ; P9-LABEL: abs_isinf:

From c2018fa40fd081a10af4f3294362db9634d9a282 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Thu, 5 Sep 2024 09:01:19 -0700
Subject: [PATCH 249/425] [NFC][Support] Refactor FormatVariadic code.
 (#106610)

- Rename `Align` field in ReplacementItem/FmtAlign to `Width` to
  accurately reflect its use.
- Change both `Width` and `Index` in ReplacementItem to 32-bit int
  instead of size_t (as 64-bits seems excessive in this context).
- Eliminate the use of `Empty` ReplacementType, and use the
  existing std::optional<> instead to indicate that.
- Eliminate some boilerplate type code in formatv().
- Eliminate the loop in `splitLiteralAndReplacement`. The existing
  code will never loop back.
- Directly use constructor instead of std::make_pair.
---
 llvm/include/llvm/Support/FormatCommon.h      |  20 ++--
 llvm/include/llvm/Support/FormatVariadic.h    |  27 ++---
 llvm/lib/Support/FormatVariadic.cpp           | 108 ++++++++----------
 llvm/unittests/Support/FormatVariadicTest.cpp |  40 +++----
 4 files changed, 90 insertions(+), 105 deletions(-)

diff --git a/llvm/include/llvm/Support/FormatCommon.h b/llvm/include/llvm/Support/FormatCommon.h
index 326e00936aa7c..eaf291b864c5e 100644
--- a/llvm/include/llvm/Support/FormatCommon.h
+++ b/llvm/include/llvm/Support/FormatCommon.h
@@ -16,15 +16,17 @@
 namespace llvm {
 enum class AlignStyle { Left, Center, Right };
 
+/// Helper class to format to a \p Width wide field, with alignment \p Where
+/// within that field.
 struct FmtAlign {
   support::detail::format_adapter &Adapter;
   AlignStyle Where;
-  size_t Amount;
+  unsigned Width;
   char Fill;
 
   FmtAlign(support::detail::format_adapter &Adapter, AlignStyle Where,
-           size_t Amount, char Fill = ' ')
-      : Adapter(Adapter), Where(Where), Amount(Amount), Fill(Fill) {}
+           unsigned Width, char Fill = ' ')
+      : Adapter(Adapter), Where(Where), Width(Width), Fill(Fill) {}
 
   void format(raw_ostream &S, StringRef Options) {
     // If we don't need to align, we can format straight into the underlying
@@ -32,7 +34,7 @@ struct FmtAlign {
     // in order to calculate how long the output is so we can align it.
     // TODO: Make the format method return the number of bytes written, that
     // way we can also skip the intermediate stream for left-aligned output.
-    if (Amount == 0) {
+    if (Width == 0) {
       Adapter.format(S, Options);
       return;
     }
@@ -40,19 +42,19 @@ struct FmtAlign {
     raw_svector_ostream Stream(Item);
 
     Adapter.format(Stream, Options);
-    if (Amount <= Item.size()) {
+    if (Width <= Item.size()) {
       S << Item;
       return;
     }
 
-    size_t PadAmount = Amount - Item.size();
+    unsigned PadAmount = Width - static_cast<unsigned>(Item.size());
     switch (Where) {
     case AlignStyle::Left:
       S << Item;
       fill(S, PadAmount);
       break;
     case AlignStyle::Center: {
-      size_t X = PadAmount / 2;
+      unsigned X = PadAmount / 2;
       fill(S, X);
       S << Item;
       fill(S, PadAmount - X);
@@ -66,8 +68,8 @@ struct FmtAlign {
   }
 
 private:
-  void fill(llvm::raw_ostream &S, size_t Count) {
-    for (size_t I = 0; I < Count; ++I)
+  void fill(llvm::raw_ostream &S, unsigned Count) {
+    for (unsigned I = 0; I < Count; ++I)
       S << Fill;
   }
 };
diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h
index f31ad70021579..005d26f02d8fd 100644
--- a/llvm/include/llvm/Support/FormatVariadic.h
+++ b/llvm/include/llvm/Support/FormatVariadic.h
@@ -43,21 +43,20 @@
 
 namespace llvm {
 
-enum class ReplacementType { Empty, Format, Literal };
+enum class ReplacementType { Format, Literal };
 
 struct ReplacementItem {
-  ReplacementItem() = default;
   explicit ReplacementItem(StringRef Literal)
       : Type(ReplacementType::Literal), Spec(Literal) {}
-  ReplacementItem(StringRef Spec, size_t Index, size_t Align, AlignStyle Where,
-                  char Pad, StringRef Options)
-      : Type(ReplacementType::Format), Spec(Spec), Index(Index), Align(Align),
+  ReplacementItem(StringRef Spec, unsigned Index, unsigned Width,
+                  AlignStyle Where, char Pad, StringRef Options)
+      : Type(ReplacementType::Format), Spec(Spec), Index(Index), Width(Width),
         Where(Where), Pad(Pad), Options(Options) {}
 
-  ReplacementType Type = ReplacementType::Empty;
+  ReplacementType Type;
   StringRef Spec;
-  size_t Index = 0;
-  size_t Align = 0;
+  unsigned Index = 0;
+  unsigned Width = 0;
   AlignStyle Where = AlignStyle::Right;
   char Pad = 0;
   StringRef Options;
@@ -81,8 +80,6 @@ class formatv_object_base {
   void format(raw_ostream &S) const {
     const auto Replacements = parseFormatString(Fmt, Adapters.size(), Validate);
     for (const auto &R : Replacements) {
-      if (R.Type == ReplacementType::Empty)
-        continue;
       if (R.Type == ReplacementType::Literal) {
         S << R.Spec;
         continue;
@@ -94,7 +91,7 @@ class formatv_object_base {
 
       auto *W = Adapters[R.Index];
 
-      FmtAlign Align(*W, R.Where, R.Align, R.Pad);
+      FmtAlign Align(*W, R.Where, R.Width, R.Pad);
       Align.format(S, R.Options);
     }
   }
@@ -248,14 +245,10 @@ template <typename Tuple> class formatv_object : public formatv_object_base {
 
 // formatv() with validation enable/disable controlled by the first argument.
 template <typename... Ts>
-inline auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
-    -> formatv_object<decltype(std::make_tuple(
-        support::detail::build_format_adapter(std::forward<Ts>(Vals))...))> {
-  using ParamTuple = decltype(std::make_tuple(
-      support::detail::build_format_adapter(std::forward<Ts>(Vals))...));
+inline auto formatv(bool Validate, const char *Fmt, Ts &&...Vals) {
   auto Params = std::make_tuple(
       support::detail::build_format_adapter(std::forward<Ts>(Vals))...);
-  return formatv_object<ParamTuple>(Fmt, std::move(Params), Validate);
+  return formatv_object<decltype(Params)>(Fmt, std::move(Params), Validate);
 }
 
 // formatv() with validation enabled.
diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp
index 26d2b549136e4..9056466190284 100644
--- a/llvm/lib/Support/FormatVariadic.cpp
+++ b/llvm/lib/Support/FormatVariadic.cpp
@@ -26,7 +26,7 @@ static std::optional<AlignStyle> translateLocChar(char C) {
 }
 
 static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
-                               size_t &Align, char &Pad) {
+                               unsigned &Align, char &Pad) {
   Where = AlignStyle::Right;
   Align = 0;
   Pad = ' ';
@@ -60,14 +60,14 @@ static std::optional<ReplacementItem> parseReplacementItem(StringRef Spec) {
   // If the replacement sequence does not start with a non-negative integer,
   // this is an error.
   char Pad = ' ';
-  std::size_t Align = 0;
+  unsigned Align = 0;
   AlignStyle Where = AlignStyle::Right;
   StringRef Options;
-  size_t Index = 0;
+  unsigned Index = 0;
   RepString = RepString.trim();
   if (RepString.consumeInteger(0, Index)) {
     assert(false && "Invalid replacement sequence index!");
-    return ReplacementItem{};
+    return std::nullopt;
   }
   RepString = RepString.trim();
   if (RepString.consume_front(",")) {
@@ -83,61 +83,50 @@ static std::optional<ReplacementItem> parseReplacementItem(StringRef Spec) {
   assert(RepString.empty() &&
          "Unexpected characters found in replacement string!");
 
-  return ReplacementItem{Spec, Index, Align, Where, Pad, Options};
+  return ReplacementItem(Spec, Index, Align, Where, Pad, Options);
 }
 
-static std::pair<ReplacementItem, StringRef>
+static std::pair<std::optional<ReplacementItem>, StringRef>
 splitLiteralAndReplacement(StringRef Fmt) {
-  while (!Fmt.empty()) {
-    // Everything up until the first brace is a literal.
-    if (Fmt.front() != '{') {
-      std::size_t BO = Fmt.find_first_of('{');
-      return std::make_pair(ReplacementItem{Fmt.substr(0, BO)}, Fmt.substr(BO));
-    }
-
-    StringRef Braces = Fmt.take_while([](char C) { return C == '{'; });
-    // If there is more than one brace, then some of them are escaped.  Treat
-    // these as replacements.
-    if (Braces.size() > 1) {
-      size_t NumEscapedBraces = Braces.size() / 2;
-      StringRef Middle = Fmt.take_front(NumEscapedBraces);
-      StringRef Right = Fmt.drop_front(NumEscapedBraces * 2);
-      return std::make_pair(ReplacementItem{Middle}, Right);
-    }
-    // An unterminated open brace is undefined. Assert to indicate that this is
-    // undefined and that we consider it an error. When asserts are disabled,
-    // build a replacement item with an error message.
-    std::size_t BC = Fmt.find_first_of('}');
-    if (BC == StringRef::npos) {
-      assert(
-          false &&
-          "Unterminated brace sequence. Escape with {{ for a literal brace.");
-      return std::make_pair(
-          ReplacementItem{"Unterminated brace sequence. Escape with {{ for a "
-                          "literal brace."},
-          StringRef());
-    }
+  assert(!Fmt.empty());
+  // Everything up until the first brace is a literal.
+  if (Fmt.front() != '{') {
+    size_t BO = Fmt.find_first_of('{');
+    return {ReplacementItem{Fmt.substr(0, BO)}, Fmt.substr(BO)};
+  }
 
-    // Even if there is a closing brace, if there is another open brace before
-    // this closing brace, treat this portion as literal, and try again with the
-    // next one.
-    std::size_t BO2 = Fmt.find_first_of('{', 1);
-    if (BO2 < BC)
-      return std::make_pair(ReplacementItem{Fmt.substr(0, BO2)},
-                            Fmt.substr(BO2));
+  StringRef Braces = Fmt.take_while([](char C) { return C == '{'; });
+  // If there is more than one brace, then some of them are escaped.  Treat
+  // these as replacements.
+  if (Braces.size() > 1) {
+    size_t NumEscapedBraces = Braces.size() / 2;
+    StringRef Middle = Fmt.take_front(NumEscapedBraces);
+    StringRef Right = Fmt.drop_front(NumEscapedBraces * 2);
+    return {ReplacementItem(Middle), Right};
+  }
+  // An unterminated open brace is undefined. Assert to indicate that this is
+  // undefined and that we consider it an error. When asserts are disabled,
+  // build a replacement item with an error message.
+  size_t BC = Fmt.find_first_of('}');
+  if (BC == StringRef::npos) {
+    assert(false &&
+           "Unterminated brace sequence. Escape with {{ for a literal brace.");
+    return {ReplacementItem("Unterminated brace sequence. Escape with {{ for a "
+                            "literal brace."),
+            StringRef()};
+  }
 
-    StringRef Spec = Fmt.slice(1, BC);
-    StringRef Right = Fmt.substr(BC + 1);
+  // Even if there is a closing brace, if there is another open brace before
+  // this closing brace, treat this portion as literal, and try again with the
+  // next one.
+  size_t BO2 = Fmt.find_first_of('{', 1);
+  if (BO2 < BC)
+    return {ReplacementItem(Fmt.substr(0, BO2)), Fmt.substr(BO2)};
 
-    auto RI = parseReplacementItem(Spec);
-    if (RI)
-      return std::make_pair(*RI, Right);
+  StringRef Spec = Fmt.slice(1, BC);
+  StringRef Right = Fmt.substr(BC + 1);
 
-    // If there was an error parsing the replacement item, treat it as an
-    // invalid replacement spec, and just continue.
-    Fmt = Fmt.drop_front(BC + 1);
-  }
-  return std::make_pair(ReplacementItem{Fmt}, StringRef());
+  return {parseReplacementItem(Spec), Right};
 }
 
 #ifndef NDEBUG
@@ -153,17 +142,18 @@ formatv_object_base::parseFormatString(StringRef Fmt, size_t NumArgs,
 
 #if ENABLE_VALIDATION
   const StringRef SavedFmtStr = Fmt;
-  size_t NumExpectedArgs = 0;
+  unsigned NumExpectedArgs = 0;
 #endif
 
   while (!Fmt.empty()) {
-    ReplacementItem I;
+    std::optional<ReplacementItem> I;
     std::tie(I, Fmt) = splitLiteralAndReplacement(Fmt);
-    if (I.Type != ReplacementType::Empty)
-      Replacements.push_back(I);
+    if (!I)
+      continue;
+    Replacements.emplace_back(*I);
 #if ENABLE_VALIDATION
-    if (I.Type == ReplacementType::Format)
-      NumExpectedArgs = std::max(NumExpectedArgs, I.Index + 1);
+    if (I->Type == ReplacementType::Format)
+      NumExpectedArgs = std::max(NumExpectedArgs, I->Index + 1);
 #endif
   }
 
@@ -195,7 +185,7 @@ formatv_object_base::parseFormatString(StringRef Fmt, size_t NumArgs,
   // Find the number of unique indices seen. All replacement indices
   // are < NumExpectedArgs.
   SmallVector<bool> Indices(NumExpectedArgs);
-  size_t Count = 0;
+  unsigned Count = 0;
   for (const ReplacementItem &I : Replacements) {
     if (I.Type != ReplacementType::Format || Indices[I.Index])
       continue;
diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp
index 7f1e09b1857dd..4f3d1791c0018 100644
--- a/llvm/unittests/Support/FormatVariadicTest.cpp
+++ b/llvm/unittests/Support/FormatVariadicTest.cpp
@@ -87,14 +87,14 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(0u, Replacements[0].Align);
+  EXPECT_EQ(0u, Replacements[0].Width);
   EXPECT_EQ("", Replacements[0].Options);
 
   Replacements = parseFormatString("{1}");
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(1u, Replacements[0].Index);
-  EXPECT_EQ(0u, Replacements[0].Align);
+  EXPECT_EQ(0u, Replacements[0].Width);
   EXPECT_EQ(AlignStyle::Right, Replacements[0].Where);
   EXPECT_EQ("", Replacements[0].Options);
 
@@ -103,7 +103,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(3u, Replacements[0].Align);
+  EXPECT_EQ(3u, Replacements[0].Width);
   EXPECT_EQ(AlignStyle::Right, Replacements[0].Where);
   EXPECT_EQ("", Replacements[0].Options);
 
@@ -112,7 +112,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(3u, Replacements[0].Align);
+  EXPECT_EQ(3u, Replacements[0].Width);
   EXPECT_EQ(AlignStyle::Left, Replacements[0].Where);
   EXPECT_EQ("", Replacements[0].Options);
 
@@ -121,7 +121,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(3u, Replacements[0].Align);
+  EXPECT_EQ(3u, Replacements[0].Width);
   EXPECT_EQ(AlignStyle::Center, Replacements[0].Where);
   EXPECT_EQ("", Replacements[0].Options);
 
@@ -130,7 +130,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(0u, Replacements[0].Align);
+  EXPECT_EQ(0u, Replacements[0].Width);
   EXPECT_EQ(AlignStyle::Right, Replacements[0].Where);
   EXPECT_EQ("foo", Replacements[0].Options);
 
@@ -139,7 +139,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(3u, Replacements[0].Align);
+  EXPECT_EQ(3u, Replacements[0].Width);
   EXPECT_EQ(AlignStyle::Left, Replacements[0].Where);
   EXPECT_EQ("foo", Replacements[0].Options);
 
@@ -148,7 +148,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(3u, Replacements[0].Align);
+  EXPECT_EQ(3u, Replacements[0].Width);
   EXPECT_EQ(AlignStyle::Left, Replacements[0].Where);
   EXPECT_EQ("foo", Replacements[0].Options);
 
@@ -159,7 +159,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("0:0:1", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(0u, Replacements[0].Align);
+  EXPECT_EQ(0u, Replacements[0].Width);
   EXPECT_EQ(AlignStyle::Right, Replacements[0].Where);
   EXPECT_EQ("0:1", Replacements[0].Options);
 
@@ -169,7 +169,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("0,p+4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(4u, Replacements[0].Align);
+  EXPECT_EQ(4u, Replacements[0].Width);
   EXPECT_EQ(AlignStyle::Right, Replacements[0].Where);
   EXPECT_EQ('p', Replacements[0].Pad);
   EXPECT_EQ("foo", Replacements[0].Options);
@@ -180,7 +180,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("0,-+4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(4u, Replacements[0].Align);
+  EXPECT_EQ(4u, Replacements[0].Width);
   EXPECT_EQ(AlignStyle::Right, Replacements[0].Where);
   EXPECT_EQ('-', Replacements[0].Pad);
   EXPECT_EQ("foo", Replacements[0].Options);
@@ -190,7 +190,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("0,+-4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(4u, Replacements[0].Align);
+  EXPECT_EQ(4u, Replacements[0].Width);
   EXPECT_EQ(AlignStyle::Left, Replacements[0].Where);
   EXPECT_EQ('+', Replacements[0].Pad);
   EXPECT_EQ("foo", Replacements[0].Options);
@@ -200,7 +200,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("0,==4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(4u, Replacements[0].Align);
+  EXPECT_EQ(4u, Replacements[0].Width);
   EXPECT_EQ(AlignStyle::Center, Replacements[0].Where);
   EXPECT_EQ('=', Replacements[0].Pad);
   EXPECT_EQ("foo", Replacements[0].Options);
@@ -210,7 +210,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ("0,:=4:foo", Replacements[0].Spec);
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(4u, Replacements[0].Align);
+  EXPECT_EQ(4u, Replacements[0].Width);
   EXPECT_EQ(AlignStyle::Center, Replacements[0].Where);
   EXPECT_EQ(':', Replacements[0].Pad);
   EXPECT_EQ("foo", Replacements[0].Options);
@@ -222,7 +222,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) {
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(3u, Replacements[0].Align);
+  EXPECT_EQ(3u, Replacements[0].Width);
   EXPECT_EQ("", Replacements[0].Options);
 
   // Including if the colon is present but contains no text.
@@ -230,7 +230,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) {
   ASSERT_EQ(1u, Replacements.size());
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(3u, Replacements[0].Align);
+  EXPECT_EQ(3u, Replacements[0].Width);
   EXPECT_EQ("", Replacements[0].Options);
 
   // 3. If alignment is missing, it defaults to 0, right, space
@@ -240,7 +240,7 @@ TEST(FormatVariadicTest, DefaultReplacementValues) {
   EXPECT_EQ(AlignStyle::Right, Replacements[0].Where);
   EXPECT_EQ(' ', Replacements[0].Pad);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(0u, Replacements[0].Align);
+  EXPECT_EQ(0u, Replacements[0].Width);
   EXPECT_EQ("foo", Replacements[0].Options);
 }
 
@@ -250,7 +250,7 @@ TEST(FormatVariadicTest, MultipleReplacements) {
   // {0}
   EXPECT_EQ(ReplacementType::Format, Replacements[0].Type);
   EXPECT_EQ(0u, Replacements[0].Index);
-  EXPECT_EQ(0u, Replacements[0].Align);
+  EXPECT_EQ(0u, Replacements[0].Width);
   EXPECT_EQ(AlignStyle::Right, Replacements[0].Where);
   EXPECT_EQ("", Replacements[0].Options);
 
@@ -261,7 +261,7 @@ TEST(FormatVariadicTest, MultipleReplacements) {
   // {1:foo} - Options=foo
   EXPECT_EQ(ReplacementType::Format, Replacements[2].Type);
   EXPECT_EQ(1u, Replacements[2].Index);
-  EXPECT_EQ(0u, Replacements[2].Align);
+  EXPECT_EQ(0u, Replacements[2].Width);
   EXPECT_EQ(AlignStyle::Right, Replacements[2].Where);
   EXPECT_EQ("foo", Replacements[2].Options);
 
@@ -272,7 +272,7 @@ TEST(FormatVariadicTest, MultipleReplacements) {
   // {2:bar,-3} - Options=bar, Align=-3
   EXPECT_EQ(ReplacementType::Format, Replacements[4].Type);
   EXPECT_EQ(2u, Replacements[4].Index);
-  EXPECT_EQ(3u, Replacements[4].Align);
+  EXPECT_EQ(3u, Replacements[4].Width);
   EXPECT_EQ(AlignStyle::Left, Replacements[4].Where);
   EXPECT_EQ("bar", Replacements[4].Options);
 }

From be1958fd487dd58532a45b40be4a7152b80ec31a Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Thu, 5 Sep 2024 17:02:48 +0100
Subject: [PATCH 250/425] [LLVM][CodeGen][SVE] Implement nxvbf16 fpextend to
 nxvf32/nxvf64. (#107253)

NOTE: There are no dedicated SVE instructions but bf16->f32 is just a
left shift because they share the same exponent range and from there
other convert instructions can be used.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 23 ++++-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  7 +-
 .../test/CodeGen/AArch64/sve-bf16-converts.ll | 89 +++++++++++++++++++
 3 files changed, 117 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-bf16-converts.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d1ddbfa300846..c0671dd1f0087 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1663,6 +1663,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
       setOperationAction(ISD::BITCAST, VT, Custom);
       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+      setOperationAction(ISD::FP_EXTEND, VT, Custom);
       setOperationAction(ISD::MLOAD, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
@@ -4298,8 +4299,28 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
-  if (VT.isScalableVector())
+  if (VT.isScalableVector()) {
+    SDValue SrcVal = Op.getOperand(0);
+
+    if (SrcVal.getValueType().getScalarType() == MVT::bf16) {
+      // bf16 and f32 share the same exponent range so the conversion requires
+      // them to be aligned with the new mantissa bits zero'd. This is just a
+      // left shift that is best to isel directly.
+      if (VT == MVT::nxv2f32 || VT == MVT::nxv4f32)
+        return Op;
+
+      if (VT != MVT::nxv2f64)
+        return SDValue();
+
+      // Break other conversions in two with the first part converting to f32
+      // and the second using native f32->VT instructions.
+      SDLoc DL(Op);
+      return DAG.getNode(ISD::FP_EXTEND, DL, VT,
+                         DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
+    }
+
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
+  }
 
   if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
     return LowerFixedLengthFPExtendToSVE(Op, DAG);
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 4922fb280333b..692cd66d38437 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2320,7 +2320,12 @@ let Predicates = [HasSVEorSME] in {
   def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 (SVEAllActive:$Pg)), nxv2f32:$Zs, (i64 timm0_1), nxv2f16:$Zd)),
             (FCVT_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
-  // Signed integer -> Floating-point 
+  def : Pat<(nxv4f32 (fpextend nxv4bf16:$op)),
+            (LSL_ZZI_S $op, (i32 16))>;
+  def : Pat<(nxv2f32 (fpextend nxv2bf16:$op)),
+            (LSL_ZZI_S $op, (i32 16))>;
+
+  // Signed integer -> Floating-point
   def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg),
                       (sext_inreg nxv2i64:$Zs, nxv2i16), nxv2f16:$Zd)),
             (SCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll b/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll
new file mode 100644
index 0000000000000..d72f92c1dac1f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve                  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 2 x float> @fpext_nxv2bf16_to_nxv2f32(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fpext_nxv2bf16_to_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 2 x bfloat> %a to <vscale x 2 x float>
+  ret <vscale x 2 x float> %res
+}
+
+define <vscale x 4 x float> @fpext_nxv4bf16_to_nxv4f32(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fpext_nxv4bf16_to_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 4 x bfloat> %a to <vscale x 4 x float>
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 8 x float> @fpext_nxv8bf16_to_nxv8f32(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fpext_nxv8bf16_to_nxv8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    uunpkhi z2.s, z0.h
+; CHECK-NEXT:    lsl z0.s, z1.s, #16
+; CHECK-NEXT:    lsl z1.s, z2.s, #16
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 8 x bfloat> %a to <vscale x 8 x float>
+  ret <vscale x 8 x float> %res
+}
+
+define <vscale x 2 x double> @fpext_nxv2bf16_to_nxv2f64(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fpext_nxv2bf16_to_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.d, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 2 x bfloat> %a to <vscale x 2 x double>
+  ret <vscale x 2 x double> %res
+}
+
+define <vscale x 4 x double> @fpext_nxv4bf16_to_nxv4f64(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fpext_nxv4bf16_to_nxv4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z2.s, z0.s, #16
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvt z0.d, p0/m, z1.s
+; CHECK-NEXT:    movprfx z1, z2
+; CHECK-NEXT:    fcvt z1.d, p0/m, z2.s
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 4 x bfloat> %a to <vscale x 4 x double>
+  ret <vscale x 4 x double> %res
+}
+
+define <vscale x 8 x double> @fpext_nxv8bf16_to_nxv8f64(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fpext_nxv8bf16_to_nxv8f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z2.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z3.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z2.s, z2.s, #16
+; CHECK-NEXT:    lsl z3.s, z3.s, #16
+; CHECK-NEXT:    lsl z4.s, z0.s, #16
+; CHECK-NEXT:    fcvt z1.d, p0/m, z1.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvt z0.d, p0/m, z2.s
+; CHECK-NEXT:    movprfx z2, z3
+; CHECK-NEXT:    fcvt z2.d, p0/m, z3.s
+; CHECK-NEXT:    movprfx z3, z4
+; CHECK-NEXT:    fcvt z3.d, p0/m, z4.s
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 8 x bfloat> %a to <vscale x 8 x double>
+  ret <vscale x 8 x double> %res
+}

From 62e6c1ead7aedfbf973fb667537ff5cee4988da1 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 5 Sep 2024 12:29:46 -0400
Subject: [PATCH 251/425] [lld/mac] Allow -segprot having stricter initprot
 than maxprot on mac (#107269)

...including for catalyst.

The usecase for this is to put certain security-critical variables into
a special segment/section that's mapped as read-only most of the time,
and that temporary gets remapped as writeable when these variables are
written to be the program. This protects against them being written to
by heap spraying attacks. This special section should be mapped as
read-only at program start, so using

`-segprot MY_PROTECTED_MEMORY_THINGER rw r`

to mark that segment as rw maxprot and r initprot is exactly what we
want.

lld has so far rejected mismatching initprot and maxprot.

ld64 doesn't reject this, but silently writes initprot into both fields
(!) It looks like this might not be fully intentional, see
https://crbug.com/41495919#comment5 and
http://crbug.com/41495919#comment8.

In any case, when postprocessing ld64's output to have different values
for initprot and maxprot, the dynamic loader seems to do the right thing
(see also the previous two links).

The same technique also works on Windows, using both link.exe and
lld-link.exe using `/SECTION:myprotsect,R`.

So, since this is useful, allow it when targeting macOS, and make it do
what you'd expect.

Since loader support for this on iOS is less clear, keep disallowing it
there for now.

See the PR for the program I used to check that this seems to work. (I
only checked on arm64 macOS 14.5 so far; will run this on many more
systems on bots once this is merged and rolled in.)
---
 lld/MachO/Driver.cpp        | 18 +++++++++++++---
 lld/MachO/OutputSegment.cpp |  6 ++++++
 lld/test/MachO/segprot.s    | 41 +++++++++++++++++++++++++++++++------
 3 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 6a1ff96ed6569..09a539d71dab3 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1882,9 +1882,21 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     StringRef segName = arg->getValue(0);
     uint32_t maxProt = parseProtection(arg->getValue(1));
     uint32_t initProt = parseProtection(arg->getValue(2));
-    if (maxProt != initProt && config->arch() != AK_i386)
-      error("invalid argument '" + arg->getAsString(args) +
-            "': max and init must be the same for non-i386 archs");
+
+    // FIXME: Check if this works on more platforms.
+    bool allowsDifferentInitAndMaxProt =
+        config->platform() == PLATFORM_MACOS ||
+        config->platform() == PLATFORM_MACCATALYST;
+    if (allowsDifferentInitAndMaxProt) {
+      if (initProt > maxProt)
+        error("invalid argument '" + arg->getAsString(args) +
+              "': init must not be more permissive than max");
+    } else {
+      if (maxProt != initProt && config->arch() != AK_i386)
+        error("invalid argument '" + arg->getAsString(args) +
+              "': max and init must be the same for non-macOS non-i386 archs");
+    }
+
     if (segName == segment_names::linkEdit)
       error("-segprot cannot be used to change __LINKEDIT's protections");
     config->segmentProtections.push_back({segName, maxProt, initProt});
diff --git a/lld/MachO/OutputSegment.cpp b/lld/MachO/OutputSegment.cpp
index 3d8a8eb61a9bb..c320af3fb3177 100644
--- a/lld/MachO/OutputSegment.cpp
+++ b/lld/MachO/OutputSegment.cpp
@@ -42,6 +42,12 @@ static uint32_t initProt(StringRef name) {
 static uint32_t maxProt(StringRef name) {
   assert(config->arch() != AK_i386 &&
          "TODO: i386 has different maxProt requirements");
+  auto it = find_if(
+      config->segmentProtections,
+      [&](const SegmentProtection &segprot) { return segprot.name == name; });
+  if (it != config->segmentProtections.end())
+    return it->maxProt;
+
   return initProt(name);
 }
 
diff --git a/lld/test/MachO/segprot.s b/lld/test/MachO/segprot.s
index a4e91d1336105..a5ca8342b41aa 100644
--- a/lld/test/MachO/segprot.s
+++ b/lld/test/MachO/segprot.s
@@ -2,7 +2,10 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o
 
 ## Make sure the option parser doesn't think --x and -w are flags.
-# RUN: %lld -dylib -o %t %t.o -segprot FOO rwx xwr -segprot BAR --x --x -segprot BAZ -w -w
+# RUN: %lld -dylib -o %t %t.o \
+# RUN:      -segprot FOO rwx xwr \
+# RUN:      -segprot BAR --x --x \
+# RUN:      -segprot BAZ -w -w
 # RUN: llvm-readobj --macho-segment %t | FileCheck %s
 
 # CHECK:        Name: FOO
@@ -32,12 +35,38 @@
 # CHECK-NEXT:   maxprot: -w-
 # CHECK-NEXT:   initprot: -w-
 
-# RUN: not %lld -dylib -o /dev/null %t.o -segprot FOO rwx rw 2>&1 | FileCheck %s --check-prefix=MISMATCH
-# RUN: not %lld -dylib -o /dev/null %t.o -segprot __LINKEDIT rwx rwx 2>&1 | FileCheck %s --check-prefix=NO-LINKEDIT
-# RUN: not %lld -dylib -o /dev/null %t.o -segprot FOO uhh wat 2>&1 | FileCheck %s --check-prefix=MISPARSE
-# RUN: not %lld -dylib -o /dev/null %t.o -segprot FOO rwx 2>&1 | FileCheck %s --check-prefix=MISSING
+# RUN: %lld -dylib -o %t.different %t.o -segprot FOO rw r
+# RUN: llvm-readobj --macho-segment %t.different \
+# RUN:     | FileCheck %s --check-prefix=DIFFERENT
 
-# MISMATCH:    error: invalid argument '-segprot FOO rwx rw': max and init must be the same for non-i386 archs
+# RUN: %no-arg-lld -arch x86_64 -platform_version "mac catalyst" 14.0.0 17.5 \
+# RUN:     -dylib -o /dev/null %t.o -segprot FOO rw r
+# RUN: llvm-readobj --macho-segment %t.different \
+# RUN:     | FileCheck %s --check-prefix=DIFFERENT
+
+# DIFFERENT:        Name: FOO
+# DIFFERENT-NEXT:   Size:
+# DIFFERENT-NEXT:   vmaddr:
+# DIFFERENT-NEXT:   vmsize:
+# DIFFERENT-NEXT:   fileoff:
+# DIFFERENT-NEXT:   filesize:
+# DIFFERENT-NEXT:   maxprot: rw-
+# DIFFERENT-NEXT:   initprot: r--
+
+# RUN: not %no-arg-lld -arch x86_64 -platform_version ios-simulator 14.0 15.0 \
+# RUN:     -dylib -o /dev/null %t.o -segprot FOO rwx rw 2>&1 \
+# RUN:     | FileCheck %s --check-prefix=MISMATCH
+# RUN: not %lld -dylib -o /dev/null %t.o -segprot FOO r rw 2>&1 \
+# RUN:     | FileCheck %s --check-prefix=INITTOOPERMISSIVE
+# RUN: not %lld -dylib -o /dev/null %t.o -segprot __LINKEDIT rwx rwx 2>&1 \
+# RUN:     | FileCheck %s --check-prefix=NO-LINKEDIT
+# RUN: not %lld -dylib -o /dev/null %t.o -segprot FOO uhh wat 2>&1 \
+# RUN:     | FileCheck %s --check-prefix=MISPARSE
+# RUN: not %lld -dylib -o /dev/null %t.o -segprot FOO rwx 2>&1 \
+# RUN:     | FileCheck %s --check-prefix=MISSING
+
+# MISMATCH:    error: invalid argument '-segprot FOO rwx rw': max and init must be the same for non-macOS non-i386 archs
+# INITTOOPERMISSIVE: error: invalid argument '-segprot FOO r rw': init must not be more permissive than max
 # NO-LINKEDIT: error: -segprot cannot be used to change __LINKEDIT's protections
 # MISPARSE:    error: unknown -segprot letter 'u' in uhh
 # MISPARSE:    error: unknown -segprot letter 'a' in wat

From ce3648094d44e8c098396a353b215acecb363cda Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 6 Sep 2024 00:31:01 +0800
Subject: [PATCH 252/425] [RISCV] Update V0Defs after moving Src in peepholes
 (#107359)

If we move a pseudo in tryReduceVL or foldVMV_V_V via ensureDominates,
its V0 definition may have changed so we need to update V0Defs.

This shouldn't have any functional change today since any pseudo which
uses V0 won't be able to move past a new definition.

However this will matter if we add a peephole to convert unmasked
pseudos to masked pseudos and add a use of V0.
---
 llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index a612a03106f02..db8e496493c41 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -61,7 +61,7 @@ class RISCVVectorPeephole : public MachineFunctionPass {
   }
 
 private:
-  bool tryToReduceVL(MachineInstr &MI) const;
+  bool tryToReduceVL(MachineInstr &MI);
   bool convertToVLMAX(MachineInstr &MI) const;
   bool convertToWholeRegister(MachineInstr &MI) const;
   bool convertToUnmasked(MachineInstr &MI) const;
@@ -72,7 +72,7 @@ class RISCVVectorPeephole : public MachineFunctionPass {
   bool hasSameEEW(const MachineInstr &User, const MachineInstr &Src) const;
   bool isAllOnesMask(const MachineInstr *MaskDef) const;
   std::optional<unsigned> getConstant(const MachineOperand &VL) const;
-  bool ensureDominates(const MachineOperand &Use, MachineInstr &Src) const;
+  bool ensureDominates(const MachineOperand &Use, MachineInstr &Src);
 
   /// Maps uses of V0 to the corresponding def of V0.
   DenseMap<const MachineInstr *, const MachineInstr *> V0Defs;
@@ -115,7 +115,7 @@ bool RISCVVectorPeephole::hasSameEEW(const MachineInstr &User,
 // Attempt to reduce the VL of an instruction whose sole use is feeding a
 // instruction with a narrower VL.  This currently works backwards from the
 // user instruction (which might have a smaller VL).
-bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
+bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) {
   // Note that the goal here is a bit multifaceted.
   // 1) For store's reducing the VL of the value being stored may help to
   //    reduce VL toggles.  This is somewhat of an artifact of the fact we
@@ -465,17 +465,18 @@ static bool dominates(MachineBasicBlock::const_iterator A,
 /// does. Returns false if doesn't dominate and we can't move. \p MO must be in
 /// the same basic block as \Src.
 bool RISCVVectorPeephole::ensureDominates(const MachineOperand &MO,
-                                          MachineInstr &Src) const {
+                                          MachineInstr &Src) {
   assert(MO.getParent()->getParent() == Src.getParent());
   if (!MO.isReg() || MO.getReg() == RISCV::NoRegister)
     return true;
 
   MachineInstr *Def = MRI->getVRegDef(MO.getReg());
   if (Def->getParent() == Src.getParent() && !dominates(Def, Src)) {
-    if (!isSafeToMove(Src, *Def->getNextNode()))
+    MachineInstr *AfterDef = Def->getNextNode();
+    if (!isSafeToMove(Src, *AfterDef))
       return false;
-    // FIXME: Update V0Defs
-    Src.moveBefore(Def->getNextNode());
+    V0Defs[&Src] = V0Defs[AfterDef];
+    Src.moveBefore(AfterDef);
   }
 
   return true;

From 8e28f0471b20ed1148951bc7ffe5c503c43692ae Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Thu, 5 Sep 2024 09:38:05 -0700
Subject: [PATCH 253/425] [libc] Correct the entrypoints list for ARM/darwin
 (#107331)

These entrypoints were added to every target without testing. They don't
work on ARM macs.
---
 libc/config/darwin/arm/entrypoints.txt | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index 36da9e1313663..a012504daa5c5 100644
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -94,16 +94,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.calloc
     libc.src.stdlib.realloc
     libc.src.stdlib.free
-
-    # stdio.h external entrypoints
-    libc.src.stdio.snprintf
-    libc.src.stdio.sprintf
-    libc.src.stdio.asprintf
-    libc.src.stdio.asprintf
-    libc.src.stdio.vprintf
-    libc.src.stdio.vsnprintf
-    libc.src.stdio.vsprintf
-    libc.src.stdio.vasprintf
 )
 
 set(TARGET_LIBM_ENTRYPOINTS
@@ -148,9 +138,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cospif
     libc.src.math.dfmal
     libc.src.math.dsqrtl
-    libc.src.math.daddl
     libc.src.math.ddivl
-    libc.src.math.dsubl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.expf

From 2ed510dc9789ca0b9172f0593527bee9d53496c4 Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Thu, 5 Sep 2024 09:38:45 -0700
Subject: [PATCH 254/425] [LLDB][Minidump] Extend the minidump x86_64 registers
 to include fs_base and gs_base (#106767)

A follow up to #106473 Minidump wasn't collecting fs or gs_base. This
patch extends the x86_64 register context and gated reading it behind an
lldb specific flag. Additionally these registers are explicitly checked
in the tests.
---
 .../ObjectFile/Minidump/MinidumpFileBuilder.cpp      |  5 ++++-
 .../minidump/RegisterContextMinidump_x86_64.cpp      |  8 ++++++++
 .../minidump/RegisterContextMinidump_x86_64.h        |  8 +++++++-
 .../TestProcessSaveCoreMinidump.py                   | 12 ++++++++++++
 4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
index 13355afb58dbd..5c9ba223ad143 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
@@ -473,7 +473,8 @@ GetThreadContext_x86_64(RegisterContext *reg_ctx) {
       lldb_private::minidump::MinidumpContext_x86_64_Flags::x86_64_Flag |
       lldb_private::minidump::MinidumpContext_x86_64_Flags::Control |
       lldb_private::minidump::MinidumpContext_x86_64_Flags::Segments |
-      lldb_private::minidump::MinidumpContext_x86_64_Flags::Integer);
+      lldb_private::minidump::MinidumpContext_x86_64_Flags::Integer |
+      lldb_private::minidump::MinidumpContext_x86_64_Flags::LLDBSpecific);
   thread_context.rax = read_register_u64(reg_ctx, "rax");
   thread_context.rbx = read_register_u64(reg_ctx, "rbx");
   thread_context.rcx = read_register_u64(reg_ctx, "rcx");
@@ -499,6 +500,8 @@ GetThreadContext_x86_64(RegisterContext *reg_ctx) {
   thread_context.gs = read_register_u64(reg_ctx, "gs");
   thread_context.ss = read_register_u64(reg_ctx, "ss");
   thread_context.ds = read_register_u64(reg_ctx, "ds");
+  thread_context.fs_base = read_register_u64(reg_ctx, "fs_base");
+  thread_context.gs_base = read_register_u64(reg_ctx, "gs_base");
   return thread_context;
 }
 
diff --git a/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp b/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp
index 917140cab2976..e879c49315659 100644
--- a/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp
+++ b/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp
@@ -67,6 +67,7 @@ lldb::DataBufferSP lldb_private::minidump::ConvertMinidumpContext_x86_64(
   auto ControlFlag = MinidumpContext_x86_64_Flags::Control;
   auto IntegerFlag = MinidumpContext_x86_64_Flags::Integer;
   auto SegmentsFlag = MinidumpContext_x86_64_Flags::Segments;
+  auto LLDBSpecificFlag = MinidumpContext_x86_64_Flags::LLDBSpecific;
 
   if ((context_flags & x86_64_Flag) != x86_64_Flag)
     return nullptr;
@@ -104,6 +105,13 @@ lldb::DataBufferSP lldb_private::minidump::ConvertMinidumpContext_x86_64(
     writeRegister(&context->r15, result_base, reg_info[lldb_r15_x86_64]);
   }
 
+  if ((context_flags & LLDBSpecificFlag) == LLDBSpecificFlag) {
+    writeRegister(&context->fs_base, result_base,
+                  reg_info[x86_64_with_base::lldb_fs_base]);
+    writeRegister(&context->gs_base, result_base,
+                  reg_info[x86_64_with_base::lldb_gs_base]);
+  }
+
   // TODO parse the floating point registers
 
   return result_context_buf;
diff --git a/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.h b/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.h
index d920ea9d823f4..f8d38e4ba5378 100644
--- a/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.h
+++ b/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.h
@@ -153,6 +153,11 @@ struct MinidumpContext_x86_64 {
   llvm::support::ulittle64_t last_branch_from_rip;
   llvm::support::ulittle64_t last_exception_to_rip;
   llvm::support::ulittle64_t last_exception_from_rip;
+
+  // LLDB can save core files and save extra information that isn't available
+  // from Google breakpad, or similar, minidump files.
+  llvm::support::ulittle64_t fs_base;
+  llvm::support::ulittle64_t gs_base;
 };
 
 // For context_flags. These values indicate the type of
@@ -168,9 +173,10 @@ enum class MinidumpContext_x86_64_Flags : uint32_t {
   FloatingPoint = x86_64_Flag | 0x00000008,
   DebugRegisters = x86_64_Flag | 0x00000010,
   XState = x86_64_Flag | 0x00000040,
+  LLDBSpecific = x86_64_Flag | 0x80000000,
 
   Full = Control | Integer | FloatingPoint,
-  All = Full | Segments | DebugRegisters,
+  All = Full | Segments | DebugRegisters | LLDBSpecific,
 
   LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ All)
 };
diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
index ea59aef004aff..ed15793b527fc 100644
--- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
+++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
@@ -67,6 +67,18 @@ def verify_core_file(
             self.assertIn(thread_id, stacks_to_registers_map)
             register_val_list = stacks_to_registers_map[thread_id]
             frame_register_list = frame.GetRegisters()
+            # explicitly verify we collected fs and gs base for x86_64
+            explicit_registers = ["fs_base", "gs_base"]
+            for reg in explicit_registers:
+                register = frame_register_list.GetFirstValueByName(reg)
+                self.assertNotEqual(None, register)
+                self.assertEqual(
+                    register.GetValueAsUnsigned(),
+                    stacks_to_registers_map[thread_id]
+                    .GetFirstValueByName("fs_base")
+                    .GetValueAsUnsigned(),
+                )
+
             for x in register_val_list:
                 self.assertEqual(
                     x.GetValueAsUnsigned(),

From 953af0e7f1bcb42136be1a0ea9cdd5aa1fb74852 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 5 Sep 2024 12:39:05 -0400
Subject: [PATCH 255/425] [libc++][NFC] Increase consistency for namespace
 closing comments

---
 libcxx/include/__mdspan/extents.h                         | 2 +-
 libcxx/src/filesystem/error.h                             | 2 +-
 libcxx/src/filesystem/file_descriptor.h                   | 2 +-
 libcxx/src/filesystem/format_string.h                     | 2 +-
 libcxx/src/filesystem/operations.cpp                      | 6 +++---
 libcxx/src/filesystem/posix_compat.h                      | 2 +-
 libcxx/src/filesystem/time_utils.h                        | 2 +-
 libcxx/src/include/atomic_support.h                       | 2 +-
 libcxx/src/memory_resource.cpp                            | 2 +-
 libcxx/src/system_error.cpp                               | 2 +-
 libcxx/test/benchmarks/ContainerBenchmarks.h              | 2 +-
 libcxx/test/benchmarks/VariantBenchmarks.h                | 2 +-
 .../fs.op.last_write_time/last_write_time.pass.cpp        | 2 +-
 .../meta/meta.unary/meta.unary.prop/is_swappable.pass.cpp | 2 +-
 .../tuple/tuple.tuple/tuple.apply/apply.pass.cpp          | 2 +-
 libcxx/test/support/archetypes.h                          | 8 ++++----
 libcxx/test/support/container_test_types.h                | 4 ++--
 libcxx/test/support/filesystem_test_helper.h              | 2 +-
 libcxx/test/support/make_test_thread.h                    | 2 +-
 libcxx/test/support/parse_integer.h                       | 2 +-
 libcxx/test/support/uses_alloc_types.h                    | 2 +-
 libcxxabi/src/cxa_guard_impl.h                            | 4 ++--
 libcxxabi/src/cxa_personality.cpp                         | 2 +-
 23 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/libcxx/include/__mdspan/extents.h b/libcxx/include/__mdspan/extents.h
index 8fcfb4b09a2b1..3d2c2771a834b 100644
--- a/libcxx/include/__mdspan/extents.h
+++ b/libcxx/include/__mdspan/extents.h
@@ -449,7 +449,7 @@ struct __make_dextents< _IndexType, 0, extents<_IndexType, _ExtentsPack...>> {
   using type = extents<_IndexType, _ExtentsPack...>;
 };
 
-} // end namespace __mdspan_detail
+} // namespace __mdspan_detail
 
 // [mdspan.extents.dextents], alias template
 template <class _IndexType, size_t _Rank>
diff --git a/libcxx/src/filesystem/error.h b/libcxx/src/filesystem/error.h
index 572cc73292a19..09020fbede9b9 100644
--- a/libcxx/src/filesystem/error.h
+++ b/libcxx/src/filesystem/error.h
@@ -225,7 +225,7 @@ struct ErrorHandler {
   ErrorHandler& operator=(ErrorHandler const&) = delete;
 };
 
-} // end namespace detail
+} // namespace detail
 
 _LIBCPP_END_NAMESPACE_FILESYSTEM
 
diff --git a/libcxx/src/filesystem/file_descriptor.h b/libcxx/src/filesystem/file_descriptor.h
index 50178ff84e03f..f86eb60d77808 100644
--- a/libcxx/src/filesystem/file_descriptor.h
+++ b/libcxx/src/filesystem/file_descriptor.h
@@ -284,7 +284,7 @@ inline file_status FileDescriptor::refresh_status(error_code& ec) {
   return m_status;
 }
 
-} // end namespace detail
+} // namespace detail
 
 _LIBCPP_END_NAMESPACE_FILESYSTEM
 
diff --git a/libcxx/src/filesystem/format_string.h b/libcxx/src/filesystem/format_string.h
index a44def86f53e9..81c5a95ae31e5 100644
--- a/libcxx/src/filesystem/format_string.h
+++ b/libcxx/src/filesystem/format_string.h
@@ -70,7 +70,7 @@ inline _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) string format_string(const cha
   return ret;
 }
 
-} // end namespace detail
+} // namespace detail
 
 _LIBCPP_END_NAMESPACE_FILESYSTEM
 
diff --git a/libcxx/src/filesystem/operations.cpp b/libcxx/src/filesystem/operations.cpp
index a83c1ae15a4a4..d771f20097352 100644
--- a/libcxx/src/filesystem/operations.cpp
+++ b/libcxx/src/filesystem/operations.cpp
@@ -254,7 +254,7 @@ bool copy_file_impl(FileDescriptor& read_fd, FileDescriptor& write_fd, error_cod
 #endif // copy_file_impl implementation
 
 } // end anonymous namespace
-} // end namespace detail
+} // namespace detail
 
 bool __copy_file(const path& from, const path& to, copy_options options, error_code* ec) {
   using detail::FileDescriptor;
@@ -732,7 +732,7 @@ uintmax_t remove_all_impl(path const& p, error_code& ec) {
   return count;
 }
 
-} // end namespace
+} // namespace
 
 uintmax_t __remove_all(const path& p, error_code* ec) {
   ErrorHandler<uintmax_t> err("remove_all", ec, &p);
@@ -827,7 +827,7 @@ uintmax_t remove_all_impl(int parent_directory, const path& p, error_code& ec) {
   return 0;
 }
 
-} // end namespace
+} // namespace
 
 uintmax_t __remove_all(const path& p, error_code* ec) {
   ErrorHandler<uintmax_t> err("remove_all", ec, &p);
diff --git a/libcxx/src/filesystem/posix_compat.h b/libcxx/src/filesystem/posix_compat.h
index 760cdb65dae1d..b41c004341af3 100644
--- a/libcxx/src/filesystem/posix_compat.h
+++ b/libcxx/src/filesystem/posix_compat.h
@@ -490,7 +490,7 @@ using SSizeT  = ::ssize_t;
 
 #endif
 
-} // end namespace detail
+} // namespace detail
 
 _LIBCPP_END_NAMESPACE_FILESYSTEM
 
diff --git a/libcxx/src/filesystem/time_utils.h b/libcxx/src/filesystem/time_utils.h
index e05f252868f03..13f0d6f3b726e 100644
--- a/libcxx/src/filesystem/time_utils.h
+++ b/libcxx/src/filesystem/time_utils.h
@@ -344,7 +344,7 @@ inline file_time_type __extract_last_write_time(const path& p, const StatT& st,
 
 #endif // !_LIBCPP_HAS_NO_FILESYSTEM
 
-} // end namespace detail
+} // namespace detail
 
 _LIBCPP_END_NAMESPACE_FILESYSTEM
 
diff --git a/libcxx/src/include/atomic_support.h b/libcxx/src/include/atomic_support.h
index 9ce41b3229f5e..c4bc34ffc1cd2 100644
--- a/libcxx/src/include/atomic_support.h
+++ b/libcxx/src/include/atomic_support.h
@@ -125,7 +125,7 @@ __libcpp_atomic_compare_exchange(_ValueType* __val, _ValueType* __expected, _Val
 
 #endif // _LIBCPP_HAS_NO_THREADS
 
-} // end namespace
+} // namespace
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/src/memory_resource.cpp b/libcxx/src/memory_resource.cpp
index 2117238e63487..d2ff3509c5a31 100644
--- a/libcxx/src/memory_resource.cpp
+++ b/libcxx/src/memory_resource.cpp
@@ -82,7 +82,7 @@ union ResourceInitHelper {
 // attribute with a value that's reserved for the implementation (we're the implementation).
 #include "memory_resource_init_helper.h"
 
-} // end namespace
+} // namespace
 
 memory_resource* new_delete_resource() noexcept { return &res_init.resources.new_delete_res; }
 
diff --git a/libcxx/src/system_error.cpp b/libcxx/src/system_error.cpp
index f518b480a2782..3367bd56bd788 100644
--- a/libcxx/src/system_error.cpp
+++ b/libcxx/src/system_error.cpp
@@ -110,7 +110,7 @@ string make_error_str(const error_code& ec) {
   }
   return string();
 }
-} // end namespace
+} // namespace
 
 string __do_message::message(int ev) const {
 #if defined(_LIBCPP_HAS_NO_THREADS)
diff --git a/libcxx/test/benchmarks/ContainerBenchmarks.h b/libcxx/test/benchmarks/ContainerBenchmarks.h
index 744505b439985..5404814e7599b 100644
--- a/libcxx/test/benchmarks/ContainerBenchmarks.h
+++ b/libcxx/test/benchmarks/ContainerBenchmarks.h
@@ -223,6 +223,6 @@ static void BM_Compare_different_containers(benchmark::State& st, Container, Gen
   }
 }
 
-} // end namespace ContainerBenchmarks
+} // namespace ContainerBenchmarks
 
 #endif // BENCHMARK_CONTAINER_BENCHMARKS_H
diff --git a/libcxx/test/benchmarks/VariantBenchmarks.h b/libcxx/test/benchmarks/VariantBenchmarks.h
index a8e9c9febd728..ad36b59d2e879 100644
--- a/libcxx/test/benchmarks/VariantBenchmarks.h
+++ b/libcxx/test/benchmarks/VariantBenchmarks.h
@@ -50,6 +50,6 @@ static void BM_Visit(benchmark::State& state) {
   }
 }
 
-} // end namespace VariantBenchmarks
+} // namespace VariantBenchmarks
 
 #endif // BENCHMARK_VARIANT_BENCHMARKS_H
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
index c1816a94fd2b6..7024e50e81209 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
@@ -300,7 +300,7 @@ static const bool SupportsMinRoundTrip = [] {
   return min_val == file_time_type::min();
 }();
 
-} // end namespace
+} // namespace
 
 static bool CompareTime(TimeSpec t1, TimeSpec t2) {
   if (SupportsNanosecondRoundTrip)
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable.pass.cpp
index 925ddad28136b..cb7614224342f 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_swappable.pass.cpp
@@ -61,7 +61,7 @@ struct AmbiguousSwap {};
 template <class T>
 void swap(T&, T&) {}
 
-} // end namespace MyNS2
+} // namespace MyNS2
 
 int main(int, char**)
 {
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp
index af37527522075..ebcbcfeffc022 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/apply.pass.cpp
@@ -247,7 +247,7 @@ namespace ReturnTypeTest {
         using InvokeResult = decltype(std::apply(fn, t));
         static_assert(std::is_same<InvokeResult, Expect>::value, "");
     }
-} // end namespace ReturnTypeTest
+} // namespace ReturnTypeTest
 
 void test_return_type()
 {
diff --git a/libcxx/test/support/archetypes.h b/libcxx/test/support/archetypes.h
index 53545f687970e..259346e90a510 100644
--- a/libcxx/test/support/archetypes.h
+++ b/libcxx/test/support/archetypes.h
@@ -328,7 +328,7 @@ constexpr bool operator!=(Tp const& L, Tp const& R) noexcept {
   return L.value != R.value;
 }
 
-} // end namespace ConstexprTestTypes
+} // namespace ConstexprTestTypes
 
 
 //============================================================================//
@@ -351,7 +351,7 @@ constexpr bool operator!=(Tp const& L, Tp const& R) noexcept {
   return L.value != R.value;
 }
 
-} // end namespace ExplicitConstexprTestTypes
+} // namespace ExplicitConstexprTestTypes
 
 
 //============================================================================//
@@ -373,7 +373,7 @@ constexpr bool operator!=(Tp const& L, Tp const& R) noexcept {
   return L.value != R.value;
 }
 
-} // end namespace TrivialTestTypes
+} // namespace TrivialTestTypes
 
 //============================================================================//
 //
@@ -395,7 +395,7 @@ constexpr bool operator!=(Tp const& L, Tp const& R) noexcept {
   return L.value != R.value;
 }
 
-} // end namespace ExplicitTrivialTestTypes
+} // namespace ExplicitTrivialTestTypes
 
 #endif // TEST_STD_VER >= 11
 
diff --git a/libcxx/test/support/container_test_types.h b/libcxx/test/support/container_test_types.h
index b72bbbeaccf77..7a97e04398054 100644
--- a/libcxx/test/support/container_test_types.h
+++ b/libcxx/test/support/container_test_types.h
@@ -349,7 +349,7 @@ typedef std::allocator_traits<A2> A2T;
 
 static_assert(std::is_same<A1T::rebind_traits<float>, A2T>::value, "");
 static_assert(std::is_same<A2T::rebind_traits<int>, A1T>::value, "");
-} // end namespace test_detail
+} // namespace test_detail
 
 //===----------------------------------------------------------------------===//
 //  'CopyInsertable', 'MoveInsertable' and 'EmplaceConstructible' test types
@@ -491,6 +491,6 @@ template <class Value = CopyInsertable<1> >
 using multiset =
     std::multiset<Value, std::less<Value>, ContainerTestAllocator<Value, Value> >;
 
-} // end namespace TCT
+} // namespace TCT
 
 #endif // SUPPORT_CONTAINER_TEST_TYPES_H
diff --git a/libcxx/test/support/filesystem_test_helper.h b/libcxx/test/support/filesystem_test_helper.h
index 29bc846fbbc83..932d14f45f8e5 100644
--- a/libcxx/test/support/filesystem_test_helper.h
+++ b/libcxx/test/support/filesystem_test_helper.h
@@ -146,7 +146,7 @@ namespace utils {
         struct ::stat tmp;
         return ::stat(path.c_str(), &tmp) == 0;
     }
-} // end namespace utils
+} // namespace utils
 
 struct scoped_test_env
 {
diff --git a/libcxx/test/support/make_test_thread.h b/libcxx/test/support/make_test_thread.h
index cd548fd909d71..00190a8a69ce1 100644
--- a/libcxx/test/support/make_test_thread.h
+++ b/libcxx/test/support/make_test_thread.h
@@ -46,6 +46,6 @@ TEST_AVAILABILITY_SYNC std::jthread make_test_jthread(F&& f, Args&&... args) {
 }
 #endif
 
-} // end namespace support
+} // namespace support
 
 #endif // TEST_SUPPORT_MAKE_TEST_THREAD_H
diff --git a/libcxx/test/support/parse_integer.h b/libcxx/test/support/parse_integer.h
index c4fa429e65d57..36f8f7b79881b 100644
--- a/libcxx/test/support/parse_integer.h
+++ b/libcxx/test/support/parse_integer.h
@@ -62,7 +62,7 @@ struct parse_integer_impl<unsigned long long> {
         return std::stoull(str);
     }
 };
-} // end namespace detail
+} // namespace detail
 
 template <class T, class CharT>
 T parse_integer(std::basic_string<CharT> const& str) {
diff --git a/libcxx/test/support/uses_alloc_types.h b/libcxx/test/support/uses_alloc_types.h
index 817bef1da3aef..66746960fc87c 100644
--- a/libcxx/test/support/uses_alloc_types.h
+++ b/libcxx/test/support/uses_alloc_types.h
@@ -120,7 +120,7 @@ using IdentityT = typename Identity<T>::type;
 template <bool Value>
 using EnableIfB = typename std::enable_if<Value, bool>::type;
 
-} // end namespace detail
+} // namespace detail
 
 using detail::EnableIfB;
 
diff --git a/libcxxabi/src/cxa_guard_impl.h b/libcxxabi/src/cxa_guard_impl.h
index 320501cb85938..3e533054098e2 100644
--- a/libcxxabi/src/cxa_guard_impl.h
+++ b/libcxxabi/src/cxa_guard_impl.h
@@ -676,8 +676,8 @@ static_assert(CurrentImplementation != Implementation::Futex || PlatformSupports
 
 using SelectedImplementation = SelectImplementation<CurrentImplementation>::type;
 
-} // end namespace
-} // end namespace __cxxabiv1
+} // namespace
+} // namespace __cxxabiv1
 
 #if defined(__clang__)
 #  pragma clang diagnostic pop
diff --git a/libcxxabi/src/cxa_personality.cpp b/libcxxabi/src/cxa_personality.cpp
index 843a18a4cbd8a..5f6e75c5be19c 100644
--- a/libcxxabi/src/cxa_personality.cpp
+++ b/libcxxabi/src/cxa_personality.cpp
@@ -167,7 +167,7 @@ uintptr_t readPointerHelper(const uint8_t*& p) {
     return static_cast<uintptr_t>(value);
 }
 
-} // end namespace
+} // namespace
 
 extern "C"
 {

From eee2f02e4e28e54e5a38a1dbbd62ea6780909e16 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 5 Sep 2024 12:42:55 -0400
Subject: [PATCH 256/425] [libc++][modules] Get rid of <cstddef> dependency in
 __datasizeof (#107394)

All the compilers we support also provide __builtin_offsetof, so avoid
using this macro and use the builtin directly instead. This allows
removing a dependency on `<cstddef>`, which is heavier than we need.
---
 libcxx/include/__type_traits/datasizeof.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libcxx/include/__type_traits/datasizeof.h b/libcxx/include/__type_traits/datasizeof.h
index b4cbd1ddfa8de..0c1ed94f84029 100644
--- a/libcxx/include/__type_traits/datasizeof.h
+++ b/libcxx/include/__type_traits/datasizeof.h
@@ -10,9 +10,9 @@
 #define _LIBCPP___TYPE_TRAITS_DATASIZEOF_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__type_traits/is_class.h>
 #include <__type_traits/is_final.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -37,15 +37,15 @@ struct _FirstPaddingByte {
   char __first_padding_byte_;
 };
 
-// _FirstPaddingByte<> is sometimes non-standard layout. Using `offsetof` is UB in that case, but GCC and Clang allow
-// the use as an extension.
+// _FirstPaddingByte<> is sometimes non-standard layout.
+// It is conditionally-supported to use __builtin_offsetof in that case, but GCC and Clang allow it.
 _LIBCPP_DIAGNOSTIC_PUSH
 _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-offsetof")
 _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Winvalid-offsetof")
 template <class _Tp>
-inline const size_t __datasizeof_v = offsetof(_FirstPaddingByte<_Tp>, __first_padding_byte_);
+inline const size_t __datasizeof_v = __builtin_offsetof(_FirstPaddingByte<_Tp>, __first_padding_byte_);
 _LIBCPP_DIAGNOSTIC_POP
-#endif   // __has_extension(datasizeof)
+#endif // __has_extension(datasizeof)
 
 _LIBCPP_END_NAMESPACE_STD
 

From 5edede2db09d38cbf9397edb9bfd43b92265f660 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tgymnich@icloud.com>
Date: Thu, 5 Sep 2024 18:46:14 +0200
Subject: [PATCH 257/425] [DXIL] Add sign intrinsic part 2 (#101988)

makes progress on #70078

### Changes
- Added `int_dx_sign` intrinsic in `IntrinsicsDirectX.td`
- Added expansion for `int_dx_sign in `DXILIntrinsicExpansion.cpp`
- Added DXIL backend test case

### Related PRs
- https://github.com/llvm/llvm-project/pull/101987
- https://github.com/llvm/llvm-project/pull/101989
---
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |   1 +
 .../Target/DirectX/DXILIntrinsicExpansion.cpp |  31 +++
 llvm/test/CodeGen/DirectX/sign.ll             | 212 ++++++++++++++++++
 3 files changed, 244 insertions(+)
 create mode 100644 llvm/test/CodeGen/DirectX/sign.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index d2c0859f52a24..f089d51fa1b45 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -81,4 +81,5 @@ def int_dx_rcp  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
 def int_dx_rsqrt  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
 
 def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
+def int_dx_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty]>;
 }
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index 2daa4f825c3b2..72fa9891bfd8e 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
@@ -48,6 +49,7 @@ static bool isIntrinsicExpansion(Function &F) {
   case Intrinsic::dx_fdot:
   case Intrinsic::dx_sdot:
   case Intrinsic::dx_udot:
+  case Intrinsic::dx_sign:
     return true;
   }
   return false;
@@ -359,6 +361,32 @@ static Value *expandClampIntrinsic(CallInst *Orig,
                                  {MaxCall, Max}, nullptr, "dx.min");
 }
 
+static Value *expandSignIntrinsic(CallInst *Orig) {
+  Value *X = Orig->getOperand(0);
+  Type *Ty = X->getType();
+  Type *ScalarTy = Ty->getScalarType();
+  Type *RetTy = Orig->getType();
+  Constant *Zero = Constant::getNullValue(Ty);
+
+  IRBuilder<> Builder(Orig);
+
+  Value *GT;
+  Value *LT;
+  if (ScalarTy->isFloatingPointTy()) {
+    GT = Builder.CreateFCmpOLT(Zero, X);
+    LT = Builder.CreateFCmpOLT(X, Zero);
+  } else {
+    assert(ScalarTy->isIntegerTy());
+    GT = Builder.CreateICmpSLT(Zero, X);
+    LT = Builder.CreateICmpSLT(X, Zero);
+  }
+
+  Value *ZextGT = Builder.CreateZExt(GT, RetTy);
+  Value *ZextLT = Builder.CreateZExt(LT, RetTy);
+
+  return Builder.CreateSub(ZextGT, ZextLT);
+}
+
 static bool expandIntrinsic(Function &F, CallInst *Orig) {
   Value *Result = nullptr;
   Intrinsic::ID IntrinsicId = F.getIntrinsicID();
@@ -402,6 +430,9 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) {
   case Intrinsic::dx_udot:
     Result = expandIntegerDotIntrinsic(Orig, IntrinsicId);
     break;
+  case Intrinsic::dx_sign:
+    Result = expandSignIntrinsic(Orig);
+    break;
   }
 
   if (Result) {
diff --git a/llvm/test/CodeGen/DirectX/sign.ll b/llvm/test/CodeGen/DirectX/sign.ll
new file mode 100644
index 0000000000000..0a1631bd5ddc1
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/sign.ll
@@ -0,0 +1,212 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S  -dxil-op-lower  -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+
+define noundef i32 @sign_half(half noundef %a) {
+; CHECK-LABEL: define noundef i32 @sign_half(
+; CHECK-SAME: half noundef [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt half 0xH0000, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt half [[A]], 0xH0000
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+entry:
+  %elt.sign = call i32 @llvm.dx.sign.f16(half %a)
+  ret i32 %elt.sign
+}
+
+define noundef i32 @sign_float(float noundef %a) {
+; CHECK-LABEL: define noundef i32 @sign_float(
+; CHECK-SAME: float noundef [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt float 0.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt float [[A]], 0.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+entry:
+  %elt.sign = call i32 @llvm.dx.sign.f32(float %a)
+  ret i32 %elt.sign
+}
+
+define noundef i32 @sign_double(double noundef %a) {
+; CHECK-LABEL: define noundef i32 @sign_double(
+; CHECK-SAME: double noundef [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt double 0.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt double [[A]], 0.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+entry:
+  %elt.sign = call i32 @llvm.dx.sign.f64(double %a)
+  ret i32 %elt.sign
+}
+
+define noundef i32 @sign_i16(i16 noundef %a) {
+; CHECK-LABEL: define noundef i32 @sign_i16(
+; CHECK-SAME: i16 noundef [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt i16 0, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i16 [[A]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+entry:
+  %elt.sign = call i32 @llvm.dx.sign.i16(i16 %a)
+  ret i32 %elt.sign
+}
+
+define noundef i32 @sign_i32(i32 noundef %a) {
+; CHECK-LABEL: define noundef i32 @sign_i32(
+; CHECK-SAME: i32 noundef [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt i32 0, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[A]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+entry:
+  %elt.sign = call i32 @llvm.dx.sign.i32(i32 %a)
+  ret i32 %elt.sign
+}
+
+define noundef i32 @sign_i64(i64 noundef %a) {
+; CHECK-LABEL: define noundef i32 @sign_i64(
+; CHECK-SAME: i64 noundef [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt i64 0, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i64 [[A]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+entry:
+  %elt.sign = call i32 @llvm.dx.sign.i64(i64 %a)
+  ret i32 %elt.sign
+}
+
+define noundef <4 x i32> @sign_half_vector(<4 x half> noundef %a) {
+; CHECK-LABEL: define noundef <4 x i32> @sign_half_vector(
+; CHECK-SAME: <4 x half> noundef [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x half> zeroinitializer, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt <4 x half> [[A]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i1> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+entry:
+  %elt.sign = call <4 x i32> @llvm.dx.sign.v4f16(<4 x half> %a)
+  ret <4 x i32> %elt.sign
+}
+
+define noundef <4 x i32> @sign_float_vector(<4 x float> noundef %a) {
+; CHECK-LABEL: define noundef <4 x i32> @sign_float_vector(
+; CHECK-SAME: <4 x float> noundef [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x float> zeroinitializer, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt <4 x float> [[A]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i1> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+entry:
+  %elt.sign = call <4 x i32> @llvm.dx.sign.v4f32(<4 x float> %a)
+  ret <4 x i32> %elt.sign
+}
+
+define noundef <4 x i32> @sign_double_vector(<4 x double> noundef %a) {
+; CHECK-LABEL: define noundef <4 x i32> @sign_double_vector(
+; CHECK-SAME: <4 x double> noundef [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x double> zeroinitializer, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt <4 x double> [[A]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i1> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+entry:
+  %elt.sign = call <4 x i32> @llvm.dx.sign.v4f64(<4 x double> %a)
+  ret <4 x i32> %elt.sign
+}
+
+define noundef <4 x i32> @sign_i16_vector(<4 x i16> noundef %a) {
+; CHECK-LABEL: define noundef <4 x i32> @sign_i16_vector(
+; CHECK-SAME: <4 x i16> noundef [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <4 x i16> zeroinitializer, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i16> [[A]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i1> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+entry:
+  %elt.sign = call <4 x i32> @llvm.dx.sign.v4i16(<4 x i16> %a)
+  ret <4 x i32> %elt.sign
+}
+
+define noundef <4 x i32> @sign_i32_vector(<4 x i32> noundef %a) {
+; CHECK-LABEL: define noundef <4 x i32> @sign_i32_vector(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <4 x i32> zeroinitializer, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[A]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i1> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+entry:
+  %elt.sign = call <4 x i32> @llvm.dx.sign.v4i32(<4 x i32> %a)
+  ret <4 x i32> %elt.sign
+}
+
+define noundef <4 x i32> @sign_i64_vector(<4 x i64> noundef %a) {
+; CHECK-LABEL: define noundef <4 x i32> @sign_i64_vector(
+; CHECK-SAME: <4 x i64> noundef [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <4 x i64> zeroinitializer, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i64> [[A]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i1> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+entry:
+  %elt.sign = call <4 x i32> @llvm.dx.sign.v4i64(<4 x i64> %a)
+  ret <4 x i32> %elt.sign
+}
+
+
+declare i32 @llvm.dx.sign.f16(half)
+declare i32 @llvm.dx.sign.f32(float)
+declare i32 @llvm.dx.sign.f64(double)
+
+declare i32 @llvm.dx.sign.i16(i16)
+declare i32 @llvm.dx.sign.i32(i32)
+declare i32 @llvm.dx.sign.i64(i64)
+
+declare <4 x i32> @llvm.dx.sign.v4f16(<4 x half>)
+declare <4 x i32> @llvm.dx.sign.v4f32(<4 x float>)
+declare <4 x i32> @llvm.dx.sign.v4f64(<4 x double>)
+
+declare <4 x i32> @llvm.dx.sign.v4i16(<4 x i16>)
+declare <4 x i32> @llvm.dx.sign.v4i32(<4 x i32>)
+declare <4 x i32> @llvm.dx.sign.v4i64(<4 x i64>)

From 0818c2801ecc5cb07b680bb77e24df90f35c74b9 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Thu, 5 Sep 2024 09:47:26 -0700
Subject: [PATCH 258/425] [WebAssembly] Simplify a switch-case in CFGStackify
 (NFC) (#107360)

This merges some `case`s using `[[fallthrough]]`, and make `DELEGATE` as
a separate `case`. (Previously the reason we didn't do that was not to
duplicate the code in `RewriteOperands`. But now that we've extracted it
into a lambda function in #107182 we can do it.
---
 .../WebAssembly/WebAssemblyCFGStackify.cpp    | 20 +++++++++----------
 .../WebAssembly/WebAssemblyInstrControl.td    |  4 ++--
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 3362ea5316e45..3cccc57e629fd 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -1681,18 +1681,14 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
         Stack.pop_back();
         break;
 
-      case WebAssembly::END_BLOCK:
-        Stack.push_back(std::make_pair(&MBB, &MI));
-        break;
-
       case WebAssembly::END_TRY: {
-        // We handle DELEGATE in the default level, because DELEGATE has
-        // immediate operands to rewrite.
-        Stack.push_back(std::make_pair(&MBB, &MI));
         auto *EHPad = TryToEHPad[EndToBegin[&MI]];
         EHPadStack.push_back(EHPad);
-        break;
+        [[fallthrough]];
       }
+      case WebAssembly::END_BLOCK:
+        Stack.push_back(std::make_pair(&MBB, &MI));
+        break;
 
       case WebAssembly::END_LOOP:
         Stack.push_back(std::make_pair(EndToBegin[&MI]->getParent(), &MI));
@@ -1707,12 +1703,14 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
         MI.getOperand(0).setImm(getRethrowDepth(Stack, EHPadStack));
         break;
 
+      case WebAssembly::DELEGATE:
+        RewriteOperands(MI);
+        Stack.push_back(std::make_pair(&MBB, &MI));
+        break;
+
       default:
         if (MI.isTerminator())
           RewriteOperands(MI);
-
-        if (MI.getOpcode() == WebAssembly::DELEGATE)
-          Stack.push_back(std::make_pair(&MBB, &MI));
         break;
       }
     }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index dd40015577fd7..05880b89d1fbc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -147,9 +147,9 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
 // usage gets low enough.
 
 // Rethrowing an exception: rethrow
-let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
+let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in
 defm RETHROW : NRI<(outs), (ins i32imm:$depth), [], "rethrow \t$depth", 0x09>;
-} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+// isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 // The depth argument will be computed in CFGStackify. We set it to 0 here for
 // now.
 def : Pat<(int_wasm_rethrow), (RETHROW 0)>;

From 18ad98e7947502da0c8f6dcbbf485bb34fe8d204 Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford@apple.com>
Date: Thu, 5 Sep 2024 09:53:49 -0700
Subject: [PATCH 259/425] [lldb] Fix a format string in ClangASTSource
 (#107325)

Without this, LLDB asserts when enabling the expression logs.
---
 lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
index 1fdd272dcbece..e41efdd3f61c7 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
@@ -293,7 +293,7 @@ void ClangASTSource::CompleteType(clang::ObjCInterfaceDecl *interface_decl) {
 
   LLDB_LOG(log,
            "    [CompleteObjCInterfaceDecl] on (ASTContext*){0:x} '{1}' "
-           "Completing an ObjCInterfaceDecl named {1}",
+           "Completing an ObjCInterfaceDecl named {2}",
            m_ast_context, m_clang_ast_context->getDisplayName(),
            interface_decl->getName());
   LLDB_LOG(log, "      [COID] Before:\n{0}",

From 91a3c6f3d66b866bcda8a0f7d4815bc8f2dbd86c Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Thu, 5 Sep 2024 17:54:57 +0100
Subject: [PATCH 260/425] [AArch64] Remove redundant COPY from
 loadRegFromStackSlot (#107396)

This removes a redundant 'COPY' instruction that #81716 probably forgot
to remove.

This redundant COPY led to an issue because because code in
LiveRangeSplitting expects that the instruction emitted by
`loadRegFromStackSlot` is an instruction that accesses memory, which
isn't the case for the COPY instruction.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp |  4 ---
 llvm/test/CodeGen/AArch64/spillfill-sve.mir  | 37 +++++++++++++++++++-
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 70f69f34a4a7d..3b38a5f78dee5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5559,10 +5559,6 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   if (PNRReg.isValid() && !PNRReg.isVirtual())
     MI.addDef(PNRReg, RegState::Implicit);
   MI.addMemOperand(MMO);
-
-  if (PNRReg.isValid() && PNRReg.isVirtual())
-    BuildMI(MBB, MBBI, DebugLoc(), get(TargetOpcode::COPY), PNRReg)
-        .addReg(DestReg);
 }
 
 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index 11cf388e38531..83c9b73c57570 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -11,6 +11,7 @@
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_ppr2mul2() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_pnr() #1 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_virtreg_pnr() #1 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_virtreg_ppr_to_pnr() #1 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #0 { entry: unreachable }
@@ -216,7 +217,7 @@ body:             |
     ; EXPAND: STR_PXI killed renamable $pn8, $sp, 7
     ;
     ; EXPAND: renamable $pn8 = LDR_PXI $sp, 7
-    ; EXPAND: $p0 = PEXT_PCI_B killed renamable $pn8, 0
+    ; EXPAND-NEXT: $p0 = PEXT_PCI_B killed renamable $pn8, 0
 
 
     %0:pnr_p8to15 = WHILEGE_CXX_B undef $x0, undef $x0, 0, implicit-def dead $nzcv
@@ -242,6 +243,40 @@ body:             |
     RET_ReallyLR
 ...
 ---
+name: spills_fills_stack_id_virtreg_ppr_to_pnr
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: ppr }
+  - { id: 1, class: pnr_p8to15 }
+stack:
+body:             |
+  bb.0.entry:
+    liveins: $p0
+
+    %0:ppr = COPY $p0
+
+    $pn0 = IMPLICIT_DEF
+    $pn1 = IMPLICIT_DEF
+    $pn2 = IMPLICIT_DEF
+    $pn3 = IMPLICIT_DEF
+    $pn4 = IMPLICIT_DEF
+    $pn5 = IMPLICIT_DEF
+    $pn6 = IMPLICIT_DEF
+    $pn7 = IMPLICIT_DEF
+    $pn8 = IMPLICIT_DEF
+    $pn9 = IMPLICIT_DEF
+    $pn10 = IMPLICIT_DEF
+    $pn11 = IMPLICIT_DEF
+    $pn12 = IMPLICIT_DEF
+    $pn13 = IMPLICIT_DEF
+    $pn14 = IMPLICIT_DEF
+    $pn15 = IMPLICIT_DEF
+
+    %1:pnr_p8to15 = COPY %0
+    $p0 = PEXT_PCI_B %1, 0
+    RET_ReallyLR
+...
+---
 name: spills_fills_stack_id_zpr
 tracksRegLiveness: true
 registers:

From 54194e1506bdd6dc37988678a8047ad4d48168fa Mon Sep 17 00:00:00 2001
From: Michal Terepeta <michalt@google.com>
Date: Thu, 5 Sep 2024 19:01:29 +0200
Subject: [PATCH 261/425] [RISCV][SiFive7] Change `Latency` of VCIX to the
 default (#106497)

Currently we multiply the default (`SiFive7GetCyclesDefault`) by 10, but
this turns out to be both surprising to our users and leads to worse
codegen in most cases. I think it's more natural to just keep the
default.

In the end the right solution is probably to have a separate scheduling
model for a particular VCIX coprocessor.
---
 llvm/lib/Target/RISCV/RISCVSchedSiFive7.td | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 3f2e8dee76fd6..24cbe1531c017 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -961,15 +961,18 @@ def : InstRW<[WriteIALU], (instrs COPY)>;
 
 // VCIX
 //
-// In principle we don't know the latency of any VCIX instructions. But instead
-// of taking the default of 1, which can lead to issues [1], we assume that they
-// have a fairly high latency.
+// In principle we don't know the latency of any VCIX instructions (they
+// depends on a particular coprocessor implementation). However, the default
+// latency of 1 can lead to issues [1]. So instead we set the latency to the
+// default provided by `SiFive7GetCyclesDefault`. This is still not accurate
+// and can lead to suboptimal codegen, but should hopefully be a better
+// starting point.
 //
 // [1] https://github.com/llvm/llvm-project/issues/83391
 foreach mx = SchedMxList in {
   defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
   defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = !mul(Cycles, 10),
+  let Latency = Cycles,
       AcquireAtCycles = [0, 1],
       ReleaseAtCycles = [1, !add(1, Cycles)] in {
     defm "" : LMULWriteResMX<"WriteVC_V_I",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;

From cf2ecc7c1c24dee6e3b70a836474a5ac553829b3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 5 Sep 2024 18:20:16 +0100
Subject: [PATCH 262/425] [LV] Remove over-aggressive assert from 3fe6a064f15c.

There are some cases where only the first operand is marked for
truncation. In that case, the compare won't be truncated which would
incorrectly trigger the assertion.

It also shows that the check pre 3fe6a064f15c also considered compares
truncated that cannot be truncated.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  7 +-
 .../truncate-to-minimal-bitwidth-cost.ll      | 89 +++++++++++++++++++
 2 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0ccf442dac999..780da36189c36 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6607,12 +6607,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   case Instruction::FCmp: {
     Type *ValTy = I->getOperand(0)->getType();
 
-    Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
-    (void)Op0AsInstruction;
-    assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
-            canTruncateToMinimalBitwidth(I, VF)) &&
-           "truncating Op0 must imply truncating the compare");
     if (canTruncateToMinimalBitwidth(I, VF)) {
+      Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
+      (void)Op0AsInstruction;
       assert(!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
              MinBWs[I] == MinBWs[Op0AsInstruction] &&
                  "if both the operand and the compare are marked for "
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index 9fe5a2a6a3ecc..a2c3f2bfa274c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -269,6 +269,93 @@ exit:
   ret i8 %trunc
 }
 
+define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 %v, ptr noalias %src) #1 {
+; CHECK-LABEL: define void @icmp_only_first_op_truncated(
+; CHECK-SAME: ptr noalias [[DST:%.*]], i32 [[X:%.*]], i64 [[N:%.*]], i64 [[V:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[T:%.*]] = trunc i64 [[N]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[V]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[N]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[T]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[DST]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i64> [[BROADCAST_SPLAT]] to <vscale x 2 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <vscale x 2 x i32> [[TMP7]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP9]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP10]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT4]], i32 8, <vscale x 2 x i1> [[TMP8]], <vscale x 2 x double> poison)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[WIDE_MASKED_GATHER]], <vscale x 2 x ptr> [[BROADCAST_SPLAT6]], i32 8, <vscale x 2 x i1> [[TMP8]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[T1:%.*]] = trunc i64 [[N]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[T1]], [[T]]
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr double, ptr [[SRC]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[RETVAL:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    store double [[RETVAL]], ptr [[DST]], align 8
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[V]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %t = trunc i64 %N to i32
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %t1 = trunc i64 %N to i32
+  %c = icmp eq i32 %t1, %t
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  %idxprom = zext i32 %x to i64
+  %arrayidx = getelementptr double, ptr %src, i64 %idxprom
+  %retval = load double, ptr %arrayidx, align 8
+  store double %retval, ptr %dst, align 8
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %v
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
 attributes #0 = { "target-features"="+64bit,+v,+zvl256b" }
 attributes #1 = { "target-features"="+64bit,+v" }
 
@@ -283,4 +370,6 @@ attributes #1 = { "target-features"="+64bit,+v" }
 ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
 ;.

From 311ac6381649fa0f7cc495db8fa697d6a9b43988 Mon Sep 17 00:00:00 2001
From: Abhina Sree <Abhina.Sreeskantharajan@ibm.com>
Date: Thu, 5 Sep 2024 13:25:06 -0400
Subject: [PATCH 263/425] [NFC][SystemZ][z/OS] Rename autoconversion-related
 functions to be less generic (#107399)

This patch renames the functions in AutoConvert.h/cpp to have a less
generic name because they are z/OS specific.
---
 clang/tools/c-arcmt-test/c-arcmt-test.c |  4 ++--
 clang/tools/c-index-test/c-index-test.c |  4 ++--
 llvm/include/llvm/Support/AutoConvert.h | 14 +++++++-------
 llvm/lib/Support/AutoConvert.cpp        | 20 ++++++++++----------
 llvm/lib/Support/InitLLVM.cpp           | 10 +++++-----
 llvm/lib/Support/MemoryBuffer.cpp       |  2 +-
 llvm/lib/Support/Unix/Path.inc          |  8 ++++----
 llvm/lib/Support/Unix/Program.inc       |  2 +-
 llvm/lib/Support/raw_ostream.cpp        |  4 ++--
 llvm/utils/count/count.c                |  4 ++--
 10 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/clang/tools/c-arcmt-test/c-arcmt-test.c b/clang/tools/c-arcmt-test/c-arcmt-test.c
index 00999f188c7dc..4d0c418714b95 100644
--- a/clang/tools/c-arcmt-test/c-arcmt-test.c
+++ b/clang/tools/c-arcmt-test/c-arcmt-test.c
@@ -109,10 +109,10 @@ static void flush_atexit(void) {
 
 int main(int argc, const char **argv) {
 #ifdef __MVS__
-  if (enableAutoConversion(fileno(stdout)) == -1)
+  if (enablezOSAutoConversion(fileno(stdout)) == -1)
     fprintf(stderr, "Setting conversion on stdout failed\n");
 
-  if (enableAutoConversion(fileno(stderr)) == -1)
+  if (enablezOSAutoConversion(fileno(stderr)) == -1)
     fprintf(stderr, "Setting conversion on stderr failed\n");
 #endif
 
diff --git a/clang/tools/c-index-test/c-index-test.c b/clang/tools/c-index-test/c-index-test.c
index f472a67f3bc5b..b48f44950ab75 100644
--- a/clang/tools/c-index-test/c-index-test.c
+++ b/clang/tools/c-index-test/c-index-test.c
@@ -5180,10 +5180,10 @@ int main(int argc, const char **argv) {
   thread_info client_data;
 
 #ifdef __MVS__
-  if (enableAutoConversion(fileno(stdout)) == -1)
+  if (enablezOSAutoConversion(fileno(stdout)) == -1)
     fprintf(stderr, "Setting conversion on stdout failed\n");
 
-  if (enableAutoConversion(fileno(stderr)) == -1)
+  if (enablezOSAutoConversion(fileno(stderr)) == -1)
     fprintf(stderr, "Setting conversion on stderr failed\n");
 #endif
 
diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h
index f4fe80b22a876..6f45c4683f777 100644
--- a/llvm/include/llvm/Support/AutoConvert.h
+++ b/llvm/include/llvm/Support/AutoConvert.h
@@ -27,9 +27,9 @@
 #ifdef __cplusplus
 extern "C" {
 #endif // __cplusplus
-int enableAutoConversion(int FD);
-int disableAutoConversion(int FD);
-int restoreStdHandleAutoConversion(int FD);
+int enablezOSAutoConversion(int FD);
+int disablezOSAutoConversion(int FD);
+int restorezOSStdHandleAutoConversion(int FD);
 #ifdef __cplusplus
 }
 #endif // __cplusplus
@@ -39,18 +39,18 @@ namespace llvm {
 
 /// \brief Disable the z/OS enhanced ASCII auto-conversion for the file
 /// descriptor.
-std::error_code disableAutoConversion(int FD);
+std::error_code disablezOSAutoConversion(int FD);
 
 /// \brief Query the z/OS enhanced ASCII auto-conversion status of a file
 /// descriptor and force the conversion if the file is not tagged with a
 /// codepage.
-std::error_code enableAutoConversion(int FD);
+std::error_code enablezOSAutoConversion(int FD);
 
 /// Restore the z/OS enhanced ASCII auto-conversion for the std handle.
-std::error_code restoreStdHandleAutoConversion(int FD);
+std::error_code restorezOSStdHandleAutoConversion(int FD);
 
 /// \brief Set the tag information for a file descriptor.
-std::error_code setFileTag(int FD, int CCSID, bool Text);
+std::error_code setzOSFileTag(int FD, int CCSID, bool Text);
 
 } // namespace llvm
 #endif // __cplusplus
diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp
index c509284ee916e..66570735f8fc8 100644
--- a/llvm/lib/Support/AutoConvert.cpp
+++ b/llvm/lib/Support/AutoConvert.cpp
@@ -22,7 +22,7 @@
 
 static int savedStdHandleAutoConversionMode[3] = {-1, -1, -1};
 
-int disableAutoConversion(int FD) {
+int disablezOSAutoConversion(int FD) {
   static const struct f_cnvrt Convert = {
       SETCVTOFF, // cvtcmd
       0,         // pccsid
@@ -32,7 +32,7 @@ int disableAutoConversion(int FD) {
   return fcntl(FD, F_CONTROL_CVT, &Convert);
 }
 
-int restoreStdHandleAutoConversion(int FD) {
+int restorezOSStdHandleAutoConversion(int FD) {
   assert(FD == STDIN_FILENO || FD == STDOUT_FILENO || FD == STDERR_FILENO);
   if (savedStdHandleAutoConversionMode[FD] == -1)
     return 0;
@@ -44,7 +44,7 @@ int restoreStdHandleAutoConversion(int FD) {
   return (fcntl(FD, F_CONTROL_CVT, &Cvt));
 }
 
-int enableAutoConversion(int FD) {
+int enablezOSAutoConversion(int FD) {
   struct f_cnvrt Query = {
       QUERYCVT, // cvtcmd
       0,        // pccsid
@@ -81,28 +81,28 @@ int enableAutoConversion(int FD) {
   return fcntl(FD, F_CONTROL_CVT, &Query);
 }
 
-std::error_code llvm::disableAutoConversion(int FD) {
-  if (::disableAutoConversion(FD) == -1)
+std::error_code llvm::disablezOSAutoConversion(int FD) {
+  if (::disablezOSAutoConversion(FD) == -1)
     return errnoAsErrorCode();
 
   return std::error_code();
 }
 
-std::error_code llvm::enableAutoConversion(int FD) {
-  if (::enableAutoConversion(FD) == -1)
+std::error_code llvm::enablezOSAutoConversion(int FD) {
+  if (::enablezOSAutoConversion(FD) == -1)
     return errnoAsErrorCode();
 
   return std::error_code();
 }
 
-std::error_code llvm::restoreStdHandleAutoConversion(int FD) {
-  if (::restoreStdHandleAutoConversion(FD) == -1)
+std::error_code llvm::restorezOSStdHandleAutoConversion(int FD) {
+  if (::restorezOSStdHandleAutoConversion(FD) == -1)
     return errnoAsErrorCode();
 
   return std::error_code();
 }
 
-std::error_code llvm::setFileTag(int FD, int CCSID, bool Text) {
+std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) {
   assert((!Text || (CCSID != FT_UNTAGGED && CCSID != FT_BINARY)) &&
          "FT_UNTAGGED and FT_BINARY are not allowed for text files");
   struct file_tag Tag;
diff --git a/llvm/lib/Support/InitLLVM.cpp b/llvm/lib/Support/InitLLVM.cpp
index b7e463a19122d..2b3ddd39b0879 100644
--- a/llvm/lib/Support/InitLLVM.cpp
+++ b/llvm/lib/Support/InitLLVM.cpp
@@ -27,9 +27,9 @@ void CleanupStdHandles(void *Cookie) {
   llvm::raw_ostream *Outs = &llvm::outs(), *Errs = &llvm::errs();
   Outs->flush();
   Errs->flush();
-  llvm::restoreStdHandleAutoConversion(STDIN_FILENO);
-  llvm::restoreStdHandleAutoConversion(STDOUT_FILENO);
-  llvm::restoreStdHandleAutoConversion(STDERR_FILENO);
+  llvm::restorezOSStdHandleAutoConversion(STDIN_FILENO);
+  llvm::restorezOSStdHandleAutoConversion(STDOUT_FILENO);
+  llvm::restorezOSStdHandleAutoConversion(STDERR_FILENO);
 }
 #endif
 
@@ -70,8 +70,8 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv,
 
   // If turning on conversion for stderr fails then the error message
   // may be garbled. There is no solution to this problem.
-  ExitOnErr(errorCodeToError(llvm::enableAutoConversion(STDERR_FILENO)));
-  ExitOnErr(errorCodeToError(llvm::enableAutoConversion(STDOUT_FILENO)));
+  ExitOnErr(errorCodeToError(llvm::enablezOSAutoConversion(STDERR_FILENO)));
+  ExitOnErr(errorCodeToError(llvm::enablezOSAutoConversion(STDOUT_FILENO)));
 #endif
 
 #ifdef _WIN32
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index fb7e804fd7e84..aea81964ba9fd 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -505,7 +505,7 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
 
 #ifdef __MVS__
   // Set codepage auto-conversion for z/OS.
-  if (auto EC = llvm::enableAutoConversion(FD))
+  if (auto EC = llvm::enablezOSAutoConversion(FD))
     return EC;
 #endif
 
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index cf05db546e021..8098392a7fd90 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -1096,17 +1096,17 @@ std::error_code openFile(const Twine &Name, int &ResultFD,
                     !Stat.st_tag.ft_txtflag && !Stat.st_tag.ft_ccsid &&
                     Stat.st_size == 0;
     if (Flags & OF_Text) {
-      if (auto EC = llvm::enableAutoConversion(ResultFD))
+      if (auto EC = llvm::enablezOSAutoConversion(ResultFD))
         return EC;
       if (DoSetTag) {
-        if (auto EC = llvm::setFileTag(ResultFD, CCSID_IBM_1047, true))
+        if (auto EC = llvm::setzOSFileTag(ResultFD, CCSID_IBM_1047, true))
           return EC;
       }
     } else {
-      if (auto EC = llvm::disableAutoConversion(ResultFD))
+      if (auto EC = llvm::disablezOSAutoConversion(ResultFD))
         return EC;
       if (DoSetTag) {
-        if (auto EC = llvm::setFileTag(ResultFD, FT_BINARY, false))
+        if (auto EC = llvm::setzOSFileTag(ResultFD, FT_BINARY, false))
           return EC;
       }
     }
diff --git a/llvm/lib/Support/Unix/Program.inc b/llvm/lib/Support/Unix/Program.inc
index 2742734bb11ed..ec0fad7076b45 100644
--- a/llvm/lib/Support/Unix/Program.inc
+++ b/llvm/lib/Support/Unix/Program.inc
@@ -533,7 +533,7 @@ std::error_code llvm::sys::ChangeStdoutMode(fs::OpenFlags Flags) {
 
 std::error_code llvm::sys::ChangeStdinToBinary() {
 #ifdef __MVS__
-  return disableAutoConversion(STDIN_FILENO);
+  return disablezOSAutoConversion(STDIN_FILENO);
 #else
   // Do nothing, as Unix doesn't differentiate between text and binary.
   return std::error_code();
diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp
index 2ce54faa9857e..48f04091e9e39 100644
--- a/llvm/lib/Support/raw_ostream.cpp
+++ b/llvm/lib/Support/raw_ostream.cpp
@@ -894,7 +894,7 @@ raw_fd_ostream &llvm::outs() {
   // Set buffer settings to model stdout behavior.
   std::error_code EC;
 #ifdef __MVS__
-  EC = enableAutoConversion(STDOUT_FILENO);
+  EC = enablezOSAutoConversion(STDOUT_FILENO);
   assert(!EC);
 #endif
   static raw_fd_ostream S("-", EC, sys::fs::OF_None);
@@ -905,7 +905,7 @@ raw_fd_ostream &llvm::outs() {
 raw_fd_ostream &llvm::errs() {
   // Set standard error to be unbuffered.
 #ifdef __MVS__
-  std::error_code EC = enableAutoConversion(STDERR_FILENO);
+  std::error_code EC = enablezOSAutoConversion(STDERR_FILENO);
   assert(!EC);
 #endif
   static raw_fd_ostream S(STDERR_FILENO, false, true);
diff --git a/llvm/utils/count/count.c b/llvm/utils/count/count.c
index 300be2aa8a18e..9166145fcc10a 100644
--- a/llvm/utils/count/count.c
+++ b/llvm/utils/count/count.c
@@ -12,10 +12,10 @@
 
 int main(int argc, char **argv) {
 #ifdef __MVS__
-  if (enableAutoConversion(fileno(stdin)) == -1)
+  if (enablezOSAutoConversion(fileno(stdin)) == -1)
     fprintf(stderr, "Setting conversion on stdin failed\n");
 
-  if (enableAutoConversion(fileno(stderr)) == -1)
+  if (enablezOSAutoConversion(fileno(stderr)) == -1)
     fprintf(stdout, "Setting conversion on stderr failed\n");
 #endif
   size_t Count, NumLines, NumRead;

From 5e25291b3c50873dbd0e2b3939b113bcff691460 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Thu, 5 Sep 2024 10:35:02 -0700
Subject: [PATCH 264/425] [SandboxIR][Bench] Initial patch for performance
 tracking (#107296)

This patch adds a new benchmark suite for SandboxIR. It measures the
performance of some of the most commonly used API functions and compares
it against LLVM IR.
---
 llvm/benchmarks/CMakeLists.txt     |   4 +
 llvm/benchmarks/SandboxIRBench.cpp | 115 +++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+)
 create mode 100644 llvm/benchmarks/SandboxIRBench.cpp

diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt
index aa0cb77773344..1078efa55f497 100644
--- a/llvm/benchmarks/CMakeLists.txt
+++ b/llvm/benchmarks/CMakeLists.txt
@@ -1,5 +1,7 @@
 set(LLVM_LINK_COMPONENTS
+  AsmParser
   Core
+  SandboxIR
   Support)
 
 add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED)
@@ -7,3 +9,5 @@ add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(GetIntrinsicForClangBuiltin GetIntrinsicForClangBuiltin.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(GetIntrinsicInfoTableEntriesBM GetIntrinsicInfoTableEntriesBM.cpp PARTIAL_SOURCES_INTENDED)
+add_benchmark(SandboxIRBench SandboxIRBench.cpp PARTIAL_SOURCES_INTENDED)
+
diff --git a/llvm/benchmarks/SandboxIRBench.cpp b/llvm/benchmarks/SandboxIRBench.cpp
new file mode 100644
index 0000000000000..633de6db1f5e2
--- /dev/null
+++ b/llvm/benchmarks/SandboxIRBench.cpp
@@ -0,0 +1,115 @@
+//===- SandboxIRBench.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// These tests measure the performance of some core SandboxIR functions and
+// compare them against LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "benchmark/benchmark.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
+#include "llvm/SandboxIR/SandboxIR.h"
+#include "llvm/Support/SourceMgr.h"
+#include <memory>
+
+using namespace llvm;
+
+static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
+  SMDiagnostic Err;
+  std::unique_ptr<Module> M = parseAssemblyString(IR, Err, C);
+  if (!M)
+    Err.print("SandboxIRBench", errs());
+  return M;
+}
+
+enum class IR {
+  LLVM,
+  SBox,
+};
+// Traits to get llvm::BasicBlock/sandboxir::BasicBlock from IR::LLVM/IR::SBox.
+template <IR IRTy> struct TypeSelect {};
+template <> struct TypeSelect<IR::LLVM> {
+  using BasicBlock = llvm::BasicBlock;
+};
+template <> struct TypeSelect<IR::SBox> {
+  using BasicBlock = sandboxir::BasicBlock;
+};
+
+template <IR IRTy>
+static typename TypeSelect<IRTy>::BasicBlock *
+genIR(std::unique_ptr<llvm::Module> &LLVMM, LLVMContext &LLVMCtx,
+      sandboxir::Context &Ctx,
+      std::function<std::string(unsigned)> GenerateIRStr,
+      unsigned NumInstrs = 0u) {
+  std::string IRStr = GenerateIRStr(NumInstrs);
+  LLVMM = parseIR(LLVMCtx, IRStr.c_str());
+  llvm::Function *LLVMF = &*LLVMM->getFunction("foo");
+  llvm::BasicBlock *LLVMBB = &*LLVMF->begin();
+
+  sandboxir::Function *F = Ctx.createFunction(LLVMF);
+  sandboxir::BasicBlock *BB = &*F->begin();
+  if constexpr (IRTy == IR::LLVM)
+    return LLVMBB;
+  else
+    return BB;
+}
+
+static std::string generateBBWalkIR(unsigned Size) {
+  std::stringstream SS;
+  SS << "define void @foo(i32 %v1, i32 %v2) {\n";
+  for (auto Cnt : seq<unsigned>(0, Size))
+    SS << "  %add" << Cnt << " = add i32 %v1, %v2\n";
+  SS << "ret void";
+  SS << "}";
+  return SS.str();
+}
+
+template <IR IRTy> static void BBWalk(benchmark::State &State) {
+  LLVMContext LLVMCtx;
+  sandboxir::Context Ctx(LLVMCtx);
+  unsigned NumInstrs = State.range(0);
+  std::unique_ptr<llvm::Module> LLVMM;
+  auto *BB = genIR<IRTy>(LLVMM, LLVMCtx, Ctx, generateBBWalkIR, NumInstrs);
+  for (auto _ : State) {
+    // Walk LLVM Instructions.
+    for (auto &I : *BB)
+      benchmark::DoNotOptimize(I);
+  }
+}
+
+static std::string generateGetTypeIR(unsigned Size) {
+  return R"IR(
+define void @foo(i32 %v1, i32 %v2) {
+  %add = add i32 %v1, %v2
+  ret void
+}
+)IR";
+}
+
+template <IR IRTy> static void GetType(benchmark::State &State) {
+  LLVMContext LLVMCtx;
+  sandboxir::Context Ctx(LLVMCtx);
+  std::unique_ptr<llvm::Module> LLVMM;
+  auto *BB = genIR<IRTy>(LLVMM, LLVMCtx, Ctx, generateGetTypeIR);
+  auto *I = &*BB->begin();
+  for (auto _ : State)
+    benchmark::DoNotOptimize(I->getType());
+}
+
+BENCHMARK(GetType<IR::LLVM>);
+BENCHMARK(GetType<IR::SBox>);
+
+BENCHMARK(BBWalk<IR::LLVM>)->Args({1024});
+BENCHMARK(BBWalk<IR::SBox>)->Args({1024});
+
+BENCHMARK_MAIN();

From 3815f478bb4f1c724d36044a4e0bbd3352313322 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Thu, 5 Sep 2024 19:40:58 +0200
Subject: [PATCH 265/425] [mlir][Transforms] Dialect conversion: Make
 materializations optional (#107109)

This commit makes source/target/argument materializations (via the
`TypeConverter` API) optional.

By default (`ConversionConfig::buildMaterializations = true`), the
dialect conversion infrastructure tries to legalize all unresolved
materializations right after the main transformation process has
succeeded. If at least one unresolved materialization fails to resolve,
the dialect conversion fails. (With an error message such as `failed to
legalize unresolved materialization ...`.) Automatic materializations
through the `TypeConverter` API can now be deactivated. In that case,
every unresolved materialization will show up as a
`builtin.unrealized_conversion_cast` op in the output IR.

There used to be a complex and error-prone analysis in the dialect
conversion that predicted the future uses of unresolved
materializations. Based on that logic, some casts (that were deemed to
unnecessary) were folded. This analysis was needed because folding
happened at a point of time when some IR changes (e.g., op replacements)
had not materialized yet.

This commit removes that analysis. Any folding of cast ops now happens
after all other IR changes have been materialized and the uses can
directly be queried from the IR. This simplifies the analysis
significantly. And certain helper data structures such as
`inverseMapping` are no longer needed for the analysis. The folding
itself is done by `reconcileUnrealizedCasts` (which also exists as a
standalone pass).

After casts have been folded, the remaining casts are materialized
through the `TypeConverter`, as usual. This last step can be deactivated
in the `ConversionConfig`.

`ConversionConfig::buildMaterializations = false` can be used to debug
error messages such as `failed to legalize unresolved materialization
...`. (It is also useful in case automatic materializations are not
needed.) The materializations that failed to resolve can then be seen as
`builtin.unrealized_conversion_cast` ops in the resulting IR. (This is
better than running with `-debug`, because `-debug` shows IR where some
IR changes have not been materialized yet.)

Note: This is a reupload of #104668, but with correct handling of cyclic
unrealized_conversion_casts that may be generated by the dialect
conversion.
---
 .../mlir/Transforms/DialectConversion.h       |  11 +
 .../Transforms/Utils/DialectConversion.cpp    | 466 +++++++-----------
 .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir |   5 +-
 .../Transforms/finalizing-bufferize.mlir      |   1 +
 .../test-legalize-type-conversion.mlir        |   6 +-
 mlir/test/Transforms/test-legalizer.mlir      |  11 +
 mlir/test/lib/Dialect/Test/TestPatterns.cpp   |  34 +-
 7 files changed, 234 insertions(+), 300 deletions(-)

diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 60113bdef16a2..5f680e8eca755 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -1124,6 +1124,17 @@ struct ConversionConfig {
   // already been modified) and iterators into past IR state cannot be
   // represented at the moment.
   RewriterBase::Listener *listener = nullptr;
+
+  /// If set to "true", the dialect conversion attempts to build source/target/
+  /// argument materializations through the type converter API in lieu of
+  /// builtin.unrealized_conversion_cast ops. The conversion process fails if
+  /// at least one materialization could not be built.
+  ///
+  /// If set to "false", the dialect conversion does not does not build any
+  /// custom materializations and instead inserts
+  /// builtin.unrealized_conversion_cast ops to ensure that the resulting IR
+  /// is valid.
+  bool buildMaterializations = true;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index b23fb97959ed6..450e66f0db4e7 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -702,14 +702,12 @@ class UnresolvedMaterializationRewrite : public OperationRewrite {
     return rewrite->getKind() == Kind::UnresolvedMaterialization;
   }
 
+  void rollback() override;
+
   UnrealizedConversionCastOp getOperation() const {
     return cast<UnrealizedConversionCastOp>(op);
   }
 
-  void rollback() override;
-
-  void cleanup(RewriterBase &rewriter) override;
-
   /// Return the type converter of this materialization (which may be null).
   const TypeConverter *getConverter() const {
     return converterAndKind.getPointer();
@@ -766,7 +764,7 @@ namespace detail {
 struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   explicit ConversionPatternRewriterImpl(MLIRContext *ctx,
                                          const ConversionConfig &config)
-      : context(ctx), config(config) {}
+      : context(ctx), eraseRewriter(ctx), config(config) {}
 
   //===--------------------------------------------------------------------===//
   // State Management
@@ -834,6 +832,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   //===--------------------------------------------------------------------===//
   // Materializations
   //===--------------------------------------------------------------------===//
+
   /// Build an unresolved materialization operation given an output type and set
   /// of input operands.
   Value buildUnresolvedMaterialization(MaterializationKind kind,
@@ -882,7 +881,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
 
     /// Erase the given op (unless it was already erased).
     void eraseOp(Operation *op) override {
-      if (erased.contains(op))
+      if (wasErased(op))
         return;
       op->dropAllUses();
       RewriterBase::eraseOp(op);
@@ -890,17 +889,24 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
 
     /// Erase the given block (unless it was already erased).
     void eraseBlock(Block *block) override {
-      if (erased.contains(block))
+      if (wasErased(block))
         return;
       assert(block->empty() && "expected empty block");
       block->dropAllDefinedValueUses();
       RewriterBase::eraseBlock(block);
     }
 
+    bool wasErased(void *ptr) const { return erased.contains(ptr); }
+
+    bool wasErased(OperationRewrite *rewrite) const {
+      return wasErased(rewrite->getOperation());
+    }
+
     void notifyOperationErased(Operation *op) override { erased.insert(op); }
 
     void notifyBlockErased(Block *block) override { erased.insert(block); }
 
+  private:
     /// Pointers to all erased operations and blocks.
     DenseSet<void *> erased;
   };
@@ -912,6 +918,11 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// MLIR context.
   MLIRContext *context;
 
+  /// A rewriter that keeps track of ops/block that were already erased and
+  /// skips duplicate op/block erasures. This rewriter is used during the
+  /// "cleanup" phase.
+  SingleEraseRewriter eraseRewriter;
+
   // Mapping between replaced values that differ in type. This happens when
   // replacing a value with one of a different type.
   ConversionValueMapping mapping;
@@ -1058,10 +1069,6 @@ void UnresolvedMaterializationRewrite::rollback() {
   op->erase();
 }
 
-void UnresolvedMaterializationRewrite::cleanup(RewriterBase &rewriter) {
-  rewriter.eraseOp(op);
-}
-
 void ConversionPatternRewriterImpl::applyRewrites() {
   // Commit all rewrites.
   IRRewriter rewriter(context, config.listener);
@@ -1069,7 +1076,6 @@ void ConversionPatternRewriterImpl::applyRewrites() {
     rewrite->commit(rewriter);
 
   // Clean up all rewrites.
-  SingleEraseRewriter eraseRewriter(context);
   for (auto &rewrite : rewrites)
     rewrite->cleanup(eraseRewriter);
 }
@@ -2353,12 +2359,6 @@ struct OperationConverter {
       ConversionPatternRewriterImpl &rewriterImpl,
       DenseMap<Value, SmallVector<Value>> &inverseMapping);
 
-  /// Legalize any unresolved type materializations.
-  LogicalResult legalizeUnresolvedMaterializations(
-      ConversionPatternRewriter &rewriter,
-      ConversionPatternRewriterImpl &rewriterImpl,
-      DenseMap<Value, SmallVector<Value>> &inverseMapping);
-
   /// Legalize an operation result that was marked as "erased".
   LogicalResult
   legalizeErasedResult(Operation *op, OpResult result,
@@ -2405,6 +2405,128 @@ LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter,
   return success();
 }
 
+static LogicalResult
+legalizeUnresolvedMaterialization(RewriterBase &rewriter,
+                                  UnresolvedMaterializationRewrite *rewrite) {
+  UnrealizedConversionCastOp op = rewrite->getOperation();
+  assert(!op.use_empty() &&
+         "expected that dead materializations have already been DCE'd");
+  Operation::operand_range inputOperands = op.getOperands();
+  Type outputType = op.getResultTypes()[0];
+
+  // Try to materialize the conversion.
+  if (const TypeConverter *converter = rewrite->getConverter()) {
+    rewriter.setInsertionPoint(op);
+    Value newMaterialization;
+    switch (rewrite->getMaterializationKind()) {
+    case MaterializationKind::Argument:
+      // Try to materialize an argument conversion.
+      newMaterialization = converter->materializeArgumentConversion(
+          rewriter, op->getLoc(), outputType, inputOperands);
+      if (newMaterialization)
+        break;
+      // If an argument materialization failed, fallback to trying a target
+      // materialization.
+      [[fallthrough]];
+    case MaterializationKind::Target:
+      newMaterialization = converter->materializeTargetConversion(
+          rewriter, op->getLoc(), outputType, inputOperands);
+      break;
+    case MaterializationKind::Source:
+      newMaterialization = converter->materializeSourceConversion(
+          rewriter, op->getLoc(), outputType, inputOperands);
+      break;
+    }
+    if (newMaterialization) {
+      assert(newMaterialization.getType() == outputType &&
+             "materialization callback produced value of incorrect type");
+      rewriter.replaceOp(op, newMaterialization);
+      return success();
+    }
+  }
+
+  InFlightDiagnostic diag = op->emitError()
+                            << "failed to legalize unresolved materialization "
+                               "from ("
+                            << inputOperands.getTypes() << ") to " << outputType
+                            << " that remained live after conversion";
+  diag.attachNote(op->getUsers().begin()->getLoc())
+      << "see existing live user here: " << *op->getUsers().begin();
+  return failure();
+}
+
+/// Erase all dead unrealized_conversion_cast ops. An op is dead if its results
+/// are not used (transitively) by any op that is not in the given list of
+/// cast ops.
+///
+/// In particular, this function erases cyclic casts that may be inserted
+/// during the dialect conversion process. E.g.:
+/// %0 = unrealized_conversion_cast(%1)
+/// %1 = unrealized_conversion_cast(%0)
+// Note: This step will become unnecessary when
+// https://github.com/llvm/llvm-project/pull/106760 has been merged.
+static void eraseDeadUnrealizedCasts(
+    ArrayRef<UnrealizedConversionCastOp> castOps,
+    SmallVectorImpl<UnrealizedConversionCastOp> *remainingCastOps) {
+  // Ops that have already been visited or are currently being visited.
+  DenseSet<Operation *> visited;
+  // Set of all cast ops for faster lookups.
+  DenseSet<Operation *> castOpSet;
+  // Set of all cast ops that have been determined to be alive.
+  DenseSet<Operation *> live;
+
+  for (UnrealizedConversionCastOp op : castOps)
+    castOpSet.insert(op);
+
+  // Visit a cast operation. Return "true" if the operation is live.
+  std::function<bool(Operation *)> visit = [&](Operation *op) -> bool {
+    // No need to traverse any IR if the op was already marked as live.
+    if (live.contains(op))
+      return true;
+
+    // Do not visit ops multiple times. If we find a circle, no live user was
+    // found on the current path.
+    if (visited.contains(op))
+      return false;
+    visited.insert(op);
+
+    // Visit all users.
+    for (Operation *user : op->getUsers()) {
+      // If the user is not an unrealized_conversion_cast op, then the given op
+      // is live.
+      if (!castOpSet.contains(user)) {
+        live.insert(op);
+        return true;
+      }
+      // Otherwise, it is live if a live op can be reached from one of its
+      // users (which must all be unrealized_conversion_cast ops).
+      if (visit(user)) {
+        live.insert(op);
+        return true;
+      }
+    }
+
+    return false;
+  };
+
+  // Visit all cast ops.
+  for (UnrealizedConversionCastOp op : castOps) {
+    visit(op);
+    visited.clear();
+  }
+
+  // Erase all cast ops that are dead.
+  for (UnrealizedConversionCastOp op : castOps) {
+    if (live.contains(op)) {
+      if (remainingCastOps)
+        remainingCastOps->push_back(op);
+      continue;
+    }
+    op->dropAllUses();
+    op->erase();
+  }
+}
+
 LogicalResult OperationConverter::convertOperations(ArrayRef<Operation *> ops) {
   if (ops.empty())
     return success();
@@ -2446,6 +2568,38 @@ LogicalResult OperationConverter::convertOperations(ArrayRef<Operation *> ops) {
   } else {
     rewriterImpl.applyRewrites();
   }
+
+  // Gather all unresolved materializations.
+  SmallVector<UnrealizedConversionCastOp> allCastOps;
+  DenseMap<Operation *, UnresolvedMaterializationRewrite *> rewriteMap;
+  for (std::unique_ptr<IRRewrite> &rewrite : rewriterImpl.rewrites) {
+    auto *mat = dyn_cast<UnresolvedMaterializationRewrite>(rewrite.get());
+    if (!mat)
+      continue;
+    if (rewriterImpl.eraseRewriter.wasErased(mat))
+      continue;
+    allCastOps.push_back(mat->getOperation());
+    rewriteMap[mat->getOperation()] = mat;
+  }
+
+  // Reconcile all UnrealizedConversionCastOps that were inserted by the
+  // dialect conversion frameworks. (Not the one that were inserted by
+  // patterns.)
+  SmallVector<UnrealizedConversionCastOp> remainingCastOps1, remainingCastOps2;
+  eraseDeadUnrealizedCasts(allCastOps, &remainingCastOps1);
+  reconcileUnrealizedCasts(remainingCastOps1, &remainingCastOps2);
+
+  // Try to legalize all unresolved materializations.
+  if (config.buildMaterializations) {
+    IRRewriter rewriter(rewriterImpl.context, config.listener);
+    for (UnrealizedConversionCastOp castOp : remainingCastOps2) {
+      auto it = rewriteMap.find(castOp.getOperation());
+      assert(it != rewriteMap.end() && "inconsistent state");
+      if (failed(legalizeUnresolvedMaterialization(rewriter, it->second)))
+        return failure();
+    }
+  }
+
   return success();
 }
 
@@ -2459,9 +2613,6 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) {
   if (failed(legalizeConvertedOpResultTypes(rewriter, rewriterImpl,
                                             inverseMapping)))
     return failure();
-  if (failed(legalizeUnresolvedMaterializations(rewriter, rewriterImpl,
-                                                inverseMapping)))
-    return failure();
   return success();
 }
 
@@ -2577,279 +2728,6 @@ LogicalResult OperationConverter::legalizeConvertedArgumentTypes(
   return success();
 }
 
-/// Replace the results of a materialization operation with the given values.
-static void
-replaceMaterialization(ConversionPatternRewriterImpl &rewriterImpl,
-                       ResultRange matResults, ValueRange values,
-                       DenseMap<Value, SmallVector<Value>> &inverseMapping) {
-  matResults.replaceAllUsesWith(values);
-
-  // For each of the materialization results, update the inverse mappings to
-  // point to the replacement values.
-  for (auto [matResult, newValue] : llvm::zip(matResults, values)) {
-    auto inverseMapIt = inverseMapping.find(matResult);
-    if (inverseMapIt == inverseMapping.end())
-      continue;
-
-    // Update the reverse mapping, or remove the mapping if we couldn't update
-    // it. Not being able to update signals that the mapping would have become
-    // circular (i.e. %foo -> newValue -> %foo), which may occur as values are
-    // propagated through temporary materializations. We simply drop the
-    // mapping, and let the post-conversion replacement logic handle updating
-    // uses.
-    for (Value inverseMapVal : inverseMapIt->second)
-      if (!rewriterImpl.mapping.tryMap(inverseMapVal, newValue))
-        rewriterImpl.mapping.erase(inverseMapVal);
-  }
-}
-
-/// Compute all of the unresolved materializations that will persist beyond the
-/// conversion process, and require inserting a proper user materialization for.
-static void computeNecessaryMaterializations(
-    DenseMap<Operation *, UnresolvedMaterializationRewrite *>
-        &materializationOps,
-    ConversionPatternRewriter &rewriter,
-    ConversionPatternRewriterImpl &rewriterImpl,
-    DenseMap<Value, SmallVector<Value>> &inverseMapping,
-    SetVector<UnresolvedMaterializationRewrite *> &necessaryMaterializations) {
-  // Helper function to check if the given value or a not yet materialized
-  // replacement of the given value is live.
-  // Note: `inverseMapping` maps from replaced values to original values.
-  auto isLive = [&](Value value) {
-    auto findFn = [&](Operation *user) {
-      auto matIt = materializationOps.find(user);
-      if (matIt != materializationOps.end())
-        return !necessaryMaterializations.count(matIt->second);
-      return rewriterImpl.isOpIgnored(user);
-    };
-    // A worklist is needed because a value may have gone through a chain of
-    // replacements and each of the replaced values may have live users.
-    SmallVector<Value> worklist;
-    worklist.push_back(value);
-    while (!worklist.empty()) {
-      Value next = worklist.pop_back_val();
-      if (llvm::find_if_not(next.getUsers(), findFn) != next.user_end())
-        return true;
-      // This value may be replacing another value that has a live user.
-      llvm::append_range(worklist, inverseMapping.lookup(next));
-    }
-    return false;
-  };
-
-  llvm::unique_function<Value(Value, Value, Type)> lookupRemappedValue =
-      [&](Value invalidRoot, Value value, Type type) {
-        // Check to see if the input operation was remapped to a variant of the
-        // output.
-        Value remappedValue = rewriterImpl.mapping.lookupOrDefault(value, type);
-        if (remappedValue.getType() == type && remappedValue != invalidRoot)
-          return remappedValue;
-
-        // Check to see if the input is a materialization operation that
-        // provides an inverse conversion. We just check blindly for
-        // UnrealizedConversionCastOp here, but it has no effect on correctness.
-        auto inputCastOp = value.getDefiningOp<UnrealizedConversionCastOp>();
-        if (inputCastOp && inputCastOp->getNumOperands() == 1)
-          return lookupRemappedValue(invalidRoot, inputCastOp->getOperand(0),
-                                     type);
-
-        return Value();
-      };
-
-  SetVector<UnresolvedMaterializationRewrite *> worklist;
-  for (auto &rewrite : rewriterImpl.rewrites) {
-    auto *mat = dyn_cast<UnresolvedMaterializationRewrite>(rewrite.get());
-    if (!mat)
-      continue;
-    materializationOps.try_emplace(mat->getOperation(), mat);
-    worklist.insert(mat);
-  }
-  while (!worklist.empty()) {
-    UnresolvedMaterializationRewrite *mat = worklist.pop_back_val();
-    UnrealizedConversionCastOp op = mat->getOperation();
-
-    // We currently only handle target materializations here.
-    assert(op->getNumResults() == 1 && "unexpected materialization type");
-    OpResult opResult = op->getOpResult(0);
-    Type outputType = opResult.getType();
-    Operation::operand_range inputOperands = op.getOperands();
-
-    // Try to forward propagate operands for user conversion casts that result
-    // in the input types of the current cast.
-    for (Operation *user : llvm::make_early_inc_range(opResult.getUsers())) {
-      auto castOp = dyn_cast<UnrealizedConversionCastOp>(user);
-      if (!castOp)
-        continue;
-      if (castOp->getResultTypes() == inputOperands.getTypes()) {
-        replaceMaterialization(rewriterImpl, user->getResults(), inputOperands,
-                               inverseMapping);
-        necessaryMaterializations.remove(materializationOps.lookup(user));
-      }
-    }
-
-    // Try to avoid materializing a resolved materialization if possible.
-    // Handle the case of a 1-1 materialization.
-    if (inputOperands.size() == 1) {
-      // Check to see if the input operation was remapped to a variant of the
-      // output.
-      Value remappedValue =
-          lookupRemappedValue(opResult, inputOperands[0], outputType);
-      if (remappedValue && remappedValue != opResult) {
-        replaceMaterialization(rewriterImpl, opResult, remappedValue,
-                               inverseMapping);
-        necessaryMaterializations.remove(mat);
-        continue;
-      }
-    } else {
-      // TODO: Avoid materializing other types of conversions here.
-    }
-
-    // If the materialization does not have any live users, we don't need to
-    // generate a user materialization for it.
-    bool isMaterializationLive = isLive(opResult);
-    if (!isMaterializationLive)
-      continue;
-    if (!necessaryMaterializations.insert(mat))
-      continue;
-
-    // Reprocess input materializations to see if they have an updated status.
-    for (Value input : inputOperands) {
-      if (auto parentOp = input.getDefiningOp<UnrealizedConversionCastOp>()) {
-        if (auto *mat = materializationOps.lookup(parentOp))
-          worklist.insert(mat);
-      }
-    }
-  }
-}
-
-/// Legalize the given unresolved materialization. Returns success if the
-/// materialization was legalized, failure otherise.
-static LogicalResult legalizeUnresolvedMaterialization(
-    UnresolvedMaterializationRewrite &mat,
-    DenseMap<Operation *, UnresolvedMaterializationRewrite *>
-        &materializationOps,
-    ConversionPatternRewriter &rewriter,
-    ConversionPatternRewriterImpl &rewriterImpl,
-    DenseMap<Value, SmallVector<Value>> &inverseMapping) {
-  auto findLiveUser = [&](auto &&users) {
-    auto liveUserIt = llvm::find_if_not(
-        users, [&](Operation *user) { return rewriterImpl.isOpIgnored(user); });
-    return liveUserIt == users.end() ? nullptr : *liveUserIt;
-  };
-
-  llvm::unique_function<Value(Value, Type)> lookupRemappedValue =
-      [&](Value value, Type type) {
-        // Check to see if the input operation was remapped to a variant of the
-        // output.
-        Value remappedValue = rewriterImpl.mapping.lookupOrDefault(value, type);
-        if (remappedValue.getType() == type)
-          return remappedValue;
-        return Value();
-      };
-
-  UnrealizedConversionCastOp op = mat.getOperation();
-  if (!rewriterImpl.ignoredOps.insert(op))
-    return success();
-
-  // We currently only handle target materializations here.
-  OpResult opResult = op->getOpResult(0);
-  Operation::operand_range inputOperands = op.getOperands();
-  Type outputType = opResult.getType();
-
-  // If any input to this materialization is another materialization, resolve
-  // the input first.
-  for (Value value : op->getOperands()) {
-    auto valueCast = value.getDefiningOp<UnrealizedConversionCastOp>();
-    if (!valueCast)
-      continue;
-
-    auto matIt = materializationOps.find(valueCast);
-    if (matIt != materializationOps.end())
-      if (failed(legalizeUnresolvedMaterialization(
-              *matIt->second, materializationOps, rewriter, rewriterImpl,
-              inverseMapping)))
-        return failure();
-  }
-
-  // Perform a last ditch attempt to avoid materializing a resolved
-  // materialization if possible.
-  // Handle the case of a 1-1 materialization.
-  if (inputOperands.size() == 1) {
-    // Check to see if the input operation was remapped to a variant of the
-    // output.
-    Value remappedValue = lookupRemappedValue(inputOperands[0], outputType);
-    if (remappedValue && remappedValue != opResult) {
-      replaceMaterialization(rewriterImpl, opResult, remappedValue,
-                             inverseMapping);
-      return success();
-    }
-  } else {
-    // TODO: Avoid materializing other types of conversions here.
-  }
-
-  // Try to materialize the conversion.
-  if (const TypeConverter *converter = mat.getConverter()) {
-    rewriter.setInsertionPoint(op);
-    Value newMaterialization;
-    switch (mat.getMaterializationKind()) {
-    case MaterializationKind::Argument:
-      // Try to materialize an argument conversion.
-      newMaterialization = converter->materializeArgumentConversion(
-          rewriter, op->getLoc(), outputType, inputOperands);
-      if (newMaterialization)
-        break;
-      // If an argument materialization failed, fallback to trying a target
-      // materialization.
-      [[fallthrough]];
-    case MaterializationKind::Target:
-      newMaterialization = converter->materializeTargetConversion(
-          rewriter, op->getLoc(), outputType, inputOperands);
-      break;
-    case MaterializationKind::Source:
-      newMaterialization = converter->materializeSourceConversion(
-          rewriter, op->getLoc(), outputType, inputOperands);
-      break;
-    }
-    if (newMaterialization) {
-      assert(newMaterialization.getType() == outputType &&
-             "materialization callback produced value of incorrect type");
-      replaceMaterialization(rewriterImpl, opResult, newMaterialization,
-                             inverseMapping);
-      return success();
-    }
-  }
-
-  InFlightDiagnostic diag = op->emitError()
-                            << "failed to legalize unresolved materialization "
-                               "from ("
-                            << inputOperands.getTypes() << ") to " << outputType
-                            << " that remained live after conversion";
-  if (Operation *liveUser = findLiveUser(op->getUsers())) {
-    diag.attachNote(liveUser->getLoc())
-        << "see existing live user here: " << *liveUser;
-  }
-  return failure();
-}
-
-LogicalResult OperationConverter::legalizeUnresolvedMaterializations(
-    ConversionPatternRewriter &rewriter,
-    ConversionPatternRewriterImpl &rewriterImpl,
-    DenseMap<Value, SmallVector<Value>> &inverseMapping) {
-  // As an initial step, compute all of the inserted materializations that we
-  // expect to persist beyond the conversion process.
-  DenseMap<Operation *, UnresolvedMaterializationRewrite *> materializationOps;
-  SetVector<UnresolvedMaterializationRewrite *> necessaryMaterializations;
-  computeNecessaryMaterializations(materializationOps, rewriter, rewriterImpl,
-                                   inverseMapping, necessaryMaterializations);
-
-  // Once computed, legalize any necessary materializations.
-  for (auto *mat : necessaryMaterializations) {
-    if (failed(legalizeUnresolvedMaterialization(
-            *mat, materializationOps, rewriter, rewriterImpl, inverseMapping)))
-      return failure();
-  }
-  return success();
-}
-
 LogicalResult OperationConverter::legalizeErasedResult(
     Operation *op, OpResult result,
     ConversionPatternRewriterImpl &rewriterImpl) {
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 156a8a468d5b4..75362378daaaa 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -1286,7 +1286,6 @@ func.func @warpgroup_matrix_multiply_m128n128k64(
 
 // CHECK-DAG: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[arg0]] : !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>> to i64
 // CHECK-DAG: %[[S1:.+]] = builtin.unrealized_conversion_cast %[[arg1]] : !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>> to i64
-// CHECK-DAG: %[[S2:.+]] = builtin.unrealized_conversion_cast %[[arg2]] : memref<128x128xf32, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
 // CHECK: %[[S3:.+]] = llvm.mlir.constant(0.000000e+00 : f32) : f32
 // CHECK: %[[S4:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)>
 // CHECK: %[[S5:.+]] = llvm.extractvalue %[[S4]][0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
@@ -1299,8 +1298,8 @@ func.func @warpgroup_matrix_multiply_m128n128k64(
 // CHECK: %[[S136:.+]] = llvm.insertvalue %[[S134]], %[[S135]][1] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
 // CHECK: nvvm.wgmma.fence.aligned
 // CHECK: %[[S137:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)>
-// CHECK: %[[S138:.+]] = llvm.extractvalue %136[0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
-// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %[[S0]], %1, %[[S138]], <m = 64, n = 128, k = 16>, D[<f32>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
+// CHECK: %[[S138:.+]] = llvm.extractvalue %{{.*}}[0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
+// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %[[S0]], %[[S1]], %[[S138]], <m = 64, n = 128, k = 16>, D[<f32>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
 // CHECK: nvvm.wgmma.mma_async
 // CHECK: nvvm.wgmma.mma_async
 // CHECK: %[[S154:.+]] = nvvm.wgmma.mma_async
diff --git a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir
index a192434c5accf..ab18ce05e355d 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir
@@ -80,6 +80,7 @@ func.func @no_layout_to_dyn_layout_cast(%m: memref<?xf32>) -> memref<?xf32, stri
   %0 = bufferization.to_tensor %m : memref<?xf32>
   // expected-error @+1 {{failed to legalize unresolved materialization from ('memref<?xf32>') to 'memref<?xf32, strided<[1], offset: ?>>' that remained live after conversion}}
   %1 = bufferization.to_memref %0 : memref<?xf32, strided<[1], offset: ?>>
+  // expected-note @below{{see existing live user here}}
   return %1 : memref<?xf32, strided<[1], offset: ?>>
 }
 
diff --git a/mlir/test/Transforms/test-legalize-type-conversion.mlir b/mlir/test/Transforms/test-legalize-type-conversion.mlir
index cf2c9f6a8ec44..f130adff42f8c 100644
--- a/mlir/test/Transforms/test-legalize-type-conversion.mlir
+++ b/mlir/test/Transforms/test-legalize-type-conversion.mlir
@@ -4,6 +4,7 @@
 func.func @test_invalid_arg_materialization(
   // expected-error@below {{failed to legalize unresolved materialization from () to 'i16' that remained live after conversion}}
   %arg0: i16) {
+  // expected-note@below{{see existing live user here}}
   "foo.return"(%arg0) : (i16) -> ()
 }
 
@@ -22,6 +23,7 @@ func.func @test_valid_arg_materialization(%arg0: i64) {
 func.func @test_invalid_result_materialization() {
   // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}}
   %result = "test.type_producer"() : () -> f16
+  // expected-note@below{{see existing live user here}}
   "foo.return"(%result) : (f16) -> ()
 }
 
@@ -30,6 +32,7 @@ func.func @test_invalid_result_materialization() {
 func.func @test_invalid_result_materialization() {
   // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}}
   %result = "test.type_producer"() : () -> f16
+  // expected-note@below{{see existing live user here}}
   "foo.return"(%result) : (f16) -> ()
 }
 
@@ -49,6 +52,7 @@ func.func @test_transitive_use_materialization() {
 func.func @test_transitive_use_invalid_materialization() {
   // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}}
   %result = "test.another_type_producer"() : () -> f16
+  // expected-note@below{{see existing live user here}}
   "foo.return"(%result) : (f16) -> ()
 }
 
@@ -99,9 +103,9 @@ func.func @test_block_argument_not_converted() {
 func.func @test_signature_conversion_no_converter() {
   "test.signature_conversion_no_converter"() ({
   // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f32' that remained live after conversion}}
-  // expected-note@below {{see existing live user here}}
   ^bb0(%arg0: f32):
     "test.type_consumer"(%arg0) : (f32) -> ()
+    // expected-note@below{{see existing live user here}}
     "test.return"(%arg0) : (f32) -> ()
   }) : () -> ()
   return
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index a789ab9a82e19..e5503ee892042 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -452,3 +452,14 @@ func.func @convert_detached_signature() {
   }) : () -> ()
   "test.return"() : () -> ()
 }
+
+// -----
+
+// CHECK-LABEL: func @circular_mapping()
+//  CHECK-NEXT:   "test.valid"() : () -> ()
+func.func @circular_mapping() {
+  // Regression test that used to crash due to circular
+  // unrealized_conversion_cast ops.
+  %0 = "test.erase_op"() : () -> (i64)
+  "test.drop_operands_and_replace_with_valid"(%0) : (i64) -> ()
+}
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index 91dfb2faa80a1..3cbc307835afd 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -907,6 +907,22 @@ struct TestPassthroughInvalidOp : public ConversionPattern {
     return success();
   }
 };
+/// Replace with valid op, but simply drop the operands. This is used in a
+/// regression where we used to generate circular unrealized_conversion_cast
+/// ops.
+struct TestDropAndReplaceInvalidOp : public ConversionPattern {
+  TestDropAndReplaceInvalidOp(MLIRContext *ctx, const TypeConverter &converter)
+      : ConversionPattern(converter,
+                          "test.drop_operands_and_replace_with_valid", 1, ctx) {
+  }
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    rewriter.replaceOpWithNewOp<TestValidOp>(op, std::nullopt, ValueRange(),
+                                             std::nullopt);
+    return success();
+  }
+};
 /// This pattern handles the case of a split return value.
 struct TestSplitReturnType : public ConversionPattern {
   TestSplitReturnType(MLIRContext *ctx)
@@ -1070,6 +1086,19 @@ struct TestCreateUnregisteredOp : public OpRewritePattern<ILLegalOpG> {
     return success();
   };
 };
+
+class TestEraseOp : public ConversionPattern {
+public:
+  TestEraseOp(MLIRContext *ctx) : ConversionPattern("test.erase_op", 1, ctx) {}
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    // Erase op without replacements.
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
 } // namespace
 
 namespace {
@@ -1148,8 +1177,9 @@ struct TestLegalizePatternDriver
         TestUpdateConsumerType, TestNonRootReplacement,
         TestBoundedRecursiveRewrite, TestNestedOpCreationUndoRewrite,
         TestReplaceEraseOp, TestCreateUnregisteredOp, TestUndoMoveOpBefore,
-        TestUndoPropertiesModification>(&getContext());
-    patterns.add<TestDropOpSignatureConversion>(&getContext(), converter);
+        TestUndoPropertiesModification, TestEraseOp>(&getContext());
+    patterns.add<TestDropOpSignatureConversion, TestDropAndReplaceInvalidOp>(
+        &getContext(), converter);
     mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns,
                                                               converter);
     mlir::populateCallOpTypeConversionPattern(patterns, converter);

From ebc7f5578033248ce7de52a7f374332a2fc201aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= <alex@alexrp.com>
Date: Thu, 5 Sep 2024 19:45:19 +0200
Subject: [PATCH 266/425] [llvm][SystemZ] Fix parsing of `.cfi_undefined` with
 percent-less registers. (#107032)

This is just e3d658b applied to SystemZ.

An example of this being used in the wild:
https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/s390/s390-64/start.S;h=59eeb7e998227bdf32029cd074f0876c450404ea;hb=HEAD#l63
---
 .../SystemZ/AsmParser/SystemZAsmParser.cpp    |  56 ++++----
 llvm/test/MC/SystemZ/regs-bad.s               |   3 -
 llvm/test/MC/SystemZ/regs-good.s              | 128 ++++++++++++++++++
 3 files changed, 161 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index f2c04215d12dd..7c3898ac67312 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -416,7 +416,8 @@ class SystemZAsmParser : public MCTargetAsmParser {
     return static_cast<SystemZTargetStreamer &>(TS);
   }
 
-  bool parseRegister(Register &Reg, bool RestoreOnFailure = false);
+  bool parseRegister(Register &Reg, bool RequirePercent,
+                     bool RestoreOnFailure = false);
 
   bool parseIntegerRegister(Register &Reg, RegisterGroup Group);
 
@@ -495,7 +496,7 @@ class SystemZAsmParser : public MCTargetAsmParser {
   ParseStatus parseDirective(AsmToken DirectiveID) override;
   bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override;
   bool ParseRegister(MCRegister &RegNo, SMLoc &StartLoc, SMLoc &EndLoc,
-                     bool RestoreOnFailure);
+                     bool RequirePercent, bool RestoreOnFailure);
   ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
                                SMLoc &EndLoc) override;
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
@@ -756,26 +757,32 @@ void SystemZOperand::print(raw_ostream &OS) const {
 }
 
 // Parse one register of the form %<prefix><number>.
-bool SystemZAsmParser::parseRegister(Register &Reg, bool RestoreOnFailure) {
-  Reg.StartLoc = Parser.getTok().getLoc();
-
-  // Eat the % prefix.
-  if (Parser.getTok().isNot(AsmToken::Percent))
-    return Error(Parser.getTok().getLoc(), "register expected");
+bool SystemZAsmParser::parseRegister(Register &Reg, bool RequirePercent,
+                                     bool RestoreOnFailure) {
   const AsmToken &PercentTok = Parser.getTok();
-  Parser.Lex();
+  bool HasPercent = PercentTok.is(AsmToken::Percent);
+
+  Reg.StartLoc = PercentTok.getLoc();
+
+  if (RequirePercent && PercentTok.isNot(AsmToken::Percent))
+    return Error(PercentTok.getLoc(), "register expected");
+
+  if (HasPercent) {
+    Parser.Lex(); // Eat percent token.
+  }
 
   // Expect a register name.
   if (Parser.getTok().isNot(AsmToken::Identifier)) {
-    if (RestoreOnFailure)
+    if (RestoreOnFailure && HasPercent)
       getLexer().UnLex(PercentTok);
-    return Error(Reg.StartLoc, "invalid register");
+    return Error(Reg.StartLoc,
+                 HasPercent ? "invalid register" : "register expected");
   }
 
   // Check that there's a prefix.
   StringRef Name = Parser.getTok().getString();
   if (Name.size() < 2) {
-    if (RestoreOnFailure)
+    if (RestoreOnFailure && HasPercent)
       getLexer().UnLex(PercentTok);
     return Error(Reg.StartLoc, "invalid register");
   }
@@ -783,7 +790,7 @@ bool SystemZAsmParser::parseRegister(Register &Reg, bool RestoreOnFailure) {
 
   // Treat the rest of the register name as a register number.
   if (Name.substr(1).getAsInteger(10, Reg.Num)) {
-    if (RestoreOnFailure)
+    if (RestoreOnFailure && HasPercent)
       getLexer().UnLex(PercentTok);
     return Error(Reg.StartLoc, "invalid register");
   }
@@ -800,7 +807,7 @@ bool SystemZAsmParser::parseRegister(Register &Reg, bool RestoreOnFailure) {
   else if (Prefix == 'c' && Reg.Num < 16)
     Reg.Group = RegCR;
   else {
-    if (RestoreOnFailure)
+    if (RestoreOnFailure && HasPercent)
       getLexer().UnLex(PercentTok);
     return Error(Reg.StartLoc, "invalid register");
   }
@@ -842,7 +849,7 @@ ParseStatus SystemZAsmParser::parseRegister(OperandVector &Operands,
 
   // Handle register names of the form %<prefix><number>
   if (isParsingATT() && Parser.getTok().is(AsmToken::Percent)) {
-    if (parseRegister(Reg))
+    if (parseRegister(Reg, /*RequirePercent=*/true))
       return ParseStatus::Failure;
 
     // Check the parsed register group "Reg.Group" with the expected "Group"
@@ -918,7 +925,7 @@ ParseStatus SystemZAsmParser::parseAnyRegister(OperandVector &Operands) {
       return ParseStatus::NoMatch;
 
     Register Reg;
-    if (parseRegister(Reg))
+    if (parseRegister(Reg, /*RequirePercent=*/true))
       return ParseStatus::Failure;
 
     if (Reg.Num > 15)
@@ -1025,7 +1032,7 @@ bool SystemZAsmParser::parseAddress(bool &HaveReg1, Register &Reg1,
     if (isParsingATT() && getLexer().is(AsmToken::Percent)) {
       // Parse the first register.
       HaveReg1 = true;
-      if (parseRegister(Reg1))
+      if (parseRegister(Reg1, /*RequirePercent=*/true))
         return true;
     }
     // So if we have an integer as the first token in ([tok1], ..), it could:
@@ -1065,7 +1072,7 @@ bool SystemZAsmParser::parseAddress(bool &HaveReg1, Register &Reg1,
         if (parseIntegerRegister(Reg2, RegGR))
           return true;
       } else {
-        if (isParsingATT() && parseRegister(Reg2))
+        if (isParsingATT() && parseRegister(Reg2, /*RequirePercent=*/true))
           return true;
       }
     }
@@ -1355,9 +1362,10 @@ bool SystemZAsmParser::ParseGNUAttribute(SMLoc L) {
 }
 
 bool SystemZAsmParser::ParseRegister(MCRegister &RegNo, SMLoc &StartLoc,
-                                     SMLoc &EndLoc, bool RestoreOnFailure) {
+                                     SMLoc &EndLoc, bool RequirePercent,
+                                     bool RestoreOnFailure) {
   Register Reg;
-  if (parseRegister(Reg, RestoreOnFailure))
+  if (parseRegister(Reg, RequirePercent, RestoreOnFailure))
     return true;
   if (Reg.Group == RegGR)
     RegNo = SystemZMC::GR64Regs[Reg.Num];
@@ -1376,12 +1384,14 @@ bool SystemZAsmParser::ParseRegister(MCRegister &RegNo, SMLoc &StartLoc,
 
 bool SystemZAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc,
                                      SMLoc &EndLoc) {
-  return ParseRegister(Reg, StartLoc, EndLoc, /*RestoreOnFailure=*/false);
+  return ParseRegister(Reg, StartLoc, EndLoc, /*RequirePercent=*/false,
+                       /*RestoreOnFailure=*/false);
 }
 
 ParseStatus SystemZAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
                                                SMLoc &EndLoc) {
-  bool Result = ParseRegister(Reg, StartLoc, EndLoc, /*RestoreOnFailure=*/true);
+  bool Result = ParseRegister(Reg, StartLoc, EndLoc, /*RequirePercent=*/false,
+                              /*RestoreOnFailure=*/true);
   bool PendingErrors = getParser().hasPendingError();
   getParser().clearPendingErrors();
   if (PendingErrors)
@@ -1482,7 +1492,7 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands,
   // the instruction isn't recognized.
   if (isParsingATT() && Parser.getTok().is(AsmToken::Percent)) {
     Register Reg;
-    if (parseRegister(Reg))
+    if (parseRegister(Reg, /*RequirePercent=*/true))
       return true;
     Operands.push_back(SystemZOperand::createInvalid(Reg.StartLoc, Reg.EndLoc));
     return false;
diff --git a/llvm/test/MC/SystemZ/regs-bad.s b/llvm/test/MC/SystemZ/regs-bad.s
index 320cba0fc856c..6392ff2863002 100644
--- a/llvm/test/MC/SystemZ/regs-bad.s
+++ b/llvm/test/MC/SystemZ/regs-bad.s
@@ -262,8 +262,6 @@
 
 # Test general register parsing, with no predetermined class in mind.
 #
-#CHECK: error: register expected
-#CHECK: .cfi_offset r0,0
 #CHECK: error: invalid register
 #CHECK: .cfi_offset %,0
 #CHECK: error: invalid register
@@ -289,7 +287,6 @@
 #CHECK: error: invalid register
 #CHECK: .cfi_offset %arid,0
 
-	.cfi_offset r0,0
 	.cfi_offset %,0
 	.cfi_offset %r,0
 	.cfi_offset %f,0
diff --git a/llvm/test/MC/SystemZ/regs-good.s b/llvm/test/MC/SystemZ/regs-good.s
index b4c1edd1b591e..49bf8a48ca4c2 100644
--- a/llvm/test/MC/SystemZ/regs-good.s
+++ b/llvm/test/MC/SystemZ/regs-good.s
@@ -176,6 +176,70 @@
 	st	0, 4095(1,15)
 	st	0, 4095(15,1)
 
+#CHECK: .cfi_offset %r0, 0
+#CHECK: .cfi_offset %r1, 8
+#CHECK: .cfi_offset %r2, 16
+#CHECK: .cfi_offset %r3, 24
+#CHECK: .cfi_offset %r4, 32
+#CHECK: .cfi_offset %r5, 40
+#CHECK: .cfi_offset %r6, 48
+#CHECK: .cfi_offset %r7, 56
+#CHECK: .cfi_offset %r8, 64
+#CHECK: .cfi_offset %r9, 72
+#CHECK: .cfi_offset %r10, 80
+#CHECK: .cfi_offset %r11, 88
+#CHECK: .cfi_offset %r12, 96
+#CHECK: .cfi_offset %r13, 104
+#CHECK: .cfi_offset %r14, 112
+#CHECK: .cfi_offset %r15, 120
+#CHECK: .cfi_offset %f0, 128
+#CHECK: .cfi_offset %f1, 136
+#CHECK: .cfi_offset %f2, 144
+#CHECK: .cfi_offset %f3, 152
+#CHECK: .cfi_offset %f4, 160
+#CHECK: .cfi_offset %f5, 168
+#CHECK: .cfi_offset %f6, 176
+#CHECK: .cfi_offset %f7, 184
+#CHECK: .cfi_offset %f8, 192
+#CHECK: .cfi_offset %f9, 200
+#CHECK: .cfi_offset %f10, 208
+#CHECK: .cfi_offset %f11, 216
+#CHECK: .cfi_offset %f12, 224
+#CHECK: .cfi_offset %f13, 232
+#CHECK: .cfi_offset %f14, 240
+#CHECK: .cfi_offset %f15, 248
+#CHECK: .cfi_offset %a0, 256
+#CHECK: .cfi_offset %a1, 260
+#CHECK: .cfi_offset %a2, 264
+#CHECK: .cfi_offset %a3, 268
+#CHECK: .cfi_offset %a4, 272
+#CHECK: .cfi_offset %a5, 276
+#CHECK: .cfi_offset %a6, 280
+#CHECK: .cfi_offset %a7, 284
+#CHECK: .cfi_offset %a8, 288
+#CHECK: .cfi_offset %r9, 292
+#CHECK: .cfi_offset %a10, 296
+#CHECK: .cfi_offset %a11, 300
+#CHECK: .cfi_offset %a12, 304
+#CHECK: .cfi_offset %a13, 308
+#CHECK: .cfi_offset %a14, 312
+#CHECK: .cfi_offset %a15, 316
+#CHECK: .cfi_offset %c0, 318
+#CHECK: .cfi_offset %c1, 326
+#CHECK: .cfi_offset %c2, 334
+#CHECK: .cfi_offset %c3, 342
+#CHECK: .cfi_offset %c4, 350
+#CHECK: .cfi_offset %c5, 358
+#CHECK: .cfi_offset %c6, 366
+#CHECK: .cfi_offset %c7, 374
+#CHECK: .cfi_offset %c8, 382
+#CHECK: .cfi_offset %c9, 390
+#CHECK: .cfi_offset %c10, 398
+#CHECK: .cfi_offset %c11, 406
+#CHECK: .cfi_offset %c12, 414
+#CHECK: .cfi_offset %c13, 422
+#CHECK: .cfi_offset %c14, 430
+#CHECK: .cfi_offset %c15, 438
 #CHECK: .cfi_offset %r0, 0
 #CHECK: .cfi_offset %r1, 8
 #CHECK: .cfi_offset %r2, 16
@@ -306,4 +370,68 @@
 	.cfi_offset %c13,422
 	.cfi_offset %c14,430
 	.cfi_offset %c15,438
+	.cfi_offset r0,0
+	.cfi_offset r1,8
+	.cfi_offset r2,16
+	.cfi_offset r3,24
+	.cfi_offset r4,32
+	.cfi_offset r5,40
+	.cfi_offset r6,48
+	.cfi_offset r7,56
+	.cfi_offset r8,64
+	.cfi_offset r9,72
+	.cfi_offset r10,80
+	.cfi_offset r11,88
+	.cfi_offset r12,96
+	.cfi_offset r13,104
+	.cfi_offset r14,112
+	.cfi_offset r15,120
+	.cfi_offset f0,128
+	.cfi_offset f1,136
+	.cfi_offset f2,144
+	.cfi_offset f3,152
+	.cfi_offset f4,160
+	.cfi_offset f5,168
+	.cfi_offset f6,176
+	.cfi_offset f7,184
+	.cfi_offset f8,192
+	.cfi_offset f9,200
+	.cfi_offset f10,208
+	.cfi_offset f11,216
+	.cfi_offset f12,224
+	.cfi_offset f13,232
+	.cfi_offset f14,240
+	.cfi_offset f15,248
+	.cfi_offset a0,256
+	.cfi_offset a1,260
+	.cfi_offset a2,264
+	.cfi_offset a3,268
+	.cfi_offset a4,272
+	.cfi_offset a5,276
+	.cfi_offset a6,280
+	.cfi_offset a7,284
+	.cfi_offset a8,288
+	.cfi_offset r9,292
+	.cfi_offset a10,296
+	.cfi_offset a11,300
+	.cfi_offset a12,304
+	.cfi_offset a13,308
+	.cfi_offset a14,312
+	.cfi_offset a15,316
+	.cfi_offset c0,318
+	.cfi_offset c1,326
+	.cfi_offset c2,334
+	.cfi_offset c3,342
+	.cfi_offset c4,350
+	.cfi_offset c5,358
+	.cfi_offset c6,366
+	.cfi_offset c7,374
+	.cfi_offset c8,382
+	.cfi_offset c9,390
+	.cfi_offset c10,398
+	.cfi_offset c11,406
+	.cfi_offset c12,414
+	.cfi_offset c13,422
+	.cfi_offset c14,430
+	.cfi_offset c15,438
 	.cfi_endproc

From bedac64d36dce88ea25bd444c60eaac7d420550e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= <alex@alexrp.com>
Date: Thu, 5 Sep 2024 19:45:53 +0200
Subject: [PATCH 267/425] [llvm][SystemZ] Recognize `@GOTENT` modifier in
 assembler. (#107038)

Closes #105918.

I'm unsure if there are other places that need to be updated for this.
---
 llvm/include/llvm/MC/MCExpr.h                 |   1 +
 llvm/lib/MC/MCExpr.cpp                        | 264 +++++++++---------
 .../MCTargetDesc/SystemZELFObjectWriter.cpp   |   1 +
 llvm/test/MC/SystemZ/fixups.s                 |   6 +
 4 files changed, 141 insertions(+), 131 deletions(-)

diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h
index 118b1dd88525a..10bc6ebd6fe50 100644
--- a/llvm/include/llvm/MC/MCExpr.h
+++ b/llvm/include/llvm/MC/MCExpr.h
@@ -192,6 +192,7 @@ class MCSymbolRefExpr : public MCExpr {
     VK_Invalid,
 
     VK_GOT,
+    VK_GOTENT,
     VK_GOTOFF,
     VK_GOTREL,
     VK_PCREL,
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index e4ca431c3d25f..c9d5f6580fda4 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -226,6 +226,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_DTPOFF: return "DTPOFF";
   case VK_DTPREL: return "DTPREL";
   case VK_GOT: return "GOT";
+  case VK_GOTENT: return "GOTENT";
   case VK_GOTOFF: return "GOTOFF";
   case VK_GOTREL: return "GOTREL";
   case VK_PCREL: return "PCREL";
@@ -404,137 +405,138 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
 MCSymbolRefExpr::VariantKind
 MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
   return StringSwitch<VariantKind>(Name.lower())
-    .Case("dtprel", VK_DTPREL)
-    .Case("dtpoff", VK_DTPOFF)
-    .Case("got", VK_GOT)
-    .Case("gotoff", VK_GOTOFF)
-    .Case("gotrel", VK_GOTREL)
-    .Case("pcrel", VK_PCREL)
-    .Case("gotpcrel", VK_GOTPCREL)
-    .Case("gotpcrel_norelax", VK_GOTPCREL_NORELAX)
-    .Case("gottpoff", VK_GOTTPOFF)
-    .Case("indntpoff", VK_INDNTPOFF)
-    .Case("ntpoff", VK_NTPOFF)
-    .Case("gotntpoff", VK_GOTNTPOFF)
-    .Case("plt", VK_PLT)
-    .Case("tlscall", VK_TLSCALL)
-    .Case("tlsdesc", VK_TLSDESC)
-    .Case("tlsgd", VK_TLSGD)
-    .Case("tlsld", VK_TLSLD)
-    .Case("tlsldm", VK_TLSLDM)
-    .Case("tpoff", VK_TPOFF)
-    .Case("tprel", VK_TPREL)
-    .Case("tlvp", VK_TLVP)
-    .Case("tlvppage", VK_TLVPPAGE)
-    .Case("tlvppageoff", VK_TLVPPAGEOFF)
-    .Case("page", VK_PAGE)
-    .Case("pageoff", VK_PAGEOFF)
-    .Case("gotpage", VK_GOTPAGE)
-    .Case("gotpageoff", VK_GOTPAGEOFF)
-    .Case("imgrel", VK_COFF_IMGREL32)
-    .Case("secrel32", VK_SECREL)
-    .Case("size", VK_SIZE)
-    .Case("abs8", VK_X86_ABS8)
-    .Case("pltoff", VK_X86_PLTOFF)
-    .Case("l", VK_PPC_LO)
-    .Case("h", VK_PPC_HI)
-    .Case("ha", VK_PPC_HA)
-    .Case("high", VK_PPC_HIGH)
-    .Case("higha", VK_PPC_HIGHA)
-    .Case("higher", VK_PPC_HIGHER)
-    .Case("highera", VK_PPC_HIGHERA)
-    .Case("highest", VK_PPC_HIGHEST)
-    .Case("highesta", VK_PPC_HIGHESTA)
-    .Case("got@l", VK_PPC_GOT_LO)
-    .Case("got@h", VK_PPC_GOT_HI)
-    .Case("got@ha", VK_PPC_GOT_HA)
-    .Case("local", VK_PPC_LOCAL)
-    .Case("tocbase", VK_PPC_TOCBASE)
-    .Case("toc", VK_PPC_TOC)
-    .Case("toc@l", VK_PPC_TOC_LO)
-    .Case("toc@h", VK_PPC_TOC_HI)
-    .Case("toc@ha", VK_PPC_TOC_HA)
-    .Case("u", VK_PPC_U)
-    .Case("l", VK_PPC_L)
-    .Case("tls", VK_PPC_TLS)
-    .Case("dtpmod", VK_PPC_DTPMOD)
-    .Case("tprel@l", VK_PPC_TPREL_LO)
-    .Case("tprel@h", VK_PPC_TPREL_HI)
-    .Case("tprel@ha", VK_PPC_TPREL_HA)
-    .Case("tprel@high", VK_PPC_TPREL_HIGH)
-    .Case("tprel@higha", VK_PPC_TPREL_HIGHA)
-    .Case("tprel@higher", VK_PPC_TPREL_HIGHER)
-    .Case("tprel@highera", VK_PPC_TPREL_HIGHERA)
-    .Case("tprel@highest", VK_PPC_TPREL_HIGHEST)
-    .Case("tprel@highesta", VK_PPC_TPREL_HIGHESTA)
-    .Case("dtprel@l", VK_PPC_DTPREL_LO)
-    .Case("dtprel@h", VK_PPC_DTPREL_HI)
-    .Case("dtprel@ha", VK_PPC_DTPREL_HA)
-    .Case("dtprel@high", VK_PPC_DTPREL_HIGH)
-    .Case("dtprel@higha", VK_PPC_DTPREL_HIGHA)
-    .Case("dtprel@higher", VK_PPC_DTPREL_HIGHER)
-    .Case("dtprel@highera", VK_PPC_DTPREL_HIGHERA)
-    .Case("dtprel@highest", VK_PPC_DTPREL_HIGHEST)
-    .Case("dtprel@highesta", VK_PPC_DTPREL_HIGHESTA)
-    .Case("got@tprel", VK_PPC_GOT_TPREL)
-    .Case("got@tprel@l", VK_PPC_GOT_TPREL_LO)
-    .Case("got@tprel@h", VK_PPC_GOT_TPREL_HI)
-    .Case("got@tprel@ha", VK_PPC_GOT_TPREL_HA)
-    .Case("got@dtprel", VK_PPC_GOT_DTPREL)
-    .Case("got@dtprel@l", VK_PPC_GOT_DTPREL_LO)
-    .Case("got@dtprel@h", VK_PPC_GOT_DTPREL_HI)
-    .Case("got@dtprel@ha", VK_PPC_GOT_DTPREL_HA)
-    .Case("got@tlsgd", VK_PPC_GOT_TLSGD)
-    .Case("got@tlsgd@l", VK_PPC_GOT_TLSGD_LO)
-    .Case("got@tlsgd@h", VK_PPC_GOT_TLSGD_HI)
-    .Case("got@tlsgd@ha", VK_PPC_GOT_TLSGD_HA)
-    .Case("got@tlsld", VK_PPC_GOT_TLSLD)
-    .Case("got@tlsld@l", VK_PPC_GOT_TLSLD_LO)
-    .Case("got@tlsld@h", VK_PPC_GOT_TLSLD_HI)
-    .Case("got@tlsld@ha", VK_PPC_GOT_TLSLD_HA)
-    .Case("got@pcrel", VK_PPC_GOT_PCREL)
-    .Case("got@tlsgd@pcrel", VK_PPC_GOT_TLSGD_PCREL)
-    .Case("got@tlsld@pcrel", VK_PPC_GOT_TLSLD_PCREL)
-    .Case("got@tprel@pcrel", VK_PPC_GOT_TPREL_PCREL)
-    .Case("tls@pcrel", VK_PPC_TLS_PCREL)
-    .Case("notoc", VK_PPC_NOTOC)
-    .Case("gdgot", VK_Hexagon_GD_GOT)
-    .Case("gdplt", VK_Hexagon_GD_PLT)
-    .Case("iegot", VK_Hexagon_IE_GOT)
-    .Case("ie", VK_Hexagon_IE)
-    .Case("ldgot", VK_Hexagon_LD_GOT)
-    .Case("ldplt", VK_Hexagon_LD_PLT)
-    .Case("lo8", VK_AVR_LO8)
-    .Case("hi8", VK_AVR_HI8)
-    .Case("hlo8", VK_AVR_HLO8)
-    .Case("typeindex", VK_WASM_TYPEINDEX)
-    .Case("tbrel", VK_WASM_TBREL)
-    .Case("mbrel", VK_WASM_MBREL)
-    .Case("tlsrel", VK_WASM_TLSREL)
-    .Case("got@tls", VK_WASM_GOT_TLS)
-    .Case("funcindex", VK_WASM_FUNCINDEX)
-    .Case("gotpcrel32@lo", VK_AMDGPU_GOTPCREL32_LO)
-    .Case("gotpcrel32@hi", VK_AMDGPU_GOTPCREL32_HI)
-    .Case("rel32@lo", VK_AMDGPU_REL32_LO)
-    .Case("rel32@hi", VK_AMDGPU_REL32_HI)
-    .Case("rel64", VK_AMDGPU_REL64)
-    .Case("abs32@lo", VK_AMDGPU_ABS32_LO)
-    .Case("abs32@hi", VK_AMDGPU_ABS32_HI)
-    .Case("hi", VK_VE_HI32)
-    .Case("lo", VK_VE_LO32)
-    .Case("pc_hi", VK_VE_PC_HI32)
-    .Case("pc_lo", VK_VE_PC_LO32)
-    .Case("got_hi", VK_VE_GOT_HI32)
-    .Case("got_lo", VK_VE_GOT_LO32)
-    .Case("gotoff_hi", VK_VE_GOTOFF_HI32)
-    .Case("gotoff_lo", VK_VE_GOTOFF_LO32)
-    .Case("plt_hi", VK_VE_PLT_HI32)
-    .Case("plt_lo", VK_VE_PLT_LO32)
-    .Case("tls_gd_hi", VK_VE_TLS_GD_HI32)
-    .Case("tls_gd_lo", VK_VE_TLS_GD_LO32)
-    .Case("tpoff_hi", VK_VE_TPOFF_HI32)
-    .Case("tpoff_lo", VK_VE_TPOFF_LO32)
-    .Default(VK_Invalid);
+      .Case("dtprel", VK_DTPREL)
+      .Case("dtpoff", VK_DTPOFF)
+      .Case("got", VK_GOT)
+      .Case("gotent", VK_GOTENT)
+      .Case("gotoff", VK_GOTOFF)
+      .Case("gotrel", VK_GOTREL)
+      .Case("pcrel", VK_PCREL)
+      .Case("gotpcrel", VK_GOTPCREL)
+      .Case("gotpcrel_norelax", VK_GOTPCREL_NORELAX)
+      .Case("gottpoff", VK_GOTTPOFF)
+      .Case("indntpoff", VK_INDNTPOFF)
+      .Case("ntpoff", VK_NTPOFF)
+      .Case("gotntpoff", VK_GOTNTPOFF)
+      .Case("plt", VK_PLT)
+      .Case("tlscall", VK_TLSCALL)
+      .Case("tlsdesc", VK_TLSDESC)
+      .Case("tlsgd", VK_TLSGD)
+      .Case("tlsld", VK_TLSLD)
+      .Case("tlsldm", VK_TLSLDM)
+      .Case("tpoff", VK_TPOFF)
+      .Case("tprel", VK_TPREL)
+      .Case("tlvp", VK_TLVP)
+      .Case("tlvppage", VK_TLVPPAGE)
+      .Case("tlvppageoff", VK_TLVPPAGEOFF)
+      .Case("page", VK_PAGE)
+      .Case("pageoff", VK_PAGEOFF)
+      .Case("gotpage", VK_GOTPAGE)
+      .Case("gotpageoff", VK_GOTPAGEOFF)
+      .Case("imgrel", VK_COFF_IMGREL32)
+      .Case("secrel32", VK_SECREL)
+      .Case("size", VK_SIZE)
+      .Case("abs8", VK_X86_ABS8)
+      .Case("pltoff", VK_X86_PLTOFF)
+      .Case("l", VK_PPC_LO)
+      .Case("h", VK_PPC_HI)
+      .Case("ha", VK_PPC_HA)
+      .Case("high", VK_PPC_HIGH)
+      .Case("higha", VK_PPC_HIGHA)
+      .Case("higher", VK_PPC_HIGHER)
+      .Case("highera", VK_PPC_HIGHERA)
+      .Case("highest", VK_PPC_HIGHEST)
+      .Case("highesta", VK_PPC_HIGHESTA)
+      .Case("got@l", VK_PPC_GOT_LO)
+      .Case("got@h", VK_PPC_GOT_HI)
+      .Case("got@ha", VK_PPC_GOT_HA)
+      .Case("local", VK_PPC_LOCAL)
+      .Case("tocbase", VK_PPC_TOCBASE)
+      .Case("toc", VK_PPC_TOC)
+      .Case("toc@l", VK_PPC_TOC_LO)
+      .Case("toc@h", VK_PPC_TOC_HI)
+      .Case("toc@ha", VK_PPC_TOC_HA)
+      .Case("u", VK_PPC_U)
+      .Case("l", VK_PPC_L)
+      .Case("tls", VK_PPC_TLS)
+      .Case("dtpmod", VK_PPC_DTPMOD)
+      .Case("tprel@l", VK_PPC_TPREL_LO)
+      .Case("tprel@h", VK_PPC_TPREL_HI)
+      .Case("tprel@ha", VK_PPC_TPREL_HA)
+      .Case("tprel@high", VK_PPC_TPREL_HIGH)
+      .Case("tprel@higha", VK_PPC_TPREL_HIGHA)
+      .Case("tprel@higher", VK_PPC_TPREL_HIGHER)
+      .Case("tprel@highera", VK_PPC_TPREL_HIGHERA)
+      .Case("tprel@highest", VK_PPC_TPREL_HIGHEST)
+      .Case("tprel@highesta", VK_PPC_TPREL_HIGHESTA)
+      .Case("dtprel@l", VK_PPC_DTPREL_LO)
+      .Case("dtprel@h", VK_PPC_DTPREL_HI)
+      .Case("dtprel@ha", VK_PPC_DTPREL_HA)
+      .Case("dtprel@high", VK_PPC_DTPREL_HIGH)
+      .Case("dtprel@higha", VK_PPC_DTPREL_HIGHA)
+      .Case("dtprel@higher", VK_PPC_DTPREL_HIGHER)
+      .Case("dtprel@highera", VK_PPC_DTPREL_HIGHERA)
+      .Case("dtprel@highest", VK_PPC_DTPREL_HIGHEST)
+      .Case("dtprel@highesta", VK_PPC_DTPREL_HIGHESTA)
+      .Case("got@tprel", VK_PPC_GOT_TPREL)
+      .Case("got@tprel@l", VK_PPC_GOT_TPREL_LO)
+      .Case("got@tprel@h", VK_PPC_GOT_TPREL_HI)
+      .Case("got@tprel@ha", VK_PPC_GOT_TPREL_HA)
+      .Case("got@dtprel", VK_PPC_GOT_DTPREL)
+      .Case("got@dtprel@l", VK_PPC_GOT_DTPREL_LO)
+      .Case("got@dtprel@h", VK_PPC_GOT_DTPREL_HI)
+      .Case("got@dtprel@ha", VK_PPC_GOT_DTPREL_HA)
+      .Case("got@tlsgd", VK_PPC_GOT_TLSGD)
+      .Case("got@tlsgd@l", VK_PPC_GOT_TLSGD_LO)
+      .Case("got@tlsgd@h", VK_PPC_GOT_TLSGD_HI)
+      .Case("got@tlsgd@ha", VK_PPC_GOT_TLSGD_HA)
+      .Case("got@tlsld", VK_PPC_GOT_TLSLD)
+      .Case("got@tlsld@l", VK_PPC_GOT_TLSLD_LO)
+      .Case("got@tlsld@h", VK_PPC_GOT_TLSLD_HI)
+      .Case("got@tlsld@ha", VK_PPC_GOT_TLSLD_HA)
+      .Case("got@pcrel", VK_PPC_GOT_PCREL)
+      .Case("got@tlsgd@pcrel", VK_PPC_GOT_TLSGD_PCREL)
+      .Case("got@tlsld@pcrel", VK_PPC_GOT_TLSLD_PCREL)
+      .Case("got@tprel@pcrel", VK_PPC_GOT_TPREL_PCREL)
+      .Case("tls@pcrel", VK_PPC_TLS_PCREL)
+      .Case("notoc", VK_PPC_NOTOC)
+      .Case("gdgot", VK_Hexagon_GD_GOT)
+      .Case("gdplt", VK_Hexagon_GD_PLT)
+      .Case("iegot", VK_Hexagon_IE_GOT)
+      .Case("ie", VK_Hexagon_IE)
+      .Case("ldgot", VK_Hexagon_LD_GOT)
+      .Case("ldplt", VK_Hexagon_LD_PLT)
+      .Case("lo8", VK_AVR_LO8)
+      .Case("hi8", VK_AVR_HI8)
+      .Case("hlo8", VK_AVR_HLO8)
+      .Case("typeindex", VK_WASM_TYPEINDEX)
+      .Case("tbrel", VK_WASM_TBREL)
+      .Case("mbrel", VK_WASM_MBREL)
+      .Case("tlsrel", VK_WASM_TLSREL)
+      .Case("got@tls", VK_WASM_GOT_TLS)
+      .Case("funcindex", VK_WASM_FUNCINDEX)
+      .Case("gotpcrel32@lo", VK_AMDGPU_GOTPCREL32_LO)
+      .Case("gotpcrel32@hi", VK_AMDGPU_GOTPCREL32_HI)
+      .Case("rel32@lo", VK_AMDGPU_REL32_LO)
+      .Case("rel32@hi", VK_AMDGPU_REL32_HI)
+      .Case("rel64", VK_AMDGPU_REL64)
+      .Case("abs32@lo", VK_AMDGPU_ABS32_LO)
+      .Case("abs32@hi", VK_AMDGPU_ABS32_HI)
+      .Case("hi", VK_VE_HI32)
+      .Case("lo", VK_VE_LO32)
+      .Case("pc_hi", VK_VE_PC_HI32)
+      .Case("pc_lo", VK_VE_PC_LO32)
+      .Case("got_hi", VK_VE_GOT_HI32)
+      .Case("got_lo", VK_VE_GOT_LO32)
+      .Case("gotoff_hi", VK_VE_GOTOFF_HI32)
+      .Case("gotoff_lo", VK_VE_GOTOFF_LO32)
+      .Case("plt_hi", VK_VE_PLT_HI32)
+      .Case("plt_lo", VK_VE_PLT_LO32)
+      .Case("tls_gd_hi", VK_VE_TLS_GD_HI32)
+      .Case("tls_gd_lo", VK_VE_TLS_GD_LO32)
+      .Case("tpoff_hi", VK_VE_TPOFF_HI32)
+      .Case("tpoff_lo", VK_VE_TPOFF_LO32)
+      .Default(VK_Invalid);
 }
 
 /* *** */
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp
index de1eedb8daff6..e44b4a5236915 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp
@@ -185,6 +185,7 @@ unsigned SystemZELFObjectWriter::getRelocType(MCContext &Ctx,
     return getTLSGDReloc(Ctx, Loc, Kind);
 
   case MCSymbolRefExpr::VK_GOT:
+  case MCSymbolRefExpr::VK_GOTENT:
     if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL)
       return ELF::R_390_GOTENT;
     Ctx.reportError(Loc, "Only PC-relative GOT accesses are supported for now");
diff --git a/llvm/test/MC/SystemZ/fixups.s b/llvm/test/MC/SystemZ/fixups.s
index ad0b1f18cdcfd..0d59e60fc5957 100644
--- a/llvm/test/MC/SystemZ/fixups.s
+++ b/llvm/test/MC/SystemZ/fixups.s
@@ -19,6 +19,12 @@
 	.align 16
 	larl %r14, target@got
 
+# CHECK: larl %r14, target@GOTENT               # encoding: [0xc0,0xe0,A,A,A,A]
+# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@GOTENT+2, kind: FK_390_PC32DBL
+# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_GOTENT target 0x2
+	.align 16
+	larl %r14, target@gotent
+
 # CHECK: larl %r14, target@INDNTPOFF            # encoding: [0xc0,0xe0,A,A,A,A]
 # CHECK-NEXT:                                   # fixup A - offset: 2, value: target@INDNTPOFF+2, kind: FK_390_PC32DBL
 # CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_TLS_IEENT target 0x2

From 797f01198e8b41982916ba02d703bd6a96b5347e Mon Sep 17 00:00:00 2001
From: Leandro Lupori <leandro.lupori@linaro.org>
Date: Thu, 5 Sep 2024 14:55:01 -0300
Subject: [PATCH 268/425] [flang][OpenMP] Make lastprivate work with
 reallocated variables (#106559)

Fixes https://github.com/llvm/llvm-project/issues/100951
---
 flang/lib/Lower/Bridge.cpp                    | 56 ++++++++----------
 .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 51 +----------------
 flang/runtime/assign.cpp                      |  2 +-
 .../DelayedPrivatization/equivalence.f90      |  2 +-
 flang/test/Lower/OpenMP/copyin-order.f90      |  4 +-
 flang/test/Lower/OpenMP/copyin.f90            | 20 +++----
 flang/test/Lower/OpenMP/copyprivate.f90       |  2 +-
 flang/test/Lower/OpenMP/copyprivate2.f90      |  9 +--
 .../Lower/OpenMP/default-clause-byref.f90     |  6 +-
 ...elayed-privatization-allocatable-array.f90 |  2 +-
 ...privatization-allocatable-firstprivate.f90 |  2 +-
 .../OpenMP/delayed-privatization-array.f90    |  4 +-
 .../delayed-privatization-firstprivate.f90    |  2 +-
 flang/test/Lower/OpenMP/implicit-dsa.f90      | 26 ++++-----
 .../Lower/OpenMP/lastprivate-allocatable.f90  | 57 ++++++++++++++-----
 .../Lower/OpenMP/lastprivate-commonblock.f90  |  4 +-
 flang/test/Lower/OpenMP/lastprivate-iv.f90    |  6 +-
 .../parallel-lastprivate-clause-scalar.f90    | 22 +++----
 .../OpenMP/parallel-wsloop-firstpriv.f90      |  6 +-
 .../Lower/OpenMP/parallel-wsloop-lastpriv.f90 | 10 ++--
 flang/test/Lower/OpenMP/parallel-wsloop.f90   |  8 +--
 flang/test/Lower/OpenMP/sections.f90          | 20 +++----
 flang/test/Lower/OpenMP/single.f90            |  4 +-
 .../test/Lower/OpenMP/statement-function.f90  |  4 +-
 flang/test/Lower/OpenMP/task.f90              |  6 +-
 flang/test/Lower/OpenMP/task2.f90             |  9 +--
 26 files changed, 157 insertions(+), 187 deletions(-)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 1f2724290b885..7cdecb788425a 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -858,7 +858,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
     copyVarHLFIR(loc, Fortran::lower::SymbolBox::Intrinsic{dst},
                  Fortran::lower::SymbolBox::Intrinsic{src}, isAllocatable,
-                 isPointer);
+                 isPointer, Fortran::semantics::Symbol::Flags());
   }
 
   void copyHostAssociateVar(
@@ -895,7 +895,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       rhs_sb = &hsb;
     }
 
-    copyVar(sym, *lhs_sb, *rhs_sb);
+    copyVar(sym, *lhs_sb, *rhs_sb, sym.flags());
 
     if (copyAssignIP && copyAssignIP->isSet() &&
         sym.test(Fortran::semantics::Symbol::Flag::OmpLastPrivate)) {
@@ -1211,16 +1211,18 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
   void copyVar(const Fortran::semantics::Symbol &sym,
                const Fortran::lower::SymbolBox &lhs_sb,
-               const Fortran::lower::SymbolBox &rhs_sb) {
+               const Fortran::lower::SymbolBox &rhs_sb,
+               Fortran::semantics::Symbol::Flags flags) {
     mlir::Location loc = genLocation(sym.name());
     if (lowerToHighLevelFIR())
-      copyVarHLFIR(loc, lhs_sb, rhs_sb);
+      copyVarHLFIR(loc, lhs_sb, rhs_sb, flags);
     else
       copyVarFIR(loc, sym, lhs_sb, rhs_sb);
   }
 
   void copyVarHLFIR(mlir::Location loc, Fortran::lower::SymbolBox dst,
-                    Fortran::lower::SymbolBox src) {
+                    Fortran::lower::SymbolBox src,
+                    Fortran::semantics::Symbol::Flags flags) {
     assert(lowerToHighLevelFIR());
 
     bool isBoxAllocatable = dst.match(
@@ -1237,51 +1239,39 @@ class FirConverter : public Fortran::lower::AbstractConverter {
         },
         [](const auto &box) { return false; });
 
-    copyVarHLFIR(loc, dst, src, isBoxAllocatable, isBoxPointer);
+    copyVarHLFIR(loc, dst, src, isBoxAllocatable, isBoxPointer, flags);
   }
 
   void copyVarHLFIR(mlir::Location loc, Fortran::lower::SymbolBox dst,
                     Fortran::lower::SymbolBox src, bool isAllocatable,
-                    bool isPointer) {
+                    bool isPointer, Fortran::semantics::Symbol::Flags flags) {
     assert(lowerToHighLevelFIR());
     hlfir::Entity lhs{dst.getAddr()};
     hlfir::Entity rhs{src.getAddr()};
-    // Temporary_lhs is set to true in hlfir.assign below to avoid user
-    // assignment to be used and finalization to be called on the LHS.
-    // This may or may not be correct but mimics the current behaviour
-    // without HLFIR.
+
     auto copyData = [&](hlfir::Entity l, hlfir::Entity r) {
       // Dereference RHS and load it if trivial scalar.
       r = hlfir::loadTrivialScalar(loc, *builder, r);
-      builder->create<hlfir::AssignOp>(
-          loc, r, l,
-          /*isWholeAllocatableAssignment=*/false,
-          /*keepLhsLengthInAllocatableAssignment=*/false,
-          /*temporary_lhs=*/true);
+      builder->create<hlfir::AssignOp>(loc, r, l, isAllocatable);
     };
 
-    if (isAllocatable) {
-      // Deep copy allocatable if it is allocated.
-      // Note that when allocated, the RHS is already allocated with the LHS
-      // shape for copy on entry in createHostAssociateVarClone.
-      // For lastprivate, this assumes that the RHS was not reallocated in
-      // the OpenMP region.
-      lhs = hlfir::derefPointersAndAllocatables(loc, *builder, lhs);
-      mlir::Value addr = hlfir::genVariableRawAddress(loc, *builder, lhs);
-      mlir::Value isAllocated = builder->genIsNotNullAddr(loc, addr);
-      builder->genIfThen(loc, isAllocated)
-          .genThen([&]() {
-            // Copy the DATA, not the descriptors.
-            copyData(lhs, rhs);
-          })
-          .end();
-    } else if (isPointer) {
+    if (isPointer) {
       // Set LHS target to the target of RHS (do not copy the RHS
       // target data into the LHS target storage).
       auto loadVal = builder->create<fir::LoadOp>(loc, rhs);
       builder->create<fir::StoreOp>(loc, loadVal, lhs);
+    } else if (isAllocatable &&
+               flags.test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate)) {
+      // For firstprivate allocatable variables, RHS must be copied only when
+      // LHS is allocated.
+      hlfir::Entity temp =
+          hlfir::derefPointersAndAllocatables(loc, *builder, lhs);
+      mlir::Value addr = hlfir::genVariableRawAddress(loc, *builder, temp);
+      mlir::Value isAllocated = builder->genIsNotNullAddr(loc, addr);
+      builder->genIfThen(loc, isAllocated)
+          .genThen([&]() { copyData(lhs, rhs); })
+          .end();
     } else {
-      // Non ALLOCATABLE/POINTER variable. Simple DATA copy.
       copyData(lhs, rhs);
     }
   }
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index a2003473a0fd8..78c3aa5a1c16c 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -132,54 +132,9 @@ void DataSharingProcessor::copyFirstPrivateSymbol(
 }
 
 void DataSharingProcessor::copyLastPrivateSymbol(
-    const semantics::Symbol *sym,
-    [[maybe_unused]] mlir::OpBuilder::InsertPoint *lastPrivIP) {
-  if (sym->test(semantics::Symbol::Flag::OmpLastPrivate)) {
-    bool allocatable = semantics::IsAllocatable(sym->GetUltimate());
-    if (!allocatable) {
-      converter.copyHostAssociateVar(*sym, lastPrivIP);
-      return;
-    }
-
-    // copyHostAssociateVar doesn't work properly if the privatised copy was
-    // reallocated (e.g. by assignment): it will only copy if the ultimate
-    // symbol was already allocated, and it only copies data so any reallocated
-    // lengths etc are lost
-
-    // 1) Fetch the original copy of the variable.
-    assert(sym->has<Fortran::semantics::HostAssocDetails>() &&
-           "No host-association found");
-    const Fortran::semantics::Symbol &hsym = sym->GetUltimate();
-    Fortran::lower::SymbolBox hsb = symTable->lookupOneLevelUpSymbol(hsym);
-    assert(hsb && "Host symbol box not found");
-
-    // 2) Fetch the copied one that will mask the original.
-    Fortran::lower::SymbolBox sb = symTable->shallowLookupSymbol(sym);
-    assert(sb && "Host-associated symbol box not found");
-    assert(hsb.getAddr() != sb.getAddr() &&
-           "Host and associated symbol boxes are the same");
-
-    // 3) Perform the assignment.
-    fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-    mlir::Location loc = converter.genLocation(sym->name());
-    mlir::OpBuilder::InsertPoint insPt = builder.saveInsertionPoint();
-    if (lastPrivIP && lastPrivIP->isSet())
-      builder.restoreInsertionPoint(*lastPrivIP);
-    else
-      builder.setInsertionPointAfter(sb.getAddr().getDefiningOp());
-
-    hlfir::Entity dst{hsb.getAddr()};
-    hlfir::Entity src{sb.getAddr()};
-    builder.create<hlfir::AssignOp>(
-        loc, src, dst, /*isWholeAllocatableAssignment=*/allocatable,
-        /*keepLhsLengthInAllocatableAssignment=*/false,
-        /*temporary_lhs=*/false);
-
-    if (lastPrivIP && lastPrivIP->isSet() &&
-        sym->test(Fortran::semantics::Symbol::Flag::OmpLastPrivate)) {
-      builder.restoreInsertionPoint(insPt);
-    }
-  }
+    const semantics::Symbol *sym, mlir::OpBuilder::InsertPoint *lastPrivIP) {
+  if (sym->test(semantics::Symbol::Flag::OmpLastPrivate))
+    converter.copyHostAssociateVar(*sym, lastPrivIP);
 }
 
 void DataSharingProcessor::collectOmpObjectListSymbol(
diff --git a/flang/runtime/assign.cpp b/flang/runtime/assign.cpp
index c3c9b0ba10ab3..d558ada51cd21 100644
--- a/flang/runtime/assign.cpp
+++ b/flang/runtime/assign.cpp
@@ -591,7 +591,7 @@ void RTDEF(AssignTemporary)(Descriptor &to, const Descriptor &from,
     }
   }
 
-  Assign(to, from, terminator, PolymorphicLHS);
+  Assign(to, from, terminator, MaybeReallocate | PolymorphicLHS);
 }
 
 void RTDEF(CopyInAssign)(Descriptor &temp, const Descriptor &var,
diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/equivalence.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/equivalence.f90
index 1cbbcdcd0e4fd..2307c09513795 100644
--- a/flang/test/Lower/OpenMP/DelayedPrivatization/equivalence.f90
+++ b/flang/test/Lower/OpenMP/DelayedPrivatization/equivalence.f90
@@ -22,7 +22,7 @@ subroutine private_common
 ! CHECK:  } copy {
 ! CHECK:  ^bb0(%[[ORIG_PTR:.*]]: ![[X_TYPE]], %[[PRIV_REF:.*]]: ![[X_TYPE]]):
 ! CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[ORIG_PTR]] : !fir.ptr<f32>
-! CHECK:    hlfir.assign %[[ORIG_VAL]] to %[[PRIV_REF]] temporary_lhs : f32, ![[X_TYPE]]
+! CHECK:    hlfir.assign %[[ORIG_VAL]] to %[[PRIV_REF]] : f32, ![[X_TYPE]]
 ! CHECK:    omp.yield(%[[PRIV_REF]] : ![[X_TYPE]])
 ! CHECK:  }
 
diff --git a/flang/test/Lower/OpenMP/copyin-order.f90 b/flang/test/Lower/OpenMP/copyin-order.f90
index 8999c247eebdc..ffbbad00e7cbd 100644
--- a/flang/test/Lower/OpenMP/copyin-order.f90
+++ b/flang/test/Lower/OpenMP/copyin-order.f90
@@ -6,11 +6,11 @@
 !CHECK:   %[[THP1:[0-9]+]] = omp.threadprivate %{{[0-9]+}}#1
 !CHECK:   %[[DCL1:[0-9]+]]:2 = hlfir.declare %[[THP1]] {uniq_name = "_QFcopyin_scalar_arrayEx1"}
 !CHECK:   %[[LD1:[0-9]+]] = fir.load %{{[0-9]+}}#0
-!CHECK:   hlfir.assign %[[LD1]] to %[[DCL1]]#0 temporary_lhs
+!CHECK:   hlfir.assign %[[LD1]] to %[[DCL1]]#0
 !CHECK:   %[[THP2:[0-9]+]] = omp.threadprivate %{{[0-9]+}}#1
 !CHECK:   %[[SHP2:[0-9]+]] = fir.shape %c{{[0-9]+}}
 !CHECK:   %[[DCL2:[0-9]+]]:2 = hlfir.declare %[[THP2]](%[[SHP2]]) {uniq_name = "_QFcopyin_scalar_arrayEx2"}
-!CHECK:   hlfir.assign %{{[0-9]+}}#0 to %[[DCL2]]#0 temporary_lhs
+!CHECK:   hlfir.assign %{{[0-9]+}}#0 to %[[DCL2]]#0
 !CHECK:   omp.barrier
 !CHECK:   fir.call @_QPsub1(%[[DCL1]]#1, %[[DCL2]]#1)
 !CHECK:   omp.terminator
diff --git a/flang/test/Lower/OpenMP/copyin.f90 b/flang/test/Lower/OpenMP/copyin.f90
index 34c83fca46417..4023987a841b8 100644
--- a/flang/test/Lower/OpenMP/copyin.f90
+++ b/flang/test/Lower/OpenMP/copyin.f90
@@ -21,11 +21,11 @@
 ! CHECK:             %[[VAL_11:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref<i32> -> !fir.ref<i32>
 ! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFcopyin_scalar_arrayEx1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_13:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
-! CHECK:             hlfir.assign %[[VAL_13]] to %[[VAL_12]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_13]] to %[[VAL_12]]#0 : i32, !fir.ref<i32>
 ! CHECK:             %[[VAL_14:.*]] = omp.threadprivate %[[VAL_7]]#1 : !fir.ref<!fir.array<10xi64>> -> !fir.ref<!fir.array<10xi64>>
 ! CHECK:             %[[VAL_15:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
 ! CHECK:             %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {uniq_name = "_QFcopyin_scalar_arrayEx2"} : (!fir.ref<!fir.array<10xi64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi64>>, !fir.ref<!fir.array<10xi64>>)
-! CHECK:             hlfir.assign %[[VAL_10]]#0 to %[[VAL_16]]#0 temporary_lhs : !fir.ref<!fir.array<10xi64>>, !fir.ref<!fir.array<10xi64>>
+! CHECK:             hlfir.assign %[[VAL_10]]#0 to %[[VAL_16]]#0 : !fir.ref<!fir.array<10xi64>>, !fir.ref<!fir.array<10xi64>>
 ! CHECK:             omp.barrier
 ! CHECK:             fir.call @_QPsub1(%[[VAL_12]]#1, %[[VAL_16]]#1) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xi64>>) -> ()
 ! CHECK:             omp.terminator
@@ -61,11 +61,11 @@ subroutine copyin_scalar_array()
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[VAL_13:.*]] = omp.threadprivate %[[VAL_2]]#1 : !fir.ref<!fir.char<1,5>> -> !fir.ref<!fir.char<1,5>>
 ! CHECK:             %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_13]] typeparams %[[VAL_1]] {uniq_name = "_QFcopyin_char_chararrayEx3"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
-! CHECK:             hlfir.assign %[[VAL_4]]#0 to %[[VAL_14]]#0 temporary_lhs : !fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>
+! CHECK:             hlfir.assign %[[VAL_4]]#0 to %[[VAL_14]]#0 : !fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>
 ! CHECK:             %[[VAL_15:.*]] = omp.threadprivate %[[VAL_9]]#1 : !fir.ref<!fir.array<10x!fir.char<1,5>>> -> !fir.ref<!fir.array<10x!fir.char<1,5>>>
 ! CHECK:             %[[VAL_16:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
 ! CHECK:             %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]](%[[VAL_16]]) typeparams %[[VAL_6]] {uniq_name = "_QFcopyin_char_chararrayEx4"} : (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>)
-! CHECK:             hlfir.assign %[[VAL_12]]#0 to %[[VAL_17]]#0 temporary_lhs : !fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>
+! CHECK:             hlfir.assign %[[VAL_12]]#0 to %[[VAL_17]]#0 : !fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>
 ! CHECK:             omp.barrier
 ! CHECK:             %[[VAL_18:.*]] = fir.emboxchar %[[VAL_14]]#1, %[[VAL_1]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
 ! CHECK:             %[[VAL_19:.*]] = fir.convert %[[VAL_17]]#1 : (!fir.ref<!fir.array<10x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,5>>
@@ -116,7 +116,7 @@ subroutine copyin_char_chararray()
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[VAL_27:.*]] = omp.threadprivate %[[VAL_17]]#1 : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>> -> !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
 ! CHECK:             %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_27]] {uniq_name = "_QFcopyin_derived_typeEx5"} : (!fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>) -> (!fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>, !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>)
-! CHECK:             hlfir.assign %[[VAL_19]]#0 to %[[VAL_28]]#0 temporary_lhs : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>, !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
+! CHECK:             hlfir.assign %[[VAL_19]]#0 to %[[VAL_28]]#0 : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>, !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
 ! CHECK:             omp.barrier
 ! CHECK:             fir.call @_QPsub3(%[[VAL_28]]#1) fastmath<contract> : (!fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>) -> ()
 ! CHECK:             omp.terminator
@@ -150,7 +150,7 @@ subroutine copyin_derived_type()
 ! CHECK:             %[[VAL_8:.*]] = omp.threadprivate %[[VAL_3]]#1 : !fir.ref<i32> -> !fir.ref<i32>
 ! CHECK:             %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFcombined_parallel_worksharing_loopEx6"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_10:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
-! CHECK:             hlfir.assign %[[VAL_10]] to %[[VAL_9]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_10]] to %[[VAL_9]]#0 : i32, !fir.ref<i32>
 
 ! CHECK:             omp.barrier
 
@@ -194,7 +194,7 @@ subroutine combined_parallel_worksharing_loop()
 ! CHECK:             %[[VAL_4:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref<i32> -> !fir.ref<i32>
 ! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFcombined_parallel_sectionsEx7"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
-! CHECK:             hlfir.assign %[[VAL_6]] to %[[VAL_5]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_6]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
 ! CHECK:             omp.barrier
 ! CHECK:             omp.sections {
 ! CHECK:               omp.section {
@@ -247,7 +247,7 @@ subroutine combined_parallel_sections()
 ! CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (!fir.ref<i8>) -> !fir.ref<i32>
 ! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFcommon_1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_20:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
-! CHECK:             hlfir.assign %[[VAL_20]] to %[[VAL_19]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_20]] to %[[VAL_19]]#0 : i32, !fir.ref<i32>
 ! CHECK:             omp.barrier
 ! CHECK:             omp.sections {
 ! CHECK:               omp.section {
@@ -318,9 +318,9 @@ subroutine common_1()
 ! CHECK:             %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (!fir.ref<i8>) -> !fir.ref<i32>
 ! CHECK:             %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_30]] {uniq_name = "_QFcommon_2Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_32:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<i32>
-! CHECK:             hlfir.assign %[[VAL_32]] to %[[VAL_26]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_32]] to %[[VAL_26]]#0 : i32, !fir.ref<i32>
 ! CHECK:             %[[VAL_33:.*]] = fir.load %[[VAL_18]]#0 : !fir.ref<i32>
-! CHECK:             hlfir.assign %[[VAL_33]] to %[[VAL_31]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_33]] to %[[VAL_31]]#0 : i32, !fir.ref<i32>
 ! CHECK:             omp.barrier
 
 ! CHECK:             %[[VAL_19:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
diff --git a/flang/test/Lower/OpenMP/copyprivate.f90 b/flang/test/Lower/OpenMP/copyprivate.f90
index 4b3d8a6b596ef..bb86aa5356b1e 100644
--- a/flang/test/Lower/OpenMP/copyprivate.f90
+++ b/flang/test/Lower/OpenMP/copyprivate.f90
@@ -35,7 +35,7 @@
 !CHECK-NEXT:    %[[DST:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_copy_i32_dst"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NEXT:    %[[SRC:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_copy_i32_src"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NEXT:    %[[SRC_VAL:.*]] = fir.load %[[SRC]]#0 : !fir.ref<i32>
-!CHECK-NEXT:    hlfir.assign %[[SRC_VAL]] to %[[DST]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:    hlfir.assign %[[SRC_VAL]] to %[[DST]]#0 : i32, !fir.ref<i32>
 !CHECK-NEXT:    return
 !CHECK-NEXT:  }
 
diff --git a/flang/test/Lower/OpenMP/copyprivate2.f90 b/flang/test/Lower/OpenMP/copyprivate2.f90
index 38235e8ec79c3..848ebe39e45f4 100644
--- a/flang/test/Lower/OpenMP/copyprivate2.f90
+++ b/flang/test/Lower/OpenMP/copyprivate2.f90
@@ -24,12 +24,9 @@
 !CHECK-NEXT:    %[[SRC:.*]]:2 = hlfir.declare %[[ARG1]] {fortran_attrs = #fir.var_attrs<allocatable>,
 !CHECK-SAME:      uniq_name = "_copy_box_heap_Uxi32_src"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) ->
 !CHECK-SAME:      (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
-!CHECK-NEXT:    %[[DST_BOX:.*]] = fir.load %[[DST]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-!CHECK:         fir.if %{{.*}} {
-!CHECK-NEXT:      %[[SRC_BOX:.*]] = fir.load %[[SRC]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-!CHECK-NEXT:      hlfir.assign %[[SRC_BOX]] to %[[DST_BOX]] temporary_lhs : !fir.box<!fir.heap<!fir.array<?xi32>>>,
-!CHECK-SAME:        !fir.box<!fir.heap<!fir.array<?xi32>>>
-!CHECK-NEXT:    }
+!CHECK-NEXT:    %[[SRC_BOX:.*]] = fir.load %[[SRC]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+!CHECK-NEXT:    hlfir.assign %[[SRC_BOX]] to %[[DST]]#0 realloc : !fir.box<!fir.heap<!fir.array<?xi32>>>,
+!CHECK-SAME:      !fir.box<!fir.heap<!fir.array<?xi32>>>
 !CHECK-NEXT:    return
 !CHECK-NEXT:  }
 
diff --git a/flang/test/Lower/OpenMP/default-clause-byref.f90 b/flang/test/Lower/OpenMP/default-clause-byref.f90
index 626ba3335a8c1..7e9011f9c1bd5 100644
--- a/flang/test/Lower/OpenMP/default-clause-byref.f90
+++ b/flang/test/Lower/OpenMP/default-clause-byref.f90
@@ -15,7 +15,7 @@
 !CHECK:  } copy {
 !CHECK:  ^bb0(%[[ORIG_W:.*]]: !fir.ref<i32>, %[[PRIV_W:.*]]: !fir.ref<i32>):
 !CHECK:    %[[ORIG_W_VAL:.*]] = fir.load %[[ORIG_W]]
-!CHECK:    hlfir.assign %[[ORIG_W_VAL]] to %[[PRIV_W]] temporary_lhs
+!CHECK:    hlfir.assign %[[ORIG_W_VAL]] to %[[PRIV_W]]
 !CHECK:    omp.yield(%[[PRIV_W]] : !fir.ref<i32>)
 !CHECK:  }
 
@@ -27,7 +27,7 @@
 !CHECK:  } copy {
 !CHECK:  ^bb0(%[[ORIG_Y:.*]]: !fir.ref<i32>, %[[PRIV_Y:.*]]: !fir.ref<i32>):
 !CHECK:    %[[ORIG_Y_VAL:.*]] = fir.load %[[ORIG_Y]]
-!CHECK:    hlfir.assign %[[ORIG_Y_VAL]] to %[[PRIV_Y]] temporary_lhs
+!CHECK:    hlfir.assign %[[ORIG_Y_VAL]] to %[[PRIV_Y]]
 !CHECK:    omp.yield(%[[PRIV_Y]] : !fir.ref<i32>)
 !CHECK:  }
 
@@ -60,7 +60,7 @@
 !CHECK:  } copy {
 !CHECK:  ^bb0(%[[ORIG_X:.*]]: !fir.ref<i32>, %[[PRIV_X:.*]]: !fir.ref<i32>):
 !CHECK:    %[[ORIG_X_VAL:.*]] = fir.load %[[ORIG_X]]
-!CHECK:    hlfir.assign %[[ORIG_X_VAL]] to %[[PRIV_X]] temporary_lhs
+!CHECK:    hlfir.assign %[[ORIG_X_VAL]] to %[[PRIV_X]]
 !CHECK:    omp.yield(%[[PRIV_X]] : !fir.ref<i32>)
 !CHECK:  }
 
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90 b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90
index 47e163014fe86..759d80cf45b2a 100644
--- a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90
+++ b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90
@@ -61,7 +61,7 @@ subroutine delayed_privatization_private(var1, l1)
 
 ! CHECK-NEXT:  fir.if %[[COPY_COND]] {
 ! CHECK-NEXT:    %[[PRIV_ORIG_ARG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]]
-! CHECK-NEXT:    hlfir.assign %[[PRIV_ORIG_ARG_VAL]] to %[[PRIV_BASE_VAL]] temporary_lhs
+! CHECK-NEXT:    hlfir.assign %[[PRIV_ORIG_ARG_VAL]] to %[[PRIV_PRIV_ARG]] realloc
 ! CHECK-NEXT:   }
 ! CHECK-NEXT:   omp.yield
 ! CHECK-NEXT: }
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90
index 5f09371bbaba2..9c97c689dad70 100644
--- a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90
+++ b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90
@@ -35,7 +35,7 @@ subroutine delayed_privatization_allocatable
 ! CHECK-NEXT:    %[[ORIG_BASE_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]]
 ! CHECK-NEXT:    %[[ORIG_BASE_ADDR:.*]] = fir.box_addr %[[ORIG_BASE_VAL]]
 ! CHECK-NEXT:    %[[ORIG_BASE_LD:.*]] = fir.load %[[ORIG_BASE_ADDR]]
-! CHECK-NEXT:    hlfir.assign %[[ORIG_BASE_LD]] to %[[PRIV_BASE_BOX]] temporary_lhs
+! CHECK-NEXT:    hlfir.assign %[[ORIG_BASE_LD]] to %[[PRIV_PRIV_ARG]] realloc
 ! CHECK-NEXT:  }
 
 ! RUN: %flang -c -emit-llvm -fopenmp -mmlir --openmp-enable-delayed-privatization \
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-array.f90 b/flang/test/Lower/OpenMP/delayed-privatization-array.f90
index 1d291b2ac0feb..3d641a0d69689 100644
--- a/flang/test/Lower/OpenMP/delayed-privatization-array.f90
+++ b/flang/test/Lower/OpenMP/delayed-privatization-array.f90
@@ -42,7 +42,7 @@ subroutine delayed_privatization_private_1d(var1, l1, u1)
 
 ! ONE_DIM-NEXT: } copy {
 ! ONE_DIM-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
-! ONE_DIM-NEXT:  hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]] temporary_lhs
+! ONE_DIM-NEXT:  hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]]
 ! ONE_DIM-NEXT:   omp.yield(%[[PRIV_PRIV_ARG]] : [[TYPE]])
 ! ONE_DIM-NEXT: }
 
@@ -75,7 +75,7 @@ subroutine delayed_privatization_private_2d(var1, l1, u1, l2, u2)
 
 ! TWO_DIM-NEXT: } copy {
 ! TWO_DIM-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
-! TWO_DIM-NEXT:  hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]] temporary_lhs
+! TWO_DIM-NEXT:  hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]]
 ! TWO_DIM-NEXT:   omp.yield(%[[PRIV_PRIV_ARG]] : [[TYPE]])
 ! TWO_DIM-NEXT: }
 
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90
index 0fb81d68016a4..119f77ea26626 100644
--- a/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90
+++ b/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90
@@ -23,7 +23,7 @@ subroutine delayed_privatization_firstprivate
 ! CHECK: } copy {
 ! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: !fir.ref<i32>, %[[PRIV_PRIV_ARG:.*]]: !fir.ref<i32>):
 ! CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]] : !fir.ref<i32>
-! CHECK:    hlfir.assign %[[ORIG_VAL]] to %[[PRIV_PRIV_ARG]] temporary_lhs : i32, !fir.ref<i32>
+! CHECK:    hlfir.assign %[[ORIG_VAL]] to %[[PRIV_PRIV_ARG]] : i32, !fir.ref<i32>
 ! CHECK:    omp.yield(%[[PRIV_PRIV_ARG]] : !fir.ref<i32>)
 ! CHECK: }
 
diff --git a/flang/test/Lower/OpenMP/implicit-dsa.f90 b/flang/test/Lower/OpenMP/implicit-dsa.f90
index fd38ca6b2f6f2..925677469847e 100644
--- a/flang/test/Lower/OpenMP/implicit-dsa.f90
+++ b/flang/test/Lower/OpenMP/implicit-dsa.f90
@@ -17,7 +17,7 @@
 !CHECK-NEXT:    %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test1Ex"}
 !CHECK-NEXT:    %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NEXT:    %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT:    hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:    hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK-NOT:     fir.alloca
 !CHECK:       }
 !CHECK:       omp.task {
@@ -44,7 +44,7 @@ subroutine implicit_dsa_test1
 !CHECK:     %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test2Ex"}
 !CHECK:     %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:     %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
-!CHECK:     hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:     hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:   }
 !CHECK: }
 subroutine implicit_dsa_test2
@@ -76,7 +76,7 @@ subroutine implicit_dsa_test2
 !CHECK:     %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test3Ex"}
 !CHECK:     %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:     %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
-!CHECK:     hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:     hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:     %[[ONE:.*]] = arith.constant 1 : i32
 !CHECK:     hlfir.assign %[[ONE]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:     %[[ONE:.*]] = arith.constant 1 : i32
@@ -115,11 +115,11 @@ subroutine implicit_dsa_test3
 !CHECK-NEXT:      %[[PRIV2_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test4Ex"}
 !CHECK-NEXT:      %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test4Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NEXT:      %[[TEMP:.*]] = fir.load %[[PRIV_X_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT:      hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:      hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK-NEXT:      %[[PRIV2_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFimplicit_dsa_test4Ez"}
 !CHECK-NEXT:      %[[PRIV2_Z_DECL:.*]]:2 = hlfir.declare %[[PRIV2_Z]] {uniq_name = "_QFimplicit_dsa_test4Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NEXT:      %[[TEMP2:.*]] = fir.load %[[PRIV_Z_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT:      hlfir.assign %[[TEMP2]] to %[[PRIV2_Z_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:      hlfir.assign %[[TEMP2]] to %[[PRIV2_Z_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:           %[[ZERO:.*]] = arith.constant 0 : i32
 !CHECK-NEXT:      hlfir.assign %[[ZERO]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:           %[[ONE:.*]] = arith.constant 1 : i32
@@ -129,11 +129,11 @@ subroutine implicit_dsa_test3
 !CHECK-NEXT:      %[[PRIV2_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test4Ex"}
 !CHECK-NEXT:      %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test4Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NEXT:      %[[TEMP:.*]] = fir.load %[[PRIV_X_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT:      hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:      hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK-NEXT:      %[[PRIV2_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test4Ey"}
 !CHECK-NEXT:      %[[PRIV2_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV2_Y]] {uniq_name = "_QFimplicit_dsa_test4Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NEXT:      %[[TEMP2:.*]] = fir.load %[[PRIV_Y_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT:      hlfir.assign %[[TEMP2]] to %[[PRIV2_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:      hlfir.assign %[[TEMP2]] to %[[PRIV2_Y_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:           %[[ONE:.*]] = arith.constant 1 : i32
 !CHECK-NEXT:      hlfir.assign %[[ONE]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:           %[[ZERO:.*]] = arith.constant 0 : i32
@@ -166,7 +166,7 @@ subroutine implicit_dsa_test4
 !CHECK:     %[[PRIV2_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test5Ex"}
 !CHECK:     %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test5Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:      %[[TEMP:.*]] = fir.load %[[PRIV_X_DECL]]#0 : !fir.ref<i32>
-!CHECK:      hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:      hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:     omp.parallel {
 !CHECK:       %[[ONE:.*]] = arith.constant 1 : i32
 !CHECK:       hlfir.assign %[[ONE]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref<i32>
@@ -197,15 +197,15 @@ subroutine implicit_dsa_test5
 !CHECK-NEXT:    %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test6Ex"}
 !CHECK-NEXT:    %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test6Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NEXT:    %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT:    hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:    hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK-NEXT:    %[[PRIV_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test6Ey"}
 !CHECK-NEXT:    %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test6Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NEXT:    %[[TEMP2:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT:    hlfir.assign %[[TEMP2]] to %[[PRIV_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:    hlfir.assign %[[TEMP2]] to %[[PRIV_Y_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK-NEXT:    %[[PRIV_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFimplicit_dsa_test6Ez"}
 !CHECK-NEXT:    %[[PRIV_Z_DECL:.*]]:2 = hlfir.declare %[[PRIV_Z]] {uniq_name = "_QFimplicit_dsa_test6Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NEXT:    %[[TEMP3:.*]] = fir.load %[[Z_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT:    hlfir.assign %[[TEMP3]] to %[[PRIV_Z_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:    hlfir.assign %[[TEMP3]] to %[[PRIV_Z_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:         omp.parallel private({{.*}} %{{.*}}#0 -> %[[PRIV2_X:.*]] : {{.*}}, {{.*}} %{{.*}}#0 -> %[[PRIV2_Y:.*]] : {{.*}}) {
 !CHECK:           %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test6Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NOT:       hlfir.assign
@@ -244,11 +244,11 @@ subroutine implicit_dsa_test6
 !CHECK-NEXT:      %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test7Ex"}
 !CHECK-NEXT:      %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test7Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NEXT:      %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT:      hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:      hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK-NEXT:      %[[PRIV_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test7Ey"}
 !CHECK-NEXT:      %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test7Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NEXT:      %[[TEMP2:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT:      hlfir.assign %[[TEMP2]] to %[[PRIV_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:      hlfir.assign %[[TEMP2]] to %[[PRIV_Y_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:         }
 !CHECK:       }
 subroutine implicit_dsa_test7
diff --git a/flang/test/Lower/OpenMP/lastprivate-allocatable.f90 b/flang/test/Lower/OpenMP/lastprivate-allocatable.f90
index 41bbb182aade2..0dce4d514bd10 100644
--- a/flang/test/Lower/OpenMP/lastprivate-allocatable.f90
+++ b/flang/test/Lower/OpenMP/lastprivate-allocatable.f90
@@ -1,18 +1,6 @@
 ! RUN: %flang_fc1 -emit-hlfir -o - -fopenmp %s | FileCheck %s
 ! RUN: bbc -emit-hlfir -o - -fopenmp %s | FileCheck %s
 
-program lastprivate_allocatable
-  integer, allocatable :: a
-  integer :: i
-  ! a is unallocated here
-  !$omp parallel do lastprivate(a)
-  do i=1,1
-    a = 42
-  enddo
-  !$omp end parallel do
-  ! a should be allocated here
-end program
-
 ! CHECK-LABEL:   func.func @_QQmain()
 ! CHECK:           %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "a", uniq_name = "_QFEa"}
 ! CHECK:           %[[VAL_1:.*]] = fir.zero_bits !fir.heap<i32>
@@ -32,7 +20,50 @@ program lastprivate_allocatable
 !                          store loop IV
 ! CHECK:                   fir.store %{{.*}} to %[[VAL_18]]#1 : !fir.ref<i32>
 !                          assign private variable to original copy: realloc
-! CHECK:                   hlfir.assign %[[VAL_16]]#0 to %[[VAL_3]]#0 realloc : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK:                   %[[VAL_23:.*]] = fir.load %[[VAL_16]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK:                   %[[VAL_24:.*]] = fir.box_addr %[[VAL_23]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CHECK:                   %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.heap<i32>
+! CHECK:                   hlfir.assign %[[VAL_25]] to %[[VAL_3]]#0 realloc : i32, !fir.ref<!fir.box<!fir.heap<i32>>>
 ! CHECK-NEXT:            }
 ! CHECK-NEXT:            omp.yield
 ! CHECK-NEXT:          }
+program lastprivate_allocatable
+  integer, allocatable :: a
+  integer :: i
+  ! a is unallocated here
+  !$omp parallel do lastprivate(a)
+  do i=1,1
+    a = 42
+  enddo
+  !$omp end parallel do
+  ! a should be allocated here
+end program
+
+! CHECK-LABEL:  func @_QPlastprivate_realloc()
+! CHECK:          %[[A:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFlastprivate_reallocEa"} :
+! CHECK-SAME:       (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.complex<4>>>>>) ->
+! CHECK-SAME:       (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.complex<4>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.complex<4>>>>>)
+! CHECK:          omp.parallel {
+! CHECK:            %[[A_PRIV:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFlastprivate_reallocEa"} :
+! CHECK-SAME:         (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.complex<4>>>>>) ->
+! CHECK-SAME:         (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.complex<4>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.complex<4>>>>>)
+! CHECK:            omp.sections {
+! CHECK:              omp.section {
+! CHECK:                %[[TEMP:.*]] = fir.load %[[A_PRIV:.*]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.complex<4>>>>>
+! CHECK:                hlfir.assign %[[TEMP]] to %[[A]]#0 realloc : !fir.box<!fir.heap<!fir.array<?x!fir.complex<4>>>>,
+! CHECK-SAME:             !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.complex<4>>>>>
+! CHECK:              }
+! CHECK:            }
+! CHECK:          }
+subroutine lastprivate_realloc()
+  complex, allocatable :: a(:)
+
+  allocate(a(2))
+  !$omp parallel
+    !$omp sections lastprivate(a)
+      !$omp section
+        deallocate(a)
+        allocate(a(3))
+    !$omp end sections
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/lastprivate-commonblock.f90 b/flang/test/Lower/OpenMP/lastprivate-commonblock.f90
index f823bf6c56ae3..dcba34b2da8ef 100644
--- a/flang/test/Lower/OpenMP/lastprivate-commonblock.f90
+++ b/flang/test/Lower/OpenMP/lastprivate-commonblock.f90
@@ -26,9 +26,9 @@
 !CHECK:          fir.if %[[LAST_ITER]] {
 !CHECK:            fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
 !CHECK:            %[[PRIVATE_X_VAL:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<f32>
-!CHECK:            hlfir.assign %[[PRIVATE_X_VAL]] to %[[X_DECL]]#0 temporary_lhs : f32, !fir.ref<f32>
+!CHECK:            hlfir.assign %[[PRIVATE_X_VAL]] to %[[X_DECL]]#0 : f32, !fir.ref<f32>
 !CHECK:            %[[PRIVATE_Y_VAL:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<f32>
-!CHECK:            hlfir.assign %[[PRIVATE_Y_VAL]] to %[[Y_DECL]]#0 temporary_lhs : f32, !fir.ref<f32>
+!CHECK:            hlfir.assign %[[PRIVATE_Y_VAL]] to %[[Y_DECL]]#0 : f32, !fir.ref<f32>
 !CHECK:          }
 !CHECK:          omp.yield
 !CHECK:        }
diff --git a/flang/test/Lower/OpenMP/lastprivate-iv.f90 b/flang/test/Lower/OpenMP/lastprivate-iv.f90
index 21a701441cb56..609192471eae1 100644
--- a/flang/test/Lower/OpenMP/lastprivate-iv.f90
+++ b/flang/test/Lower/OpenMP/lastprivate-iv.f90
@@ -24,7 +24,7 @@
 !CHECK:          fir.if %[[CMP]] {
 !CHECK:            fir.store %[[V]] to %[[I]]#1 : !fir.ref<i32>
 !CHECK:            %[[I_VAL:.*]] = fir.load %[[I]]#0 : !fir.ref<i32>
-!CHECK:            hlfir.assign %[[I_VAL]] to %[[I2]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:            hlfir.assign %[[I_VAL]] to %[[I2]]#0 : i32, !fir.ref<i32>
 !CHECK:          }
 !CHECK:          omp.yield
 !CHECK:        }
@@ -62,7 +62,7 @@ subroutine lastprivate_iv_inc()
 !CHECK:          fir.if %[[CMP]] {
 !CHECK:            fir.store %[[V]] to %[[I]]#1 : !fir.ref<i32>
 !CHECK:            %[[I_VAL:.*]] = fir.load %[[I]]#0 : !fir.ref<i32>
-!CHECK:            hlfir.assign %[[I_VAL]] to %[[I2]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:            hlfir.assign %[[I_VAL]] to %[[I2]]#0 : i32, !fir.ref<i32>
 !CHECK:          }
 !CHECK:          omp.yield
 !CHECK:        }
@@ -88,7 +88,7 @@ subroutine lastprivate_iv_i1
 !CHECK:          %[[I8_VAL:.*]] = fir.convert %{{.*}} : (i32) -> i8
 !CHECK:          fir.store %[[I8_VAL]] to %[[IV:.*]]#1 : !fir.ref<i8>
 !CHECK:          %[[IV_VAL:.*]] = fir.load %[[IV]]#0 : !fir.ref<i8>
-!CHECK:          hlfir.assign %[[IV_VAL]] to %{{.*}}#0 temporary_lhs : i8, !fir.ref<i8>
+!CHECK:          hlfir.assign %[[IV_VAL]] to %{{.*}}#0 : i8, !fir.ref<i8>
 !CHECK:        }
   !$omp do lastprivate(i1)
   do i1=1,8
diff --git a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
index be0cc4195c280..62bc247a1456a 100644
--- a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
+++ b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
@@ -37,8 +37,8 @@
 !CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
 
 ! Testing lastprivate val update
-!CHECK: hlfir.assign %[[ARG1_PVT_DECL]]#0 to %[[ARG1_DECL]]#0 temporary_lhs : !fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>
-!CHECK: } 
+!CHECK: hlfir.assign %[[ARG1_PVT_DECL]]#0 to %[[ARG1_DECL]]#0 : !fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>
+!CHECK: }
 !CHECK: omp.yield
 !CHECK: }
 !CHECK: omp.terminator
@@ -76,7 +76,7 @@ subroutine lastprivate_character(arg1)
 
 ! Testing lastprivate val update
 !CHECK-NEXT: %[[CLONE_LD:.*]] = fir.load %[[CLONE_DECL]]#0 : !fir.ref<i32>
-!CHECK:      hlfir.assign %[[CLONE_LD]] to %[[ARG1_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:      hlfir.assign %[[CLONE_LD]] to %[[ARG1_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: }
 !CHECK: omp.yield
 !CHECK: }
@@ -118,9 +118,9 @@ subroutine lastprivate_int(arg1)
 !CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
 ! Testing lastprivate val update
 !CHECK-DAG: %[[CLONE_LD1:.*]] = fir.load %[[CLONE1_DECL]]#0 : !fir.ref<i32>
-!CHECK-DAG: hlfir.assign %[[CLONE_LD1]] to %[[ARG1_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-DAG: hlfir.assign %[[CLONE_LD1]] to %[[ARG1_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK-DAG: %[[CLONE_LD2:.*]] = fir.load %[[CLONE2_DECL]]#0 : !fir.ref<i32>
-!CHECK-DAG: hlfir.assign %[[CLONE_LD2]] to %[[ARG2_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-DAG: hlfir.assign %[[CLONE_LD2]] to %[[ARG2_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: }
 !CHECK: omp.yield
 !CHECK: }
@@ -163,9 +163,9 @@ subroutine mult_lastprivate_int(arg1, arg2)
 !CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
 !Testing lastprivate val update
 !CHECK-DAG: %[[CLONE_LD2:.*]] = fir.load %[[CLONE2_DECL]]#0 : !fir.ref<i32>
-!CHECK-DAG: hlfir.assign %[[CLONE_LD2]] to %[[ARG2_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-DAG: hlfir.assign %[[CLONE_LD2]] to %[[ARG2_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK-DAG: %[[CLONE_LD1:.*]] = fir.load %[[CLONE1_DECL]]#0 : !fir.ref<i32>
-!CHECK-DAG: hlfir.assign %[[CLONE_LD1]] to %[[ARG1_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-DAG: hlfir.assign %[[CLONE_LD1]] to %[[ARG1_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: }
 !CHECK: omp.yield
 !CHECK: }
@@ -194,7 +194,7 @@ subroutine mult_lastprivate_int2(arg1, arg2)
 !CHECK: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1", pinned, {{.*}}}
 !CHECK: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFfirstpriv_lastpriv_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[FPV_LD:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref<i32>
-!CHECK: hlfir.assign %[[FPV_LD]] to %[[CLONE1_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: hlfir.assign %[[FPV_LD]] to %[[CLONE1_DECL]]#0 : i32, !fir.ref<i32>
 ! Lastprivate Allocation
 !CHECK: %[[CLONE2:.*]] = fir.alloca i32 {bindc_name = "arg2", pinned, {{.*}}}
 !CHECK: %[[CLONE2_DECL:.*]]:2 = hlfir.declare %[[CLONE2]] {uniq_name = "_QFfirstpriv_lastpriv_intEarg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -213,7 +213,7 @@ subroutine mult_lastprivate_int2(arg1, arg2)
 !CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
 ! Testing lastprivate val update
 !CHECK-NEXT: %[[CLONE_LD:.*]] = fir.load %[[CLONE2_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT: hlfir.assign %[[CLONE_LD]] to %[[ARG2_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT: hlfir.assign %[[CLONE_LD]] to %[[ARG2_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK-NEXT: }
 !CHECK-NEXT: omp.yield
 !CHECK-NEXT: }
@@ -242,7 +242,7 @@ subroutine firstpriv_lastpriv_int(arg1, arg2)
 !CHECK: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1", pinned, {{.*}}}
 !CHECK: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFfirstpriv_lastpriv_int2Earg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NEXT: %[[FPV_LD:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT: hlfir.assign %[[FPV_LD]] to %[[CLONE1_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT: hlfir.assign %[[FPV_LD]] to %[[CLONE1_DECL]]#0 : i32, !fir.ref<i32>
 
 !CHECK-NEXT: %[[IV:.*]] = fir.alloca i32 {bindc_name = "n", pinned, {{.*}}}
 !CHECK-NEXT: hlfir.declare %[[IV]]
@@ -261,7 +261,7 @@ subroutine firstpriv_lastpriv_int(arg1, arg2)
 !CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
 ! Testing lastprivate val update
 !CHECK-NEXT: %[[CLONE_LD:.*]] = fir.load %[[CLONE1_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT: hlfir.assign %[[CLONE_LD]] to %[[ARG1_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT: hlfir.assign %[[CLONE_LD]] to %[[ARG1_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK-NEXT: }
 !CHECK-NEXT: omp.yield
 !CHECK-NEXT: }
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90
index 33dab125b3b2d..de3f42be10482 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90
@@ -15,7 +15,7 @@ subroutine omp_do_firstprivate(a)
   ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_do_firstprivateEa"}
   ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK-NEXT: %[[LD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<i32>
-  ! CHECK-NEXT: hlfir.assign %[[LD]] to %[[A_PVT_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+  ! CHECK-NEXT: hlfir.assign %[[LD]] to %[[A_PVT_DECL]]#0 : i32, !fir.ref<i32>
 
   ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
   ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_firstprivateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -52,12 +52,12 @@ subroutine omp_do_firstprivate2(a, n)
   ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, {{.*}}}
   ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_firstprivate2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK: %[[LD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<i32>
-  ! CHECK: hlfir.assign %[[LD]] to %[[A_PVT_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+  ! CHECK: hlfir.assign %[[LD]] to %[[A_PVT_DECL]]#0 : i32, !fir.ref<i32>
 
   ! CHECK: %[[N_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "n", pinned, uniq_name = "_QFomp_do_firstprivate2En"}
   ! CHECK: %[[N_PVT_DECL:.*]]:2 = hlfir.declare %[[N_PVT_REF]] {uniq_name = "_QFomp_do_firstprivate2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK: %[[LD1:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref<i32>
-  ! CHECK: hlfir.assign %[[LD1]] to %[[N_PVT_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+  ! CHECK: hlfir.assign %[[LD1]] to %[[N_PVT_DECL]]#0 : i32, !fir.ref<i32>
 
   ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
   ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_firstprivate2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
index 9b90ac28f693c..254aeff21d06e 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
@@ -34,7 +34,7 @@ subroutine omp_do_lastprivate(a)
   ! CHECK:      fir.if %[[SEL]] {
   ! CHECK:        fir.store %[[NEXT_ARG1]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
   ! CHECK:        %[[A_PVT_LOAD:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
-  ! CHECK:        hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+  ! CHECK:        hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 : i32, !fir.ref<i32>
   ! CHECK:      }
 
   ! CHECK-NEXT: omp.yield
@@ -84,9 +84,9 @@ subroutine omp_do_lastprivate2(a, n)
   ! CHECK: fir.if %[[SEL]] {
   ! CHECK:   fir.store %[[NEXT_ARG2]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
   ! CHECK:   %[[A_PVT_LOAD:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
-  ! CHECK:   hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+  ! CHECK:   hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 : i32, !fir.ref<i32>
   ! CHECK:   %[[N_PVT_LOAD:.*]] = fir.load %[[N_PVT_DECL]]#0 : !fir.ref<i32>
-  ! CHECK:   hlfir.assign %[[N_PVT_LOAD]] to %[[ARG1_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+  ! CHECK:   hlfir.assign %[[N_PVT_LOAD]] to %[[ARG1_DECL]]#0 : i32, !fir.ref<i32>
   ! CHECK: }
 
   ! CHECK: omp.yield
@@ -143,7 +143,7 @@ subroutine omp_do_lastprivate_collapse2(a)
   ! CHECK:        fir.store %[[NEXT_ARG1]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
   ! CHECK:        fir.store %[[NEXT_ARG2]] to %[[J_PVT_DECL]]#1 : !fir.ref<i32>
   ! CHECK:        %[[A_PVT_LOAD:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
-  ! CHECK:        hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+  ! CHECK:        hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 : i32, !fir.ref<i32>
   ! CHECK:      }
 
   ! CHECK-NEXT: omp.yield
@@ -219,7 +219,7 @@ subroutine omp_do_lastprivate_collapse3(a)
   ! CHECK:        fir.store %[[NEXT_ARG2]] to %[[J_PVT_DECL]]#1 : !fir.ref<i32>
   ! CHECK:        fir.store %[[NEXT_ARG3]] to %[[K_PVT_DECL]]#1 : !fir.ref<i32>
   ! CHECK:        %[[A_PVT_LOAD:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
-  ! CHECK:        hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+  ! CHECK:        hlfir.assign %[[A_PVT_LOAD]] to %[[ARG0_DECL]]#0 : i32, !fir.ref<i32>
   ! CHECK:      }
 
   ! CHECK-NEXT: omp.yield
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop.f90 b/flang/test/Lower/OpenMP/parallel-wsloop.f90
index 5853d07c46c7d..de1b8f4bc7d04 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop.f90
@@ -100,7 +100,7 @@ subroutine parallel_do_with_privatisation_clauses(cond,nt)
   ! CHECK:      %[[PRIVATE_NT_REF:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"}
   ! CHECK:      %[[PRIVATE_NT_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_NT_REF]] {uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK:      %[[NT_VAL:.*]] = fir.load %[[NT_DECL]]#0 : !fir.ref<i32>
-  ! CHECK:      hlfir.assign %[[NT_VAL]] to %[[PRIVATE_NT_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+  ! CHECK:      hlfir.assign %[[NT_VAL]] to %[[PRIVATE_NT_DECL]]#0 : i32, !fir.ref<i32>
   ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
   ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
   ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
@@ -250,7 +250,7 @@ end subroutine parallel_do_private
 ! CHECK:             %[[NT_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_do_privateEnt"}
 ! CHECK:             %[[NT_PRIV_DECL:.*]]:2 = hlfir.declare %[[NT_PRIV_ADDR]] {uniq_name = "_QFparallel_do_privateEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[NT_VAL:.*]] = fir.load %[[NT_DECL]]#0 : !fir.ref<i32>
-! CHECK:             hlfir.assign %[[NT_VAL]] to %[[NT_PRIV_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             hlfir.assign %[[NT_VAL]] to %[[NT_PRIV_DECL]]#0 : i32, !fir.ref<i32>
 
 ! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
 ! CHECK:             %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFparallel_do_privateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -298,12 +298,12 @@ end subroutine omp_parallel_do_multiple_firstprivate
 ! CHECK:             %[[A_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"}
 ! CHECK:             %[[A_PRIV_DECL:.*]]:2 = hlfir.declare %[[A_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[A:.*]] = fir.load %[[A_DECL]]#0 : !fir.ref<i32>
-! CHECK:             hlfir.assign %[[A]] to %[[A_PRIV_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             hlfir.assign %[[A]] to %[[A_PRIV_DECL]]#0 : i32, !fir.ref<i32>
 
 ! CHECK:             %[[B_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "b", pinned, uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"}
 ! CHECK:             %[[B_PRIV_DECL:.*]]:2 = hlfir.declare %[[B_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[B:.*]] = fir.load %[[B_DECL]]#0 : !fir.ref<i32>
-! CHECK:             hlfir.assign %[[B]] to %[[B_PRIV_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             hlfir.assign %[[B]] to %[[B_PRIV_DECL]]#0 : i32, !fir.ref<i32>
 
 ! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
 ! CHECK:             %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Lower/OpenMP/sections.f90 b/flang/test/Lower/OpenMP/sections.f90
index c79d6b020f3f5..2287918528635 100644
--- a/flang/test/Lower/OpenMP/sections.f90
+++ b/flang/test/Lower/OpenMP/sections.f90
@@ -83,7 +83,7 @@ end program sample
 !CHECK:   %[[PRIVATE_ALPHA:.*]] = fir.alloca f32 {bindc_name = "alpha", pinned, uniq_name = "_QFfirstprivateEalpha"}
 !CHECK:   %[[PRIVATE_ALPHA_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_ALPHA]] {uniq_name = "_QFfirstprivateEalpha"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:   %[[TEMP:.*]] = fir.load %[[ARG_DECL]]#0 : !fir.ref<f32>
-!CHECK:   hlfir.assign %[[TEMP]] to %[[PRIVATE_ALPHA_DECL]]#0 temporary_lhs : f32, !fir.ref<f32>
+!CHECK:   hlfir.assign %[[TEMP]] to %[[PRIVATE_ALPHA_DECL]]#0 : f32, !fir.ref<f32>
 !CHECK:   omp.sections {
 !CHECK:     omp.section  {
 !CHECK:       omp.terminator
@@ -138,7 +138,7 @@ subroutine lastprivate()
 !CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
 !CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: %[[TEMP1:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
-!CHECK: hlfir.assign %[[TEMP1]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP1]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: omp.terminator
 !CHECK: }
         !$omp section
@@ -150,7 +150,7 @@ subroutine lastprivate()
 !CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFlastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
-!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: omp.barrier
 !CHECK: omp.sections {
     !$omp sections firstprivate(x) lastprivate(x)
@@ -169,7 +169,7 @@ subroutine lastprivate()
 !CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
 !CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
-!CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: omp.terminator
 !CHECK: }
         !$omp section
@@ -181,7 +181,7 @@ subroutine lastprivate()
 !CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFlastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
-!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: omp.barrier
 !CHECK: omp.sections nowait {
     !$omp sections firstprivate(x) lastprivate(x)
@@ -200,7 +200,7 @@ subroutine lastprivate()
 !CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
 !CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
-!CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: omp.terminator
 !CHECK: }
         !$omp section
@@ -221,7 +221,7 @@ subroutine lastprivate()
 !CHECK: %[[RESULT:.*]] = arith.addi %[[INNER_PRIVATE_X]], %[[CONST]] : i32
 !CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: %[[LOADED_VALUE:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
-!CHECK: hlfir.assign %[[LOADED_VALUE]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: hlfir.assign %[[LOADED_VALUE]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: omp.terminator
 !CHECK: }
 !CHECK: omp.terminator
@@ -247,9 +247,9 @@ subroutine lastprivate()
 !CHECK: omp.sections {
 !CHECK:   omp.section {
 !CHECK:     %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
-!CHECK:     hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:     hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:     %[[TEMP2:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
-!CHECK:     hlfir.assign %[[TEMP2]] to %[[Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:     hlfir.assign %[[TEMP2]] to %[[Y_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:     omp.terminator
 !CHECK:   }
 !CHECK:   omp.terminator
@@ -289,7 +289,7 @@ subroutine unstructured_sections_privatization()
 !CHECK: %[[PRIVATE_X:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFunstructured_sections_privatizationEx"}
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFunstructured_sections_privatizationEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<f32>
-!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : f32, !fir.ref<f32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : f32, !fir.ref<f32>
 !CHECK: omp.sections {
 !CHECK: omp.section {
 !CHECK: cf.br ^bb1
diff --git a/flang/test/Lower/OpenMP/single.f90 b/flang/test/Lower/OpenMP/single.f90
index 2e5d05dd7b472..9fd34344eb43b 100644
--- a/flang/test/Lower/OpenMP/single.f90
+++ b/flang/test/Lower/OpenMP/single.f90
@@ -84,7 +84,7 @@ end subroutine single_allocate
 ! CHECK:             %[[Y_PVT:.*]] = fir.alloca f64 {bindc_name = "y", pinned, uniq_name = "_QFsingle_privatizationEy"}
 ! CHECK:             %[[Y_PVT_DECL:.*]]:2 = hlfir.declare %[[Y_PVT]] {uniq_name = "_QFsingle_privatizationEy"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
 ! CHECK:             %[[Y_LOAD:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<f64>
-! CHECK:             hlfir.assign %[[Y_LOAD]] to %[[Y_PVT_DECL]]#0 temporary_lhs : f64, !fir.ref<f64>
+! CHECK:             hlfir.assign %[[Y_LOAD]] to %[[Y_PVT_DECL]]#0 : f64, !fir.ref<f64>
 ! CHECK:             fir.call @_QPbar(%[[X_PVT_DECL]]#1, %[[Y_PVT_DECL]]#1) fastmath<contract> : (!fir.ref<f32>, !fir.ref<f64>) -> ()
 ! CHECK:           omp.terminator
 ! CHECK:         }
@@ -112,7 +112,7 @@ subroutine single_privatization(x, y)
 ! CHECK:             %[[Y_PVT:.*]] = fir.alloca f64 {bindc_name = "y", pinned, uniq_name = "_QFsingle_privatization2Ey"}
 ! CHECK:             %[[Y_PVT_DECL:.*]]:2 = hlfir.declare %[[Y_PVT]] {uniq_name = "_QFsingle_privatization2Ey"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
 ! CHECK:             %[[Y_LOAD:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<f64>
-! CHECK:             hlfir.assign %[[Y_LOAD]] to %[[Y_PVT_DECL]]#0 temporary_lhs : f64, !fir.ref<f64>
+! CHECK:             hlfir.assign %[[Y_LOAD]] to %[[Y_PVT_DECL]]#0 : f64, !fir.ref<f64>
 ! CHECK:             fir.call @_QPbar(%[[X_PVT_DECL]]#1, %[[Y_PVT_DECL]]#1) fastmath<contract> : (!fir.ref<f32>, !fir.ref<f64>) -> ()
 ! CHECK:             omp.terminator
 ! CHECK:           }
diff --git a/flang/test/Lower/OpenMP/statement-function.f90 b/flang/test/Lower/OpenMP/statement-function.f90
index 6cdbcb6e141c7..fd6f5986bb072 100644
--- a/flang/test/Lower/OpenMP/statement-function.f90
+++ b/flang/test/Lower/OpenMP/statement-function.f90
@@ -26,10 +26,10 @@ subroutine test_implicit_use()
 !CHECK:         omp.task
 !CHECK:           %[[PRIV_IEXP:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_implicit_use2Eiexp"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:           %[[TEMP0:.*]] = fir.load %[[IEXP]]#0 : !fir.ref<i32>
-!CHECK:           hlfir.assign %[[TEMP0]] to %[[PRIV_IEXP]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:           hlfir.assign %[[TEMP0]] to %[[PRIV_IEXP]]#0 : i32, !fir.ref<i32>
 !CHECK:           %[[PRIV_IIMP:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_implicit_use2Eiimp"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:           %[[TEMP1:.*]] = fir.load %[[IIMP]]#0 : !fir.ref<i32>
-!CHECK:           hlfir.assign %[[TEMP1]] to %[[PRIV_IIMP]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:           hlfir.assign %[[TEMP1]] to %[[PRIV_IIMP]]#0 : i32, !fir.ref<i32>
 subroutine test_implicit_use2()
   implicit none
   integer :: iexp, iimp
diff --git a/flang/test/Lower/OpenMP/task.f90 b/flang/test/Lower/OpenMP/task.f90
index afbe2cbfa746e..28e438d342d35 100644
--- a/flang/test/Lower/OpenMP/task.f90
+++ b/flang/test/Lower/OpenMP/task.f90
@@ -201,10 +201,10 @@ subroutine task_firstprivate
 !CHECK: %[[INT_FIRSTPRIVATE_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "int_var", pinned, uniq_name = "_QFtask_firstprivateEint_var"}
 !CHECK: %[[INT_VAR_FIRSTPRIVATE:.+]]:2 = hlfir.declare %[[INT_FIRSTPRIVATE_ALLOCA]] {uniq_name = "_QFtask_firstprivateEint_var"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[INT_VAR_LOAD:.+]] = fir.load %[[INT_VAR]]#0 : !fir.ref<i32>
-!CHECK: hlfir.assign %[[INT_VAR_LOAD]] to %[[INT_VAR_FIRSTPRIVATE]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: hlfir.assign %[[INT_VAR_LOAD]] to %[[INT_VAR_FIRSTPRIVATE]]#0 : i32, !fir.ref<i32>
 !CHECK: %[[MYTYPE_FIRSTPRIVATE_ALLOCA:.+]] = fir.alloca !fir.type<_QFtask_firstprivateTmytype{x:i32}> {bindc_name = "mytype_var", pinned, uniq_name = "_QFtask_firstprivateEmytype_var"}
 !CHECK: %[[MYTYPE_VAR_FIRSTPRIVATE:.+]]:2 = hlfir.declare %[[MYTYPE_FIRSTPRIVATE_ALLOCA]] {uniq_name = "_QFtask_firstprivateEmytype_var"} : (!fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>) -> (!fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>, !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>)
-!CHECK: hlfir.assign %[[MYTYPE_VAR]]#0 to %[[MYTYPE_VAR_FIRSTPRIVATE]]#0 temporary_lhs : !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>, !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>
+!CHECK: hlfir.assign %[[MYTYPE_VAR]]#0 to %[[MYTYPE_VAR_FIRSTPRIVATE]]#0 : !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>, !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>
   call baz(int_var, mytype_var)
   !CHECK: omp.terminator
   !$omp end task
@@ -235,7 +235,7 @@ subroutine task_multiple_clauses()
 !CHECK: %[[Y_PRIV_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFtask_multiple_clausesEy"}
 !CHECK: %[[Y_PRIV:.+]]:2 = hlfir.declare %[[Y_PRIV_ALLOCA]] {uniq_name = "_QFtask_multiple_clausesEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[Y_LOAD:.+]] = fir.load %[[Y]]#0 : !fir.ref<i32>
-!CHECK: hlfir.assign %[[Y_LOAD]] to %[[Y_PRIV]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: hlfir.assign %[[Y_LOAD]] to %[[Y_PRIV]]#0 : i32, !fir.ref<i32>
 
   !CHECK: arith.addi
   x = x + 12
diff --git a/flang/test/Lower/OpenMP/task2.f90 b/flang/test/Lower/OpenMP/task2.f90
index ce491d95e9397..cff9ebdd375b3 100644
--- a/flang/test/Lower/OpenMP/task2.f90
+++ b/flang/test/Lower/OpenMP/task2.f90
@@ -18,12 +18,9 @@ subroutine omp_task_nested_allocatable_firstprivate
 !CHECK-SAME:        uniq_name = "_QFomp_task_nested_allocatable_firstprivateEa"} :
 !CHECK-SAME:        (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) ->
 !CHECK-SAME:        (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
-!CHECK:           %[[PRIV_A_BOX:.*]] = fir.load %[[PRIV_A]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-!CHECK:           fir.if %{{.*}} {
-!CHECK:             %[[TEMP:.*]] = fir.load %[[A]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-!CHECK:             hlfir.assign %[[TEMP]] to %[[PRIV_A_BOX]] temporary_lhs :
-!CHECK-SAME:          !fir.box<!fir.heap<!fir.array<?xi32>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>
-!CHECK:           }
+!CHECK:           %[[TEMP:.*]] = fir.load %[[A]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+!CHECK:           hlfir.assign %[[TEMP]] to %[[PRIV_A]]#0 realloc :
+!CHECK-SAME:        !fir.box<!fir.heap<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
     !$omp task default(firstprivate)
       a = 2
 !CHECK:         }

From 4d819daab91f54b90365927ba4b40e5a2eff26a9 Mon Sep 17 00:00:00 2001
From: Alexander Richardson <alexrichardson@google.com>
Date: Thu, 5 Sep 2024 10:58:37 -0700
Subject: [PATCH 269/425] [compiler-rt] Simplify definition of uptr

We can rely on the compiler-provided macro __UINTPTR_TYPE__ for all
non-MSVC compilers. I verified via https://godbolt.org/z/MW9KMjv5f that
this works for MSVC as well as GCC 4.5 Clang 3.0, so that should cover
all supported compilers.

This means we no longer need to explicitly handle new architectures
and as an added bonus also adds support for architectures where
`unsigned long long` cannot be used to hold pointers (e.g. CHERI).

Reviewed By: mstorsjo, vitalybuka

Pull Request: https://github.com/llvm/llvm-project/pull/106155
---
 .../sanitizer_common/sanitizer_internal_defs.h   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
index fefe28e811767..f8f03454ea169 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
@@ -138,19 +138,19 @@
 // in a portable way by the language itself.
 namespace __sanitizer {
 
-#if defined(_WIN64)
+#if defined(__UINTPTR_TYPE__)
+typedef __UINTPTR_TYPE__ uptr;
+typedef __INTPTR_TYPE__ sptr;
+#elif defined(_WIN64)
 // 64-bit Windows uses LLP64 data model.
 typedef unsigned long long uptr;
 typedef signed long long sptr;
-#else
-#  if (SANITIZER_WORDSIZE == 64) || SANITIZER_APPLE
-typedef unsigned long uptr;
-typedef signed long sptr;
-#  else
+#elif defined(_WIN32)
 typedef unsigned int uptr;
 typedef signed int sptr;
-#  endif
-#endif  // defined(_WIN64)
+#else
+#  error Unsupported compiler, missing __UINTPTR_TYPE__
+#endif  // defined(__UINTPTR_TYPE__)
 #if defined(__x86_64__)
 // Since x32 uses ILP32 data model in 64-bit hardware mode, we must use
 // 64-bit pointer to unwind stack frame.

From b2dbcf4dc1078fd62ef2295ff9696173a9991116 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 5 Sep 2024 11:43:19 -0700
Subject: [PATCH 270/425] [Presburger] Avoid repeated hash lookups (NFC)
 (#107426)

---
 mlir/lib/Analysis/Presburger/IntegerRelation.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
index 87204d2100713..94af81f955e5a 100644
--- a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
+++ b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
@@ -2367,10 +2367,8 @@ bool IntegerRelation::removeDuplicateConstraints() {
   hashTable.insert({row, 0});
   for (unsigned k = 1; k < ineqs; ++k) {
     row = getInequality(k).drop_back();
-    if (!hashTable.contains(row)) {
-      hashTable.insert({row, k});
+    if (hashTable.try_emplace(row, k).second)
       continue;
-    }
 
     // For identical cases, keep only the smaller part of the constant term.
     unsigned l = hashTable[row];

From 7cf18ff22b626efb0dad6eb9daebea821faff438 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 5 Sep 2024 11:43:36 -0700
Subject: [PATCH 271/425] [LLVMIR] Avoid repeated hash lookups (NFC) (#107428)

---
 mlir/lib/Target/LLVMIR/ModuleImport.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index bd76165053488..d1732cb808928 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1021,9 +1021,8 @@ ModuleImport::getConstantsToConvert(llvm::Constant *constant) {
     llvm::Constant *current = workList.back();
     // Collect all dependencies of the current constant and add them to the
     // adjacency list if none has been computed before.
-    auto adjacencyIt = adjacencyLists.find(current);
-    if (adjacencyIt == adjacencyLists.end()) {
-      adjacencyIt = adjacencyLists.try_emplace(current).first;
+    auto [adjacencyIt, inserted] = adjacencyLists.try_emplace(current);
+    if (inserted) {
       // Add all constant operands to the adjacency list and skip any other
       // values such as basic block addresses.
       for (llvm::Value *operand : current->operands())

From 92e75c095bb380039f32218534f78c4580bf76e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 5 Sep 2024 11:44:12 -0700
Subject: [PATCH 272/425] Reland [flang][cuda] Add c_devptr and bypass output
 semantic check (#107353)

Add a builtin type for c_devptr since it will need some special handling
for some function like c_f_pointer.
`c_ptr` is defined as a builtin type and was raising a semantic error if
you try to use it in a I/O statement. This patch add a check for c_ptr
and c_devptr to bypass the semantic check and allow the variables of
these types to be used in I/O.

This version of the patch keeps the semantic error when -pedantic is
enabled to align with gfortran.
---
 flang/include/flang/Common/Fortran-features.h |  2 +-
 flang/lib/Semantics/check-io.cpp              |  6 ++++++
 flang/module/__fortran_builtins.f90           |  4 ++++
 flang/test/Lower/CUDA/cuda-devptr.cuf         | 16 ++++++++++++++++
 4 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Lower/CUDA/cuda-devptr.cuf

diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h
index 6ef5f44c89db0..0c8a3d2bd5281 100644
--- a/flang/include/flang/Common/Fortran-features.h
+++ b/flang/include/flang/Common/Fortran-features.h
@@ -51,7 +51,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
     BadBranchTarget, ConvertedArgument, HollerithPolymorphic, ListDirectedSize,
     NonBindCInteroperability, CudaManaged, CudaUnified,
     PolymorphicActualAllocatableOrPointerToMonomorphicDummy, RelaxedPureDummy,
-    UndefinableAsynchronousOrVolatileActual, AutomaticInMainProgram)
+    UndefinableAsynchronousOrVolatileActual, AutomaticInMainProgram, PrintCptr)
 
 // Portability and suspicious usage warnings
 ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
diff --git a/flang/lib/Semantics/check-io.cpp b/flang/lib/Semantics/check-io.cpp
index 54e8e09cbf7e4..46f07842b92cb 100644
--- a/flang/lib/Semantics/check-io.cpp
+++ b/flang/lib/Semantics/check-io.cpp
@@ -1171,6 +1171,12 @@ parser::Message *IoChecker::CheckForBadIoType(const evaluate::DynamicType &type,
             "Derived type '%s' in I/O may not be polymorphic unless using defined I/O"_err_en_US,
             derived.name());
       }
+      if ((IsBuiltinDerivedType(&derived, "c_ptr") ||
+              IsBuiltinDerivedType(&derived, "c_devptr")) &&
+          !context_.ShouldWarn(common::LanguageFeature::PrintCptr)) {
+        // Bypass the check below for c_ptr and c_devptr.
+        return nullptr;
+      }
       if (const Symbol *
           bad{FindInaccessibleComponent(which, derived, scope)}) {
         return &context_.Say(where,
diff --git a/flang/module/__fortran_builtins.f90 b/flang/module/__fortran_builtins.f90
index 44b0f17339cd9..a9d3ac897eb58 100644
--- a/flang/module/__fortran_builtins.f90
+++ b/flang/module/__fortran_builtins.f90
@@ -102,6 +102,10 @@
     __builtin_threadIdx, __builtin_blockDim, __builtin_blockIdx, &
     __builtin_gridDim
   integer, parameter, public :: __builtin_warpsize = 32
+  
+  type, public, bind(c) :: __builtin_c_devptr
+    type(__builtin_c_ptr) :: cptr
+  end type
 
   intrinsic :: __builtin_fma
   intrinsic :: __builtin_ieee_is_nan, __builtin_ieee_is_negative, &
diff --git a/flang/test/Lower/CUDA/cuda-devptr.cuf b/flang/test/Lower/CUDA/cuda-devptr.cuf
new file mode 100644
index 0000000000000..4e11e3c0fc8f8
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-devptr.cuf
@@ -0,0 +1,16 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test CUDA Fortran specific type
+
+subroutine sub1()
+  use iso_c_binding
+  use __fortran_builtins, only : c_devptr => __builtin_c_devptr
+
+  type(c_ptr) :: ptr
+  type(c_devptr) :: dptr
+  print*,ptr
+  print*,dptr
+end
+
+! CHECK-LABEL: func.func @_QPsub1()
+! CHECK-COUNT-2: %{{.*}} = fir.call @_FortranAioOutputDerivedType

From d219c63b16851ba264b6495e3f63016d1c8b2aac Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Thu, 5 Sep 2024 20:45:31 +0200
Subject: [PATCH 273/425] [Clang] Fix crash with `source_location` in lambda
 declarators. (#107411)

Parsing lambdas require pushing a declaration context for the lambda, so
that parameters can be attached to it, before its trailing type is
parsed. DAt that point, partially-parsed lambda don't have a name that
can be computed for then.
This would cause source_location::current() to crash when use in the
decltype of a lambda().
We work around this by producing a source_location for an enclosing
scope in that scenario.

Fixes #67134
---
 clang/docs/ReleaseNotes.rst            |  1 +
 clang/lib/AST/Expr.cpp                 | 10 ++++++++++
 clang/test/SemaCXX/source_location.cpp | 23 +++++++++++++++++++++++
 3 files changed, 34 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ebd0b7371e1be..a2e91fd648cce 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -369,6 +369,7 @@ Bug Fixes to C++ Support
 - Clang no longer tries to capture non-odr used default arguments of template parameters of generic lambdas (#GH107048)
 - Fixed a bug where defaulted comparison operators would remove ``const`` from base classes. (#GH102588)
 
+- Fix a crash when using ``source_location`` in the trailing return type of a lambda expression. (#GH67134)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 96c6276f3f34c..27930db019a17 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -13,6 +13,7 @@
 #include "clang/AST/Expr.h"
 #include "clang/AST/APValue.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/ASTLambda.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/ComputeDependence.h"
 #include "clang/AST/DeclCXX.h"
@@ -2287,6 +2288,15 @@ APValue SourceLocExpr::EvaluateInContext(const ASTContext &Ctx,
     Context = getParentContext();
   }
 
+  // If we are currently parsing a lambda declarator, we might not have a fully
+  // formed call operator declaration yet, and we could not form a function name
+  // for it. Because we do not have access to Sema/function scopes here, we
+  // detect this case by relying on the fact such method doesn't yet have a
+  // type.
+  if (const auto *D = dyn_cast<CXXMethodDecl>(Context);
+      D && D->getFunctionTypeLoc().isNull() && isLambdaCallOperator(D))
+    Context = D->getParent()->getParent();
+
   PresumedLoc PLoc = Ctx.getSourceManager().getPresumedLoc(
       Ctx.getSourceManager().getExpansionRange(Loc).getEnd());
 
diff --git a/clang/test/SemaCXX/source_location.cpp b/clang/test/SemaCXX/source_location.cpp
index 34177bfe287fc..8b3a5d8dd3327 100644
--- a/clang/test/SemaCXX/source_location.cpp
+++ b/clang/test/SemaCXX/source_location.cpp
@@ -989,3 +989,26 @@ void Test() {
 }
 
 #endif
+
+
+namespace GH67134 {
+template <int loc = std::source_location::current().line()>
+constexpr auto f(std::source_location loc2 = std::source_location::current()) { return loc; }
+
+int g = []() -> decltype(f()) { return 0; }();
+
+int call() {
+#if __cplusplus >= 202002L
+  return []<decltype(f()) = 0>() -> decltype(f()) { return  0; }();
+#endif
+  return []() -> decltype(f()) { return  0; }();
+}
+
+#if __cplusplus >= 202002L
+template<typename T>
+int Var = requires { []() -> decltype(f()){}; };
+int h = Var<int>;
+#endif
+
+
+}

From 18926666f509104c3f478444b282291ce19fab6a Mon Sep 17 00:00:00 2001
From: SJW <48454132+sjw36@users.noreply.github.com>
Date: Thu, 5 Sep 2024 13:46:18 -0500
Subject: [PATCH 274/425] [MLIR][SCF] Loop pipelining fails on failed
 predication (no assert) (#107442)

The SCFLoopPipelining allows predication on peeled or loop ops. When the
predicationFn returns a nullptr this signifies the op type is
unsupported and the pipeliner fails except in `emitPrologue` where it
asserts.

This patch fixes handling in the prologue to gracefully fail.
---
 mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
index a34542f0161ac..7cecd4942b640 100644
--- a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
@@ -77,7 +77,7 @@ struct LoopPipelinerInternal {
   bool initializeLoopInfo(ForOp op, const PipeliningOption &options);
   /// Emits the prologue, this creates `maxStage - 1` part which will contain
   /// operations from stages [0; i], where i is the part index.
-  void emitPrologue(RewriterBase &rewriter);
+  LogicalResult emitPrologue(RewriterBase &rewriter);
   /// Gather liverange information for Values that are used in a different stage
   /// than its definition.
   llvm::MapVector<Value, LiverangeInfo> analyzeCrossStageValues();
@@ -263,7 +263,7 @@ cloneAndUpdateOperands(RewriterBase &rewriter, Operation *op,
   return clone;
 }
 
-void LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) {
+LogicalResult LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) {
   // Initialize the iteration argument to the loop initial values.
   for (auto [arg, operand] :
        llvm::zip(forOp.getRegionIterArgs(), forOp.getInitsMutable())) {
@@ -311,7 +311,8 @@ void LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) {
       if (predicates[predicateIdx]) {
         OpBuilder::InsertionGuard insertGuard(rewriter);
         newOp = predicateFn(rewriter, newOp, predicates[predicateIdx]);
-        assert(newOp && "failed to predicate op.");
+        if (newOp == nullptr)
+          return failure();
       }
       if (annotateFn)
         annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i);
@@ -339,6 +340,7 @@ void LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) {
       }
     }
   }
+  return success();
 }
 
 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
@@ -772,7 +774,8 @@ FailureOr<ForOp> mlir::scf::pipelineForLoop(RewriterBase &rewriter, ForOp forOp,
     *modifiedIR = true;
 
   // 1. Emit prologue.
-  pipeliner.emitPrologue(rewriter);
+  if (failed(pipeliner.emitPrologue(rewriter)))
+    return failure();
 
   // 2. Track values used across stages. When a value cross stages it will
   // need to be passed as loop iteration arguments.

From 3f1d0e1b1dfef0af0ca5f3315317246d0026fb70 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Thu, 5 Sep 2024 11:53:22 -0700
Subject: [PATCH 275/425] [flang] Silence warning in module file (#107421)

Most warnings should be silenced when processing the content of a module
file, since the warning should have also appeared when the module file
was generated. The case of an intrinsic type kind not being supported
for a target wasn't being suppressed; fix.

Fixes https://github.com/llvm/llvm-project/issues/107337.
---
 flang/lib/Semantics/expression.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 60db02dc764b4..3684839c187e6 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -4027,7 +4027,8 @@ bool ExpressionAnalyzer::CheckIntrinsicKind(
     return true;
   } else if (foldingContext_.targetCharacteristics().CanSupportType(
                  category, kind)) {
-    if (context_.ShouldWarn(common::UsageWarning::BadTypeForTarget)) {
+    if (context_.ShouldWarn(common::UsageWarning::BadTypeForTarget) &&
+        !context_.IsInModuleFile(GetContextualMessages().at())) {
       Say("%s(KIND=%jd) is not an enabled type for this target"_warn_en_US,
           ToUpperCase(EnumToString(category)), kind);
     }

From 5e1e6a689c82aaf2b7af72e074c95889a11d3a78 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 5 Sep 2024 11:58:03 -0700
Subject: [PATCH 276/425] [TableGen] Avoid repeated hash lookups (NFC)
 (#107429)

---
 mlir/lib/TableGen/Pattern.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/TableGen/Pattern.cpp b/mlir/lib/TableGen/Pattern.cpp
index afb69e7cc5586..1be0e744ffbc8 100644
--- a/mlir/lib/TableGen/Pattern.cpp
+++ b/mlir/lib/TableGen/Pattern.cpp
@@ -137,11 +137,10 @@ llvm::StringRef DagNode::getSymbol() const { return node->getNameStr(); }
 
 Operator &DagNode::getDialectOp(RecordOperatorMap *mapper) const {
   llvm::Record *opDef = cast<llvm::DefInit>(node->getOperator())->getDef();
-  auto it = mapper->find(opDef);
-  if (it != mapper->end())
-    return *it->second;
-  return *mapper->try_emplace(opDef, std::make_unique<Operator>(opDef))
-              .first->second;
+  auto [it, inserted] = mapper->try_emplace(opDef);
+  if (inserted)
+    it->second = std::make_unique<Operator>(opDef);
+  return *it->second;
 }
 
 int DagNode::getNumOps() const {

From a0dd90eb7dc318c9b3fccb9ba02e1e22fb073094 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 5 Sep 2024 12:19:31 -0700
Subject: [PATCH 277/425] =?UTF-8?q?[lldb]=20Make=20conversions=20from=20ll?=
 =?UTF-8?q?vm::Error=20explicit=20with=20Status::FromEr=E2=80=A6=20(#10716?=
 =?UTF-8?q?3)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ror() [NFC]
---
 lldb/include/lldb/Utility/Status.h            |   7 +-
 lldb/source/API/SBDebugger.cpp                |   2 +-
 lldb/source/API/SBTarget.cpp                  |   2 +-
 .../Commands/CommandObjectBreakpoint.cpp      | 103 ++++++++++--------
 .../Commands/CommandObjectMemoryTag.cpp       |  10 +-
 lldb/source/Commands/CommandObjectStats.cpp   |   6 +-
 lldb/source/Commands/CommandObjectTrace.cpp   |   2 +-
 lldb/source/Core/PluginManager.cpp            |   2 +-
 lldb/source/Core/ThreadedCommunication.cpp    |   2 +-
 lldb/source/Core/ValueObjectVTable.cpp        |   2 +-
 lldb/source/Core/ValueObjectVariable.cpp      |   2 +-
 lldb/source/DataFormatters/VectorType.cpp     |   2 +-
 lldb/source/Host/common/FileCache.cpp         |   2 +-
 .../Host/common/NativeProcessProtocol.cpp     |   2 +-
 lldb/source/Host/common/TCPSocket.cpp         |   6 +-
 lldb/source/Host/macosx/objcxx/Host.mm        |   2 +-
 .../posix/ConnectionFileDescriptorPosix.cpp   |   6 +-
 lldb/source/Interpreter/CommandObject.cpp     |   2 +-
 lldb/source/Interpreter/OptionValueRegex.cpp  |   2 +-
 .../Language/CPlusPlus/BlockPointer.cpp       |   2 +-
 .../ObjectFile/Mach-O/ObjectFileMachO.cpp     |   2 +-
 .../Minidump/ObjectFileMinidump.cpp           |   2 +-
 ...latformiOSSimulatorCoreSimulatorSupport.mm |   2 +-
 .../Process/Linux/NativeProcessLinux.cpp      |  16 +--
 .../NativeRegisterContextDBReg_arm64.cpp      |  10 +-
 .../Process/elf-core/ProcessElfCore.cpp       |   2 +-
 .../gdb-remote/GDBRemoteCommunication.cpp     |   2 +-
 .../GDBRemoteCommunicationClient.cpp          |   2 +-
 .../GDBRemoteCommunicationServer.cpp          |   2 +-
 .../GDBRemoteCommunicationServerLLGS.cpp      |   8 +-
 .../GDBRemoteCommunicationServerPlatform.cpp  |   2 +-
 .../Process/minidump/ProcessMinidump.cpp      |   2 +-
 .../Interfaces/ScriptedPythonInterface.h      |   2 +-
 .../Python/PythonDataObjects.cpp              |  22 ++--
 .../Python/ScriptInterpreterPython.cpp        |  10 +-
 .../DarwinLog/StructuredDataDarwinLog.cpp     |   2 +-
 lldb/source/Target/ModuleCache.cpp            |   2 +-
 lldb/source/Target/Platform.cpp               |   2 +-
 lldb/source/Target/Process.cpp                |   4 +-
 lldb/source/Target/StackFrame.cpp             |   2 +-
 lldb/source/Target/Thread.cpp                 |   3 +-
 lldb/source/Utility/Scalar.cpp                |   2 +-
 lldb/source/Utility/Status.cpp                |   8 +-
 lldb/source/Utility/StructuredData.cpp        |   2 +-
 .../Host/NativeProcessTestUtils.h             |   4 +-
 lldb/unittests/Utility/StatusTest.cpp         |  14 ++-
 46 files changed, 157 insertions(+), 140 deletions(-)

diff --git a/lldb/include/lldb/Utility/Status.h b/lldb/include/lldb/Utility/Status.h
index b304291ffae00..3813a3c160470 100644
--- a/lldb/include/lldb/Utility/Status.h
+++ b/lldb/include/lldb/Utility/Status.h
@@ -91,9 +91,9 @@ class Status {
 
   ~Status();
 
-  // llvm::Error support
-  explicit Status(llvm::Error error) { *this = std::move(error); }
-  const Status &operator=(llvm::Error error);
+  /// Avoid using this in new code. Migrate APIs to llvm::Expected instead.
+  static Status FromError(llvm::Error error);
+  /// FIXME: Replace this with a takeError method.
   llvm::Error ToError() const;
 
   /// Get the error string associated with the current error.
@@ -145,6 +145,7 @@ class Status {
   bool Success() const;
 
 protected:
+  Status(llvm::Error error);
   /// Status code as an integer value.
   ValueType m_code = 0;
   /// The type of the above error code.
diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp
index 72501570320d5..c226acc15018e 100644
--- a/lldb/source/API/SBDebugger.cpp
+++ b/lldb/source/API/SBDebugger.cpp
@@ -220,7 +220,7 @@ lldb::SBError SBDebugger::InitializeWithErrorHandling() {
   SBError error;
   if (auto e = g_debugger_lifetime->Initialize(
           std::make_unique<SystemInitializerFull>(), LoadPlugin)) {
-    error.SetError(Status(std::move(e)));
+    error.SetError(Status::FromError(std::move(e)));
   }
   return error;
 }
diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp
index e927cb854cd88..41eb77e5506bc 100644
--- a/lldb/source/API/SBTarget.cpp
+++ b/lldb/source/API/SBTarget.cpp
@@ -1658,7 +1658,7 @@ SBError SBTarget::SetLabel(const char *label) {
   if (!target_sp)
     return Status::FromErrorString("Couldn't get internal target object.");
 
-  return Status(target_sp->SetLabel(label));
+  return Status::FromError(target_sp->SetLabel(label));
 }
 
 uint32_t SBTarget::GetDataByteSize() {
diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp
index ede3dd2f2a864..494d6c50e94ac 100644
--- a/lldb/source/Commands/CommandObjectBreakpoint.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp
@@ -89,14 +89,16 @@ class lldb_private::BreakpointOptionGroup : public OptionGroup {
       if (success)
         m_bp_opts.SetAutoContinue(value);
       else
-        error = CreateOptionParsingError(option_arg, short_option, long_option,
-                                         g_bool_parsing_error_message);
+        error = Status::FromError(
+            CreateOptionParsingError(option_arg, short_option, long_option,
+                                     g_bool_parsing_error_message));
     } break;
     case 'i': {
       uint32_t ignore_count;
       if (option_arg.getAsInteger(0, ignore_count))
-        error = CreateOptionParsingError(option_arg, short_option, long_option,
-                                         g_int_parsing_error_message);
+        error = Status::FromError(
+            CreateOptionParsingError(option_arg, short_option, long_option,
+                                     g_int_parsing_error_message));
       else
         m_bp_opts.SetIgnoreCount(ignore_count);
     } break;
@@ -106,29 +108,31 @@ class lldb_private::BreakpointOptionGroup : public OptionGroup {
       if (success) {
         m_bp_opts.SetOneShot(value);
       } else
-        error = CreateOptionParsingError(option_arg, short_option, long_option,
-                                         g_bool_parsing_error_message);
+        error = Status::FromError(
+            CreateOptionParsingError(option_arg, short_option, long_option,
+                                     g_bool_parsing_error_message));
     } break;
     case 't': {
       lldb::tid_t thread_id = LLDB_INVALID_THREAD_ID;
       if (option_arg == "current") {
         if (!execution_context) {
-          error = CreateOptionParsingError(
+          error = Status::FromError(CreateOptionParsingError(
               option_arg, short_option, long_option,
-              "No context to determine current thread");
+              "No context to determine current thread"));
         } else {
           ThreadSP ctx_thread_sp = execution_context->GetThreadSP();
           if (!ctx_thread_sp || !ctx_thread_sp->IsValid()) {
-            error =
+            error = Status::FromError(
                 CreateOptionParsingError(option_arg, short_option, long_option,
-                                         "No currently selected thread");
+                                         "No currently selected thread"));
           } else {
             thread_id = ctx_thread_sp->GetID();
           }
         }
       } else if (option_arg.getAsInteger(0, thread_id)) {
-        error = CreateOptionParsingError(option_arg, short_option, long_option,
-                                         g_int_parsing_error_message);
+        error = Status::FromError(
+            CreateOptionParsingError(option_arg, short_option, long_option,
+                                     g_int_parsing_error_message));
       }
       if (thread_id != LLDB_INVALID_THREAD_ID)
         m_bp_opts.SetThreadID(thread_id);
@@ -142,8 +146,9 @@ class lldb_private::BreakpointOptionGroup : public OptionGroup {
     case 'x': {
       uint32_t thread_index = UINT32_MAX;
       if (option_arg.getAsInteger(0, thread_index)) {
-        error = CreateOptionParsingError(option_arg, short_option, long_option,
-                                         g_int_parsing_error_message);
+        error = Status::FromError(
+            CreateOptionParsingError(option_arg, short_option, long_option,
+                                     g_int_parsing_error_message));
       } else {
         m_bp_opts.GetThreadSpec()->SetIndex(thread_index);
       }
@@ -286,9 +291,9 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
 
       case 'u':
         if (option_arg.getAsInteger(0, m_column))
-          error =
+          error = Status::FromError(
               CreateOptionParsingError(option_arg, short_option, long_option,
-                                       g_int_parsing_error_message);
+                                       g_int_parsing_error_message));
         break;
 
       case 'E': {
@@ -326,8 +331,8 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
           error_context = "Unsupported language type for exception breakpoint";
         }
         if (!error_context.empty())
-          error = CreateOptionParsingError(option_arg, short_option,
-                                           long_option, error_context);
+          error = Status::FromError(CreateOptionParsingError(
+              option_arg, short_option, long_option, error_context));
       } break;
 
       case 'f':
@@ -343,9 +348,9 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
         bool success;
         m_catch_bp = OptionArgParser::ToBoolean(option_arg, true, &success);
         if (!success)
-          error =
+          error = Status::FromError(
               CreateOptionParsingError(option_arg, short_option, long_option,
-                                       g_bool_parsing_error_message);
+                                       g_bool_parsing_error_message));
       } break;
 
       case 'H':
@@ -362,24 +367,24 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
           m_skip_prologue = eLazyBoolNo;
 
         if (!success)
-          error =
+          error = Status::FromError(
               CreateOptionParsingError(option_arg, short_option, long_option,
-                                       g_bool_parsing_error_message);
+                                       g_bool_parsing_error_message));
       } break;
 
       case 'l':
         if (option_arg.getAsInteger(0, m_line_num))
-          error =
+          error = Status::FromError(
               CreateOptionParsingError(option_arg, short_option, long_option,
-                                       g_int_parsing_error_message);
+                                       g_int_parsing_error_message));
         break;
 
       case 'L':
         m_language = Language::GetLanguageTypeFromString(option_arg);
         if (m_language == eLanguageTypeUnknown)
-          error =
+          error = Status::FromError(
               CreateOptionParsingError(option_arg, short_option, long_option,
-                                       g_language_parsing_error_message);
+                                       g_language_parsing_error_message));
         break;
 
       case 'm': {
@@ -392,9 +397,9 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
           m_move_to_nearest_code = eLazyBoolNo;
 
         if (!success)
-          error =
+          error = Status::FromError(
               CreateOptionParsingError(option_arg, short_option, long_option,
-                                       g_bool_parsing_error_message);
+                                       g_bool_parsing_error_message));
         break;
       }
 
@@ -412,8 +417,9 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
         if (BreakpointID::StringIsBreakpointName(option_arg, error))
           m_breakpoint_names.push_back(std::string(option_arg));
         else
-          error = CreateOptionParsingError(
-              option_arg, short_option, long_option, "Invalid breakpoint name");
+          error = Status::FromError(
+              CreateOptionParsingError(option_arg, short_option, long_option,
+                                       "Invalid breakpoint name"));
         break;
       }
 
@@ -451,9 +457,9 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
         bool success;
         m_throw_bp = OptionArgParser::ToBoolean(option_arg, true, &success);
         if (!success)
-          error =
+          error = Status::FromError(
               CreateOptionParsingError(option_arg, short_option, long_option,
-                                       g_bool_parsing_error_message);
+                                       g_bool_parsing_error_message));
       } break;
 
       case 'X':
@@ -465,8 +471,8 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
         OptionValueFileColonLine value;
         Status fcl_err = value.SetValueFromString(option_arg);
         if (!fcl_err.Success()) {
-          error = CreateOptionParsingError(option_arg, short_option,
-                                           long_option, fcl_err.AsCString());
+          error = Status::FromError(CreateOptionParsingError(
+              option_arg, short_option, long_option, fcl_err.AsCString()));
         } else {
           m_filenames.AppendIfUnique(value.GetFileSpec());
           m_line_num = value.GetLineNumber();
@@ -1551,13 +1557,15 @@ class BreakpointNameOptionGroup : public OptionGroup {
       break;
     case 'B':
       if (m_breakpoint.SetValueFromString(option_arg).Fail())
-        error = CreateOptionParsingError(option_arg, short_option, long_option,
-                                         g_int_parsing_error_message);
+        error = Status::FromError(
+            CreateOptionParsingError(option_arg, short_option, long_option,
+                                     g_int_parsing_error_message));
       break;
     case 'D':
       if (m_use_dummy.SetValueFromString(option_arg).Fail())
-        error = CreateOptionParsingError(option_arg, short_option, long_option,
-                                         g_bool_parsing_error_message);
+        error = Status::FromError(
+            CreateOptionParsingError(option_arg, short_option, long_option,
+                                     g_bool_parsing_error_message));
       break;
     case 'H':
       m_help_string.SetValueFromString(option_arg);
@@ -1610,8 +1618,9 @@ class BreakpointAccessOptionGroup : public OptionGroup {
       if (success) {
         m_permissions.SetAllowList(value);
       } else
-        error = CreateOptionParsingError(option_arg, short_option, long_option,
-                                         g_bool_parsing_error_message);
+        error = Status::FromError(
+            CreateOptionParsingError(option_arg, short_option, long_option,
+                                     g_bool_parsing_error_message));
     } break;
     case 'A': {
       bool value, success;
@@ -1619,8 +1628,9 @@ class BreakpointAccessOptionGroup : public OptionGroup {
       if (success) {
         m_permissions.SetAllowDisable(value);
       } else
-        error = CreateOptionParsingError(option_arg, short_option, long_option,
-                                         g_bool_parsing_error_message);
+        error = Status::FromError(
+            CreateOptionParsingError(option_arg, short_option, long_option,
+                                     g_bool_parsing_error_message));
     } break;
     case 'D': {
       bool value, success;
@@ -1628,8 +1638,9 @@ class BreakpointAccessOptionGroup : public OptionGroup {
       if (success) {
         m_permissions.SetAllowDelete(value);
       } else
-        error = CreateOptionParsingError(option_arg, short_option, long_option,
-                                         g_bool_parsing_error_message);
+        error = Status::FromError(
+            CreateOptionParsingError(option_arg, short_option, long_option,
+                                     g_bool_parsing_error_message));
     } break;
     default:
       llvm_unreachable("Unimplemented option");
@@ -2113,8 +2124,8 @@ class CommandObjectBreakpointRead : public CommandObjectParsed {
         Status name_error;
         if (!BreakpointID::StringIsBreakpointName(llvm::StringRef(option_arg),
                                                   name_error)) {
-          error = CreateOptionParsingError(option_arg, short_option,
-                                           long_option, name_error.AsCString());
+          error = Status::FromError(CreateOptionParsingError(
+              option_arg, short_option, long_option, name_error.AsCString()));
         }
         m_names.push_back(std::string(option_arg));
         break;
diff --git a/lldb/source/Commands/CommandObjectMemoryTag.cpp b/lldb/source/Commands/CommandObjectMemoryTag.cpp
index f45d6eacab3d0..bc76319018da9 100644
--- a/lldb/source/Commands/CommandObjectMemoryTag.cpp
+++ b/lldb/source/Commands/CommandObjectMemoryTag.cpp
@@ -77,7 +77,7 @@ class CommandObjectMemoryTagRead : public CommandObjectParsed {
         process->GetMemoryTagManager();
 
     if (!tag_manager_or_err) {
-      result.SetError(Status(tag_manager_or_err.takeError()));
+      result.SetError(Status::FromError(tag_manager_or_err.takeError()));
       return;
     }
 
@@ -102,7 +102,7 @@ class CommandObjectMemoryTagRead : public CommandObjectParsed {
         tag_manager->MakeTaggedRange(start_addr, end_addr, memory_regions);
 
     if (!tagged_range) {
-      result.SetError(Status(tagged_range.takeError()));
+      result.SetError(Status::FromError(tagged_range.takeError()));
       return;
     }
 
@@ -110,7 +110,7 @@ class CommandObjectMemoryTagRead : public CommandObjectParsed {
         tagged_range->GetRangeBase(), tagged_range->GetByteSize());
 
     if (!tags) {
-      result.SetError(Status(tags.takeError()));
+      result.SetError(Status::FromError(tags.takeError()));
       return;
     }
 
@@ -230,7 +230,7 @@ class CommandObjectMemoryTagWrite : public CommandObjectParsed {
         process->GetMemoryTagManager();
 
     if (!tag_manager_or_err) {
-      result.SetError(Status(tag_manager_or_err.takeError()));
+      result.SetError(Status::FromError(tag_manager_or_err.takeError()));
       return;
     }
 
@@ -282,7 +282,7 @@ class CommandObjectMemoryTagWrite : public CommandObjectParsed {
                                      memory_regions);
 
     if (!tagged_range) {
-      result.SetError(Status(tagged_range.takeError()));
+      result.SetError(Status::FromError(tagged_range.takeError()));
       return;
     }
 
diff --git a/lldb/source/Commands/CommandObjectStats.cpp b/lldb/source/Commands/CommandObjectStats.cpp
index 53855e7d03165..7d333afc231ba 100644
--- a/lldb/source/Commands/CommandObjectStats.cpp
+++ b/lldb/source/Commands/CommandObjectStats.cpp
@@ -87,21 +87,21 @@ class CommandObjectStatsDump : public CommandObjectParsed {
                 OptionArgParser::ToBoolean("--targets", option_arg))
           m_stats_options.SetIncludeTargets(*bool_or_error);
         else
-          error = bool_or_error.takeError();
+          error = Status::FromError(bool_or_error.takeError());
         break;
       case 'm':
         if (llvm::Expected<bool> bool_or_error =
                 OptionArgParser::ToBoolean("--modules", option_arg))
           m_stats_options.SetIncludeModules(*bool_or_error);
         else
-          error = bool_or_error.takeError();
+          error = Status::FromError(bool_or_error.takeError());
         break;
       case 't':
         if (llvm::Expected<bool> bool_or_error =
                 OptionArgParser::ToBoolean("--transcript", option_arg))
           m_stats_options.SetIncludeTranscript(*bool_or_error);
         else
-          error = bool_or_error.takeError();
+          error = Status::FromError(bool_or_error.takeError());
         break;
       default:
         llvm_unreachable("Unimplemented option");
diff --git a/lldb/source/Commands/CommandObjectTrace.cpp b/lldb/source/Commands/CommandObjectTrace.cpp
index 5bcbc236301cc..5e212e05461a6 100644
--- a/lldb/source/Commands/CommandObjectTrace.cpp
+++ b/lldb/source/Commands/CommandObjectTrace.cpp
@@ -361,7 +361,7 @@ class CommandObjectTraceSchema : public CommandObjectParsed {
               Trace::FindPluginSchema(plugin_name))
         result.AppendMessage(*schemaOrErr);
       else
-        error = schemaOrErr.takeError();
+        error = Status::FromError(schemaOrErr.takeError());
     }
 
     if (error.Success()) {
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index fd5cb792c101a..a5219025495a9 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -723,7 +723,7 @@ Status PluginManager::SaveCore(const lldb::ProcessSP &process_sp,
     llvm::Expected<bool> ret =
         process_sp->SaveCore(options.GetOutputFile()->GetPath());
     if (!ret)
-      return Status(ret.takeError());
+      return Status::FromError(ret.takeError());
     if (ret.get())
       return Status();
   }
diff --git a/lldb/source/Core/ThreadedCommunication.cpp b/lldb/source/Core/ThreadedCommunication.cpp
index d8b567c9bd0de..649ce71c29374 100644
--- a/lldb/source/Core/ThreadedCommunication.cpp
+++ b/lldb/source/Core/ThreadedCommunication.cpp
@@ -178,7 +178,7 @@ bool ThreadedCommunication::StartReadThread(Status *error_ptr) {
     m_read_thread = *maybe_thread;
   } else {
     if (error_ptr)
-      *error_ptr = Status(maybe_thread.takeError());
+      *error_ptr = Status::FromError(maybe_thread.takeError());
     else {
       LLDB_LOG_ERROR(GetLog(LLDBLog::Host), maybe_thread.takeError(),
                      "failed to launch host thread: {0}");
diff --git a/lldb/source/Core/ValueObjectVTable.cpp b/lldb/source/Core/ValueObjectVTable.cpp
index 66e0750b63f8f..e38f0a83df994 100644
--- a/lldb/source/Core/ValueObjectVTable.cpp
+++ b/lldb/source/Core/ValueObjectVTable.cpp
@@ -220,7 +220,7 @@ bool ValueObjectVTable::UpdateValue() {
   llvm::Expected<LanguageRuntime::VTableInfo> vtable_info_or_err =
       language_runtime->GetVTableInfo(*parent, /*check_type=*/true);
   if (!vtable_info_or_err) {
-    m_error = vtable_info_or_err.takeError();
+    m_error = Status::FromError(vtable_info_or_err.takeError());
     return false;
   }
 
diff --git a/lldb/source/Core/ValueObjectVariable.cpp b/lldb/source/Core/ValueObjectVariable.cpp
index 01f871a6f8bc4..29aefb270c92c 100644
--- a/lldb/source/Core/ValueObjectVariable.cpp
+++ b/lldb/source/Core/ValueObjectVariable.cpp
@@ -249,7 +249,7 @@ bool ValueObjectVariable::UpdateValue() {
 
       SetValueIsValid(m_error.Success());
     } else {
-      m_error = maybe_value.takeError();
+      m_error = Status::FromError(maybe_value.takeError());
       // could not find location, won't allow editing
       m_resolved_value.SetContext(Value::ContextType::Invalid, nullptr);
     }
diff --git a/lldb/source/DataFormatters/VectorType.cpp b/lldb/source/DataFormatters/VectorType.cpp
index 19de204c24353..f6c38f76fea31 100644
--- a/lldb/source/DataFormatters/VectorType.cpp
+++ b/lldb/source/DataFormatters/VectorType.cpp
@@ -233,7 +233,7 @@ class VectorTypeSyntheticFrontEnd : public SyntheticChildrenFrontEnd {
     auto num_children_or_err = CalculateNumChildren();
     if (!num_children_or_err)
       return ValueObjectConstResult::Create(
-          nullptr, Status(num_children_or_err.takeError()));
+          nullptr, Status::FromError(num_children_or_err.takeError()));
     if (idx >= *num_children_or_err)
       return {};
     std::optional<uint64_t> size = m_child_type.GetByteSize(nullptr);
diff --git a/lldb/source/Host/common/FileCache.cpp b/lldb/source/Host/common/FileCache.cpp
index 4ac198171f96c..87c90a03fd98d 100644
--- a/lldb/source/Host/common/FileCache.cpp
+++ b/lldb/source/Host/common/FileCache.cpp
@@ -32,7 +32,7 @@ lldb::user_id_t FileCache::OpenFile(const FileSpec &file_spec,
   }
   auto file = FileSystem::Instance().Open(file_spec, flags, mode);
   if (!file) {
-    error = file.takeError();
+    error = Status::FromError(file.takeError());
     return UINT64_MAX;
   }
   lldb::user_id_t fd = file.get()->GetDescriptor();
diff --git a/lldb/source/Host/common/NativeProcessProtocol.cpp b/lldb/source/Host/common/NativeProcessProtocol.cpp
index d3b9dde368db0..a84d8db1c8794 100644
--- a/lldb/source/Host/common/NativeProcessProtocol.cpp
+++ b/lldb/source/Host/common/NativeProcessProtocol.cpp
@@ -350,7 +350,7 @@ Status NativeProcessProtocol::SetSoftwareBreakpoint(lldb::addr_t addr,
   }
   auto expected_bkpt = EnableSoftwareBreakpoint(addr, size_hint);
   if (!expected_bkpt)
-    return Status(expected_bkpt.takeError());
+    return Status::FromError(expected_bkpt.takeError());
 
   m_software_breakpoints.emplace(addr, std::move(*expected_bkpt));
   return Status();
diff --git a/lldb/source/Host/common/TCPSocket.cpp b/lldb/source/Host/common/TCPSocket.cpp
index 4f1518ef697ff..b28ba148ee1af 100644
--- a/lldb/source/Host/common/TCPSocket.cpp
+++ b/lldb/source/Host/common/TCPSocket.cpp
@@ -137,7 +137,7 @@ Status TCPSocket::Connect(llvm::StringRef name) {
   Status error;
   llvm::Expected<HostAndPort> host_port = DecodeHostAndPort(name);
   if (!host_port)
-    return Status(host_port.takeError());
+    return Status::FromError(host_port.takeError());
 
   std::vector<SocketAddress> addresses =
       SocketAddress::GetAddressInfo(host_port->hostname.c_str(), nullptr,
@@ -176,7 +176,7 @@ Status TCPSocket::Listen(llvm::StringRef name, int backlog) {
   Status error;
   llvm::Expected<HostAndPort> host_port = DecodeHostAndPort(name);
   if (!host_port)
-    return Status(host_port.takeError());
+    return Status::FromError(host_port.takeError());
 
   if (host_port->hostname == "*")
     host_port->hostname = "0.0.0.0";
@@ -287,7 +287,7 @@ Status TCPSocket::Accept(Socket *&conn_socket) {
                accept_loop.RequestTermination();
              });
   if (!expected_handles)
-    return Status(expected_handles.takeError());
+    return Status::FromError(expected_handles.takeError());
   return accept_loop.Run();
 }
 
diff --git a/lldb/source/Host/macosx/objcxx/Host.mm b/lldb/source/Host/macosx/objcxx/Host.mm
index 94a2b916574c6..fe63cc16c6499 100644
--- a/lldb/source/Host/macosx/objcxx/Host.mm
+++ b/lldb/source/Host/macosx/objcxx/Host.mm
@@ -316,7 +316,7 @@ repeat with the_window in (get windows)\n\
       unix_socket_name, [&] { return AcceptPIDFromInferior(connect_url); });
 
   if (!accept_thread)
-    return Status(accept_thread.takeError());
+    return Status::FromError(accept_thread.takeError());
 
   [applescript executeAndReturnError:nil];
 
diff --git a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
index 6a40f66be39b1..2a2fcf00c0adf 100644
--- a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
+++ b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
@@ -652,7 +652,7 @@ ConnectionFileDescriptor::ConnectUDP(llvm::StringRef s,
       Socket::UdpConnect(s, m_child_processes_inherit);
   if (!socket) {
     if (error_ptr)
-      *error_ptr = socket.takeError();
+      *error_ptr = Status::FromError(socket.takeError());
     else
       LLDB_LOG_ERROR(GetLog(LLDBLog::Connection), socket.takeError(),
                      "tcp connect failed: {0}");
@@ -769,7 +769,7 @@ ConnectionStatus ConnectionFileDescriptor::ConnectSerialPort(
       SerialPort::OptionsFromURL(qs);
   if (!serial_options) {
     if (error_ptr)
-      *error_ptr = serial_options.takeError();
+      *error_ptr = Status::FromError(serial_options.takeError());
     else
       llvm::consumeError(serial_options.takeError());
     return eConnectionStatusError;
@@ -786,7 +786,7 @@ ConnectionStatus ConnectionFileDescriptor::ConnectSerialPort(
       fd, File::eOpenOptionReadWrite, serial_options.get(), true);
   if (!serial_sp) {
     if (error_ptr)
-      *error_ptr = serial_sp.takeError();
+      *error_ptr = Status::FromError(serial_sp.takeError());
     else
       llvm::consumeError(serial_sp.takeError());
     return eConnectionStatusError;
diff --git a/lldb/source/Interpreter/CommandObject.cpp b/lldb/source/Interpreter/CommandObject.cpp
index c819024ccf018..cf2682cd26faa 100644
--- a/lldb/source/Interpreter/CommandObject.cpp
+++ b/lldb/source/Interpreter/CommandObject.cpp
@@ -121,7 +121,7 @@ bool CommandObject::ParseOptions(Args &args, CommandReturnObject &result) {
       args = std::move(*args_or);
       error = options->NotifyOptionParsingFinished(&exe_ctx);
     } else
-      error = args_or.takeError();
+      error = Status::FromError(args_or.takeError());
 
     if (error.Success()) {
       if (options->VerifyOptions(result))
diff --git a/lldb/source/Interpreter/OptionValueRegex.cpp b/lldb/source/Interpreter/OptionValueRegex.cpp
index d810df503f589..91ec41df6ee50 100644
--- a/lldb/source/Interpreter/OptionValueRegex.cpp
+++ b/lldb/source/Interpreter/OptionValueRegex.cpp
@@ -51,7 +51,7 @@ Status OptionValueRegex::SetValueFromString(llvm::StringRef value,
       m_value_was_set = true;
       NotifyValueChanged();
     } else if (llvm::Error err = m_regex.GetError()) {
-      return Status(std::move(err));
+      return Status::FromError(std::move(err));
     } else {
       return Status::FromErrorString("regex error");
     }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
index 2c9b3c425397a..f3c137d99703b 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
@@ -114,7 +114,7 @@ class BlockPointerSyntheticFrontEnd : public SyntheticChildrenFrontEnd {
     if (!child_type_or_err)
       return ValueObjectConstResult::Create(
           exe_ctx.GetBestExecutionContextScope(),
-          Status(child_type_or_err.takeError()));
+          Status::FromError(child_type_or_err.takeError()));
     CompilerType child_type = *child_type_or_err;
 
     ValueObjectSP struct_pointer_sp =
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 2004622e547be..b28beab117cca 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -6822,7 +6822,7 @@ bool ObjectFileMachO::SaveCore(const lldb::ProcessSP &process_sp,
             outfile, File::eOpenOptionWriteOnly | File::eOpenOptionTruncate |
                          File::eOpenOptionCanCreate);
         if (!core_file) {
-          error = core_file.takeError();
+          error = Status::FromError(core_file.takeError());
         } else {
           // Read 1 page at a time
           uint8_t bytes[0x1000];
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
index 0897895e6bc25..5da69dd4f2ce7 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
@@ -70,7 +70,7 @@ bool ObjectFileMinidump::SaveCore(const lldb::ProcessSP &process_sp,
       options.GetOutputFile().value(),
       File::eOpenOptionWriteOnly | File::eOpenOptionCanCreate);
   if (!maybe_core_file) {
-    error = maybe_core_file.takeError();
+    error = Status::FromError(maybe_core_file.takeError());
     return false;
   }
   MinidumpFileBuilder builder(std::move(maybe_core_file.get()), process_sp,
diff --git a/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm b/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm
index 2825db6e3a6b5..303a5409c6fe4 100644
--- a/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm
+++ b/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm
@@ -408,7 +408,7 @@ static Status HandleFileAction(ProcessLaunchInfo &launch_info,
                 launch_info.GetPTY().GetSecondaryFileDescriptor();
             if (secondary_fd == PseudoTerminal::invalid_fd) {
               if (llvm::Error Err = launch_info.GetPTY().OpenSecondary(O_RDWR))
-                return Status(std::move(Err));
+                return Status::FromError(std::move(Err));
             }
             secondary_fd = launch_info.GetPTY().GetSecondaryFileDescriptor();
             assert(secondary_fd != PseudoTerminal::invalid_fd);
diff --git a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
index 1e2e3a80b18bf..cc0e34eecdf30 100644
--- a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
@@ -1287,7 +1287,7 @@ Status NativeProcessLinux::PopulateMemoryRegionCache() {
       return true;
     }
 
-    Result = Info.takeError();
+    Result = Status::FromError(Info.takeError());
     m_supports_mem_region = LazyBool::eLazyBoolNo;
     LLDB_LOG(log, "failed to parse proc maps: {0}", Result);
     return false;
@@ -1475,7 +1475,7 @@ Status NativeProcessLinux::ReadMemoryTags(int32_t type, lldb::addr_t addr,
   llvm::Expected<NativeRegisterContextLinux::MemoryTaggingDetails> details =
       GetCurrentThread()->GetRegisterContext().GetMemoryTaggingDetails(type);
   if (!details)
-    return Status(details.takeError());
+    return Status::FromError(details.takeError());
 
   // Ignore 0 length read
   if (!len)
@@ -1530,7 +1530,7 @@ Status NativeProcessLinux::WriteMemoryTags(int32_t type, lldb::addr_t addr,
   llvm::Expected<NativeRegisterContextLinux::MemoryTaggingDetails> details =
       GetCurrentThread()->GetRegisterContext().GetMemoryTaggingDetails(type);
   if (!details)
-    return Status(details.takeError());
+    return Status::FromError(details.takeError());
 
   // Ignore 0 length write
   if (!len)
@@ -1547,18 +1547,18 @@ Status NativeProcessLinux::WriteMemoryTags(int32_t type, lldb::addr_t addr,
   llvm::Expected<std::vector<lldb::addr_t>> unpacked_tags_or_err =
       details->manager->UnpackTagsData(tags);
   if (!unpacked_tags_or_err)
-    return Status(unpacked_tags_or_err.takeError());
+    return Status::FromError(unpacked_tags_or_err.takeError());
 
   llvm::Expected<std::vector<lldb::addr_t>> repeated_tags_or_err =
       details->manager->RepeatTagsForRange(*unpacked_tags_or_err, range);
   if (!repeated_tags_or_err)
-    return Status(repeated_tags_or_err.takeError());
+    return Status::FromError(repeated_tags_or_err.takeError());
 
   // Repack them for ptrace to use
   llvm::Expected<std::vector<uint8_t>> final_tag_data =
       details->manager->PackTags(*repeated_tags_or_err);
   if (!final_tag_data)
-    return Status(final_tag_data.takeError());
+    return Status::FromError(final_tag_data.takeError());
 
   struct iovec tags_vec;
   uint8_t *src = final_tag_data->data();
@@ -1790,7 +1790,7 @@ void NativeProcessLinux::NotifyTracersProcessWillResume() {
 
 Status NativeProcessLinux::NotifyTracersOfNewThread(lldb::tid_t tid) {
   Log *log = GetLog(POSIXLog::Thread);
-  Status error(m_intel_pt_collector.OnThreadCreated(tid));
+  Status error = Status::FromError(m_intel_pt_collector.OnThreadCreated(tid));
   if (error.Fail())
     LLDB_LOG(log, "Failed to trace a new thread with intel-pt, tid = {0}. {1}",
              tid, error.AsCString());
@@ -1799,7 +1799,7 @@ Status NativeProcessLinux::NotifyTracersOfNewThread(lldb::tid_t tid) {
 
 Status NativeProcessLinux::NotifyTracersOfThreadDestroyed(lldb::tid_t tid) {
   Log *log = GetLog(POSIXLog::Thread);
-  Status error(m_intel_pt_collector.OnThreadDestroyed(tid));
+  Status error = Status::FromError(m_intel_pt_collector.OnThreadDestroyed(tid));
   if (error.Fail())
     LLDB_LOG(log,
              "Failed to stop a destroyed thread with intel-pt, tid = {0}. {1}",
diff --git a/lldb/source/Plugins/Process/Utility/NativeRegisterContextDBReg_arm64.cpp b/lldb/source/Plugins/Process/Utility/NativeRegisterContextDBReg_arm64.cpp
index 4bec3de586685..f1d0756b3ed9c 100644
--- a/lldb/source/Plugins/Process/Utility/NativeRegisterContextDBReg_arm64.cpp
+++ b/lldb/source/Plugins/Process/Utility/NativeRegisterContextDBReg_arm64.cpp
@@ -172,7 +172,7 @@ Status NativeRegisterContextDBReg_arm64::ClearAllHardwareBreakpoints() {
   // Read hardware breakpoint and watchpoint information.
   llvm::Error error = ReadHardwareDebugInfo();
   if (error)
-    return Status(std::move(error));
+    return Status::FromError(std::move(error));
 
   for (uint32_t i = 0; i < m_max_hbp_supported; i++) {
     if (BreakpointIsEnabled(i)) {
@@ -191,7 +191,7 @@ Status NativeRegisterContextDBReg_arm64::ClearAllHardwareBreakpoints() {
         m_hbp_regs[i].control = tempControl;
         m_hbp_regs[i].address = tempAddr;
 
-        return Status(std::move(error));
+        return Status::FromError(std::move(error));
       }
     }
   }
@@ -356,7 +356,7 @@ Status NativeRegisterContextDBReg_arm64::ClearAllHardwareWatchpoints() {
   // Read hardware breakpoint and watchpoint information.
   llvm::Error error = ReadHardwareDebugInfo();
   if (error)
-    return Status(std::move(error));
+    return Status::FromError(std::move(error));
 
   for (uint32_t i = 0; i < m_max_hwp_supported; i++) {
     if (WatchpointIsEnabled(i)) {
@@ -375,7 +375,7 @@ Status NativeRegisterContextDBReg_arm64::ClearAllHardwareWatchpoints() {
         m_hwp_regs[i].control = tempControl;
         m_hwp_regs[i].address = tempAddr;
 
-        return Status(std::move(error));
+        return Status::FromError(std::move(error));
       }
     }
   }
@@ -420,7 +420,7 @@ Status NativeRegisterContextDBReg_arm64::GetWatchpointHitIndex(
   // Read hardware breakpoint and watchpoint information.
   llvm::Error error = ReadHardwareDebugInfo();
   if (error)
-    return Status(std::move(error));
+    return Status::FromError(std::move(error));
 
   // Mask off ignored bits from watchpoint trap address.
   trap_addr = FixWatchpointHitAddress(trap_addr);
diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
index 0e8407fc46edf..7955594bf5d94 100644
--- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
+++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
@@ -194,7 +194,7 @@ Status ProcessElfCore::DoLoadCore() {
     // Parse thread contexts and auxv structure
     if (H.p_type == llvm::ELF::PT_NOTE) {
       if (llvm::Error error = ParseThreadContextsFromNoteSegment(H, data))
-        return Status(std::move(error));
+        return Status::FromError(std::move(error));
     }
     // PT_LOAD segments contains address map
     if (H.p_type == llvm::ELF::PT_LOAD) {
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index 50fa11e916c5f..1f1e511346879 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -847,7 +847,7 @@ Status GDBRemoteCommunication::StartListenThread(const char *hostname,
   llvm::Expected<HostThread> listen_thread = ThreadLauncher::LaunchThread(
       listen_url, [this] { return GDBRemoteCommunication::ListenThread(); });
   if (!listen_thread)
-    return Status(listen_thread.takeError());
+    return Status::FromError(listen_thread.takeError());
   m_listen_thread = *listen_thread;
 
   return Status();
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index 55d76ca8532d3..f6b1db7b8bbbc 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -1745,7 +1745,7 @@ Status GDBRemoteCommunicationClient::LoadQXferMemoryMap() {
 
   llvm::Expected<std::string> xml = ReadExtFeature("memory-map", "");
   if (!xml)
-    return Status(xml.takeError());
+    return Status::FromError(xml.takeError());
 
   XMLDocument xml_document;
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp
index 9d08a5d3411f1..9b72cb0035282 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp
@@ -123,7 +123,7 @@ GDBRemoteCommunicationServer::SendErrorResponse(llvm::Error error) {
       [&](std::unique_ptr<llvm::ErrorInfoBase> E) { EIB = std::move(E); });
 
   if (EIB)
-    return SendErrorResponse(Status(llvm::Error(std::move(EIB))));
+    return SendErrorResponse(Status::FromError(llvm::Error(std::move(EIB))));
   return SendUnimplementedResponse("");
 }
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
index 504c994f980cf..35fa93e53bc66 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
@@ -277,7 +277,7 @@ Status GDBRemoteCommunicationServerLLGS::LaunchProcess() {
     // lldb-server on Windows.
 #if !defined(_WIN32)
     if (llvm::Error Err = m_process_launch_info.SetUpPtyRedirection())
-      return Status(std::move(Err));
+      return Status::FromError(std::move(Err));
 #endif
   }
 
@@ -287,7 +287,7 @@ Status GDBRemoteCommunicationServerLLGS::LaunchProcess() {
                                            "process but one already exists");
     auto process_or = m_process_manager.Launch(m_process_launch_info, *this);
     if (!process_or)
-      return Status(process_or.takeError());
+      return Status::FromError(process_or.takeError());
     m_continue_process = m_current_process = process_or->get();
     m_debugged_processes.emplace(
         m_current_process->GetID(),
@@ -356,7 +356,7 @@ Status GDBRemoteCommunicationServerLLGS::AttachToProcess(lldb::pid_t pid) {
   // Try to attach.
   auto process_or = m_process_manager.Attach(pid, *this);
   if (!process_or) {
-    Status status(process_or.takeError());
+    Status status = Status::FromError(process_or.takeError());
     llvm::errs() << llvm::formatv("failed to attach to process {0}: {1}\n", pid,
                                   status);
     return status;
@@ -1367,7 +1367,7 @@ GDBRemoteCommunicationServerLLGS::Handle_jLLDBTraceGetBinaryData(
       llvm::json::parse<TraceGetBinaryDataRequest>(packet.Peek(),
                                                    "TraceGetBinaryDataRequest");
   if (!request)
-    return SendErrorResponse(Status(request.takeError()));
+    return SendErrorResponse(Status::FromError(request.takeError()));
 
   if (Expected<std::vector<uint8_t>> bytes =
           m_current_process->TraceGetBinaryData(*request)) {
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
index 30e782e3be184..2f2750ec2b920 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
@@ -167,7 +167,7 @@ Status GDBRemoteCommunicationServerPlatform::LaunchGDBServer(
     if (available_port)
       port = *available_port;
     else
-      return Status(available_port.takeError());
+      return Status::FromError(available_port.takeError());
   }
 
   // Spawn a new thread to accept the port that gets bound after binding to
diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
index 7a326a557547d..ac1ecbfc0e2e7 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
@@ -186,7 +186,7 @@ void ProcessMinidump::Terminate() {
 Status ProcessMinidump::DoLoadCore() {
   auto expected_parser = MinidumpParser::Create(m_core_data);
   if (!expected_parser)
-    return Status(expected_parser.takeError());
+    return Status::FromError(expected_parser.takeError());
   m_minidump_parser = std::move(*expected_parser);
 
   Status error;
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
index cbb6cd41aa867..c1dcdc7df6cee 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
@@ -269,7 +269,7 @@ class ScriptedPythonInterface : virtual public ScriptedInterface {
         transformed_args);
 
     if (llvm::Error e = expected_return_object.takeError()) {
-      error = Status(std::move(e));
+      error = Status::FromError(std::move(e));
       return ErrorWithMessage<T>(caller_signature,
                                  "Python method could not be called.", error);
     }
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp
index ce14b531ea29c..24cf343000632 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp
@@ -1105,7 +1105,7 @@ template <typename Base> class OwnedPythonFile : public Base {
     if (!m_borrowed) {
       auto r = m_py_obj.CallMethod("close");
       if (!r)
-        py_error = Status(r.takeError());
+        py_error = Status::FromError(r.takeError());
     }
     base_error = Base::Close();
     if (py_error.Fail())
@@ -1196,7 +1196,7 @@ class PythonIOFile : public OwnedPythonFile<File> {
       return Flush();
     auto r = m_py_obj.CallMethod("close");
     if (!r)
-      return Status(r.takeError());
+      return Status::FromError(r.takeError());
     return Status();
   }
 
@@ -1204,7 +1204,7 @@ class PythonIOFile : public OwnedPythonFile<File> {
     GIL takeGIL;
     auto r = m_py_obj.CallMethod("flush");
     if (!r)
-      return Status(r.takeError());
+      return Status::FromError(r.takeError());
     return Status();
   }
 
@@ -1240,12 +1240,12 @@ class BinaryPythonFile : public PythonIOFile {
     PyObject *pybuffer_p = PyMemoryView_FromMemory(
         const_cast<char *>((const char *)buf), num_bytes, PyBUF_READ);
     if (!pybuffer_p)
-      return Status(llvm::make_error<PythonException>());
+      return Status::FromError(llvm::make_error<PythonException>());
     auto pybuffer = Take<PythonObject>(pybuffer_p);
     num_bytes = 0;
     auto bytes_written = As<long long>(m_py_obj.CallMethod("write", pybuffer));
     if (!bytes_written)
-      return Status(bytes_written.takeError());
+      return Status::FromError(bytes_written.takeError());
     if (bytes_written.get() < 0)
       return Status::FromErrorString(
           ".write() method returned a negative number!");
@@ -1260,7 +1260,7 @@ class BinaryPythonFile : public PythonIOFile {
     auto pybuffer_obj =
         m_py_obj.CallMethod("read", (unsigned long long)num_bytes);
     if (!pybuffer_obj)
-      return Status(pybuffer_obj.takeError());
+      return Status::FromError(pybuffer_obj.takeError());
     num_bytes = 0;
     if (pybuffer_obj.get().IsNone()) {
       // EOF
@@ -1269,7 +1269,7 @@ class BinaryPythonFile : public PythonIOFile {
     }
     auto pybuffer = PythonBuffer::Create(pybuffer_obj.get());
     if (!pybuffer)
-      return Status(pybuffer.takeError());
+      return Status::FromError(pybuffer.takeError());
     memcpy(buf, pybuffer.get().get().buf, pybuffer.get().get().len);
     num_bytes = pybuffer.get().get().len;
     return Status();
@@ -1295,12 +1295,12 @@ class TextPythonFile : public PythonIOFile {
     auto pystring =
         PythonString::FromUTF8(llvm::StringRef((const char *)buf, num_bytes));
     if (!pystring)
-      return Status(pystring.takeError());
+      return Status::FromError(pystring.takeError());
     num_bytes = 0;
     auto bytes_written =
         As<long long>(m_py_obj.CallMethod("write", pystring.get()));
     if (!bytes_written)
-      return Status(bytes_written.takeError());
+      return Status::FromError(bytes_written.takeError());
     if (bytes_written.get() < 0)
       return Status::FromErrorString(
           ".write() method returned a negative number!");
@@ -1321,14 +1321,14 @@ class TextPythonFile : public PythonIOFile {
     auto pystring = As<PythonString>(
         m_py_obj.CallMethod("read", (unsigned long long)num_chars));
     if (!pystring)
-      return Status(pystring.takeError());
+      return Status::FromError(pystring.takeError());
     if (pystring.get().IsNone()) {
       // EOF
       return Status();
     }
     auto stringref = pystring.get().AsUTF8();
     if (!stringref)
-      return Status(stringref.takeError());
+      return Status::FromError(stringref.takeError());
     num_bytes = stringref.get().size();
     memcpy(buf, stringref.get().begin(), num_bytes);
     return Status();
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index 76f2640a3ea69..63691d24f0dad 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -1110,7 +1110,7 @@ Status ScriptInterpreterPythonImpl::ExecuteMultipleLines(
           options.GetEnableIO(), m_debugger, /*result=*/nullptr);
 
   if (!io_redirect_or_error)
-    return Status(io_redirect_or_error.takeError());
+    return Status::FromError(io_redirect_or_error.takeError());
 
   ScriptInterpreterIORedirect &io_redirect = **io_redirect_or_error;
 
@@ -1144,7 +1144,7 @@ Status ScriptInterpreterPythonImpl::ExecuteMultipleLines(
             E.Restore();
           return error;
         });
-    return Status(std::move(error));
+    return Status::FromError(std::move(error));
   }
 
   return Status();
@@ -2393,7 +2393,7 @@ bool ScriptInterpreterPythonImpl::LoadScriptingModule(
           exc_options.GetEnableIO(), m_debugger, /*result=*/nullptr);
 
   if (!io_redirect_or_error) {
-    error = io_redirect_or_error.takeError();
+    error = Status::FromError(io_redirect_or_error.takeError());
     return false;
   }
 
@@ -2435,7 +2435,7 @@ bool ScriptInterpreterPythonImpl::LoadScriptingModule(
 
   if (extra_search_dir) {
     if (llvm::Error e = ExtendSysPath(extra_search_dir.GetPath())) {
-      error = std::move(e);
+      error = Status::FromError(std::move(e));
       return false;
     }
   } else {
@@ -2465,7 +2465,7 @@ bool ScriptInterpreterPythonImpl::LoadScriptingModule(
       }
       if (llvm::Error e =
               ExtendSysPath(module_file.GetDirectory().GetCString())) {
-        error = std::move(e);
+        error = Status::FromError(std::move(e));
         return false;
       }
       module_name = module_file.GetFilename().GetCString();
diff --git a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
index 1137f24451d28..4ca8bd2f9085d 100644
--- a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
+++ b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
@@ -289,7 +289,7 @@ class RegexFilterRule : public FilterRule {
     // Instantiate the regex so we can report any errors.
     auto regex = RegularExpression(op_arg);
     if (llvm::Error err = regex.GetError()) {
-      error = Status(std::move(err));
+      error = Status::FromError(std::move(err));
       return FilterRuleSP();
     }
 
diff --git a/lldb/source/Target/ModuleCache.cpp b/lldb/source/Target/ModuleCache.cpp
index ce009f9b2fafe..ccae7ea106c97 100644
--- a/lldb/source/Target/ModuleCache.cpp
+++ b/lldb/source/Target/ModuleCache.cpp
@@ -166,7 +166,7 @@ ModuleLock::ModuleLock(const FileSpec &root_dir_spec, const UUID &uuid,
     m_file_up = std::move(file.get());
   else {
     m_file_up.reset();
-    error = Status(file.takeError());
+    error = Status::FromError(file.takeError());
     return;
   }
 
diff --git a/lldb/source/Target/Platform.cpp b/lldb/source/Target/Platform.cpp
index b65a27dedc081..7792edcc2cb58 100644
--- a/lldb/source/Target/Platform.cpp
+++ b/lldb/source/Target/Platform.cpp
@@ -1131,7 +1131,7 @@ Status Platform::PutFile(const FileSpec &source, const FileSpec &destination,
   auto source_file = FileSystem::Instance().Open(source, source_open_options,
                                                  lldb::eFilePermissionsUserRW);
   if (!source_file)
-    return Status(source_file.takeError());
+    return Status::FromError(source_file.takeError());
   Status error;
 
   bool requires_upload = true;
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 97ce2c14458e9..3d7ddbe294a49 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -6486,13 +6486,13 @@ Status Process::WriteMemoryTags(lldb::addr_t addr, size_t len,
   llvm::Expected<const MemoryTagManager *> tag_manager_or_err =
       GetMemoryTagManager();
   if (!tag_manager_or_err)
-    return Status(tag_manager_or_err.takeError());
+    return Status::FromError(tag_manager_or_err.takeError());
 
   const MemoryTagManager *tag_manager = *tag_manager_or_err;
   llvm::Expected<std::vector<uint8_t>> packed_tags =
       tag_manager->PackTags(tags);
   if (!packed_tags) {
-    return Status(packed_tags.takeError());
+    return Status::FromError(packed_tags.takeError());
   }
 
   return DoWriteMemoryTags(addr, len, tag_manager->GetAllocationTagType(),
diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
index e35a4c318d358..1610971a34148 100644
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -1104,7 +1104,7 @@ bool StackFrame::GetFrameBaseValue(Scalar &frame_base, Status *error_ptr) {
           m_sc.function->GetFrameBaseExpression().Evaluate(
               &exe_ctx, nullptr, loclist_base_addr, nullptr, nullptr);
       if (!expr_value)
-        m_frame_base_error = expr_value.takeError();
+        m_frame_base_error = Status::FromError(expr_value.takeError());
       else
         m_frame_base = expr_value->ResolveValue(&exe_ctx);
     } else {
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index 899e822851d81..902fbb2b519ef 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -2078,7 +2078,8 @@ lldb::ValueObjectSP Thread::GetSiginfoValue() {
   llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> data =
       GetSiginfo(*type_size);
   if (!data)
-    return ValueObjectConstResult::Create(&target, Status(data.takeError()));
+    return ValueObjectConstResult::Create(&target,
+                                          Status::FromError(data.takeError()));
 
   DataExtractor data_extractor{data.get()->getBufferStart(), data.get()->getBufferSize(),
     process_sp->GetByteOrder(), arch.GetAddressByteSize()};
diff --git a/lldb/source/Utility/Scalar.cpp b/lldb/source/Utility/Scalar.cpp
index 098128b1a50e5..329f5b6e4b9a5 100644
--- a/lldb/source/Utility/Scalar.cpp
+++ b/lldb/source/Utility/Scalar.cpp
@@ -684,7 +684,7 @@ Status Scalar::SetValueFromCString(const char *value_str, Encoding encoding,
       m_type = e_float;
       m_float = std::move(f);
     } else
-      error = op.takeError();
+      error = Status::FromError(op.takeError());
     break;
   }
 
diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp
index 7260b7b3e0a03..131fc662bfc0a 100644
--- a/lldb/source/Utility/Status.cpp
+++ b/lldb/source/Utility/Status.cpp
@@ -55,10 +55,10 @@ Status::Status(std::string err_str)
     : m_code(LLDB_GENERIC_ERROR), m_type(eErrorTypeGeneric),
       m_string(std::move(err_str)) {}
 
-const Status &Status::operator=(llvm::Error error) {
+Status::Status(llvm::Error error) {
   if (!error) {
     Clear();
-    return *this;
+    return;
   }
 
   // if the error happens to be a errno error, preserve the error code
@@ -79,8 +79,6 @@ const Status &Status::operator=(llvm::Error error) {
     m_type = eErrorTypeGeneric;
     m_string = llvm::toString(std::move(error));
   }
-
-  return *this;
 }
 
 Status Status::FromErrorStringWithFormat(const char *format, ...) {
@@ -96,6 +94,8 @@ Status Status::FromErrorStringWithFormat(const char *format, ...) {
   return Status(string);
 }
 
+Status Status::FromError(llvm::Error error) { return Status(std::move(error)); }
+
 llvm::Error Status::ToError() const {
   if (Success())
     return llvm::Error::success();
diff --git a/lldb/source/Utility/StructuredData.cpp b/lldb/source/Utility/StructuredData.cpp
index 5f9821b979c07..fb4f6920d62eb 100644
--- a/lldb/source/Utility/StructuredData.cpp
+++ b/lldb/source/Utility/StructuredData.cpp
@@ -46,7 +46,7 @@ StructuredData::ParseJSONFromFile(const FileSpec &input_spec, Status &error) {
       json::parse(buffer_or_error.get()->getBuffer().str());
   if (value)
     return ParseJSONValue(*value);
-  error = Status(value.takeError());
+  error = Status::FromError(value.takeError());
   return StructuredData::ObjectSP();
 }
 
diff --git a/lldb/unittests/TestingSupport/Host/NativeProcessTestUtils.h b/lldb/unittests/TestingSupport/Host/NativeProcessTestUtils.h
index a610b37a6b38e..1a017122411a8 100644
--- a/lldb/unittests/TestingSupport/Host/NativeProcessTestUtils.h
+++ b/lldb/unittests/TestingSupport/Host/NativeProcessTestUtils.h
@@ -75,7 +75,7 @@ template <typename T> class MockProcess : public T {
     auto ExpectedMemory = this->ReadMemory(Addr, Size);
     if (!ExpectedMemory) {
       BytesRead = 0;
-      return Status(ExpectedMemory.takeError());
+      return Status::FromError(ExpectedMemory.takeError());
     }
     BytesRead = ExpectedMemory->size();
     assert(BytesRead <= Size);
@@ -89,7 +89,7 @@ template <typename T> class MockProcess : public T {
         Addr, llvm::ArrayRef(static_cast<const uint8_t *>(Buf), Size));
     if (!ExpectedBytes) {
       BytesWritten = 0;
-      return Status(ExpectedBytes.takeError());
+      return Status::FromError(ExpectedBytes.takeError());
     }
     BytesWritten = *ExpectedBytes;
     return Status();
diff --git a/lldb/unittests/Utility/StatusTest.cpp b/lldb/unittests/Utility/StatusTest.cpp
index d33909ea89727..be4f2beebcdb5 100644
--- a/lldb/unittests/Utility/StatusTest.cpp
+++ b/lldb/unittests/Utility/StatusTest.cpp
@@ -27,21 +27,20 @@ TEST(StatusTest, Formatv) {
 }
 
 TEST(StatusTest, ErrorConstructor) {
-  EXPECT_TRUE(Status(llvm::Error::success()).Success());
+  EXPECT_TRUE(Status::FromError(llvm::Error::success()).Success());
 
-  Status eagain(
+  Status eagain = Status::FromError(
       llvm::errorCodeToError(std::error_code(EAGAIN, std::generic_category())));
   EXPECT_TRUE(eagain.Fail());
   EXPECT_EQ(eErrorTypePOSIX, eagain.GetType());
   EXPECT_EQ(Status::ValueType(EAGAIN), eagain.GetError());
 
-  Status foo(llvm::make_error<llvm::StringError>(
-      "foo", llvm::inconvertibleErrorCode()));
+  Status foo = Status::FromError(llvm::createStringError("foo"));
   EXPECT_TRUE(foo.Fail());
   EXPECT_EQ(eErrorTypeGeneric, foo.GetType());
   EXPECT_STREQ("foo", foo.AsCString());
 
-  foo = llvm::Error::success();
+  foo = Status::FromError(llvm::Error::success());
   EXPECT_TRUE(foo.Success());
 }
 
@@ -52,6 +51,11 @@ TEST(StatusTest, ErrorCodeConstructor) {
   EXPECT_TRUE(eagain.Fail());
   EXPECT_EQ(eErrorTypePOSIX, eagain.GetType());
   EXPECT_EQ(Status::ValueType(EAGAIN), eagain.GetError());
+
+  llvm::Error list = llvm::joinErrors(llvm::createStringError("foo"),
+                                      llvm::createStringError("bar"));
+  Status foobar = Status::FromError(std::move(list));
+  EXPECT_EQ(std::string("foo\nbar"), std::string(foobar.AsCString()));
 }
 
 TEST(StatusTest, ErrorConversion) {

From 9df592fb806b77d5fb0c7a9d5c9057d1626587e3 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 5 Sep 2024 20:25:56 +0100
Subject: [PATCH 278/425] [AArch64] Fold away zext of extract of uzp. (#107367)

Similar to #107201, this comes up from the lowering of zext of
deinterleaving shuffles. Patterns such as ext(extract_subvector(uzp(a,
b))) can be converted to a simple and to perform the extract/zext from a
uzp1. Uzp2 can be handled with an extra shift, and due to the existing
legalization we could have and / shift between which can be combined in.

Mostly this reduces instruction count or increases the amount of
parallelism in the sequence.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  66 +++++++
 llvm/test/CodeGen/AArch64/zext-shuffle.ll     | 162 ++++++++----------
 2 files changed, 135 insertions(+), 93 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c0671dd1f0087..c8e8a1e612e0a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22261,6 +22261,70 @@ static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N,
       DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
 }
 
+// This comes up similar to the above when lowering deinterleaving shuffles from
+// zexts. We have legalized the operations in the generally case to
+// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
+// the extract is to the low half and the uzp is uzp1. There would be an extra
+// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
+// there could also be an existing and / shift that can be combined in, either
+// before of after the extract.
+static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  if (N->getOpcode() != ISD::ZERO_EXTEND ||
+      (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
+    return SDValue();
+
+  SDValue Op = N->getOperand(0);
+  unsigned ExtOffset = (unsigned)-1;
+  if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+    ExtOffset = Op.getConstantOperandVal(1);
+    Op = Op.getOperand(0);
+  }
+
+  unsigned Shift = 0;
+  APInt Mask = APInt::getLowBitsSet(VT.getScalarSizeInBits(),
+                                    Op.getValueType().getScalarSizeInBits());
+
+  if (Op.getOpcode() == AArch64ISD::VLSHR) {
+    Shift = Op.getConstantOperandVal(1);
+    Op = Op.getOperand(0);
+    Mask = Mask.lshr(Shift);
+  }
+  if (Op.getOpcode() == ISD::AND &&
+      ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {
+    Op = Op.getOperand(0);
+    Mask = Mask.zext(VT.getScalarSizeInBits());
+  } else if (Op.getOpcode() == AArch64ISD::BICi) {
+    Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
+                  Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));
+    Mask = Mask.zext(VT.getScalarSizeInBits());
+    Op = Op.getOperand(0);
+  }
+
+  if (ExtOffset == (unsigned)-1) {
+    if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      ExtOffset = Op.getConstantOperandVal(1);
+      Op = Op.getOperand(0);
+    } else
+      return SDValue();
+  }
+  if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
+    return SDValue();
+
+  if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
+    return SDValue();
+  if (Op.getOpcode() == AArch64ISD::UZP2)
+    Shift += VT.getScalarSizeInBits() / 2;
+
+  SDLoc DL(N);
+  SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
+                           Op.getOperand(ExtOffset == 0 ? 0 : 1));
+  if (Shift != 0)
+    BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
+                     DAG.getConstant(Shift, DL, MVT::i32));
+  return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
+}
+
 static SDValue performExtendCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG) {
@@ -22283,6 +22347,8 @@ static SDValue performExtendCombine(SDNode *N,
 
   if (SDValue R = performZExtDeinterleaveShuffleCombine(N, DAG))
     return R;
+  if (SDValue R = performZExtUZPCombine(N, DAG))
+    return R;
 
   if (N->getValueType(0).isFixedLengthVector() &&
       N->getOpcode() == ISD::SIGN_EXTEND &&
diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
index 6415fba29ff79..6d25c874a2893 100644
--- a/llvm/test/CodeGen/AArch64/zext-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
@@ -90,10 +90,11 @@ define <4 x i64> @v2i64_i16_04812(<16 x i16> %a) {
 define <4 x i64> @v2i64_i16_15913(<16 x i16> %a) {
 ; CHECK-LABEL: v2i64_i16_15913:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ushr v0.4s, v0.4s, #16
-; CHECK-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    movi v2.2d, #0x0000000000ffff
+; CHECK-NEXT:    ushr v0.2d, v0.2d, #16
+; CHECK-NEXT:    ushr v1.2d, v1.2d, #16
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
   %z1 = zext <4 x i16> %s1 to <4 x i64>
@@ -117,10 +118,8 @@ define <4 x i64> @v2i64_i16_261014(<16 x i16> %a) {
 define <4 x i64> @v2i64_i16_371115(<16 x i16> %a) {
 ; CHECK-LABEL: v2i64_i16_371115:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ushr v0.4s, v0.4s, #16
-; CHECK-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ushr v0.2d, v0.2d, #48
+; CHECK-NEXT:    ushr v1.2d, v1.2d, #48
 ; CHECK-NEXT:    ret
   %s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
   %z1 = zext <4 x i16> %s1 to <4 x i64>
@@ -142,8 +141,7 @@ define <4 x i32> @v4i32_0246(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i32> @v4i32_1357(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: v4i32_1357:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v0.8h
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #16
 ; CHECK-NEXT:    ret
   %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %d = zext <4 x i16> %c to <4 x i32>
@@ -210,8 +208,7 @@ define <8 x i16> @v8i16_0246(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i16> @v8i16_1357(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: v8i16_1357:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp2 v0.16b, v0.16b, v0.16b
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #8
 ; CHECK-NEXT:    ret
   %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   %d = zext <8 x i8> %c to <8 x i16>
@@ -278,8 +275,7 @@ define <8 x i32> @v8i32_0246(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i32> @v8i32_1357(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: v8i32_1357:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp2 v0.16b, v0.16b, v0.16b
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #8
 ; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    ret
@@ -291,10 +287,9 @@ define <8 x i32> @v8i32_1357(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i32> @v8i32_04812(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: v8i32_04812:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    bic v0.8h, #255, lsl #8
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    movi v2.2d, #0x0000ff000000ff
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
   %d = zext <8 x i8> %c to <8 x i32>
@@ -304,10 +299,11 @@ define <8 x i32> @v8i32_04812(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i32> @v8i32_15913(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: v8i32_15913:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    ushr v0.8h, v0.8h, #8
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    movi v2.2d, #0x0000ff000000ff
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #8
+; CHECK-NEXT:    ushr v1.4s, v1.4s, #8
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
   %d = zext <8 x i8> %c to <8 x i32>
@@ -317,10 +313,10 @@ define <8 x i32> @v8i32_15913(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i32> @v8i32_261014(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: v8i32_261014:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    bic v0.8h, #255, lsl #8
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #16
+; CHECK-NEXT:    ushr v1.4s, v1.4s, #16
+; CHECK-NEXT:    bic v0.4s, #255, lsl #8
+; CHECK-NEXT:    bic v1.4s, #255, lsl #8
 ; CHECK-NEXT:    ret
   %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
   %d = zext <8 x i8> %c to <8 x i32>
@@ -330,10 +326,8 @@ define <8 x i32> @v8i32_261014(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i32> @v8i32_371115(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: v8i32_371115:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    ushr v0.8h, v0.8h, #8
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #24
+; CHECK-NEXT:    ushr v1.4s, v1.4s, #24
 ; CHECK-NEXT:    ret
   %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
   %d = zext <8 x i8> %c to <8 x i32>
@@ -407,77 +401,59 @@ define <8 x i64> @zext_load_add(ptr %p) {
 define <8 x double> @uitofp_fadd(<32 x i16> %l) {
 ; CHECK-LABEL: uitofp_fadd:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 v5.4s, v0.4s, v3.4s
-; CHECK-NEXT:    uzp1 v6.4s, v0.4s, v1.4s
-; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi d4, #0x00ffff0000ffff
-; CHECK-NEXT:    uzp1 v7.4s, v2.4s, v3.4s
-; CHECK-NEXT:    uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    ext v16.16b, v6.16b, v6.16b, #8
-; CHECK-NEXT:    ext v5.16b, v5.16b, v5.16b, #8
-; CHECK-NEXT:    uzp2 v1.4s, v0.4s, v3.4s
-; CHECK-NEXT:    and v17.8b, v6.8b, v4.8b
-; CHECK-NEXT:    and v18.8b, v7.8b, v4.8b
-; CHECK-NEXT:    ushr v6.2s, v6.2s, #16
-; CHECK-NEXT:    ushr v7.2s, v7.2s, #16
-; CHECK-NEXT:    and v21.8b, v0.8b, v4.8b
-; CHECK-NEXT:    and v22.8b, v2.8b, v4.8b
-; CHECK-NEXT:    ushr v2.2s, v2.2s, #16
-; CHECK-NEXT:    and v19.8b, v16.8b, v4.8b
-; CHECK-NEXT:    and v20.8b, v5.8b, v4.8b
-; CHECK-NEXT:    ushll v3.2d, v17.2s, #0
-; CHECK-NEXT:    ushll v17.2d, v18.2s, #0
-; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ushr v16.2s, v16.2s, #16
-; CHECK-NEXT:    ushr v5.2s, v5.2s, #16
-; CHECK-NEXT:    ushll v6.2d, v6.2s, #0
-; CHECK-NEXT:    ushll v7.2d, v7.2s, #0
-; CHECK-NEXT:    ushll v18.2d, v19.2s, #0
-; CHECK-NEXT:    ushll v19.2d, v20.2s, #0
-; CHECK-NEXT:    ext v20.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
-; CHECK-NEXT:    ushll v16.2d, v16.2s, #0
-; CHECK-NEXT:    ushll v21.2d, v21.2s, #0
-; CHECK-NEXT:    ushll v5.2d, v5.2s, #0
-; CHECK-NEXT:    ushll v22.2d, v22.2s, #0
-; CHECK-NEXT:    ushll v2.2d, v2.2s, #0
-; CHECK-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-NEXT:    ucvtf v17.2d, v17.2d
-; CHECK-NEXT:    ucvtf v6.2d, v6.2d
-; CHECK-NEXT:    and v23.8b, v20.8b, v4.8b
-; CHECK-NEXT:    and v4.8b, v1.8b, v4.8b
-; CHECK-NEXT:    ushr v20.2s, v20.2s, #16
-; CHECK-NEXT:    ushr v1.2s, v1.2s, #16
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    ucvtf v7.2d, v7.2d
+; CHECK-NEXT:    movi v4.2d, #0x0000000000ffff
+; CHECK-NEXT:    ushr v5.2d, v0.2d, #16
+; CHECK-NEXT:    ushr v6.2d, v1.2d, #16
+; CHECK-NEXT:    ushr v7.2d, v2.2d, #16
+; CHECK-NEXT:    ushr v17.2d, v3.2d, #16
+; CHECK-NEXT:    ushr v20.2d, v0.2d, #32
+; CHECK-NEXT:    ushr v22.2d, v1.2d, #32
+; CHECK-NEXT:    ushr v23.2d, v2.2d, #32
+; CHECK-NEXT:    ushr v24.2d, v3.2d, #32
+; CHECK-NEXT:    and v16.16b, v0.16b, v4.16b
+; CHECK-NEXT:    and v18.16b, v1.16b, v4.16b
+; CHECK-NEXT:    and v19.16b, v2.16b, v4.16b
+; CHECK-NEXT:    and v21.16b, v3.16b, v4.16b
+; CHECK-NEXT:    and v5.16b, v5.16b, v4.16b
+; CHECK-NEXT:    and v6.16b, v6.16b, v4.16b
+; CHECK-NEXT:    and v7.16b, v7.16b, v4.16b
+; CHECK-NEXT:    and v17.16b, v17.16b, v4.16b
+; CHECK-NEXT:    and v20.16b, v20.16b, v4.16b
+; CHECK-NEXT:    and v22.16b, v22.16b, v4.16b
+; CHECK-NEXT:    and v23.16b, v23.16b, v4.16b
+; CHECK-NEXT:    and v4.16b, v24.16b, v4.16b
+; CHECK-NEXT:    ushr v0.2d, v0.2d, #48
+; CHECK-NEXT:    ushr v1.2d, v1.2d, #48
+; CHECK-NEXT:    ushr v2.2d, v2.2d, #48
+; CHECK-NEXT:    ushr v3.2d, v3.2d, #48
+; CHECK-NEXT:    ucvtf v16.2d, v16.2d
 ; CHECK-NEXT:    ucvtf v18.2d, v18.2d
 ; CHECK-NEXT:    ucvtf v19.2d, v19.2d
-; CHECK-NEXT:    ucvtf v16.2d, v16.2d
-; CHECK-NEXT:    ushll v23.2d, v23.2s, #0
-; CHECK-NEXT:    ushll v4.2d, v4.2s, #0
-; CHECK-NEXT:    ushll v20.2d, v20.2s, #0
-; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-NEXT:    ucvtf v5.2d, v5.2d
 ; CHECK-NEXT:    ucvtf v21.2d, v21.2d
+; CHECK-NEXT:    ucvtf v5.2d, v5.2d
+; CHECK-NEXT:    ucvtf v6.2d, v6.2d
+; CHECK-NEXT:    ucvtf v7.2d, v7.2d
+; CHECK-NEXT:    ucvtf v17.2d, v17.2d
+; CHECK-NEXT:    ucvtf v20.2d, v20.2d
 ; CHECK-NEXT:    ucvtf v22.2d, v22.2d
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
 ; CHECK-NEXT:    ucvtf v23.2d, v23.2d
 ; CHECK-NEXT:    ucvtf v4.2d, v4.2d
-; CHECK-NEXT:    ucvtf v20.2d, v20.2d
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    fadd v16.2d, v18.2d, v16.2d
-; CHECK-NEXT:    fadd v7.2d, v17.2d, v7.2d
-; CHECK-NEXT:    fadd v5.2d, v19.2d, v5.2d
-; CHECK-NEXT:    fadd v3.2d, v3.2d, v6.2d
-; CHECK-NEXT:    fadd v0.2d, v21.2d, v0.2d
-; CHECK-NEXT:    fadd v2.2d, v22.2d, v2.2d
-; CHECK-NEXT:    fadd v4.2d, v4.2d, v1.2d
-; CHECK-NEXT:    fadd v1.2d, v23.2d, v20.2d
-; CHECK-NEXT:    fadd v0.2d, v3.2d, v0.2d
+; CHECK-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-NEXT:    fadd v5.2d, v16.2d, v5.2d
+; CHECK-NEXT:    fadd v17.2d, v21.2d, v17.2d
+; CHECK-NEXT:    fadd v7.2d, v19.2d, v7.2d
+; CHECK-NEXT:    fadd v6.2d, v18.2d, v6.2d
+; CHECK-NEXT:    fadd v0.2d, v20.2d, v0.2d
+; CHECK-NEXT:    fadd v1.2d, v22.2d, v1.2d
+; CHECK-NEXT:    fadd v3.2d, v4.2d, v3.2d
+; CHECK-NEXT:    fadd v2.2d, v23.2d, v2.2d
+; CHECK-NEXT:    fadd v0.2d, v5.2d, v0.2d
+; CHECK-NEXT:    fadd v1.2d, v6.2d, v1.2d
 ; CHECK-NEXT:    fadd v2.2d, v7.2d, v2.2d
-; CHECK-NEXT:    fadd v1.2d, v16.2d, v1.2d
-; CHECK-NEXT:    fadd v3.2d, v5.2d, v4.2d
+; CHECK-NEXT:    fadd v3.2d, v17.2d, v3.2d
 ; CHECK-NEXT:    ret
     %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
     %z1 = uitofp <8 x i16> %s1 to <8 x double>

From 362da640dd18e2ef960e0d2198fe8378104c4119 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Thu, 5 Sep 2024 12:32:07 -0700
Subject: [PATCH 279/425] [SandboxIR][Bench] Test RAUW (#107440)

---
 llvm/benchmarks/SandboxIRBench.cpp | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/llvm/benchmarks/SandboxIRBench.cpp b/llvm/benchmarks/SandboxIRBench.cpp
index 633de6db1f5e2..1a7f808af21f2 100644
--- a/llvm/benchmarks/SandboxIRBench.cpp
+++ b/llvm/benchmarks/SandboxIRBench.cpp
@@ -106,10 +106,40 @@ template <IR IRTy> static void GetType(benchmark::State &State) {
     benchmark::DoNotOptimize(I->getType());
 }
 
+static std::string generateRAUWIR(unsigned Size) {
+  std::stringstream SS;
+  SS << "define void @foo(i32 %v1, i32 %v2) {\n";
+  SS << "  %def1 = add i32 %v1, %v2\n";
+  SS << "  %def2 = add i32 %v1, %v2\n";
+  for (auto Cnt : seq<unsigned>(0, Size))
+    SS << "  %add" << Cnt << " = add i32 %def1, %def1\n";
+  SS << "ret void";
+  SS << "}";
+  return SS.str();
+}
+
+template <IR IRTy> static void RAUW(benchmark::State &State) {
+  LLVMContext LLVMCtx;
+  sandboxir::Context Ctx(LLVMCtx);
+  std::unique_ptr<llvm::Module> LLVMM;
+  unsigned NumInstrs = State.range(0);
+  auto *BB = genIR<IRTy>(LLVMM, LLVMCtx, Ctx, generateRAUWIR, NumInstrs);
+  auto It = BB->begin();
+  auto *Def1 = &*It++;
+  auto *Def2 = &*It++;
+  for (auto _ : State) {
+    Def1->replaceAllUsesWith(Def2);
+    Def2->replaceAllUsesWith(Def1);
+  }
+}
+
 BENCHMARK(GetType<IR::LLVM>);
 BENCHMARK(GetType<IR::SBox>);
 
 BENCHMARK(BBWalk<IR::LLVM>)->Args({1024});
 BENCHMARK(BBWalk<IR::SBox>)->Args({1024});
 
+BENCHMARK(RAUW<IR::LLVM>)->Args({512});
+BENCHMARK(RAUW<IR::SBox>)->Args({512});
+
 BENCHMARK_MAIN();

From f32e5bdcefcff80f4296f8f4abedc37dcda36d53 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Thu, 5 Sep 2024 12:34:47 -0700
Subject: [PATCH 280/425] [NFC] Rename the `Nr` abbreviation to `Num` (#107151)

It's more clear. (This isn't exhaustive).
---
 .../lib/ctx_profile/CtxInstrContextNode.h     | 25 +++++-----
 .../lib/ctx_profile/CtxInstrProfiling.cpp     | 41 +++++++--------
 .../lib/ctx_profile/CtxInstrProfiling.h       |  4 +-
 llvm/include/llvm/Analysis/MLModelRunner.h    |  4 +-
 .../llvm/ProfileData/CtxInstrContextNode.h    | 25 +++++-----
 .../Analysis/FunctionPropertiesAnalysis.cpp   |  4 +-
 llvm/lib/Analysis/MLInlineAdvisor.cpp         |  6 +--
 llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp   | 36 ++++++-------
 .../Instrumentation/PGOCtxProfLowering.cpp    | 50 +++++++++----------
 .../Instrumentation/PGOInstrumentation.cpp    |  6 +--
 .../PGOCtxProfReaderWriterTest.cpp            |  8 +--
 11 files changed, 106 insertions(+), 103 deletions(-)

diff --git a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
index a916f197aa148..5991458c5732d 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
@@ -68,18 +68,19 @@ using GUID = uint64_t;
 class ContextNode final {
   const GUID Guid;
   ContextNode *const Next;
-  const uint32_t NrCounters;
-  const uint32_t NrCallsites;
+  const uint32_t NumCounters;
+  const uint32_t NumCallsites;
 
 public:
-  ContextNode(GUID Guid, uint32_t NrCounters, uint32_t NrCallsites,
+  ContextNode(GUID Guid, uint32_t NumCounters, uint32_t NumCallsites,
               ContextNode *Next = nullptr)
-      : Guid(Guid), Next(Next), NrCounters(NrCounters),
-        NrCallsites(NrCallsites) {}
+      : Guid(Guid), Next(Next), NumCounters(NumCounters),
+        NumCallsites(NumCallsites) {}
 
-  static inline size_t getAllocSize(uint32_t NrCounters, uint32_t NrCallsites) {
-    return sizeof(ContextNode) + sizeof(uint64_t) * NrCounters +
-           sizeof(ContextNode *) * NrCallsites;
+  static inline size_t getAllocSize(uint32_t NumCounters,
+                                    uint32_t NumCallsites) {
+    return sizeof(ContextNode) + sizeof(uint64_t) * NumCounters +
+           sizeof(ContextNode *) * NumCallsites;
   }
 
   // The counters vector starts right after the static header.
@@ -88,8 +89,8 @@ class ContextNode final {
     return reinterpret_cast<uint64_t *>(addr_after);
   }
 
-  uint32_t counters_size() const { return NrCounters; }
-  uint32_t callsites_size() const { return NrCallsites; }
+  uint32_t counters_size() const { return NumCounters; }
+  uint32_t callsites_size() const { return NumCallsites; }
 
   const uint64_t *counters() const {
     return const_cast<ContextNode *>(this)->counters();
@@ -97,7 +98,7 @@ class ContextNode final {
 
   // The subcontexts vector starts right after the end of the counters vector.
   ContextNode **subContexts() {
-    return reinterpret_cast<ContextNode **>(&(counters()[NrCounters]));
+    return reinterpret_cast<ContextNode **>(&(counters()[NumCounters]));
   }
 
   ContextNode *const *subContexts() const {
@@ -107,7 +108,7 @@ class ContextNode final {
   GUID guid() const { return Guid; }
   ContextNode *next() const { return Next; }
 
-  size_t size() const { return getAllocSize(NrCounters, NrCallsites); }
+  size_t size() const { return getAllocSize(NumCounters, NumCallsites); }
 
   uint64_t entrycount() const { return counters()[0]; }
 };
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
index a0a535015bf2e..df30986cdfc69 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
@@ -92,10 +92,11 @@ bool validate(const ContextRoot *Root) {
 }
 
 inline ContextNode *allocContextNode(char *Place, GUID Guid,
-                                     uint32_t NrCounters, uint32_t NrCallsites,
+                                     uint32_t NumCounters,
+                                     uint32_t NumCallsites,
                                      ContextNode *Next = nullptr) {
   assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0);
-  return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next);
+  return new (Place) ContextNode(Guid, NumCounters, NumCallsites, Next);
 }
 
 void resetContextNode(ContextNode &Node) {
@@ -161,8 +162,8 @@ void Arena::freeArenaList(Arena *&A) {
 // If this is the first time we hit a callsite with this (Guid) particular
 // callee, we need to allocate.
 ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
-                             uint32_t NrCounters, uint32_t NrCallsites) {
-  auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
+                             uint32_t NumCounters, uint32_t NumCallsites) {
+  auto AllocSize = ContextNode::getAllocSize(NumCounters, NumCallsites);
   auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
   char *AllocPlace = Mem->tryBumpAllocate(AllocSize);
   if (!AllocPlace) {
@@ -175,15 +176,15 @@ ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
         Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
     AllocPlace = Mem->tryBumpAllocate(AllocSize);
   }
-  auto *Ret = allocContextNode(AllocPlace, Guid, NrCounters, NrCallsites,
+  auto *Ret = allocContextNode(AllocPlace, Guid, NumCounters, NumCallsites,
                                *InsertionPoint);
   *InsertionPoint = Ret;
   return Ret;
 }
 
 ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
-                                            uint32_t NrCounters,
-                                            uint32_t NrCallsites) {
+                                            uint32_t NumCounters,
+                                            uint32_t NumCallsites) {
   // fast "out" if we're not even doing contextual collection.
   if (!__llvm_ctx_profile_current_context_root)
     return TheScratchContext;
@@ -222,14 +223,14 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
     Callsite = Callsite->next();
   }
   auto *Ret = Callsite ? Callsite
-                       : getCallsiteSlow(Guid, CallsiteContext, NrCounters,
-                                         NrCallsites);
-  if (Ret->callsites_size() != NrCallsites ||
-      Ret->counters_size() != NrCounters)
+                       : getCallsiteSlow(Guid, CallsiteContext, NumCounters,
+                                         NumCallsites);
+  if (Ret->callsites_size() != NumCallsites ||
+      Ret->counters_size() != NumCounters)
     __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: "
                         "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
-                        reinterpret_cast<void *>(Ret), Guid, NrCallsites,
-                        NrCounters, Ret->guid(), Ret->callsites_size(),
+                        reinterpret_cast<void *>(Ret), Guid, NumCallsites,
+                        NumCounters, Ret->guid(), Ret->callsites_size(),
                         Ret->counters_size());
   onContextEnter(*Ret);
   return Ret;
@@ -237,19 +238,19 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
 
 // This should be called once for a Root. Allocate the first arena, set up the
 // first context.
-void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters,
-                  uint32_t NrCallsites) {
+void setupContext(ContextRoot *Root, GUID Guid, uint32_t NumCounters,
+                  uint32_t NumCallsites) {
   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
       &AllContextsMutex);
   // Re-check - we got here without having had taken a lock.
   if (Root->FirstMemBlock)
     return;
-  const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites);
+  const auto Needed = ContextNode::getAllocSize(NumCounters, NumCallsites);
   auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed));
   Root->FirstMemBlock = M;
   Root->CurrentMem = M;
   Root->FirstNode = allocContextNode(M->tryBumpAllocate(Needed), Guid,
-                                     NrCounters, NrCallsites);
+                                     NumCounters, NumCallsites);
   AllContextRoots.PushBack(Root);
 }
 
@@ -278,7 +279,7 @@ void __llvm_ctx_profile_release_context(ContextRoot *Root)
 }
 
 void __llvm_ctx_profile_start_collection() {
-  size_t NrMemUnits = 0;
+  size_t NumMemUnits = 0;
   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
       &AllContextsMutex);
   for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {
@@ -286,11 +287,11 @@ void __llvm_ctx_profile_start_collection() {
     __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
         &Root->Taken);
     for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
-      ++NrMemUnits;
+      ++NumMemUnits;
 
     resetContextNode(*Root->FirstNode);
   }
-  __sanitizer::Printf("[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits);
+  __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
 }
 
 bool __llvm_ctx_profile_fetch(void *Data,
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
index f55068e98dd43..74d346d6e0a07 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
@@ -153,8 +153,8 @@ void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root);
 /// called for any other function than entry points, in the entry BB of such
 /// function. Same consideration about LSB of returned value as .._start_context
 ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
-                                            uint32_t NrCounters,
-                                            uint32_t NrCallsites);
+                                            uint32_t NumCounters,
+                                            uint32_t NumCallsites);
 
 /// Prepares for collection. Currently this resets counter values but preserves
 /// internal context tree structure.
diff --git a/llvm/include/llvm/Analysis/MLModelRunner.h b/llvm/include/llvm/Analysis/MLModelRunner.h
index 21f155de85aec..9f7f19a369d9c 100644
--- a/llvm/include/llvm/Analysis/MLModelRunner.h
+++ b/llvm/include/llvm/Analysis/MLModelRunner.h
@@ -54,8 +54,8 @@ class MLModelRunner {
   virtual void switchContext(StringRef Name) {}
 
 protected:
-  MLModelRunner(LLVMContext &Ctx, Kind Type, size_t NrInputs)
-      : Ctx(Ctx), Type(Type), InputBuffers(NrInputs) {
+  MLModelRunner(LLVMContext &Ctx, Kind Type, size_t NumInputs)
+      : Ctx(Ctx), Type(Type), InputBuffers(NumInputs) {
     assert(Type != Kind::Unknown);
   }
   virtual void *evaluateUntyped() = 0;
diff --git a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h
index a916f197aa148..5991458c5732d 100644
--- a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h
+++ b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h
@@ -68,18 +68,19 @@ using GUID = uint64_t;
 class ContextNode final {
   const GUID Guid;
   ContextNode *const Next;
-  const uint32_t NrCounters;
-  const uint32_t NrCallsites;
+  const uint32_t NumCounters;
+  const uint32_t NumCallsites;
 
 public:
-  ContextNode(GUID Guid, uint32_t NrCounters, uint32_t NrCallsites,
+  ContextNode(GUID Guid, uint32_t NumCounters, uint32_t NumCallsites,
               ContextNode *Next = nullptr)
-      : Guid(Guid), Next(Next), NrCounters(NrCounters),
-        NrCallsites(NrCallsites) {}
+      : Guid(Guid), Next(Next), NumCounters(NumCounters),
+        NumCallsites(NumCallsites) {}
 
-  static inline size_t getAllocSize(uint32_t NrCounters, uint32_t NrCallsites) {
-    return sizeof(ContextNode) + sizeof(uint64_t) * NrCounters +
-           sizeof(ContextNode *) * NrCallsites;
+  static inline size_t getAllocSize(uint32_t NumCounters,
+                                    uint32_t NumCallsites) {
+    return sizeof(ContextNode) + sizeof(uint64_t) * NumCounters +
+           sizeof(ContextNode *) * NumCallsites;
   }
 
   // The counters vector starts right after the static header.
@@ -88,8 +89,8 @@ class ContextNode final {
     return reinterpret_cast<uint64_t *>(addr_after);
   }
 
-  uint32_t counters_size() const { return NrCounters; }
-  uint32_t callsites_size() const { return NrCallsites; }
+  uint32_t counters_size() const { return NumCounters; }
+  uint32_t callsites_size() const { return NumCallsites; }
 
   const uint64_t *counters() const {
     return const_cast<ContextNode *>(this)->counters();
@@ -97,7 +98,7 @@ class ContextNode final {
 
   // The subcontexts vector starts right after the end of the counters vector.
   ContextNode **subContexts() {
-    return reinterpret_cast<ContextNode **>(&(counters()[NrCounters]));
+    return reinterpret_cast<ContextNode **>(&(counters()[NumCounters]));
   }
 
   ContextNode *const *subContexts() const {
@@ -107,7 +108,7 @@ class ContextNode final {
   GUID guid() const { return Guid; }
   ContextNode *next() const { return Next; }
 
-  size_t size() const { return getAllocSize(NrCounters, NrCallsites); }
+  size_t size() const { return getAllocSize(NumCounters, NumCallsites); }
 
   uint64_t entrycount() const { return counters()[0]; }
 };
diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
index af6574052c8c8..0ffbc90d7ee22 100644
--- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
+++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
@@ -47,7 +47,7 @@ static cl::opt<unsigned> CallWithManyArgumentsThreshold(
              "it is considered having many arguments."));
 
 namespace {
-int64_t getNrBlocksFromCond(const BasicBlock &BB) {
+int64_t getNumBlocksFromCond(const BasicBlock &BB) {
   int64_t Ret = 0;
   if (const auto *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
     if (BI->isConditional())
@@ -72,7 +72,7 @@ void FunctionPropertiesInfo::updateForBB(const BasicBlock &BB,
   assert(Direction == 1 || Direction == -1);
   BasicBlockCount += Direction;
   BlocksReachedFromConditionalInstruction +=
-      (Direction * getNrBlocksFromCond(BB));
+      (Direction * getNumBlocksFromCond(BB));
   for (const auto &I : BB) {
     if (auto *CS = dyn_cast<CallBase>(&I)) {
       const auto *Callee = CS->getCalledFunction();
diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp
index 8bb5efcf1b2ec..2db58d1c2578b 100644
--- a/llvm/lib/Analysis/MLInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp
@@ -400,9 +400,9 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) {
   if (Mandatory)
     return getMandatoryAdvice(CB, true);
 
-  auto NrCtantParams = 0;
+  auto NumCtantParams = 0;
   for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I) {
-    NrCtantParams += (isa<Constant>(*I));
+    NumCtantParams += (isa<Constant>(*I));
   }
 
   auto &CallerBefore = getCachedFPI(Caller);
@@ -414,7 +414,7 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) {
       getInitialFunctionLevel(Caller);
   *ModelRunner->getTensor<int64_t>(FeatureIndex::node_count) = NodeCount;
   *ModelRunner->getTensor<int64_t>(FeatureIndex::nr_ctant_params) =
-      NrCtantParams;
+      NumCtantParams;
   *ModelRunner->getTensor<int64_t>(FeatureIndex::edge_count) = EdgeCount;
   *ModelRunner->getTensor<int64_t>(FeatureIndex::caller_users) =
       CallerBefore.Uses;
diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index 0a124ac3fd1b6..d099544c2a491 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -273,7 +273,7 @@ struct LIFeatureComponents {
   double RW = 0;
   double IndVarUpdates = 0;
   double HintWeights = 0.0;
-  int64_t NrDefsAndUses = 0;
+  int64_t NumDefsAndUses = 0;
   float HottestBlockFreq = 0.0;
   bool IsRemat = false;
 };
@@ -327,7 +327,7 @@ class MLEvictAdvisor : public RegAllocEvictionAdvisor {
 
   void extractFeatures(const SmallVectorImpl<const LiveInterval *> &Intervals,
                        llvm::SmallVectorImpl<float> &Largest, size_t Pos,
-                       int64_t IsHint, int64_t LocalIntfsCount, float NrUrgent,
+                       int64_t IsHint, int64_t LocalIntfsCount, float NumUrgent,
                        SmallVectorImpl<LRStartEndInfo> &LRPosInfo) const;
 
   // Point-in-time: we didn't learn this, so we always delegate to the
@@ -609,7 +609,7 @@ bool MLEvictAdvisor::loadInterferenceFeatures(
 
   const bool IsLocal = LIS->intervalIsInOneMBB(VirtReg);
   int64_t LocalIntfs = 0;
-  float NrUrgent = 0.0f;
+  float NumUrgent = 0.0f;
 
   // The cascade tracking is the same as in the default advisor
   unsigned Cascade = RA.getExtraInfo().getCascadeOrCurrentNext(VirtReg.reg());
@@ -649,7 +649,7 @@ bool MLEvictAdvisor::loadInterferenceFeatures(
       if (Cascade <= IntfCascade) {
         if (!Urgent)
           return false;
-        ++NrUrgent;
+        ++NumUrgent;
       }
 
       LocalIntfs += (IsLocal && LIS->intervalIsInOneMBB(*Intf) &&
@@ -659,7 +659,7 @@ bool MLEvictAdvisor::loadInterferenceFeatures(
   // OK, so if we made it this far, this LR is an eviction candidate, load its
   // features.
   extractFeatures(InterferingIntervals, Largest, Pos, IsHint, LocalIntfs,
-                  NrUrgent, LRPosInfo);
+                  NumUrgent, LRPosInfo);
   return true;
 }
 
@@ -731,7 +731,7 @@ MCRegister MLEvictAdvisor::tryFindEvictionCandidate(
     extractFeatures(SmallVector<const LiveInterval *, 1>(1, &VirtReg), Largest,
                     CandidateVirtRegPos, /*IsHint*/ 0,
                     /*LocalIntfsCount*/ 0,
-                    /*NrUrgent*/ 0.0, LRPosInfo);
+                    /*NumUrgent*/ 0.0, LRPosInfo);
   assert(InitialQSize > 0.0 && "We couldn't have gotten here if we had "
                                "nothing to allocate initially.");
 #ifdef LLVM_HAVE_TFLITE
@@ -809,7 +809,7 @@ MLEvictAdvisor::getLIFeatureComponents(const LiveInterval &LI) const {
        I != E;) {
     MachineInstr *MI = &*(I++);
 
-    ++Ret.NrDefsAndUses;
+    ++Ret.NumDefsAndUses;
     if (!Visited.insert(MI).second)
       continue;
 
@@ -846,10 +846,10 @@ MLEvictAdvisor::getLIFeatureComponents(const LiveInterval &LI) const {
 void MLEvictAdvisor::extractFeatures(
     const SmallVectorImpl<const LiveInterval *> &Intervals,
     llvm::SmallVectorImpl<float> &Largest, size_t Pos, int64_t IsHint,
-    int64_t LocalIntfsCount, float NrUrgent,
+    int64_t LocalIntfsCount, float NumUrgent,
     SmallVectorImpl<LRStartEndInfo> &LRPosInfo) const {
-  int64_t NrDefsAndUses = 0;
-  int64_t NrBrokenHints = 0;
+  int64_t NumDefsAndUses = 0;
+  int64_t NumBrokenHints = 0;
   double R = 0.0;
   double W = 0.0;
   double RW = 0.0;
@@ -858,7 +858,7 @@ void MLEvictAdvisor::extractFeatures(
   float StartBBFreq = 0.0;
   float EndBBFreq = 0.0;
   float HottestBlockFreq = 0.0;
-  int32_t NrRematerializable = 0;
+  int32_t NumRematerializable = 0;
   float TotalWeight = 0.0;
 
   SlotIndex EndSI = LIS->getSlotIndexes()->getZeroIndex();
@@ -882,9 +882,9 @@ void MLEvictAdvisor::extractFeatures(
     if (LI.endIndex() > EndSI)
       EndSI = LI.endIndex();
     const LIFeatureComponents &LIFC = getLIFeatureComponents(LI);
-    NrBrokenHints += VRM->hasPreferredPhys(LI.reg());
+    NumBrokenHints += VRM->hasPreferredPhys(LI.reg());
 
-    NrDefsAndUses += LIFC.NrDefsAndUses;
+    NumDefsAndUses += LIFC.NumDefsAndUses;
     HottestBlockFreq = std::max(HottestBlockFreq, LIFC.HottestBlockFreq);
     R += LIFC.R;
     W += LIFC.W;
@@ -893,7 +893,7 @@ void MLEvictAdvisor::extractFeatures(
     IndVarUpdates += LIFC.IndVarUpdates;
 
     HintWeights += LIFC.HintWeights;
-    NrRematerializable += LIFC.IsRemat;
+    NumRematerializable += LIFC.IsRemat;
 
     if (EnableDevelopmentFeatures) {
       for (auto CurrentSegment : LI) {
@@ -922,12 +922,12 @@ void MLEvictAdvisor::extractFeatures(
   } while (false)
   SET(mask, int64_t, 1);
   SET(is_free, int64_t, Intervals.empty());
-  SET(nr_urgent, float, NrUrgent);
-  SET(nr_broken_hints, float, NrBrokenHints);
+  SET(nr_urgent, float, NumUrgent);
+  SET(nr_broken_hints, float, NumBrokenHints);
   SET(is_hint, int64_t, IsHint);
   SET(is_local, int64_t, LocalIntfsCount);
-  SET(nr_rematerializable, float, NrRematerializable);
-  SET(nr_defs_and_uses, float, NrDefsAndUses);
+  SET(nr_rematerializable, float, NumRematerializable);
+  SET(nr_defs_and_uses, float, NumDefsAndUses);
   SET(weighed_reads_by_max, float, R);
   SET(weighed_writes_by_max, float, W);
   SET(weighed_read_writes_by_max, float, RW);
diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
index 43bebc99316e0..b620306628729 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
@@ -71,33 +71,33 @@ class CtxInstrumentationLowerer final {
 // of its parameters, and llvm.instrprof.callsite captures the total number of
 // callsites. Those values are the same for instances of those intrinsics in
 // this function. Find the first instance of each and return them.
-std::pair<uint32_t, uint32_t> getNrCountersAndCallsites(const Function &F) {
-  uint32_t NrCounters = 0;
-  uint32_t NrCallsites = 0;
+std::pair<uint32_t, uint32_t> getNumCountersAndCallsites(const Function &F) {
+  uint32_t NumCounters = 0;
+  uint32_t NumCallsites = 0;
   for (const auto &BB : F) {
     for (const auto &I : BB) {
       if (const auto *Incr = dyn_cast<InstrProfIncrementInst>(&I)) {
         uint32_t V =
             static_cast<uint32_t>(Incr->getNumCounters()->getZExtValue());
-        assert((!NrCounters || V == NrCounters) &&
+        assert((!NumCounters || V == NumCounters) &&
                "expected all llvm.instrprof.increment[.step] intrinsics to "
                "have the same total nr of counters parameter");
-        NrCounters = V;
+        NumCounters = V;
       } else if (const auto *CSIntr = dyn_cast<InstrProfCallsite>(&I)) {
         uint32_t V =
             static_cast<uint32_t>(CSIntr->getNumCounters()->getZExtValue());
-        assert((!NrCallsites || V == NrCallsites) &&
+        assert((!NumCallsites || V == NumCallsites) &&
                "expected all llvm.instrprof.callsite intrinsics to have the "
                "same total nr of callsites parameter");
-        NrCallsites = V;
+        NumCallsites = V;
       }
 #if NDEBUG
-      if (NrCounters && NrCallsites)
-        return std::make_pair(NrCounters, NrCallsites);
+      if (NumCounters && NumCallsites)
+        return std::make_pair(NumCounters, NumCallsites);
 #endif
     }
   }
-  return {NrCounters, NrCallsites};
+  return {NumCounters, NumCallsites};
 }
 } // namespace
 
@@ -124,8 +124,8 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M,
   ContextNodeTy = StructType::get(M.getContext(), {
                                                       I64Ty,     /*Guid*/
                                                       PointerTy, /*Next*/
-                                                      I32Ty,     /*NrCounters*/
-                                                      I32Ty,     /*NrCallsites*/
+                                                      I32Ty,     /*NumCounters*/
+                                                      I32Ty, /*NumCallsites*/
                                                   });
 
   // Define a global for each entrypoint. We'll reuse the entrypoint's name as
@@ -157,7 +157,7 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M,
            FunctionType::get(ContextNodeTy->getPointerTo(),
                              {ContextRootTy->getPointerTo(), /*ContextRoot*/
                               I64Ty, /*Guid*/ I32Ty,
-                              /*NrCounters*/ I32Ty /*NrCallsites*/},
+                              /*NumCounters*/ I32Ty /*NumCallsites*/},
                              false))
           .getCallee());
   GetCtx = cast<Function>(
@@ -165,8 +165,8 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M,
                             FunctionType::get(ContextNodeTy->getPointerTo(),
                                               {PointerTy, /*Callee*/
                                                I64Ty,     /*Guid*/
-                                               I32Ty,     /*NrCounters*/
-                                               I32Ty},    /*NrCallsites*/
+                                               I32Ty,     /*NumCounters*/
+                                               I32Ty},    /*NumCallsites*/
                                               false))
           .getCallee());
   ReleaseCtx = cast<Function>(
@@ -208,7 +208,7 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) {
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
   Value *Guid = nullptr;
-  auto [NrCounters, NrCallsites] = getNrCountersAndCallsites(F);
+  auto [NumCounters, NumCallsites] = getNumCountersAndCallsites(F);
 
   Value *Context = nullptr;
   Value *RealContext = nullptr;
@@ -229,12 +229,12 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) {
       Guid = Builder.getInt64(
           AssignGUIDPass::getGUID(cast<Function>(*Mark->getNameValue())));
       // The type of the context of this function is now knowable since we have
-      // NrCallsites and NrCounters. We delcare it here because it's more
+      // NumCallsites and NumCounters. We delcare it here because it's more
       // convenient - we have the Builder.
       ThisContextType = StructType::get(
           F.getContext(),
-          {ContextNodeTy, ArrayType::get(Builder.getInt64Ty(), NrCounters),
-           ArrayType::get(Builder.getPtrTy(), NrCallsites)});
+          {ContextNodeTy, ArrayType::get(Builder.getInt64Ty(), NumCounters),
+           ArrayType::get(Builder.getPtrTy(), NumCallsites)});
       // Figure out which way we obtain the context object for this function -
       // if it's an entrypoint, then we call StartCtx, otherwise GetCtx. In the
       // former case, we also set TheRootContext since we need to release it
@@ -243,22 +243,22 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) {
       auto Iter = ContextRootMap.find(&F);
       if (Iter != ContextRootMap.end()) {
         TheRootContext = Iter->second;
-        Context = Builder.CreateCall(StartCtx, {TheRootContext, Guid,
-                                                Builder.getInt32(NrCounters),
-                                                Builder.getInt32(NrCallsites)});
+        Context = Builder.CreateCall(
+            StartCtx, {TheRootContext, Guid, Builder.getInt32(NumCounters),
+                       Builder.getInt32(NumCallsites)});
         ORE.emit(
             [&] { return OptimizationRemark(DEBUG_TYPE, "Entrypoint", &F); });
       } else {
         Context =
-            Builder.CreateCall(GetCtx, {&F, Guid, Builder.getInt32(NrCounters),
-                                        Builder.getInt32(NrCallsites)});
+            Builder.CreateCall(GetCtx, {&F, Guid, Builder.getInt32(NumCounters),
+                                        Builder.getInt32(NumCallsites)});
         ORE.emit([&] {
           return OptimizationRemark(DEBUG_TYPE, "RegularFunction", &F);
         });
       }
       // The context could be scratch.
       auto *CtxAsInt = Builder.CreatePtrToInt(Context, Builder.getInt64Ty());
-      if (NrCallsites > 0) {
+      if (NumCallsites > 0) {
         // Figure out which index of the TLS 2-element buffers to use.
         // Scratch context => we use index == 1. Real contexts => index == 0.
         auto *Index = Builder.CreateAnd(CtxAsInt, Builder.getInt64(1));
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 8dd0cfdb2ae0a..9dd4a561edfdd 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -953,15 +953,15 @@ void FunctionInstrumenter::instrument() {
           }
     };
     // First, count callsites.
-    uint32_t TotalNrCallsites = 0;
-    Visit([&TotalNrCallsites](auto *) { ++TotalNrCallsites; });
+    uint32_t TotalNumCallsites = 0;
+    Visit([&TotalNumCallsites](auto *) { ++TotalNumCallsites; });
 
     // Now instrument.
     uint32_t CallsiteIndex = 0;
     Visit([&](auto *CB) {
       IRBuilder<> Builder(CB);
       Builder.CreateCall(CSIntrinsic,
-                         {Name, CFGHash, Builder.getInt32(TotalNrCallsites),
+                         {Name, CFGHash, Builder.getInt32(TotalNumCallsites),
                           Builder.getInt32(CallsiteIndex++),
                           CB->getCalledOperand()});
     });
diff --git a/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp b/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp
index 7be01445558ec..f48f4f1ac9cc1 100644
--- a/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp
+++ b/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp
@@ -24,12 +24,12 @@ class PGOCtxProfRWTest : public ::testing::Test {
   std::map<GUID, const ContextNode *> Roots;
 
 public:
-  ContextNode *createNode(GUID Guid, uint32_t NrCounters, uint32_t NrCallsites,
-                          ContextNode *Next = nullptr) {
-    auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
+  ContextNode *createNode(GUID Guid, uint32_t NumCounters,
+                          uint32_t NumCallsites, ContextNode *Next = nullptr) {
+    auto AllocSize = ContextNode::getAllocSize(NumCounters, NumCallsites);
     auto *Mem = Nodes.emplace_back(std::make_unique<char[]>(AllocSize)).get();
     std::memset(Mem, 0, AllocSize);
-    auto *Ret = new (Mem) ContextNode(Guid, NrCounters, NrCallsites, Next);
+    auto *Ret = new (Mem) ContextNode(Guid, NumCounters, NumCallsites, Next);
     return Ret;
   }
 

From 9de972e3e1ecea506a3884bd2fc47570ce83e4df Mon Sep 17 00:00:00 2001
From: Ming-Yi Lai <ming-yi.lai@mediatek.com>
Date: Fri, 6 Sep 2024 03:35:12 +0800
Subject: [PATCH 281/425] [clang] Fix FnInfoOpts::operator&= and
 FnInfoOpts::operator|= not updating assigned operands (#107050)

This affects CodeGenTypes::arrangeCall.  No test because the only current in-tree use of that function isn't affected.
---
 clang/lib/CodeGen/CGCall.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.h b/clang/lib/CodeGen/CGCall.h
index 6fa65e1916183..92e0cc43919ca 100644
--- a/clang/lib/CodeGen/CGCall.h
+++ b/clang/lib/CodeGen/CGCall.h
@@ -450,12 +450,12 @@ inline FnInfoOpts operator&(FnInfoOpts A, FnInfoOpts B) {
                                  llvm::to_underlying(B));
 }
 
-inline FnInfoOpts operator|=(FnInfoOpts A, FnInfoOpts B) {
+inline FnInfoOpts &operator|=(FnInfoOpts &A, FnInfoOpts B) {
   A = A | B;
   return A;
 }
 
-inline FnInfoOpts operator&=(FnInfoOpts A, FnInfoOpts B) {
+inline FnInfoOpts &operator&=(FnInfoOpts &A, FnInfoOpts B) {
   A = A & B;
   return A;
 }

From 5515b086f35c2c1402234b9b94338f10fc9d16f7 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 5 Sep 2024 12:35:26 -0700
Subject: [PATCH 282/425] Factor Process::ExecutionResultAsCString() into a
 global function (NFC)

---
 lldb/include/lldb/Target/Process.h            |  2 -
 lldb/include/lldb/Utility/Status.h            |  2 +
 lldb/source/Expression/FunctionCaller.cpp     |  3 +-
 lldb/source/Expression/LLVMUserExpression.cpp |  6 +--
 lldb/source/Target/Process.cpp                | 37 -------------------
 lldb/source/Utility/Status.cpp                | 26 +++++++++++++
 6 files changed, 32 insertions(+), 44 deletions(-)

diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index a7de991104434..c66cfb2c245ef 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -1296,8 +1296,6 @@ class Process : public std::enable_shared_from_this<Process>,
                 const EvaluateExpressionOptions &options,
                 DiagnosticManager &diagnostic_manager);
 
-  static const char *ExecutionResultAsCString(lldb::ExpressionResults result);
-
   void GetStatus(Stream &ostrm);
 
   size_t GetThreadStatus(Stream &ostrm, bool only_threads_with_stop_reason,
diff --git a/lldb/include/lldb/Utility/Status.h b/lldb/include/lldb/Utility/Status.h
index 3813a3c160470..308532e7c633d 100644
--- a/lldb/include/lldb/Utility/Status.h
+++ b/lldb/include/lldb/Utility/Status.h
@@ -26,6 +26,8 @@ class raw_ostream;
 
 namespace lldb_private {
 
+const char *ExpressionResultAsCString(lldb::ExpressionResults result);
+
 /// \class Status Status.h "lldb/Utility/Status.h" An error handling class.
 ///
 /// This class is designed to be able to hold any error code that can be
diff --git a/lldb/source/Expression/FunctionCaller.cpp b/lldb/source/Expression/FunctionCaller.cpp
index 5ac2b0681ebbe..5ce0175fedf45 100644
--- a/lldb/source/Expression/FunctionCaller.cpp
+++ b/lldb/source/Expression/FunctionCaller.cpp
@@ -390,8 +390,7 @@ lldb::ExpressionResults FunctionCaller::ExecuteFunction(
       LLDB_LOGF(log,
                 "== [FunctionCaller::ExecuteFunction] Execution of \"%s\" "
                 "completed abnormally: %s ==",
-                m_name.c_str(),
-                Process::ExecutionResultAsCString(return_value));
+                m_name.c_str(), ExpressionResultAsCString(return_value));
     } else {
       LLDB_LOGF(log,
                 "== [FunctionCaller::ExecuteFunction] Execution of \"%s\" "
diff --git a/lldb/source/Expression/LLVMUserExpression.cpp b/lldb/source/Expression/LLVMUserExpression.cpp
index b4fdfc4d1fa8b..9e07d4a7374a7 100644
--- a/lldb/source/Expression/LLVMUserExpression.cpp
+++ b/lldb/source/Expression/LLVMUserExpression.cpp
@@ -235,9 +235,9 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
           expr_thread_id);
       return execution_result;
     } else if (execution_result != lldb::eExpressionCompleted) {
-      diagnostic_manager.Printf(
-          lldb::eSeverityError, "Couldn't execute function; result was %s",
-          Process::ExecutionResultAsCString(execution_result));
+      diagnostic_manager.Printf(lldb::eSeverityError,
+                                "Couldn't execute function; result was %s",
+                                ExpressionResultAsCString(execution_result));
       return execution_result;
     }
   }
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 3d7ddbe294a49..f2a631a466b35 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -5743,43 +5743,6 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx,
   return return_value;
 }
 
-const char *Process::ExecutionResultAsCString(ExpressionResults result) {
-  const char *result_name = "<unknown>";
-
-  switch (result) {
-  case eExpressionCompleted:
-    result_name = "eExpressionCompleted";
-    break;
-  case eExpressionDiscarded:
-    result_name = "eExpressionDiscarded";
-    break;
-  case eExpressionInterrupted:
-    result_name = "eExpressionInterrupted";
-    break;
-  case eExpressionHitBreakpoint:
-    result_name = "eExpressionHitBreakpoint";
-    break;
-  case eExpressionSetupError:
-    result_name = "eExpressionSetupError";
-    break;
-  case eExpressionParseError:
-    result_name = "eExpressionParseError";
-    break;
-  case eExpressionResultUnavailable:
-    result_name = "eExpressionResultUnavailable";
-    break;
-  case eExpressionTimedOut:
-    result_name = "eExpressionTimedOut";
-    break;
-  case eExpressionStoppedForDebug:
-    result_name = "eExpressionStoppedForDebug";
-    break;
-  case eExpressionThreadVanished:
-    result_name = "eExpressionThreadVanished";
-  }
-  return result_name;
-}
-
 void Process::GetStatus(Stream &strm) {
   const StateType state = GetState();
   if (StateIsStoppedState(state, false)) {
diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp
index 131fc662bfc0a..40e1fbf3fab1b 100644
--- a/lldb/source/Utility/Status.cpp
+++ b/lldb/source/Utility/Status.cpp
@@ -204,3 +204,29 @@ void llvm::format_provider<lldb_private::Status>::format(
   llvm::format_provider<llvm::StringRef>::format(error.AsCString(), OS,
                                                  Options);
 }
+
+const char *lldb_private::ExpressionResultAsCString(ExpressionResults result) {
+  switch (result) {
+  case eExpressionCompleted:
+    return "eExpressionCompleted";
+  case eExpressionDiscarded:
+    return "eExpressionDiscarded";
+  case eExpressionInterrupted:
+    return "eExpressionInterrupted";
+  case eExpressionHitBreakpoint:
+    return "eExpressionHitBreakpoint";
+  case eExpressionSetupError:
+    return "eExpressionSetupError";
+  case eExpressionParseError:
+    return "eExpressionParseError";
+  case eExpressionResultUnavailable:
+    return "eExpressionResultUnavailable";
+  case eExpressionTimedOut:
+    return "eExpressionTimedOut";
+  case eExpressionStoppedForDebug:
+    return "eExpressionStoppedForDebug";
+  case eExpressionThreadVanished:
+    return "eExpressionThreadVanished";
+  }
+  return "<unknown>";
+}

From 53d5ffea6be7216589599b6415c021f8bd13cd37 Mon Sep 17 00:00:00 2001
From: Fabian Parzefall <parzefall@meta.com>
Date: Thu, 5 Sep 2024 12:39:39 -0700
Subject: [PATCH 283/425] [clang] Check inline defs when emitting speculative
 vtable (#100785)

Clang should only emit an available_externally vtable when there are no
unused virtual inline functions. Currently, if such such a function is
declared without inline inside the class, but is defined inline outside
the class, Clang might emit the vtable as available_externally. This
happens because Clang only considers the declarations of vtable entries,
but not the definitions. This patch addresses this by inspecting the
definitions in addition to the declarations.
---
 clang/lib/CodeGen/ItaniumCXXABI.cpp           | 19 +++++++++++---
 .../vtable-available-externally.cpp           | 25 +++++++++++++------
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 0cde8a192eda0..fb1eb72d9f340 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -442,7 +442,10 @@ class ItaniumCXXABI : public CodeGen::CGCXXABI {
          continue;
 
        const CXXMethodDecl *Method = VtableComponent.getFunctionDecl();
-       if (!Method->getCanonicalDecl()->isInlined())
+       const FunctionDecl *FD = Method->getDefinition();
+       const bool IsInlined =
+           Method->getCanonicalDecl()->isInlined() || (FD && FD->isInlined());
+       if (!IsInlined)
          continue;
 
        StringRef Name = CGM.getMangledName(VtableComponent.getGlobalDecl());
@@ -2279,8 +2282,18 @@ bool ItaniumCXXABI::canSpeculativelyEmitVTableAsBaseClass(
   if (CGM.getCodeGenOpts().ForceEmitVTables)
     return true;
 
-  // If we don't have any not emitted inline virtual function then we are safe
-  // to emit an available_externally copy of vtable.
+  // A speculative vtable can only be generated if all virtual inline functions
+  // defined by this class are emitted. The vtable in the final program contains
+  // for each virtual inline function not used in the current TU a function that
+  // is equivalent to the unused function. The function in the actual vtable
+  // does not have to be declared under the same symbol (e.g., a virtual
+  // destructor that can be substituted with its base class's destructor). Since
+  // inline functions are emitted lazily and this emissions does not account for
+  // speculative emission of a vtable, we might generate a speculative vtable
+  // with references to inline functions that are not emitted under that name.
+  // This can lead to problems when devirtualizing a call to such a function,
+  // that result in linking errors. Hence, if there are any unused virtual
+  // inline function, we cannot emit the speculative vtable.
   // FIXME we can still emit a copy of the vtable if we
   // can emit definition of the inline functions.
   if (hasAnyUnusedVirtualInlineFunction(RD))
diff --git a/clang/test/CodeGenCXX/vtable-available-externally.cpp b/clang/test/CodeGenCXX/vtable-available-externally.cpp
index a57eb39edfe10..ab105260bc75a 100644
--- a/clang/test/CodeGenCXX/vtable-available-externally.cpp
+++ b/clang/test/CodeGenCXX/vtable-available-externally.cpp
@@ -250,28 +250,39 @@ struct C : A {
   virtual void car();
 };
 
+// Inline definition outside body, so we can't emit vtable available_externally
+// (see previous).
+// CHECK-TEST10-DAG: @_ZTVN6Test101FE = external unnamed_addr constant
+struct F : A {
+  void foo();
+  virtual void cat();         // inline outside body
+};
+inline void F::cat() {}
+
 // no key function, vtable will be generated everywhere it will be used
 // CHECK-TEST10-DAG: @_ZTVN6Test101EE = linkonce_odr unnamed_addr constant
 // CHECK-FORCE-EMIT-DAG: @_ZTVN6Test101EE = linkonce_odr unnamed_addr constant
 
 struct E : A {};
 
-void g(A& a) {
+void h(A& a) {
   a.foo();
   a.bar();
 }
 
-void f() {
+void g() {
   A a;
-  g(a);
+  h(a);
   B b;
-  g(b);
+  h(b);
   C c;
-  g(c);
+  h(c);
   D d;
-  g(d);
+  h(d);
   E e;
-  g(e);
+  h(e);
+  F f;
+  h(f);
 }
 
 }  // Test10

From b798f4bd50bbf0f5eb46804afad10629797c73aa Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 5 Sep 2024 12:44:13 -0700
Subject: [PATCH 284/425] [lldb] Make deep copies of Status explicit (NFC)
 (#107170)

---
 lldb/bindings/python/python-swigsafecast.swig |  4 +--
 lldb/include/lldb/API/SBError.h               |  4 +--
 lldb/include/lldb/API/SBValueList.h           |  2 +-
 .../lldb/Core/ValueObjectConstResult.h        |  4 +--
 lldb/include/lldb/Utility/Status.h            | 27 +++++++++++++++++--
 lldb/source/API/SBBreakpoint.cpp              |  6 ++---
 lldb/source/API/SBBreakpointLocation.cpp      |  4 +--
 lldb/source/API/SBBreakpointName.cpp          | 17 ++++++------
 lldb/source/API/SBDebugger.cpp                |  2 +-
 lldb/source/API/SBError.cpp                   | 15 ++++++-----
 lldb/source/API/SBFile.cpp                    | 15 ++++-------
 lldb/source/API/SBFormat.cpp                  |  2 +-
 lldb/source/API/SBFrame.cpp                   |  9 ++++---
 lldb/source/API/SBPlatform.cpp                |  4 +--
 lldb/source/API/SBProcess.cpp                 |  2 +-
 lldb/source/API/SBSaveCoreOptions.cpp         |  3 +--
 lldb/source/API/SBStructuredData.cpp          |  2 +-
 lldb/source/API/SBTarget.cpp                  |  5 ++--
 lldb/source/API/SBThread.cpp                  |  2 +-
 lldb/source/API/SBValue.cpp                   |  4 +--
 lldb/source/API/SBValueList.cpp               | 13 ++++-----
 lldb/source/API/SBWatchpoint.cpp              |  2 +-
 .../source/Commands/CommandObjectCommands.cpp |  4 +--
 lldb/source/Core/Debugger.cpp                 |  2 +-
 lldb/source/Core/ModuleList.cpp               |  5 ++--
 lldb/source/Core/ValueObject.cpp              |  4 +--
 lldb/source/Core/ValueObjectCast.cpp          |  2 +-
 lldb/source/Core/ValueObjectConstResult.cpp   |  9 ++++---
 lldb/source/Core/ValueObjectDynamicValue.cpp  |  2 +-
 .../Core/ValueObjectSyntheticFilter.cpp       |  2 +-
 lldb/source/Expression/Materializer.cpp       |  2 +-
 lldb/source/Expression/UserExpression.cpp     |  8 +++---
 lldb/source/Host/common/LockFileBase.cpp      |  4 +--
 .../Host/common/NativeProcessProtocol.cpp     |  8 +++---
 .../posix/ConnectionFileDescriptorPosix.cpp   | 12 ++++-----
 .../source/Interpreter/CommandInterpreter.cpp |  2 +-
 lldb/source/Interpreter/ScriptInterpreter.cpp |  2 +-
 .../Plugins/Platform/Android/AdbClient.cpp    |  8 +++---
 .../PlatformAndroidRemoteGDBServer.cpp        |  4 +--
 ...PlatformiOSSimulatorCoreSimulatorSupport.h |  2 +-
 ...latformiOSSimulatorCoreSimulatorSupport.mm | 12 ++++-----
 .../Plugins/Platform/POSIX/PlatformPOSIX.cpp  |  4 +--
 .../Platform/Windows/PlatformWindows.cpp      |  4 +--
 .../ScriptedProcessPythonInterface.cpp        |  6 ++---
 .../Interfaces/ScriptedPythonInterface.h      |  8 ++++--
 .../Python/SWIGPythonBridge.h                 |  2 +-
 .../Plugins/SymbolFile/DWARF/DWARFUnit.h      |  2 +-
 .../SymbolFile/DWARF/SymbolFileDWARF.cpp      |  2 +-
 .../DWARF/SymbolFileDWARFDebugMap.cpp         |  2 +-
 lldb/source/Target/ModuleCache.cpp            |  2 +-
 lldb/source/Target/Platform.cpp               |  4 +--
 lldb/source/Target/Process.cpp                |  4 +--
 lldb/source/Target/StackFrame.cpp             |  2 +-
 lldb/source/Target/Target.cpp                 | 12 ++++-----
 lldb/source/Utility/Status.cpp                |  7 +++++
 .../Python/PythonTestSuite.cpp                |  2 +-
 .../Target/RemoteAwarePlatformTest.cpp        |  8 +++---
 57 files changed, 172 insertions(+), 141 deletions(-)

diff --git a/lldb/bindings/python/python-swigsafecast.swig b/lldb/bindings/python/python-swigsafecast.swig
index 34f8c6f0ff8d3..0127ac6bfa468 100644
--- a/lldb/bindings/python/python-swigsafecast.swig
+++ b/lldb/bindings/python/python-swigsafecast.swig
@@ -33,8 +33,8 @@ PythonObject SWIGBridge::ToSWIGWrapper(lldb::BreakpointSP breakpoint_sp) {
                       SWIGTYPE_p_lldb__SBBreakpoint);
 }
 
-PythonObject SWIGBridge::ToSWIGWrapper(const Status& status) {
-  return ToSWIGHelper(new lldb::SBError(status), SWIGTYPE_p_lldb__SBError);
+PythonObject SWIGBridge::ToSWIGWrapper(Status status) {
+  return ToSWIGHelper(new lldb::SBError(std::move(status)), SWIGTYPE_p_lldb__SBError);
 }
 
 PythonObject SWIGBridge::ToSWIGWrapper(std::unique_ptr<lldb::SBStructuredData> data_sb) {
diff --git a/lldb/include/lldb/API/SBError.h b/lldb/include/lldb/API/SBError.h
index 17f2c6c3027af..9f55f92131c06 100644
--- a/lldb/include/lldb/API/SBError.h
+++ b/lldb/include/lldb/API/SBError.h
@@ -97,7 +97,7 @@ class LLDB_API SBError {
   friend class lldb_private::ScriptInterpreter;
   friend class lldb_private::python::SWIGBridge;
 
-  SBError(const lldb_private::Status &error);
+  SBError(lldb_private::Status &&error);
 
   lldb_private::Status *get();
 
@@ -107,7 +107,7 @@ class LLDB_API SBError {
 
   lldb_private::Status &ref();
 
-  void SetError(const lldb_private::Status &lldb_error);
+  void SetError(lldb_private::Status &&lldb_error);
 
 private:
   std::unique_ptr<lldb_private::Status> m_opaque_up;
diff --git a/lldb/include/lldb/API/SBValueList.h b/lldb/include/lldb/API/SBValueList.h
index a5017bccc5053..52a86f989e153 100644
--- a/lldb/include/lldb/API/SBValueList.h
+++ b/lldb/include/lldb/API/SBValueList.h
@@ -96,7 +96,7 @@ class LLDB_API SBValueList {
 
   std::unique_ptr<ValueListImpl> m_opaque_up;
 
-  void SetError(const lldb_private::Status &status);
+  void SetError(lldb_private::Status &&status);
 };
 
 } // namespace lldb
diff --git a/lldb/include/lldb/Core/ValueObjectConstResult.h b/lldb/include/lldb/Core/ValueObjectConstResult.h
index d3b3362bd0e9e..9c34617af71d0 100644
--- a/lldb/include/lldb/Core/ValueObjectConstResult.h
+++ b/lldb/include/lldb/Core/ValueObjectConstResult.h
@@ -61,7 +61,7 @@ class ValueObjectConstResult : public ValueObject {
 
   // When an expression fails to evaluate, we return an error
   static lldb::ValueObjectSP Create(ExecutionContextScope *exe_scope,
-                                    const Status &error);
+                                    Status &&error);
 
   std::optional<uint64_t> GetByteSize() override;
 
@@ -146,7 +146,7 @@ class ValueObjectConstResult : public ValueObject {
                          ConstString name, Module *module = nullptr);
 
   ValueObjectConstResult(ExecutionContextScope *exe_scope,
-                         ValueObjectManager &manager, const Status &error);
+                         ValueObjectManager &manager, Status &&error);
 
   ValueObject *CreateChildAtIndex(size_t idx) override {
     return m_impl.CreateChildAtIndex(idx);
diff --git a/lldb/include/lldb/Utility/Status.h b/lldb/include/lldb/Utility/Status.h
index 308532e7c633d..795c830b96517 100644
--- a/lldb/include/lldb/Utility/Status.h
+++ b/lldb/include/lldb/Utility/Status.h
@@ -43,13 +43,32 @@ const char *ExpressionResultAsCString(lldb::ExpressionResults result);
 /// of themselves for printing results and error codes. The string value will
 /// be fetched on demand and its string value will be cached until the error
 /// is cleared of the value of the error changes.
+///
+/// API design notes:
+///
+/// Most APIs that currently vend a Status would be better served by
+/// returning llvm::Expected<> instead. If possibles APIs should be
+/// refactored to avoid Status. The only legitimate long-term uses of
+/// Status are objects that need to store an error for a long time
+/// (which should be questioned as a design decision, too).
+///
+/// Implementation notes:
+///
+/// Internally, Status stores an llvm::Error.
+///   eErrorTypeInvalid
+///   eErrorTypeGeneric      llvm::StringError
+///   eErrorTypePOSIX        llvm::ECError
+///   eErrorTypeMachKernel   MachKernelError
+///   eErrorTypeExpression   llvm::ErrorList<ExpressionError>
+///   eErrorTypeWin32        Win32Error
+
 class Status {
 public:
-  /// Every error value that this object can contain needs to be able to fit
   /// into ValueType.
   typedef uint32_t ValueType;
 
   Status();
+  Status(Status &&other) = default;
 
   /// Initialize the error object with a generic success value.
   ///
@@ -93,10 +112,14 @@ class Status {
 
   ~Status();
 
+  const Status &operator=(Status &&);
   /// Avoid using this in new code. Migrate APIs to llvm::Expected instead.
   static Status FromError(llvm::Error error);
-  /// FIXME: Replace this with a takeError method.
+  /// FIXME: Replace this with a takeError() method.
   llvm::Error ToError() const;
+  /// Don't call this function in new code. Instead, redesign the API
+  /// to use llvm::Expected instead of Status.
+  Status Clone() const { return Status(ToError()); }
 
   /// Get the error string associated with the current error.
   //
diff --git a/lldb/source/API/SBBreakpoint.cpp b/lldb/source/API/SBBreakpoint.cpp
index 3e9c01080588e..b2ed034d19983 100644
--- a/lldb/source/API/SBBreakpoint.cpp
+++ b/lldb/source/API/SBBreakpoint.cpp
@@ -622,7 +622,7 @@ SBError SBBreakpoint::SetScriptCallbackFunction(
                                                callback_function_name,
                                                extra_args.m_impl_up
                                                    ->GetObjectSP());
-    sb_error.SetError(error);
+    sb_error.SetError(std::move(error));
   } else
     sb_error = Status::FromErrorString("invalid breakpoint");
 
@@ -645,7 +645,7 @@ SBError SBBreakpoint::SetScriptCallbackBody(const char *callback_body_text) {
             .GetScriptInterpreter()
             ->SetBreakpointCommandCallback(bp_options, callback_body_text,
                                            /*is_callback=*/false);
-    sb_error.SetError(error);
+    sb_error.SetError(std::move(error));
   } else
     sb_error = Status::FromErrorString("invalid breakpoint");
 
@@ -670,7 +670,7 @@ SBError SBBreakpoint::AddNameWithErrorHandling(const char *new_name) {
         bkpt_sp->GetTarget().GetAPIMutex());
     Status error;
     bkpt_sp->GetTarget().AddNameToBreakpoint(bkpt_sp, new_name, error);
-    status.SetError(error);
+    status.SetError(std::move(error));
   } else {
     status = Status::FromErrorString("invalid breakpoint");
   }
diff --git a/lldb/source/API/SBBreakpointLocation.cpp b/lldb/source/API/SBBreakpointLocation.cpp
index e5c96b81e8090..b2d1da3927c6e 100644
--- a/lldb/source/API/SBBreakpointLocation.cpp
+++ b/lldb/source/API/SBBreakpointLocation.cpp
@@ -239,7 +239,7 @@ SBError SBBreakpointLocation::SetScriptCallbackFunction(
                                                callback_function_name,
                                                extra_args.m_impl_up
                                                    ->GetObjectSP());
-      sb_error.SetError(error);
+    sb_error.SetError(std::move(error));
     } else
       sb_error = Status::FromErrorString("invalid breakpoint");
 
@@ -264,7 +264,7 @@ SBBreakpointLocation::SetScriptCallbackBody(const char *callback_body_text) {
             .GetScriptInterpreter()
             ->SetBreakpointCommandCallback(bp_options, callback_body_text,
                                            /*is_callback=*/false);
-    sb_error.SetError(error);
+    sb_error.SetError(std::move(error));
   } else
     sb_error = Status::FromErrorString("invalid breakpoint");
 
diff --git a/lldb/source/API/SBBreakpointName.cpp b/lldb/source/API/SBBreakpointName.cpp
index 7dc8dee19f43d..831260d44e8e7 100644
--- a/lldb/source/API/SBBreakpointName.cpp
+++ b/lldb/source/API/SBBreakpointName.cpp
@@ -570,14 +570,13 @@ SBError SBBreakpointName::SetScriptCallbackFunction(
         m_impl_up->GetTarget()->GetAPIMutex());
 
   BreakpointOptions &bp_options = bp_name->GetOptions();
-  Status error;
-  error = m_impl_up->GetTarget()
-              ->GetDebugger()
-              .GetScriptInterpreter()
-              ->SetBreakpointCommandCallbackFunction(
-                  bp_options, callback_function_name,
-                  extra_args.m_impl_up->GetObjectSP());
-  sb_error.SetError(error);
+  Status error = m_impl_up->GetTarget()
+                     ->GetDebugger()
+                     .GetScriptInterpreter()
+                     ->SetBreakpointCommandCallbackFunction(
+                         bp_options, callback_function_name,
+                         extra_args.m_impl_up->GetObjectSP());
+  sb_error.SetError(std::move(error));
   UpdateName(*bp_name);
   return sb_error;
 }
@@ -600,7 +599,7 @@ SBBreakpointName::SetScriptCallbackBody(const char *callback_body_text) {
                      .GetScriptInterpreter()
                      ->SetBreakpointCommandCallback(
                          bp_options, callback_body_text, /*is_callback=*/false);
-  sb_error.SetError(error);
+  sb_error.SetError(std::move(error));
   if (!sb_error.Fail())
     UpdateName(*bp_name);
 
diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp
index c226acc15018e..b21d7e6729007 100644
--- a/lldb/source/API/SBDebugger.cpp
+++ b/lldb/source/API/SBDebugger.cpp
@@ -1360,7 +1360,7 @@ SBError SBDebugger::SetInternalVariable(const char *var_name, const char *value,
         "invalid debugger instance name '%s'", debugger_instance_name);
   }
   if (error.Fail())
-    sb_error.SetError(error);
+    sb_error.SetError(std::move(error));
   return sb_error;
 }
 
diff --git a/lldb/source/API/SBError.cpp b/lldb/source/API/SBError.cpp
index 30d9ccc78ee37..31964931649db 100644
--- a/lldb/source/API/SBError.cpp
+++ b/lldb/source/API/SBError.cpp
@@ -23,7 +23,8 @@ SBError::SBError() { LLDB_INSTRUMENT_VA(this); }
 SBError::SBError(const SBError &rhs) {
   LLDB_INSTRUMENT_VA(this, rhs);
 
-  m_opaque_up = clone(rhs.m_opaque_up);
+  if (rhs.m_opaque_up)
+    m_opaque_up = std::make_unique<Status>(rhs.m_opaque_up->Clone());
 }
 
 SBError::SBError(const char *message) {
@@ -32,8 +33,8 @@ SBError::SBError(const char *message) {
   SetErrorString(message);
 }
 
-SBError::SBError(const lldb_private::Status &status)
-    : m_opaque_up(new Status(status)) {
+SBError::SBError(lldb_private::Status &&status)
+    : m_opaque_up(new Status(std::move(status))) {
   LLDB_INSTRUMENT_VA(this, status);
 }
 
@@ -43,7 +44,9 @@ const SBError &SBError::operator=(const SBError &rhs) {
   LLDB_INSTRUMENT_VA(this, rhs);
 
   if (this != &rhs)
-    m_opaque_up = clone(rhs.m_opaque_up);
+    if (rhs.m_opaque_up)
+      m_opaque_up = std::make_unique<Status>(rhs.m_opaque_up->Clone());
+
   return *this;
 }
 
@@ -111,9 +114,9 @@ void SBError::SetError(uint32_t err, ErrorType type) {
   *m_opaque_up = Status(err, type);
 }
 
-void SBError::SetError(const Status &lldb_error) {
+void SBError::SetError(Status &&lldb_error) {
   CreateIfNeeded();
-  *m_opaque_up = lldb_error;
+  *m_opaque_up = std::move(lldb_error);
 }
 
 void SBError::SetErrorToErrno() {
diff --git a/lldb/source/API/SBFile.cpp b/lldb/source/API/SBFile.cpp
index 623708780f4c6..2ae4b1481afbf 100644
--- a/lldb/source/API/SBFile.cpp
+++ b/lldb/source/API/SBFile.cpp
@@ -62,8 +62,7 @@ SBError SBFile::Read(uint8_t *buf, size_t num_bytes, size_t *bytes_read) {
     error = Status::FromErrorString("invalid SBFile");
     *bytes_read = 0;
   } else {
-    Status status = m_opaque_sp->Read(buf, num_bytes);
-    error.SetError(status);
+    error.SetError(m_opaque_sp->Read(buf, num_bytes));
     *bytes_read = num_bytes;
   }
   return error;
@@ -78,8 +77,7 @@ SBError SBFile::Write(const uint8_t *buf, size_t num_bytes,
     error = Status::FromErrorString("invalid SBFile");
     *bytes_written = 0;
   } else {
-    Status status = m_opaque_sp->Write(buf, num_bytes);
-    error.SetError(status);
+    error.SetError(m_opaque_sp->Write(buf, num_bytes));
     *bytes_written = num_bytes;
   }
   return error;
@@ -92,8 +90,7 @@ SBError SBFile::Flush() {
   if (!m_opaque_sp) {
     error = Status::FromErrorString("invalid SBFile");
   } else {
-    Status status = m_opaque_sp->Flush();
-    error.SetError(status);
+    error.SetError(m_opaque_sp->Flush());
   }
   return error;
 }
@@ -106,10 +103,8 @@ bool SBFile::IsValid() const {
 SBError SBFile::Close() {
   LLDB_INSTRUMENT_VA(this);
   SBError error;
-  if (m_opaque_sp) {
-    Status status = m_opaque_sp->Close();
-    error.SetError(status);
-  }
+  if (m_opaque_sp)
+    error.SetError(m_opaque_sp->Close());
   return error;
 }
 
diff --git a/lldb/source/API/SBFormat.cpp b/lldb/source/API/SBFormat.cpp
index 51cddceea0372..080e219d64a36 100644
--- a/lldb/source/API/SBFormat.cpp
+++ b/lldb/source/API/SBFormat.cpp
@@ -36,7 +36,7 @@ SBFormat::SBFormat(const char *format, lldb::SBError &error) {
   FormatEntrySP format_entry_sp = std::make_shared<FormatEntity::Entry>();
   Status status = FormatEntity::Parse(format, *format_entry_sp);
 
-  error.SetError(status);
+  error.SetError(std::move(status));
   if (error.Success())
     m_opaque_sp = format_entry_sp;
 }
diff --git a/lldb/source/API/SBFrame.cpp b/lldb/source/API/SBFrame.cpp
index b1360e88bcd3a..30c44b974988d 100644
--- a/lldb/source/API/SBFrame.cpp
+++ b/lldb/source/API/SBFrame.cpp
@@ -809,7 +809,7 @@ SBValueList SBFrame::GetVariables(const lldb::SBVariablesOptions &options) {
         Status var_error;
         variable_list = frame->GetVariableList(true, &var_error);
         if (var_error.Fail())
-          value_list.SetError(var_error);
+          value_list.SetError(std::move(var_error));
         if (variable_list) {
           const size_t num_variables = variable_list->GetSize();
           if (num_variables) {
@@ -1033,7 +1033,8 @@ SBValue SBFrame::EvaluateExpression(const char *expr) {
     Status error;
     error = Status::FromErrorString("can't evaluate expressions when the "
                                     "process is running.");
-    ValueObjectSP error_val_sp = ValueObjectConstResult::Create(nullptr, error);
+    ValueObjectSP error_val_sp =
+        ValueObjectConstResult::Create(nullptr, std::move(error));
     result.SetSP(error_val_sp, false);
   }
   return result;
@@ -1129,13 +1130,13 @@ lldb::SBValue SBFrame::EvaluateExpression(const char *expr,
       Status error;
       error = Status::FromErrorString("can't evaluate expressions when the "
                                       "process is running.");
-      expr_value_sp = ValueObjectConstResult::Create(nullptr, error);
+      expr_value_sp = ValueObjectConstResult::Create(nullptr, std::move(error));
       expr_result.SetSP(expr_value_sp, false);
     }
   } else {
       Status error;
       error = Status::FromErrorString("sbframe object is not valid.");
-      expr_value_sp = ValueObjectConstResult::Create(nullptr, error);
+      expr_value_sp = ValueObjectConstResult::Create(nullptr, std::move(error));
       expr_result.SetSP(expr_value_sp, false);
   }
 
diff --git a/lldb/source/API/SBPlatform.cpp b/lldb/source/API/SBPlatform.cpp
index 2f0f925302f16..394268b77aa21 100644
--- a/lldb/source/API/SBPlatform.cpp
+++ b/lldb/source/API/SBPlatform.cpp
@@ -586,7 +586,7 @@ SBProcess SBPlatform::Attach(SBAttachInfo &attach_info,
       Status status;
       ProcessSP process_sp = platform_sp->Attach(info, debugger.ref(),
                                                  target.GetSP().get(), status);
-      error.SetError(status);
+      error.SetError(std::move(status));
       return SBProcess(process_sp);
     }
 
@@ -728,7 +728,7 @@ SBError SBPlatform::SetLocateModuleCallback(
           symbol_file_spec = symbol_file_spec_sb.ref();
         }
 
-        return error.ref();
+        return error.ref().Clone();
       });
   return SBError();
 }
diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp
index 3eed51f0245f6..9773144723c34 100644
--- a/lldb/source/API/SBProcess.cpp
+++ b/lldb/source/API/SBProcess.cpp
@@ -1450,7 +1450,7 @@ lldb::SBError SBProcess::DeallocateMemory(lldb::addr_t ptr) {
       std::lock_guard<std::recursive_mutex> guard(
           process_sp->GetTarget().GetAPIMutex());
       Status error = process_sp->DeallocateMemory(ptr);
-      sb_error.SetError(error);
+      sb_error.SetError(std::move(error));
     } else {
       sb_error = Status::FromErrorString("process is running");
     }
diff --git a/lldb/source/API/SBSaveCoreOptions.cpp b/lldb/source/API/SBSaveCoreOptions.cpp
index 2cd431611ef55..ef82b0253f119 100644
--- a/lldb/source/API/SBSaveCoreOptions.cpp
+++ b/lldb/source/API/SBSaveCoreOptions.cpp
@@ -40,8 +40,7 @@ SBSaveCoreOptions::operator=(const SBSaveCoreOptions &rhs) {
 
 SBError SBSaveCoreOptions::SetPluginName(const char *name) {
   LLDB_INSTRUMENT_VA(this, name);
-  lldb_private::Status error = m_opaque_up->SetPluginName(name);
-  return SBError(error);
+  return SBError(m_opaque_up->SetPluginName(name));
 }
 
 void SBSaveCoreOptions::SetStyle(lldb::SaveCoreStyle style) {
diff --git a/lldb/source/API/SBStructuredData.cpp b/lldb/source/API/SBStructuredData.cpp
index 801ebf45e0e52..b891a34bd7c76 100644
--- a/lldb/source/API/SBStructuredData.cpp
+++ b/lldb/source/API/SBStructuredData.cpp
@@ -133,7 +133,7 @@ lldb::SBError SBStructuredData::GetDescription(lldb::SBStream &stream) const {
 
   Status error = m_impl_up->GetDescription(stream.ref());
   SBError sb_error;
-  sb_error.SetError(error);
+  sb_error.SetError(std::move(error));
   return sb_error;
 }
 
diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp
index 41eb77e5506bc..1c1f7e2a03def 100644
--- a/lldb/source/API/SBTarget.cpp
+++ b/lldb/source/API/SBTarget.cpp
@@ -1369,7 +1369,7 @@ SBTarget::WatchpointCreateByAddress(lldb::addr_t addr, size_t size,
     CompilerType *type = nullptr;
     watchpoint_sp =
         target_sp->CreateWatchpoint(addr, size, type, watch_type, cw_error);
-    error.SetError(cw_error);
+    error.SetError(std::move(cw_error));
     sb_watchpoint.SetSP(watchpoint_sp);
   }
 
@@ -2326,7 +2326,8 @@ lldb::SBValue SBTarget::EvaluateExpression(const char *expr,
           Status error;
           error = Status::FromErrorString("can't evaluate expressions when the "
                                           "process is running.");
-          expr_value_sp = ValueObjectConstResult::Create(nullptr, error);
+          expr_value_sp =
+              ValueObjectConstResult::Create(nullptr, std::move(error));
         }
       } else {
         target->EvaluateExpression(expr, frame, expr_value_sp, options.ref());
diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp
index 92868130f6222..7508eed5d6fdb 100644
--- a/lldb/source/API/SBThread.cpp
+++ b/lldb/source/API/SBThread.cpp
@@ -958,7 +958,7 @@ SBError SBThread::JumpToLine(lldb::SBFileSpec &file_spec, uint32_t line) {
   Thread *thread = exe_ctx.GetThreadPtr();
 
   Status err = thread->JumpToLine(file_spec.ref(), line, true);
-  sb_error.SetError(err);
+  sb_error.SetError(std::move(err));
   return sb_error;
 }
 
diff --git a/lldb/source/API/SBValue.cpp b/lldb/source/API/SBValue.cpp
index df0e82b6523fb..273aac5ad4798 100644
--- a/lldb/source/API/SBValue.cpp
+++ b/lldb/source/API/SBValue.cpp
@@ -270,7 +270,7 @@ SBError SBValue::GetError() {
   ValueLocker locker;
   lldb::ValueObjectSP value_sp(GetSP(locker));
   if (value_sp)
-    sb_error.SetError(value_sp->GetError());
+    sb_error.SetError(value_sp->GetError().Clone());
   else
     sb_error = Status::FromErrorStringWithFormat("error: %s",
                                                  locker.GetError().AsCString());
@@ -1476,7 +1476,7 @@ lldb::SBWatchpoint SBValue::Watch(bool resolve_location, bool read, bool write,
     CompilerType type(value_sp->GetCompilerType());
     WatchpointSP watchpoint_sp =
         target_sp->CreateWatchpoint(addr, byte_size, &type, watch_type, rc);
-    error.SetError(rc);
+    error.SetError(std::move(rc));
 
     if (watchpoint_sp) {
       sb_watchpoint.SetSP(watchpoint_sp);
diff --git a/lldb/source/API/SBValueList.cpp b/lldb/source/API/SBValueList.cpp
index ba7e06971dc36..63b915094baf2 100644
--- a/lldb/source/API/SBValueList.cpp
+++ b/lldb/source/API/SBValueList.cpp
@@ -22,13 +22,14 @@ class ValueListImpl {
 public:
   ValueListImpl() = default;
 
-  ValueListImpl(const ValueListImpl &rhs) = default;
+  ValueListImpl(const ValueListImpl &rhs)
+      : m_values(rhs.m_values), m_error(rhs.m_error.Clone()) {}
 
   ValueListImpl &operator=(const ValueListImpl &rhs) {
     if (this == &rhs)
       return *this;
     m_values = rhs.m_values;
-    m_error = rhs.m_error;
+    m_error = rhs.m_error.Clone();
     return *this;
   }
 
@@ -67,7 +68,7 @@ class ValueListImpl {
 
   const Status &GetError() const { return m_error; }
 
-  void SetError(const Status &error) { m_error = error; }
+  void SetError(Status &&error) { m_error = std::move(error); }
 
 private:
   std::vector<lldb::SBValue> m_values;
@@ -205,10 +206,10 @@ lldb::SBError SBValueList::GetError() {
   LLDB_INSTRUMENT_VA(this);
   SBError sb_error;
   if (m_opaque_up)
-    sb_error.SetError(m_opaque_up->GetError());
+    sb_error.SetError(m_opaque_up->GetError().Clone());
   return sb_error;
 }
 
-void SBValueList::SetError(const lldb_private::Status &status) {
-  ref().SetError(status);
+void SBValueList::SetError(lldb_private::Status &&status) {
+  ref().SetError(std::move(status));
 }
diff --git a/lldb/source/API/SBWatchpoint.cpp b/lldb/source/API/SBWatchpoint.cpp
index 9664bbe618600..21e2dcc01968e 100644
--- a/lldb/source/API/SBWatchpoint.cpp
+++ b/lldb/source/API/SBWatchpoint.cpp
@@ -86,7 +86,7 @@ SBError SBWatchpoint::GetError() {
   SBError sb_error;
   lldb::WatchpointSP watchpoint_sp(GetSP());
   if (watchpoint_sp) {
-    sb_error.SetError(watchpoint_sp->GetError());
+    sb_error.SetError(watchpoint_sp->GetError().Clone());
   }
   return sb_error;
 }
diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp
index f8f2b97eb898f..e3291640fa935 100644
--- a/lldb/source/Commands/CommandObjectCommands.cpp
+++ b/lldb/source/Commands/CommandObjectCommands.cpp
@@ -1874,8 +1874,8 @@ class CommandObjectScriptingObjectParsed : public CommandObjectParsed {
 
   ~CommandObjectScriptingObjectParsed() override = default;
 
-  Status GetOptionsError() { return m_options_error; }
-  Status GetArgsError() { return m_args_error; }
+  Status GetOptionsError() { return m_options_error.Clone(); }
+  Status GetArgsError() { return m_args_error.Clone(); }
   bool WantsCompletion() override { return true; }
 
   bool IsRemovable() const override { return true; }
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index 1266355578e32..9bdc5a3949751 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -258,7 +258,7 @@ Status Debugger::SetPropertyValue(const ExecutionContext *exe_ctx,
         StreamString feedback_stream;
         if (!target_sp->LoadScriptingResources(errors, feedback_stream)) {
           Stream &s = GetErrorStream();
-          for (auto error : errors) {
+          for (auto &error : errors) {
             s.Printf("%s\n", error.AsCString());
           }
           if (feedback_stream.GetSize())
diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp
index bba4199ce9e83..2b8ccab2406c6 100644
--- a/lldb/source/Core/ModuleList.cpp
+++ b/lldb/source/Core/ModuleList.cpp
@@ -1046,8 +1046,8 @@ bool ModuleList::LoadScriptingResourcesInTarget(Target *target,
     return false;
   std::lock_guard<std::recursive_mutex> guard(m_modules_mutex);
   for (auto module : m_modules) {
-    Status error;
     if (module) {
+      Status error;
       if (!module->LoadScriptingResourceInTarget(target, error,
                                                  feedback_stream)) {
         if (error.Fail() && error.AsCString()) {
@@ -1058,8 +1058,7 @@ bool ModuleList::LoadScriptingResourcesInTarget(Target *target,
                   .GetFileNameStrippingExtension()
                   .GetCString(),
               error.AsCString());
-          errors.push_back(error);
-
+          errors.push_back(std::move(error));
           if (!continue_on_error)
             return false;
         }
diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp
index d56bd004e63c7..1bedd87e943dc 100644
--- a/lldb/source/Core/ValueObject.cpp
+++ b/lldb/source/Core/ValueObject.cpp
@@ -2763,7 +2763,7 @@ ValueObjectSP ValueObject::CreateConstantValue(ConstString name) {
   if (!valobj_sp) {
     ExecutionContext exe_ctx(GetExecutionContextRef());
     valobj_sp = ValueObjectConstResult::Create(
-        exe_ctx.GetBestExecutionContextScope(), m_error);
+        exe_ctx.GetBestExecutionContextScope(), m_error.Clone());
   }
   return valobj_sp;
 }
@@ -2974,7 +2974,7 @@ ValueObjectSP ValueObject::Cast(const CompilerType &compiler_type) {
 
   return ValueObjectConstResult::Create(
       ExecutionContext(GetExecutionContextRef()).GetBestExecutionContextScope(),
-                       error);
+      std::move(error));
 }
 
 lldb::ValueObjectSP ValueObject::Clone(ConstString new_name) {
diff --git a/lldb/source/Core/ValueObjectCast.cpp b/lldb/source/Core/ValueObjectCast.cpp
index c8e3164151417..308fa161180d4 100644
--- a/lldb/source/Core/ValueObjectCast.cpp
+++ b/lldb/source/Core/ValueObjectCast.cpp
@@ -86,7 +86,7 @@ bool ValueObjectCast::UpdateValue() {
 
   // The dynamic value failed to get an error, pass the error along
   if (m_error.Success() && m_parent->GetError().Fail())
-    m_error = m_parent->GetError();
+    m_error = m_parent->GetError().Clone();
   SetValueIsValid(false);
   return false;
 }
diff --git a/lldb/source/Core/ValueObjectConstResult.cpp b/lldb/source/Core/ValueObjectConstResult.cpp
index 879d3c3f6b037..60850c15e6a83 100644
--- a/lldb/source/Core/ValueObjectConstResult.cpp
+++ b/lldb/source/Core/ValueObjectConstResult.cpp
@@ -169,16 +169,17 @@ ValueObjectConstResult::ValueObjectConstResult(
 }
 
 ValueObjectSP ValueObjectConstResult::Create(ExecutionContextScope *exe_scope,
-                                             const Status &error) {
+                                             Status &&error) {
   auto manager_sp = ValueObjectManager::Create();
-  return (new ValueObjectConstResult(exe_scope, *manager_sp, error))->GetSP();
+  return (new ValueObjectConstResult(exe_scope, *manager_sp, std::move(error)))
+      ->GetSP();
 }
 
 ValueObjectConstResult::ValueObjectConstResult(ExecutionContextScope *exe_scope,
                                                ValueObjectManager &manager,
-                                               const Status &error)
+                                               Status &&error)
     : ValueObject(exe_scope, manager), m_impl(this) {
-  m_error = error;
+  m_error = std::move(error);
   SetIsConstant();
 }
 
diff --git a/lldb/source/Core/ValueObjectDynamicValue.cpp b/lldb/source/Core/ValueObjectDynamicValue.cpp
index d6c523e99f10d..67311ea6e3be4 100644
--- a/lldb/source/Core/ValueObjectDynamicValue.cpp
+++ b/lldb/source/Core/ValueObjectDynamicValue.cpp
@@ -118,7 +118,7 @@ bool ValueObjectDynamicValue::UpdateValue() {
   if (!m_parent->UpdateValueIfNeeded(false)) {
     // The dynamic value failed to get an error, pass the error along
     if (m_error.Success() && m_parent->GetError().Fail())
-      m_error = m_parent->GetError();
+      m_error = m_parent->GetError().Clone();
     return false;
   }
 
diff --git a/lldb/source/Core/ValueObjectSyntheticFilter.cpp b/lldb/source/Core/ValueObjectSyntheticFilter.cpp
index adac1b400705e..091b8f883b605 100644
--- a/lldb/source/Core/ValueObjectSyntheticFilter.cpp
+++ b/lldb/source/Core/ValueObjectSyntheticFilter.cpp
@@ -169,7 +169,7 @@ bool ValueObjectSynthetic::UpdateValue() {
     // our parent could not update.. as we are meaningless without a parent,
     // just stop
     if (m_parent->GetError().Fail())
-      m_error = m_parent->GetError();
+      m_error = m_parent->GetError().Clone();
     return false;
   }
 
diff --git a/lldb/source/Expression/Materializer.cpp b/lldb/source/Expression/Materializer.cpp
index fa0edad1fa583..8097e38919b04 100644
--- a/lldb/source/Expression/Materializer.cpp
+++ b/lldb/source/Expression/Materializer.cpp
@@ -462,7 +462,7 @@ class EntityVariableBase : public Materializer::Entity {
       return;
     }
 
-    Status valobj_error = valobj_sp->GetError();
+    Status valobj_error = valobj_sp->GetError().Clone();
 
     if (valobj_error.Fail()) {
       err = Status::FromErrorStringWithFormat(
diff --git a/lldb/source/Expression/UserExpression.cpp b/lldb/source/Expression/UserExpression.cpp
index c2889e4c986bf..872f6304f91ba 100644
--- a/lldb/source/Expression/UserExpression.cpp
+++ b/lldb/source/Expression/UserExpression.cpp
@@ -268,10 +268,10 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
   const bool generate_debug_info = options.GetGenerateDebugInfo();
 
   if (options.InvokeCancelCallback(lldb::eExpressionEvaluationParse)) {
-    error = Status::FromErrorString(
+    Status error = Status::FromErrorString(
         "expression interrupted by callback before parse");
     result_valobj_sp = ValueObjectConstResult::Create(
-        exe_ctx.GetBestExecutionContextScope(), error);
+        exe_ctx.GetBestExecutionContextScope(), std::move(error));
     return lldb::eExpressionInterrupted;
   }
 
@@ -364,7 +364,7 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
             lldb::eExpressionInterrupted,
             "expression interrupted by callback before execution");
         result_valobj_sp = ValueObjectConstResult::Create(
-            exe_ctx.GetBestExecutionContextScope(), error);
+            exe_ctx.GetBestExecutionContextScope(), std::move(error));
         return lldb::eExpressionInterrupted;
       }
 
@@ -415,7 +415,7 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
 
   if (result_valobj_sp.get() == nullptr) {
     result_valobj_sp = ValueObjectConstResult::Create(
-        exe_ctx.GetBestExecutionContextScope(), error);
+        exe_ctx.GetBestExecutionContextScope(), std::move(error));
   }
 
   return execution_results;
diff --git a/lldb/source/Host/common/LockFileBase.cpp b/lldb/source/Host/common/LockFileBase.cpp
index 01a8c38cef58e..6ef684e6d622c 100644
--- a/lldb/source/Host/common/LockFileBase.cpp
+++ b/lldb/source/Host/common/LockFileBase.cpp
@@ -50,7 +50,7 @@ Status LockFileBase::Unlock() {
   if (!IsLocked())
     return NotLocked();
 
-  const auto error = DoUnlock();
+  Status error = DoUnlock();
   if (error.Success()) {
     m_locked = false;
     m_start = 0;
@@ -69,7 +69,7 @@ Status LockFileBase::DoLock(const Locker &locker, const uint64_t start,
   if (IsLocked())
     return AlreadyLocked();
 
-  const auto error = locker(start, len);
+  Status error = locker(start, len);
   if (error.Success()) {
     m_locked = true;
     m_start = start;
diff --git a/lldb/source/Host/common/NativeProcessProtocol.cpp b/lldb/source/Host/common/NativeProcessProtocol.cpp
index a84d8db1c8794..e36eefaa6f4a4 100644
--- a/lldb/source/Host/common/NativeProcessProtocol.cpp
+++ b/lldb/source/Host/common/NativeProcessProtocol.cpp
@@ -215,17 +215,17 @@ Status NativeProcessProtocol::RemoveWatchpoint(lldb::addr_t addr) {
   for (const auto &thread : m_threads) {
     assert(thread && "thread list should not have a NULL thread!");
 
-    const Status thread_error = thread->RemoveWatchpoint(addr);
+    Status thread_error = thread->RemoveWatchpoint(addr);
     if (thread_error.Fail()) {
       // Keep track of the first thread error if any threads fail. We want to
       // try to remove the watchpoint from every thread, though, even if one or
       // more have errors.
       if (!overall_error.Fail())
-        overall_error = thread_error;
+        overall_error = std::move(thread_error);
     }
   }
-  const Status error = m_watchpoint_list.Remove(addr);
-  return overall_error.Fail() ? overall_error : error;
+  Status error = m_watchpoint_list.Remove(addr);
+  return overall_error.Fail() ? std::move(overall_error) : std::move(error);
 }
 
 const HardwareBreakpointMap &
diff --git a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
index 2a2fcf00c0adf..d0cc68826d4bb 100644
--- a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
+++ b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
@@ -233,7 +233,7 @@ ConnectionStatus ConnectionFileDescriptor::Disconnect(Status *error_ptr) {
   if (error.Fail())
     status = eConnectionStatusError;
   if (error_ptr)
-    *error_ptr = error;
+    *error_ptr = std::move(error);
 
   // Close any pipes we were using for async interrupts
   m_pipe.Close();
@@ -295,7 +295,7 @@ size_t ConnectionFileDescriptor::Read(void *dst, size_t dst_len,
   }
 
   if (error_ptr)
-    *error_ptr = error;
+    *error_ptr = error.Clone();
 
   if (error.Fail()) {
     uint32_t error_value = error.GetError();
@@ -393,7 +393,7 @@ size_t ConnectionFileDescriptor::Write(const void *src, size_t src_len,
   }
 
   if (error_ptr)
-    *error_ptr = error;
+    *error_ptr = error.Clone();
 
   if (error.Fail()) {
     switch (error.GetError()) {
@@ -476,7 +476,7 @@ ConnectionFileDescriptor::BytesAvailable(const Timeout<std::micro> &timeout,
       Status error = select_helper.Select();
 
       if (error_ptr)
-        *error_ptr = error;
+        *error_ptr = error.Clone();
 
       if (error.Fail()) {
         switch (error.GetError()) {
@@ -557,7 +557,7 @@ lldb::ConnectionStatus ConnectionFileDescriptor::AcceptSocket(
   }
 
   if (error_ptr)
-    *error_ptr = error;
+    *error_ptr = error.Clone();
   return eConnectionStatusError;
 }
 
@@ -579,7 +579,7 @@ ConnectionFileDescriptor::ConnectSocket(Socket::SocketProtocol socket_protocol,
   }
 
   if (error_ptr)
-    *error_ptr = error;
+    *error_ptr = error.Clone();
   return eConnectionStatusError;
 }
 
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index df539d5f5bcee..b93f47a8a8d5e 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -1861,7 +1861,7 @@ CommandInterpreter::PreprocessToken(std::string &expr_str) {
   // But if for some reason we didn't get a value object at all, then we will
   // make up some helpful errors from the expression result.
   if (expr_result_valobj_sp)
-    error = expr_result_valobj_sp->GetError();
+    error = expr_result_valobj_sp->GetError().Clone();
 
   if (error.Success()) {
     std::string result = lldb_private::toString(expr_result) +
diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp
index 079ab9044de63..8b55221da6e76 100644
--- a/lldb/source/Interpreter/ScriptInterpreter.cpp
+++ b/lldb/source/Interpreter/ScriptInterpreter.cpp
@@ -96,7 +96,7 @@ lldb::ProcessLaunchInfoSP ScriptInterpreter::GetOpaqueTypeFromSBLaunchInfo(
 Status
 ScriptInterpreter::GetStatusFromSBError(const lldb::SBError &error) const {
   if (error.m_opaque_up)
-    return *error.m_opaque_up;
+    return error.m_opaque_up->Clone();
 
   return Status();
 }
diff --git a/lldb/source/Plugins/Platform/Android/AdbClient.cpp b/lldb/source/Plugins/Platform/Android/AdbClient.cpp
index 00e66f8818f07..a179260ca15f6 100644
--- a/lldb/source/Plugins/Platform/Android/AdbClient.cpp
+++ b/lldb/source/Plugins/Platform/Android/AdbClient.cpp
@@ -173,7 +173,7 @@ Status AdbClient::SetPortForwarding(const uint16_t local_port,
   snprintf(message, sizeof(message), "forward:tcp:%d;tcp:%d", local_port,
            remote_port);
 
-  const auto error = SendDeviceMessage(message);
+  Status error = SendDeviceMessage(message);
   if (error.Fail())
     return error;
 
@@ -192,7 +192,7 @@ AdbClient::SetPortForwarding(const uint16_t local_port,
   snprintf(message, sizeof(message), "forward:tcp:%d;%s:%s", local_port,
            sock_namespace_str, remote_socket_name.str().c_str());
 
-  const auto error = SendDeviceMessage(message);
+  Status error = SendDeviceMessage(message);
   if (error.Fail())
     return error;
 
@@ -203,7 +203,7 @@ Status AdbClient::DeletePortForwarding(const uint16_t local_port) {
   char message[32];
   snprintf(message, sizeof(message), "killforward:tcp:%d", local_port);
 
-  const auto error = SendDeviceMessage(message);
+  Status error = SendDeviceMessage(message);
   if (error.Fail())
     return error;
 
@@ -588,7 +588,7 @@ AdbClient::SyncService::executeCommand(const std::function<Status()> &cmd) {
   if (!m_conn)
     return Status::FromErrorString("SyncService is disconnected");
 
-  const auto error = cmd();
+  Status error = cmd();
   if (error.Fail())
     m_conn.reset();
 
diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp b/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp
index de7031701df4d..d18b718d4a56c 100644
--- a/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp
+++ b/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp
@@ -189,8 +189,8 @@ Status PlatformAndroidRemoteGDBServer::MakeConnectURL(
   Status error;
 
   auto forward = [&](const uint16_t local, const uint16_t remote) {
-    error = ForwardPortWithAdb(local, remote, remote_socket_name,
-                               m_socket_namespace, m_device_id);
+    Status error = ForwardPortWithAdb(local, remote, remote_socket_name,
+                                      m_socket_namespace, m_device_id);
     if (error.Success()) {
       m_port_forwards[pid] = local;
       std::ostringstream url_str;
diff --git a/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.h b/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.h
index b2956054d50bf..57dd32f587a65 100644
--- a/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.h
+++ b/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.h
@@ -34,7 +34,7 @@ class Process {
 
   explicit operator bool() { return m_pid != LLDB_INVALID_PROCESS_ID; }
 
-  lldb_private::Status GetError() { return m_error; }
+  lldb_private::Status GetError() { return m_error.Clone(); }
 
 private:
   Process(lldb::pid_t p);
diff --git a/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm b/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm
index 303a5409c6fe4..f3e79d3d56154 100644
--- a/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm
+++ b/lldb/source/Plugins/Platform/MacOSX/objcxx/PlatformiOSSimulatorCoreSimulatorSupport.mm
@@ -61,10 +61,10 @@ - (BOOL)spawnWithPath:(NSString *)path
 CoreSimulatorSupport::Process::Process(lldb::pid_t p) : m_pid(p), m_error() {}
 
 CoreSimulatorSupport::Process::Process(Status error)
-    : m_pid(LLDB_INVALID_PROCESS_ID), m_error(error) {}
+    : m_pid(LLDB_INVALID_PROCESS_ID), m_error(std::move(error)) {}
 
 CoreSimulatorSupport::Process::Process(lldb::pid_t p, Status error)
-    : m_pid(p), m_error(error) {}
+    : m_pid(p), m_error(std::move(error)) {}
 
 CoreSimulatorSupport::DeviceType::DeviceType() : m_model_identifier() {}
 
@@ -498,19 +498,19 @@ static Status HandleFileAction(ProcessLaunchInfo &launch_info,
                            STDIN_FILENO, stdin_file);
 
   if (error.Fail())
-    return CoreSimulatorSupport::Process(error);
+    return CoreSimulatorSupport::Process(std::move(error));
 
   error = HandleFileAction(launch_info, options, kSimDeviceSpawnStdout,
                            STDOUT_FILENO, stdout_file);
 
   if (error.Fail())
-    return CoreSimulatorSupport::Process(error);
+    return CoreSimulatorSupport::Process(std::move(error));
 
   error = HandleFileAction(launch_info, options, kSimDeviceSpawnStderr,
                            STDERR_FILENO, stderr_file);
 
   if (error.Fail())
-    return CoreSimulatorSupport::Process(error);
+    return CoreSimulatorSupport::Process(std::move(error));
 
 #undef kSimDeviceSpawnEnvironment
 #undef kSimDeviceSpawnStdin
@@ -539,7 +539,7 @@ static Status HandleFileAction(ProcessLaunchInfo &launch_info,
                                                    : "unable to launch");
   }
 
-  return CoreSimulatorSupport::Process(pid, error);
+  return CoreSimulatorSupport::Process(pid, std::move(error));
 }
 
 CoreSimulatorSupport::DeviceSet
diff --git a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp
index 70b49c0424bbb..31315e46ca168 100644
--- a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp
+++ b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp
@@ -544,7 +544,7 @@ Status PlatformPOSIX::EvaluateLibdlExpression(
     return expr_error;
 
   if (result_valobj_sp->GetError().Fail())
-    return result_valobj_sp->GetError();
+    return result_valobj_sp->GetError().Clone();
   return Status();
 }
 
@@ -976,7 +976,7 @@ Status PlatformPOSIX::UnloadImage(lldb_private::Process *process,
     return error;
 
   if (result_valobj_sp->GetError().Fail())
-    return result_valobj_sp->GetError();
+    return result_valobj_sp->GetError().Clone();
 
   Scalar scalar;
   if (result_valobj_sp->ResolveValue(scalar)) {
diff --git a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp
index 8173b7c3b5003..7352d6f33f217 100644
--- a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp
+++ b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp
@@ -444,7 +444,7 @@ Status PlatformWindows::UnloadImage(Process *process, uint32_t image_token) {
     return result;
 
   if (value->GetError().Fail())
-    return value->GetError();
+    return value->GetError().Clone();
 
   Scalar scalar;
   if (value->ResolveValue(scalar)) {
@@ -805,7 +805,7 @@ extern "C" {
     return error;
 
   if (value->GetError().Fail())
-    return value->GetError();
+    return value->GetError().Clone();
 
   return Status();
 }
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
index 8ba31b31e8dc1..f5fc337a8028e 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
@@ -111,7 +111,7 @@ bool ScriptedProcessPythonInterface::CreateBreakpoint(lldb::addr_t addr,
 
   // If there was an error on the python call, surface it to the user.
   if (py_error.Fail())
-    error = py_error;
+    error = std::move(py_error);
 
   if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
                                                     error))
@@ -128,7 +128,7 @@ lldb::DataExtractorSP ScriptedProcessPythonInterface::ReadMemoryAtAddress(
 
   // If there was an error on the python call, surface it to the user.
   if (py_error.Fail())
-    error = py_error;
+    error = std::move(py_error);
 
   return data_sp;
 }
@@ -145,7 +145,7 @@ lldb::offset_t ScriptedProcessPythonInterface::WriteMemoryAtAddress(
 
   // If there was an error on the python call, surface it to the user.
   if (py_error.Fail())
-    error = py_error;
+    error = std::move(py_error);
 
   return obj->GetUnsignedIntegerValue(LLDB_INVALID_OFFSET);
 }
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
index c1dcdc7df6cee..cb6f6ec398926 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
@@ -309,8 +309,12 @@ class ScriptedPythonInterface : virtual public ScriptedInterface {
     return python::PythonBoolean(arg);
   }
 
-  python::PythonObject Transform(Status arg) {
-    return python::SWIGBridge::ToSWIGWrapper(arg);
+  python::PythonObject Transform(const Status &arg) {
+    return python::SWIGBridge::ToSWIGWrapper(arg.Clone());
+  }
+
+  python::PythonObject Transform(Status &&arg) {
+    return python::SWIGBridge::ToSWIGWrapper(std::move(arg));
   }
 
   python::PythonObject Transform(const StructuredDataImpl &arg) {
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
index 5351c1a698b4a..97a3837fd7aa6 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
@@ -86,7 +86,7 @@ class SWIGBridge {
   static PythonObject ToSWIGWrapper(lldb::ProcessSP process_sp);
   static PythonObject ToSWIGWrapper(lldb::ThreadPlanSP thread_plan_sp);
   static PythonObject ToSWIGWrapper(lldb::BreakpointSP breakpoint_sp);
-  static PythonObject ToSWIGWrapper(const Status &status);
+  static PythonObject ToSWIGWrapper(Status status);
   static PythonObject ToSWIGWrapper(const StructuredDataImpl &data_impl);
   static PythonObject ToSWIGWrapper(lldb::ThreadSP thread_sp);
   static PythonObject ToSWIGWrapper(lldb::StackFrameSP frame_sp);
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
index 148932d67b908..1267e20f08712 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
@@ -268,7 +268,7 @@ class DWARFUnit : public UserID {
   /// .dwo file. Things like a missing .dwo file, DWO ID mismatch, and other
   /// .dwo errors can be stored in each compile unit so the issues can be
   /// communicated to the user.
-  void SetDwoError(const Status &error) { m_dwo_error = error; }
+  void SetDwoError(Status &&error) { m_dwo_error = std::move(error); }
 
 protected:
   DWARFUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index ff44329d081ca..f721ca00fd355 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -4480,7 +4480,7 @@ Status SymbolFileDWARF::CalculateFrameVariableError(StackFrame &frame) {
   dwarf_cu->ExtractUnitDIEIfNeeded();
   const Status &dwo_error = dwarf_cu->GetDwoError();
   if (dwo_error.Fail())
-    return dwo_error;
+    return dwo_error.Clone();
 
   // Don't return an error for assembly files as they typically don't have
   // varaible information.
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
index 0cd2d06cd708c..08ea4c6d1645a 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
@@ -1574,7 +1574,7 @@ Status SymbolFileDWARFDebugMap::CalculateFrameVariableError(StackFrame &frame) {
             // we weren't able to open the .o file. Display an appropriate
             // error
             if (comp_unit_info->oso_load_error.Fail())
-              return comp_unit_info->oso_load_error;
+              return comp_unit_info->oso_load_error.Clone();
             else
               return Status::FromErrorStringWithFormat(
                   "unable to load debug map object file \"%s\" "
diff --git a/lldb/source/Target/ModuleCache.cpp b/lldb/source/Target/ModuleCache.cpp
index ccae7ea106c97..f737836e0d971 100644
--- a/lldb/source/Target/ModuleCache.cpp
+++ b/lldb/source/Target/ModuleCache.cpp
@@ -139,7 +139,7 @@ Status CreateHostSysRootModuleLink(const FileSpec &root_dir_spec,
     DecrementRefExistingModule(root_dir_spec, sysroot_module_path_spec);
   }
 
-  const auto error = MakeDirectory(
+  Status error = MakeDirectory(
       FileSpec(sysroot_module_path_spec.GetDirectory().AsCString()));
   if (error.Fail())
     return error;
diff --git a/lldb/source/Target/Platform.cpp b/lldb/source/Target/Platform.cpp
index 7792edcc2cb58..3e7546048e586 100644
--- a/lldb/source/Target/Platform.cpp
+++ b/lldb/source/Target/Platform.cpp
@@ -549,7 +549,7 @@ Status Platform::Install(const FileSpec &src, const FileSpec &dst) {
         RecurseCopyBaton baton = {recurse_dst, this, Status()};
         FileSystem::Instance().EnumerateDirectory(
             src_dir_path, true, true, true, RecurseCopy_Callback, &baton);
-        return baton.error;
+        return std::move(baton.error);
       }
     } break;
 
@@ -1566,7 +1566,7 @@ Status Platform::GetRemoteSharedModule(const ModuleSpec &module_spec,
   //      resolved_module_spec.
 
   // Trying to find a module by UUID on local file system.
-  const Status error = module_resolver(resolved_module_spec);
+  Status error = module_resolver(resolved_module_spec);
   if (error.Success()) {
     if (module_sp && symbol_file_spec) {
       // Set the symbol file to the module if the locate modudle callback was
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index f2a631a466b35..40f3115f1ff6d 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -2147,7 +2147,6 @@ size_t Process::ReadCStringFromMemory(addr_t addr, char *dst,
     result_error.Clear();
     // NULL out everything just to be safe
     memset(dst, 0, dst_max_len);
-    Status error;
     addr_t curr_addr = addr;
     const size_t cache_line_size = m_memory_cache.GetMemoryCacheLineSize();
     size_t bytes_left = dst_max_len - 1;
@@ -2158,10 +2157,11 @@ size_t Process::ReadCStringFromMemory(addr_t addr, char *dst,
           cache_line_size - (curr_addr % cache_line_size);
       addr_t bytes_to_read =
           std::min<addr_t>(bytes_left, cache_line_bytes_left);
+      Status error;
       size_t bytes_read = ReadMemory(curr_addr, curr_dst, bytes_to_read, error);
 
       if (bytes_read == 0) {
-        result_error = error;
+        result_error = std::move(error);
         dst[total_cstr_len] = '\0';
         break;
       }
diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
index 1610971a34148..fe0d4c93c5062 100644
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -1117,7 +1117,7 @@ bool StackFrame::GetFrameBaseValue(Scalar &frame_base, Status *error_ptr) {
     frame_base = m_frame_base;
 
   if (error_ptr)
-    *error_ptr = m_frame_base_error;
+    *error_ptr = m_frame_base_error.Clone();
   return m_frame_base_error.Success();
 }
 
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index fcac0a48f46e6..f1c378b968d2b 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -1986,7 +1986,6 @@ size_t Target::ReadCStringFromMemory(const Address &addr, char *dst,
     result_error.Clear();
     // NULL out everything just to be safe
     memset(dst, 0, dst_max_len);
-    Status error;
     addr_t curr_addr = addr.GetLoadAddress(this);
     Address address(addr);
 
@@ -2003,11 +2002,12 @@ size_t Target::ReadCStringFromMemory(const Address &addr, char *dst,
           cache_line_size - (curr_addr % cache_line_size);
       addr_t bytes_to_read =
           std::min<addr_t>(bytes_left, cache_line_bytes_left);
+      Status error;
       size_t bytes_read = ReadMemory(address, curr_dst, bytes_to_read, error,
                                      force_live_memory);
 
       if (bytes_read == 0) {
-        result_error = error;
+        result_error = std::move(error);
         dst[total_cstr_len] = '\0';
         break;
       }
@@ -2401,7 +2401,7 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec,
     }
   }
   if (error_ptr)
-    *error_ptr = error;
+    *error_ptr = std::move(error);
   return module_sp;
 }
 
@@ -2730,7 +2730,7 @@ ExpressionResults Target::EvaluateExpression(
     // Pass up the error by wrapping it inside an error result.
     if (error.Fail() && !result_valobj_sp)
       result_valobj_sp = ValueObjectConstResult::Create(
-          exe_ctx.GetBestExecutionContextScope(), error);
+          exe_ctx.GetBestExecutionContextScope(), std::move(error));
   }
 
   if (execution_results == eExpressionCompleted)
@@ -3348,10 +3348,8 @@ Status Target::Launch(ProcessLaunchInfo &launch_info, Stream *stream) {
     else
       error = m_process_sp->Resume();
     if (!error.Success()) {
-      Status error2;
-      error2 = Status::FromErrorStringWithFormat(
+      error = Status::FromErrorStringWithFormat(
           "process resume at entry point failed: %s", error.AsCString());
-      error = error2;
     }
   } break;
   case eStateExited: {
diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp
index 40e1fbf3fab1b..4af3af5fba018 100644
--- a/lldb/source/Utility/Status.cpp
+++ b/lldb/source/Utility/Status.cpp
@@ -107,6 +107,13 @@ llvm::Error Status::ToError() const {
 
 Status::~Status() = default;
 
+const Status &Status::operator=(Status &&other) {
+  m_code = other.m_code;
+  m_type = other.m_type;
+  m_string = std::move(other.m_string);
+  return *this;
+}
+
 #ifdef _WIN32
 static std::string RetrieveWin32ErrorString(uint32_t error_code) {
   char *buffer = nullptr;
diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
index 0edde54d310fd..f2746c3e2516f 100644
--- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
+++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
@@ -283,7 +283,7 @@ bool lldb_private::python::SWIGBridge::LLDBSwigPythonStopHookCallHandleStop(
 }
 
 python::PythonObject
-lldb_private::python::SWIGBridge::ToSWIGWrapper(const Status &status) {
+lldb_private::python::SWIGBridge::ToSWIGWrapper(Status status) {
   return python::PythonObject();
 }
 
diff --git a/lldb/unittests/Target/RemoteAwarePlatformTest.cpp b/lldb/unittests/Target/RemoteAwarePlatformTest.cpp
index d7810b20af95d..3278674ed0a05 100644
--- a/lldb/unittests/Target/RemoteAwarePlatformTest.cpp
+++ b/lldb/unittests/Target/RemoteAwarePlatformTest.cpp
@@ -33,8 +33,8 @@ class RemoteAwarePlatformTester : public RemoteAwarePlatform {
   MOCK_METHOD0(CalculateTrapHandlerSymbolNames, void());
 
   MOCK_METHOD2(ResolveExecutable,
-               std::pair<Status, ModuleSP>(const ModuleSpec &,
-                                           const FileSpecList *));
+               std::pair<bool, ModuleSP>(const ModuleSpec &,
+                                         const FileSpecList *));
   Status
   ResolveExecutable(const ModuleSpec &module_spec,
                     lldb::ModuleSP &exe_module_sp,
@@ -42,7 +42,7 @@ class RemoteAwarePlatformTester : public RemoteAwarePlatform {
   { // NOLINT(modernize-use-override)
     auto pair = ResolveExecutable(module_spec, module_search_paths_ptr);
     exe_module_sp = pair.second;
-    return pair.first;
+    return pair.first ? Status() : Status::FromErrorString("error");
   }
 
   void SetRemotePlatform(lldb::PlatformSP platform) {
@@ -81,7 +81,7 @@ TEST_F(RemoteAwarePlatformTest, TestResolveExecutabelOnClientByPlatform) {
   EXPECT_CALL(platform, GetSupportedArchitectures(process_host_arch))
       .WillRepeatedly(Return(std::vector<ArchSpec>()));
   EXPECT_CALL(platform, ResolveExecutable(_, _))
-      .WillRepeatedly(Return(std::make_pair(Status(), expected_executable)));
+      .WillRepeatedly(Return(std::make_pair(true, expected_executable)));
 
   platform.SetRemotePlatform(std::make_shared<TargetPlatformTester>(false));
 

From 7b760894f247f4fa1b27c01c767c8599c169f996 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 5 Sep 2024 12:49:16 -0700
Subject: [PATCH 285/425] [lldb] Convert NativeProcessLinux to new Status API
 (NFC)

---
 lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
index cc0e34eecdf30..cea3fbf9112f5 100644
--- a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
@@ -1096,9 +1096,9 @@ Status NativeProcessLinux::Detach() {
 
   for (const auto &thread : m_threads) {
     Status e = Detach(thread->GetID());
+     // Save the error, but still attempt to detach from other threads.
     if (e.Fail())
-      error =
-          e; // Save the error, but still attempt to detach from other threads.
+      error = e.Clone;
   }
 
   m_intel_pt_collector.Clear();
@@ -1905,13 +1905,13 @@ Status NativeProcessLinux::ResumeThread(NativeThreadLinux &thread,
   // reflect it is running after this completes.
   switch (state) {
   case eStateRunning: {
-    const auto resume_result = thread.Resume(signo);
+    Status resume_result = thread.Resume(signo);
     if (resume_result.Success())
       SetState(eStateRunning, true);
     return resume_result;
   }
   case eStateStepping: {
-    const auto step_result = thread.SingleStep(signo);
+    Status step_result = thread.SingleStep(signo);
     if (step_result.Success())
       SetState(eStateRunning, true);
     return step_result;

From 54b10555c32e9677ce15c408296f92b35cd3d29c Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl@gmail.com>
Date: Thu, 5 Sep 2024 15:51:00 -0400
Subject: [PATCH 286/425] [OpenMP] LIBOMPTARGET_DEVICE_ARCHITECTURES requires
 semicolons (#107454)

If I use commas to delimit architectures in
`LIBOMPTARGET_DEVICE_ARCHITECTURES`, cmake for the runtimes complains:

```
Unknown GPU architecture 'sm_70,sm_80,sm_90'
```

Semicolons are required instead.
---
 openmp/docs/SupportAndFAQ.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/openmp/docs/SupportAndFAQ.rst b/openmp/docs/SupportAndFAQ.rst
index a158422befd07..cd2d6a4703221 100644
--- a/openmp/docs/SupportAndFAQ.rst
+++ b/openmp/docs/SupportAndFAQ.rst
@@ -81,9 +81,9 @@ The Cuda SDK is required on the machine that will execute the openmp application
 If your build machine is not the target machine or automatic detection of the
 available GPUs failed, you should also set:
 
-- ``LIBOMPTARGET_DEVICE_ARCHITECTURES=sm_<xy>,...`` where ``<xy>`` is the numeric 
+- ``LIBOMPTARGET_DEVICE_ARCHITECTURES='sm_<xy>;...'`` where ``<xy>`` is the numeric
   compute capability of your GPU. For instance, set 
-  ``LIBOMPTARGET_DEVICE_ARCHITECTURES=sm_70,sm_80`` to target the Nvidia Volta 
+  ``LIBOMPTARGET_DEVICE_ARCHITECTURES='sm_70;sm_80'`` to target the Nvidia Volta
   and Ampere architectures. 
 
 
@@ -141,9 +141,9 @@ With those libraries installed, then LLVM build and installed, try:
 If your build machine is not the target machine or automatic detection of the
 available GPUs failed, you should also set:
 
-- ``LIBOMPTARGET_DEVICE_ARCHITECTURES=gfx<xyz>,...`` where ``<xyz>`` is the 
+- ``LIBOMPTARGET_DEVICE_ARCHITECTURES='gfx<xyz>;...'`` where ``<xyz>`` is the
   shader core instruction set architecture. For instance, set 
-  ``LIBOMPTARGET_DEVICE_ARCHITECTURES=gfx906,gfx90a`` to target AMD GCN5 
+  ``LIBOMPTARGET_DEVICE_ARCHITECTURES='gfx906;gfx90a'`` to target AMD GCN5
   and CDNA2 devices. 
 
 Q: What are the known limitations of OpenMP AMDGPU offload?

From 3b426a8951caa543b65f20ff265353fd79f436e5 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 5 Sep 2024 12:53:08 -0700
Subject: [PATCH 287/425] [lldb] Convert NativeProcessLinux to new Status API
 (NFC)

---
 lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
index cea3fbf9112f5..5c262db8db7fd 100644
--- a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
@@ -1098,7 +1098,7 @@ Status NativeProcessLinux::Detach() {
     Status e = Detach(thread->GetID());
      // Save the error, but still attempt to detach from other threads.
     if (e.Fail())
-      error = e.Clone;
+      error = e.Clone();
   }
 
   m_intel_pt_collector.Clear();

From 3726f9c57537aff05bd6ecf309133ce05bebaf43 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Thu, 5 Sep 2024 13:10:37 -0700
Subject: [PATCH 288/425] [Attributor][NFC] Pre-commits for #107439  (#107457)

---
 .../Attributor/ArgumentPromotion/byval.ll     |   8 +-
 .../IPConstantProp/2009-09-24-byval-ptr.ll    |   8 +-
 .../IPConstantProp/return-constants.ll        |  12 +-
 .../Transforms/Attributor/assumes_info.ll     |   2 +
 .../Attributor/cb_liveness_disabled.ll        |   1 +
 .../Attributor/cb_liveness_enabled.ll         |   1 +
 .../Attributor/phi_bug_pointer_info.ll        |  26 +++
 .../Attributor/value-simplify-local-remote.ll |  47 ++--
 .../Attributor/value-simplify-pointer-info.ll | 212 ++++++++++++++++++
 9 files changed, 284 insertions(+), 33 deletions(-)

diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
index 621c6cf94313e..516b80b1e237b 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
@@ -59,12 +59,12 @@ define i32 @main() nounwind  {
 ; TUNIT-NEXT:    store i32 1, ptr [[S]], align 32
 ; TUNIT-NEXT:    [[VAL4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[S]], align 8
-; TUNIT-NEXT:    [[S_B41:%.*]] = getelementptr i8, ptr [[S]], i64 4
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_B41]], align 8
+; TUNIT-NEXT:    [[S_B4:%.*]] = getelementptr i8, ptr [[S]], i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_B4]], align 8
 ; TUNIT-NEXT:    [[C0:%.*]] = call i32 @f(i32 [[TMP0]], i64 [[TMP1]]) #[[ATTR1:[0-9]+]]
 ; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[S]], align 32
-; TUNIT-NEXT:    [[S_B4:%.*]] = getelementptr i8, ptr [[S]], i64 4
-; TUNIT-NEXT:    [[TMP3:%.*]] = load i64, ptr [[S_B4]], align 32
+; TUNIT-NEXT:    [[S_B41:%.*]] = getelementptr i8, ptr [[S]], i64 4
+; TUNIT-NEXT:    [[TMP3:%.*]] = load i64, ptr [[S_B41]], align 32
 ; TUNIT-NEXT:    [[C1:%.*]] = call i32 @g(i32 [[TMP2]], i64 [[TMP3]]) #[[ATTR1]]
 ; TUNIT-NEXT:    [[A:%.*]] = add i32 [[C0]], [[C1]]
 ; TUNIT-NEXT:    ret i32 [[A]]
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
index a209ee2ebe064..249119a77a4d0 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
@@ -139,12 +139,12 @@ define i32 @unions_v2() nounwind {
 ; TUNIT-SAME: () #[[ATTR2]] {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i8, ptr @mystr, align 8
-; TUNIT-NEXT:    [[MYSTR_B4:%.*]] = getelementptr i8, ptr @mystr, i64 4
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_B4]], align 8
+; TUNIT-NEXT:    [[MYSTR_B41:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_B41]], align 8
 ; TUNIT-NEXT:    call void @vfu1(i8 [[TMP0]], i32 [[TMP1]]) #[[ATTR2]]
 ; TUNIT-NEXT:    [[TMP2:%.*]] = load i8, ptr @mystr, align 8
-; TUNIT-NEXT:    [[MYSTR_B41:%.*]] = getelementptr i8, ptr @mystr, i64 4
-; TUNIT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_B41]], align 8
+; TUNIT-NEXT:    [[MYSTR_B4:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; TUNIT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_B4]], align 8
 ; TUNIT-NEXT:    [[RESULT:%.*]] = call i32 @vfu2_v2(i8 [[TMP2]], i32 [[TMP3]]) #[[ATTR3]]
 ; TUNIT-NEXT:    ret i32 [[RESULT]]
 ;
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll b/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll
index 343b6b9dd433c..d1c331ce19651 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll
@@ -62,13 +62,13 @@ define %0 @caller(i1 %Q) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@caller
 ; TUNIT-SAME: (i1 [[Q:%.*]]) #[[ATTR0]] {
-; TUNIT-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] [[FOO:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR1:[0-9]+]]
+; TUNIT-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR1:[0-9]+]]
 ; TUNIT-NEXT:    ret [[TMP0]] [[X]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@caller
 ; CGSCC-SAME: (i1 noundef [[Q:%.*]]) #[[ATTR1:[0-9]+]] {
-; CGSCC-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] [[FOO:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR2:[0-9]+]]
+; CGSCC-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR2:[0-9]+]]
 ; CGSCC-NEXT:    ret [[TMP0]] [[X]]
 ;
   %X = call %0 @foo(i1 %Q)
@@ -87,10 +87,10 @@ define i32 @caller2(i1 %Q) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@caller2
 ; TUNIT-SAME: (i1 [[Q:%.*]]) #[[ATTR0]] {
-; TUNIT-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] [[FOO:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR1]]
+; TUNIT-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] @[[FOO]](i1 noundef [[Q]]) #[[ATTR1]]
 ; TUNIT-NEXT:    [[A:%.*]] = extractvalue [[TMP0]] [[X]], 0
 ; TUNIT-NEXT:    [[B:%.*]] = extractvalue [[TMP0]] [[X]], 1
-; TUNIT-NEXT:    [[Y:%.*]] = call [[TMP0]] [[BAR:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR1]]
+; TUNIT-NEXT:    [[Y:%.*]] = call [[TMP0]] @[[BAR:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR1]]
 ; TUNIT-NEXT:    [[C:%.*]] = extractvalue [[TMP0]] [[Y]], 0
 ; TUNIT-NEXT:    [[D:%.*]] = extractvalue [[TMP0]] [[Y]], 1
 ; TUNIT-NEXT:    [[M:%.*]] = add i32 [[A]], [[C]]
@@ -101,10 +101,10 @@ define i32 @caller2(i1 %Q) {
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@caller2
 ; CGSCC-SAME: (i1 noundef [[Q:%.*]]) #[[ATTR1]] {
-; CGSCC-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] [[FOO:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR2]]
+; CGSCC-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] @[[FOO]](i1 noundef [[Q]]) #[[ATTR2]]
 ; CGSCC-NEXT:    [[A:%.*]] = extractvalue [[TMP0]] [[X]], 0
 ; CGSCC-NEXT:    [[B:%.*]] = extractvalue [[TMP0]] [[X]], 1
-; CGSCC-NEXT:    [[Y:%.*]] = call [[TMP0]] [[BAR:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR2]]
+; CGSCC-NEXT:    [[Y:%.*]] = call [[TMP0]] @[[BAR:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR2]]
 ; CGSCC-NEXT:    [[C:%.*]] = extractvalue [[TMP0]] [[Y]], 0
 ; CGSCC-NEXT:    [[D:%.*]] = extractvalue [[TMP0]] [[Y]], 1
 ; CGSCC-NEXT:    [[M:%.*]] = add i32 [[A]], [[C]]
diff --git a/llvm/test/Transforms/Attributor/assumes_info.ll b/llvm/test/Transforms/Attributor/assumes_info.ll
index df7d7ddd13356..088739ec5b812 100644
--- a/llvm/test/Transforms/Attributor/assumes_info.ll
+++ b/llvm/test/Transforms/Attributor/assumes_info.ll
@@ -119,9 +119,11 @@ attributes #3 = { "llvm.assume"="B,C,A" }
 ; TUNIT: attributes #[[ATTR0]] = { "llvm.assume"="A" }
 ; TUNIT: attributes #[[ATTR1]] = { "llvm.assume"="A,B" }
 ; TUNIT: attributes #[[ATTR2]] = { "llvm.assume"="A,B,C" }
+; TUNIT: attributes #[[ATTR3:[0-9]+]] = { "llvm.assume"="B,C,A" }
 ;.
 ; CGSCC: attributes #[[ATTR0]] = { "llvm.assume"="A" }
 ; CGSCC: attributes #[[ATTR1]] = { "llvm.assume"="A,B" }
 ; CGSCC: attributes #[[ATTR2]] = { "llvm.assume"="A,B,C" }
 ; CGSCC: attributes #[[ATTR3]] = { "llvm.assume"="B" }
+; CGSCC: attributes #[[ATTR4:[0-9]+]] = { "llvm.assume"="B,C,A" }
 ;.
diff --git a/llvm/test/Transforms/Attributor/cb_liveness_disabled.ll b/llvm/test/Transforms/Attributor/cb_liveness_disabled.ll
index f8f374ab66141..2f64cdbca2010 100644
--- a/llvm/test/Transforms/Attributor/cb_liveness_disabled.ll
+++ b/llvm/test/Transforms/Attributor/cb_liveness_disabled.ll
@@ -189,6 +189,7 @@ define i32 @test_ncheck2(i32 %0) #0 {
 
 attributes #0 = { noinline nounwind sspstrong uwtable}
 
+;.
 ; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind sspstrong willreturn memory(none) uwtable }
 ; TUNIT: attributes #[[ATTR1:[0-9]+]] = { nofree nosync nounwind willreturn memory(none) }
 ;.
diff --git a/llvm/test/Transforms/Attributor/cb_liveness_enabled.ll b/llvm/test/Transforms/Attributor/cb_liveness_enabled.ll
index 32c08ee92ddef..585b6ef10c4fe 100644
--- a/llvm/test/Transforms/Attributor/cb_liveness_enabled.ll
+++ b/llvm/test/Transforms/Attributor/cb_liveness_enabled.ll
@@ -192,6 +192,7 @@ define i32 @test_ncheck2(i32 %0) #0 {
 
 attributes #0 = { noinline nounwind sspstrong uwtable}
 
+;.
 ; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind sspstrong willreturn memory(none) uwtable }
 ; TUNIT: attributes #[[ATTR1:[0-9]+]] = { nofree nosync nounwind willreturn memory(none) }
 ;.
diff --git a/llvm/test/Transforms/Attributor/phi_bug_pointer_info.ll b/llvm/test/Transforms/Attributor/phi_bug_pointer_info.ll
index bb423e10f2c72..b4a2192c25faa 100644
--- a/llvm/test/Transforms/Attributor/phi_bug_pointer_info.ll
+++ b/llvm/test/Transforms/Attributor/phi_bug_pointer_info.ll
@@ -17,7 +17,30 @@
 ; CHECK:      - c:   %1 = load i32, ptr %val2, align 4
 ; CHECK:    - 6 -   %ret = load i32, ptr %x, align 4
 ; CHECK:      - c: <unknown>
+;.
+; CHECK: @globalBytes = internal global [1024 x i8] zeroinitializer
+;.
 define dso_local i32 @phi_different_offsets(ptr nocapture %val, ptr nocapture %val2, i1 %cmp) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+; CHECK-LABEL: define dso_local i32 @phi_different_offsets
+; CHECK-SAME: (ptr nocapture nofree readonly [[VAL:%.*]], ptr nocapture nofree readonly [[VAL2:%.*]], i1 noundef [[CMP:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[FIELD2:%.*]] = getelementptr i32, ptr @globalBytes, i32 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[VAL]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[FIELD2]], align 8
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[FIELD8:%.*]] = getelementptr i32, ptr @globalBytes, i32 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[VAL2]], align 4
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[FIELD8]], align 16
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[X:%.*]] = phi ptr [ [[FIELD2]], [[THEN]] ], [ [[FIELD8]], [[ELSE]] ]
+; CHECK-NEXT:    [[RET:%.*]] = load i32, ptr [[X]], align 8
+; CHECK-NEXT:    ret i32 [[RET]]
+;
 entry:
   br i1 %cmp, label %then, label %else
 
@@ -39,3 +62,6 @@ end:
   ret i32 %ret
 
 }
+;.
+; CHECK: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn }
+;.
diff --git a/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll b/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll
index 20b52c3fcd85a..9ff51e97d6e15 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll
@@ -106,7 +106,7 @@ define ptr @t2(ptr %this, ptr %this.addr, ptr %this1) {
 ; TUNIT-SAME: (ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], ptr nocapture nofree readnone [[THIS_ADDR:%.*]], ptr nocapture nofree readnone [[THIS1:%.*]]) #[[ATTR1:[0-9]+]] {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
-; TUNIT-NEXT:    [[CALL:%.*]] = call [[S:%.*]] @foo.1(ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR4:[0-9]+]]
+; TUNIT-NEXT:    [[CALL:%.*]] = call [[S:%.*]] @[[FOO_1:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR4:[0-9]+]]
 ; TUNIT-NEXT:    [[TEST_RET:%.*]] = extractvalue [[S]] [[CALL]], 0
 ; TUNIT-NEXT:    ret ptr [[TEST_RET]]
 ;
@@ -115,7 +115,7 @@ define ptr @t2(ptr %this, ptr %this.addr, ptr %this1) {
 ; CGSCC-SAME: (ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], ptr nocapture nofree readnone [[THIS_ADDR:%.*]], ptr nocapture nofree readnone [[THIS1:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
-; CGSCC-NEXT:    [[CALL:%.*]] = call [[S:%.*]] @foo.1(ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR8:[0-9]+]]
+; CGSCC-NEXT:    [[CALL:%.*]] = call [[S:%.*]] @[[FOO_1:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR8:[0-9]+]]
 ; CGSCC-NEXT:    [[TEST_RET:%.*]] = extractvalue [[S]] [[CALL]], 0
 ; CGSCC-NEXT:    ret ptr [[TEST_RET]]
 ;
@@ -205,7 +205,7 @@ define ptr @foo(ptr %this, ptr %this.addr, ptr %this1) {
 ; TUNIT-SAME: (ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], ptr nocapture nofree readnone [[THIS_ADDR:%.*]], ptr nocapture nofree readnone [[THIS1:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
-; TUNIT-NEXT:    [[CALL:%.*]] = call [[S:%.*]] @bar.5(ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR4]]
+; TUNIT-NEXT:    [[CALL:%.*]] = call [[S:%.*]] @[[BAR_5:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR4]]
 ; TUNIT-NEXT:    [[FOO_RET:%.*]] = extractvalue [[S]] [[CALL]], 0
 ; TUNIT-NEXT:    ret ptr [[FOO_RET]]
 ;
@@ -214,7 +214,7 @@ define ptr @foo(ptr %this, ptr %this.addr, ptr %this1) {
 ; CGSCC-SAME: (ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], ptr nocapture nofree readnone [[THIS_ADDR:%.*]], ptr nocapture nofree readnone [[THIS1:%.*]]) #[[ATTR2]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
-; CGSCC-NEXT:    [[CALL:%.*]] = call [[S:%.*]] @bar.5(ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR8]]
+; CGSCC-NEXT:    [[CALL:%.*]] = call [[S:%.*]] @[[BAR_5:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR8]]
 ; CGSCC-NEXT:    [[FOO_RET:%.*]] = extractvalue [[S]] [[CALL]], 0
 ; CGSCC-NEXT:    ret ptr [[FOO_RET]]
 ;
@@ -317,7 +317,7 @@ define weak_odr void @t3() {
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[CALL4:%.*]] = call [[S_2:%.*]] @t3.helper()
+; CHECK-NEXT:    [[CALL4:%.*]] = call [[S_2:%.*]] @[[T3_HELPER:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -380,8 +380,8 @@ define dso_local void @spam() {
 ; TUNIT-NEXT:    store i32 [[X]], ptr [[TMP]], align 4
 ; TUNIT-NEXT:    br label [[BB16:%.*]]
 ; TUNIT:       bb16:
-; TUNIT-NEXT:    [[TRUETMP18:%.*]] = icmp eq i32 [[X]], 0
-; TUNIT-NEXT:    br i1 [[TRUETMP18]], label [[BB35:%.*]], label [[BB19:%.*]]
+; TUNIT-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[X]], 0
+; TUNIT-NEXT:    br i1 [[TMP18]], label [[BB35:%.*]], label [[BB19:%.*]]
 ; TUNIT:       bb19:
 ; TUNIT-NEXT:    br label [[BB23:%.*]]
 ; TUNIT:       bb23:
@@ -404,8 +404,8 @@ define dso_local void @spam() {
 ; CGSCC-NEXT:    store i32 [[X]], ptr [[TMP]], align 4
 ; CGSCC-NEXT:    br label [[BB16:%.*]]
 ; CGSCC:       bb16:
-; CGSCC-NEXT:    [[TRUETMP18:%.*]] = icmp eq i32 [[X]], 0
-; CGSCC-NEXT:    br i1 [[TRUETMP18]], label [[BB35:%.*]], label [[BB19:%.*]]
+; CGSCC-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[X]], 0
+; CGSCC-NEXT:    br i1 [[TMP18]], label [[BB35:%.*]], label [[BB19:%.*]]
 ; CGSCC:       bb19:
 ; CGSCC-NEXT:    br label [[BB23:%.*]]
 ; CGSCC:       bb23:
@@ -467,7 +467,7 @@ define double @t4(ptr %this, ptr %this.addr, ptr %this1) {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[THIS_ADDR1:%.*]] = alloca ptr, i32 0, align 8
 ; TUNIT-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
-; TUNIT-NEXT:    [[CALL:%.*]] = call [[S:%.*]] @t4a(ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[THIS]]) #[[ATTR5]]
+; TUNIT-NEXT:    [[CALL:%.*]] = call [[S:%.*]] @[[T4A:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[THIS]]) #[[ATTR5]]
 ; TUNIT-NEXT:    ret double 0.000000e+00
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
@@ -476,7 +476,7 @@ define double @t4(ptr %this, ptr %this.addr, ptr %this1) {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[THIS_ADDR1:%.*]] = alloca ptr, i32 0, align 8
 ; CGSCC-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
-; CGSCC-NEXT:    [[CALL:%.*]] = call [[S:%.*]] @t4a(ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR8]]
+; CGSCC-NEXT:    [[CALL:%.*]] = call [[S:%.*]] @[[T4A:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR8]]
 ; CGSCC-NEXT:    [[TMP0:%.*]] = extractvalue [[S]] [[CALL]], 0
 ; CGSCC-NEXT:    ret double 0.000000e+00
 ;
@@ -615,12 +615,21 @@ entry:
 ; CGSCC: attributes #[[ATTR8]] = { nofree nounwind willreturn }
 ; CGSCC: attributes #[[ATTR9]] = { nofree nounwind willreturn memory(readwrite) }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 5]}
-; CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; CHECK: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; CHECK: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK: [[META6:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 2}
-; CHECK: [[META7:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; TUNIT: [[META0:![0-9]+]] = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 5]}
+; TUNIT: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; TUNIT: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; TUNIT: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; TUNIT: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; TUNIT: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; TUNIT: [[META6:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 2}
+; TUNIT: [[META7:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+;.
+; CGSCC: [[META0:![0-9]+]] = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 5]}
+; CGSCC: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CGSCC: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CGSCC: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CGSCC: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; CGSCC: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; CGSCC: [[META6:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 2}
+; CGSCC: [[META7:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ;.
diff --git a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
index 06a52ce936eec..f7f92e3c87a62 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
@@ -3176,6 +3176,216 @@ define internal i32 @recSimplify2() {
   ret i32 %r
 }
 
+; TODO: Verify we do not return 10.
+define i32 @may_access_after_return(i32 noundef %N, i32 noundef %M) {
+; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return
+; TUNIT-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR4]] {
+; TUNIT-NEXT:  entry:
+; TUNIT-NEXT:    [[A:%.*]] = alloca i32, align 4
+; TUNIT-NEXT:    [[B:%.*]] = alloca i32, align 4
+; TUNIT-NEXT:    call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]]) #[[ATTR18]]
+; TUNIT-NEXT:    ret i32 10
+;
+; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
+; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return
+; CGSCC-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR16]] {
+; CGSCC-NEXT:  entry:
+; CGSCC-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CGSCC-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CGSCC-NEXT:    call void @write_both(ptr noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]], ptr noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]]) #[[ATTR21]]
+; CGSCC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+; CGSCC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+; CGSCC-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+; CGSCC-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  %A = alloca i32, align 4
+  %B = alloca i32, align 4
+  %call = call ptr @passthrough(ptr noundef %A)
+  %call1 = call ptr @passthrough(ptr noundef %B)
+  call void @write_both(ptr noundef %call, ptr noundef %call1)
+  %0 = load i32, ptr %A, align 4
+  %1 = load i32, ptr %B, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+define internal void @write_both(ptr noundef %Q, ptr noundef %R) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
+; CHECK-LABEL: define {{[^@]+}}@write_both
+; CHECK-SAME: (ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[Q:%.*]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[R:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i32 3, ptr [[Q]], align 4
+; CHECK-NEXT:    store i32 5, ptr [[R]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i32 3, ptr %Q, align 4
+  store i32 5, ptr %R, align 4
+  ret void
+}
+
+define internal ptr @passthrough(ptr noundef %P) {
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CGSCC-LABEL: define {{[^@]+}}@passthrough
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull readnone returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR4]] {
+; CGSCC-NEXT:  entry:
+; CGSCC-NEXT:    ret ptr [[P]]
+;
+entry:
+  ret ptr %P
+}
+
+; TODO: Verify we do not return 10.
+define i32 @may_access_after_return_choice(i32 noundef %N, i32 noundef %M, i1 %c) {
+; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return_choice
+; TUNIT-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i1 [[C:%.*]]) #[[ATTR4]] {
+; TUNIT-NEXT:  entry:
+; TUNIT-NEXT:    [[A:%.*]] = alloca i32, align 4
+; TUNIT-NEXT:    [[B:%.*]] = alloca i32, align 4
+; TUNIT-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(4) ptr @passthrough_choice(i1 [[C]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[A]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[B]]) #[[ATTR23:[0-9]+]]
+; TUNIT-NEXT:    [[CALL1:%.*]] = call nonnull align 4 dereferenceable(4) ptr @passthrough_choice(i1 [[C]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[B]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[A]]) #[[ATTR23]]
+; TUNIT-NEXT:    call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[CALL]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[CALL1]]) #[[ATTR18]]
+; TUNIT-NEXT:    ret i32 10
+;
+; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn
+; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return_choice
+; CGSCC-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i1 [[C:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:  entry:
+; CGSCC-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CGSCC-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CGSCC-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(4) ptr @passthrough_choice(i1 [[C]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) [[A]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) [[B]]) #[[ATTR28:[0-9]+]]
+; CGSCC-NEXT:    [[CALL1:%.*]] = call nonnull align 4 dereferenceable(4) ptr @passthrough_choice(i1 [[C]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) [[B]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) [[A]]) #[[ATTR28]]
+; CGSCC-NEXT:    call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[CALL]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[CALL1]]) #[[ATTR21]]
+; CGSCC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+; CGSCC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+; CGSCC-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+; CGSCC-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  %A = alloca i32, align 4
+  %B = alloca i32, align 4
+  %call = call ptr @passthrough_choice(i1 %c, ptr noundef %A, ptr noundef %B)
+  %call1 = call ptr @passthrough_choice(i1 %c, ptr noundef %B, ptr noundef %A)
+  call void @write_both(ptr noundef %call, ptr noundef %call1)
+  %0 = load i32, ptr %A, align 4
+  %1 = load i32, ptr %B, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+define internal ptr @passthrough_choice(i1 %c, ptr noundef %P, ptr noundef %Q) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define {{[^@]+}}@passthrough_choice
+; CHECK-SAME: (i1 [[C:%.*]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[C]], ptr [[P]], ptr [[Q]]
+; CHECK-NEXT:    ret ptr [[R]]
+;
+entry:
+  %R = select i1 %c, ptr %P, ptr %Q
+  ret ptr %R
+}
+
+; TODO: Verify we do not return 10.
+define i32 @may_access_after_return_no_choice1(i32 noundef %N, i32 noundef %M) {
+; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return_no_choice1
+; TUNIT-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR4]] {
+; TUNIT-NEXT:  entry:
+; TUNIT-NEXT:    [[A:%.*]] = alloca i32, align 4
+; TUNIT-NEXT:    [[B:%.*]] = alloca i32, align 4
+; TUNIT-NEXT:    call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]]) #[[ATTR18]]
+; TUNIT-NEXT:    ret i32 10
+;
+; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
+; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return_no_choice1
+; CGSCC-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR16]] {
+; CGSCC-NEXT:  entry:
+; CGSCC-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CGSCC-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CGSCC-NEXT:    call void @write_both(ptr noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]], ptr noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]]) #[[ATTR21]]
+; CGSCC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+; CGSCC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+; CGSCC-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+; CGSCC-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  %A = alloca i32, align 4
+  %B = alloca i32, align 4
+  %call = call ptr @passthrough_no_choice_true(i1 true, ptr noundef %A, ptr noundef %B)
+  %call1 = call ptr @passthrough_no_choice_true(i1 true, ptr noundef %B, ptr noundef %A)
+  call void @write_both(ptr noundef %call, ptr noundef %call1)
+  %0 = load i32, ptr %A, align 4
+  %1 = load i32, ptr %B, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+; TODO: Verify we do not return 10.
+define i32 @may_access_after_return_no_choice2(i32 noundef %N, i32 noundef %M) {
+; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return_no_choice2
+; TUNIT-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR4]] {
+; TUNIT-NEXT:  entry:
+; TUNIT-NEXT:    [[A:%.*]] = alloca i32, align 4
+; TUNIT-NEXT:    [[B:%.*]] = alloca i32, align 4
+; TUNIT-NEXT:    call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]]) #[[ATTR18]]
+; TUNIT-NEXT:    ret i32 10
+;
+; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
+; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return_no_choice2
+; CGSCC-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]]) #[[ATTR16]] {
+; CGSCC-NEXT:  entry:
+; CGSCC-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CGSCC-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CGSCC-NEXT:    call void @write_both(ptr noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]], ptr noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]]) #[[ATTR21]]
+; CGSCC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+; CGSCC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+; CGSCC-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+; CGSCC-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  %A = alloca i32, align 4
+  %B = alloca i32, align 4
+  %call = call ptr @passthrough_no_choice_false(i1 false, ptr noundef %A, ptr noundef %B)
+  %call1 = call ptr @passthrough_no_choice_false(i1 false, ptr noundef %B, ptr noundef %A)
+  call void @write_both(ptr noundef %call, ptr noundef %call1)
+  %0 = load i32, ptr %A, align 4
+  %1 = load i32, ptr %B, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+define internal ptr @passthrough_no_choice_true(i1 %c, ptr noundef %P, ptr noundef %Q) {
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CGSCC-LABEL: define {{[^@]+}}@passthrough_no_choice_true
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull readnone returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]], i32 [[TMP0:%.*]]) #[[ATTR4]] {
+; CGSCC-NEXT:  entry:
+; CGSCC-NEXT:    [[Q_PRIV:%.*]] = alloca i32, align 4
+; CGSCC-NEXT:    store i32 [[TMP0]], ptr [[Q_PRIV]], align 4
+; CGSCC-NEXT:    ret ptr [[P]]
+;
+entry:
+  %R = select i1 %c, ptr %P, ptr %Q
+  ret ptr %R
+}
+define internal ptr @passthrough_no_choice_false(i1 %c, ptr noundef %P, ptr noundef %Q) {
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CGSCC-LABEL: define {{[^@]+}}@passthrough_no_choice_false
+; CGSCC-SAME: (i32 [[TMP0:%.*]], ptr noalias nofree noundef nonnull readnone returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q:%.*]]) #[[ATTR4]] {
+; CGSCC-NEXT:  entry:
+; CGSCC-NEXT:    [[P_PRIV:%.*]] = alloca i32, align 4
+; CGSCC-NEXT:    store i32 [[TMP0]], ptr [[P_PRIV]], align 4
+; CGSCC-NEXT:    ret ptr [[Q]]
+;
+entry:
+  %R = select i1 %c, ptr %P, ptr %Q
+  ret ptr %R
+}
+
 declare void @llvm.assume(i1 noundef)
 
 
@@ -3238,6 +3448,7 @@ declare void @llvm.assume(i1 noundef)
 ; TUNIT: attributes #[[ATTR20]] = { norecurse }
 ; TUNIT: attributes #[[ATTR21]] = { nounwind }
 ; TUNIT: attributes #[[ATTR22]] = { nofree nosync nounwind willreturn }
+; TUNIT: attributes #[[ATTR23]] = { nofree nosync nounwind willreturn memory(none) }
 ;.
 ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
 ; CGSCC: attributes #[[ATTR1]] = { mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite) }
@@ -3267,6 +3478,7 @@ declare void @llvm.assume(i1 noundef)
 ; CGSCC: attributes #[[ATTR25]] = { nofree nounwind }
 ; CGSCC: attributes #[[ATTR26]] = { nofree nounwind willreturn }
 ; CGSCC: attributes #[[ATTR27]] = { nofree }
+; CGSCC: attributes #[[ATTR28]] = { nofree nosync willreturn }
 ;.
 ; TUNIT: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; TUNIT: [[META1:![0-9]+]] = !{i32 7, !"uwtable", i32 1}

From 4ce8808dd96dd6f1380c4e27c04ff0a0a0fed12b Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 5 Sep 2024 21:12:48 +0100
Subject: [PATCH 289/425] [AMDGPU] Common up default value of
 -amdgpu-nsa-threshold. NFC.

The default value of 3 was specified in two places. Use the actual value
of the cl::init to avoid repeating it.
---
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 352994e541fc8..52c24a5c25ec2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -708,7 +708,7 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
   if (Value > 0)
     return std::max(Value, 2);
 
-  return 3;
+  return NSAThreshold;
 }
 
 GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,

From e6dece9f6947a50c36f714d3fc0d86c6ad9acc9b Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Thu, 5 Sep 2024 13:33:58 -0700
Subject: [PATCH 290/425] [Attributor][FIX] Mark "may" accesses through call
 sites as such (#107439)

Before, we kept the call site access kind (may/must) when we translated
the access. However, the pointer we access it through (by passing it to
the callee) might not be the underlying object. We have similar logic
when we add store and load accesses.
---
 .../Transforms/IPO/AttributorAttributes.cpp   | 20 ++++++++-----
 .../Attributor/value-simplify-pointer-info.ll | 28 +++++++++++++------
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 69d29b6c04234..6b6d6d8d2a1e4 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -1397,7 +1397,8 @@ struct AAPointerInfoImpl
   }
 
   ChangeStatus translateAndAddState(Attributor &A, const AAPointerInfo &OtherAA,
-                                    const OffsetInfo &Offsets, CallBase &CB) {
+                                    const OffsetInfo &Offsets, CallBase &CB,
+                                    bool IsMustAcc) {
     using namespace AA::PointerInfo;
     if (!OtherAA.getState().isValidState() || !isValidState())
       return indicatePessimisticFixpoint();
@@ -1410,6 +1411,8 @@ struct AAPointerInfoImpl
     for (const auto &It : State) {
       for (auto Index : It.getSecond()) {
         const auto &RAcc = State.getAccess(Index);
+        if (!IsMustAcc && RAcc.isAssumption())
+          continue;
         for (auto Offset : Offsets) {
           auto NewRanges = Offset == AA::RangeTy::Unknown
                                ? AA::RangeTy::getUnknown()
@@ -1417,9 +1420,11 @@ struct AAPointerInfoImpl
           if (!NewRanges.isUnknown()) {
             NewRanges.addToAllOffsets(Offset);
           }
-          Changed |=
-              addAccess(A, NewRanges, CB, RAcc.getContent(), RAcc.getKind(),
-                        RAcc.getType(), RAcc.getRemoteInst());
+          AccessKind AK = RAcc.getKind();
+          if (!IsMustAcc)
+            AK = AccessKind((AK & ~AK_MUST) | AK_MAY);
+          Changed |= addAccess(A, NewRanges, CB, RAcc.getContent(), AK,
+                               RAcc.getType(), RAcc.getRemoteInst());
         }
       }
     }
@@ -1893,9 +1898,10 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
             DepClassTy::REQUIRED);
         if (!CSArgPI)
           return false;
-        Changed =
-            translateAndAddState(A, *CSArgPI, OffsetInfoMap[CurPtr], *CB) |
-            Changed;
+        bool IsMustAcc = (getUnderlyingObject(CurPtr) == &AssociatedValue);
+        Changed = translateAndAddState(A, *CSArgPI, OffsetInfoMap[CurPtr], *CB,
+                                       IsMustAcc) |
+                  Changed;
         return isValidState();
       }
       LLVM_DEBUG(dbgs() << "[AAPointerInfo] Call user not handled " << *CB
diff --git a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
index f7f92e3c87a62..69bff7b5e783e 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
@@ -3176,7 +3176,7 @@ define internal i32 @recSimplify2() {
   ret i32 %r
 }
 
-; TODO: Verify we do not return 10.
+; Verify we do not return 10.
 define i32 @may_access_after_return(i32 noundef %N, i32 noundef %M) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return
@@ -3185,7 +3185,10 @@ define i32 @may_access_after_return(i32 noundef %N, i32 noundef %M) {
 ; TUNIT-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]]) #[[ATTR18]]
-; TUNIT-NEXT:    ret i32 10
+; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+; TUNIT-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+; TUNIT-NEXT:    ret i32 [[ADD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return
@@ -3237,7 +3240,7 @@ entry:
   ret ptr %P
 }
 
-; TODO: Verify we do not return 10.
+; Verify we do not return 10.
 define i32 @may_access_after_return_choice(i32 noundef %N, i32 noundef %M, i1 %c) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return_choice
@@ -3248,7 +3251,10 @@ define i32 @may_access_after_return_choice(i32 noundef %N, i32 noundef %M, i1 %c
 ; TUNIT-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(4) ptr @passthrough_choice(i1 [[C]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[A]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[B]]) #[[ATTR23:[0-9]+]]
 ; TUNIT-NEXT:    [[CALL1:%.*]] = call nonnull align 4 dereferenceable(4) ptr @passthrough_choice(i1 [[C]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[B]], ptr noalias nofree noundef nonnull readnone align 4 dereferenceable(4) "no-capture-maybe-returned" [[A]]) #[[ATTR23]]
 ; TUNIT-NEXT:    call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[CALL]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[CALL1]]) #[[ATTR18]]
-; TUNIT-NEXT:    ret i32 10
+; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+; TUNIT-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+; TUNIT-NEXT:    ret i32 [[ADD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return_choice
@@ -3289,7 +3295,7 @@ entry:
   ret ptr %R
 }
 
-; TODO: Verify we do not return 10.
+; Verify we do not return 10.
 define i32 @may_access_after_return_no_choice1(i32 noundef %N, i32 noundef %M) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return_no_choice1
@@ -3298,7 +3304,10 @@ define i32 @may_access_after_return_no_choice1(i32 noundef %N, i32 noundef %M) {
 ; TUNIT-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]]) #[[ATTR18]]
-; TUNIT-NEXT:    ret i32 10
+; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+; TUNIT-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+; TUNIT-NEXT:    ret i32 [[ADD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return_no_choice1
@@ -3324,7 +3333,7 @@ entry:
   ret i32 %add
 }
 
-; TODO: Verify we do not return 10.
+; Verify we do not return 10.
 define i32 @may_access_after_return_no_choice2(i32 noundef %N, i32 noundef %M) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@may_access_after_return_no_choice2
@@ -3333,7 +3342,10 @@ define i32 @may_access_after_return_no_choice2(i32 noundef %N, i32 noundef %M) {
 ; TUNIT-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    call void @write_both(ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]], ptr nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]]) #[[ATTR18]]
-; TUNIT-NEXT:    ret i32 10
+; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+; TUNIT-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+; TUNIT-NEXT:    ret i32 [[ADD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@may_access_after_return_no_choice2

From 08533a3ee8f3a09a59cf6ac3be59198b26b7f739 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Thu, 5 Sep 2024 13:36:26 -0700
Subject: [PATCH 291/425] [Offload][NFC] Reorganize `utils::` and make
 Device/Host/Shared clearer (#100280)

We had three `utils::` namespaces, all with different "meaning" (host,
device, hsa_utils). We should, when we can, keep "include/Shared"
accessible from host and device, thus RefCountTy has been moved to a
separate header. `hsa_utils` was introduced to make `utils::` less
overloaded. And common functionality was de-duplicated, e.g.,
`utils::advance` and `utils::advanceVoidPtr` -> `utils:advancePtr`. Type
punning now checks for the size of the result to make sure it matches
the source type.

No functional change was intended.
---
 offload/DeviceRTL/CMakeLists.txt              |   6 +-
 offload/DeviceRTL/include/Allocator.h         |   2 +-
 offload/DeviceRTL/include/Configuration.h     |   2 +-
 .../include/{Types.h => DeviceTypes.h}        |   6 +-
 offload/DeviceRTL/include/DeviceUtils.h       |  54 ++++++++++
 offload/DeviceRTL/include/Interface.h         |   2 +-
 offload/DeviceRTL/include/LibC.h              |   2 +-
 offload/DeviceRTL/include/Mapping.h           |   2 +-
 offload/DeviceRTL/include/State.h             |   4 +-
 offload/DeviceRTL/include/Synchronization.h   |   2 +-
 offload/DeviceRTL/include/Utils.h             | 100 ------------------
 offload/DeviceRTL/src/Allocator.cpp           |   4 +-
 offload/DeviceRTL/src/Configuration.cpp       |   2 +-
 offload/DeviceRTL/src/Debug.cpp               |   2 +-
 .../src/{Utils.cpp => DeviceUtils.cpp}        |  16 +--
 offload/DeviceRTL/src/Kernel.cpp              |   2 +-
 offload/DeviceRTL/src/Mapping.cpp             |   4 +-
 offload/DeviceRTL/src/Misc.cpp                |   2 +-
 offload/DeviceRTL/src/Parallelism.cpp         |   4 +-
 offload/DeviceRTL/src/Reduction.cpp           |   4 +-
 offload/DeviceRTL/src/State.cpp               |  16 +--
 offload/DeviceRTL/src/Synchronization.cpp     |   4 +-
 offload/DeviceRTL/src/Tasking.cpp             |   6 +-
 offload/DeviceRTL/src/Workshare.cpp           |   6 +-
 offload/include/Shared/RefCnt.h               |  56 ++++++++++
 offload/include/Shared/Types.h                |  22 ++++
 offload/include/Shared/Utils.h                | 100 ++++++++----------
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    |  88 +++++++--------
 .../amdgpu/utils/UtilitiesRTL.h               |   5 +-
 .../common/include/PluginInterface.h          |  18 ++--
 .../common/src/GlobalHandler.cpp              |   4 +-
 offload/plugins-nextgen/common/src/JIT.cpp    |   4 +-
 .../common/src/PluginInterface.cpp            |  26 ++---
 offload/plugins-nextgen/cuda/src/rtl.cpp      |   2 +-
 offload/src/DeviceImage.cpp                   |   5 +-
 offload/src/omptarget.cpp                     |   4 +-
 36 files changed, 308 insertions(+), 280 deletions(-)
 rename offload/DeviceRTL/include/{Types.h => DeviceTypes.h} (97%)
 create mode 100644 offload/DeviceRTL/include/DeviceUtils.h
 delete mode 100644 offload/DeviceRTL/include/Utils.h
 rename offload/DeviceRTL/src/{Utils.cpp => DeviceUtils.cpp} (90%)
 create mode 100644 offload/include/Shared/RefCnt.h
 create mode 100644 offload/include/Shared/Types.h

diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index ad465f0ccbd61..6b86c4d1ce0cf 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -80,8 +80,8 @@ set(include_files
   ${include_directory}/Profiling.h
   ${include_directory}/State.h
   ${include_directory}/Synchronization.h
-  ${include_directory}/Types.h
-  ${include_directory}/Utils.h
+  ${include_directory}/DeviceTypes.h
+  ${include_directory}/DeviceUtils.h
   ${include_directory}/Workshare.h
 )
 
@@ -99,7 +99,7 @@ set(src_files
   ${source_directory}/State.cpp
   ${source_directory}/Synchronization.cpp
   ${source_directory}/Tasking.cpp
-  ${source_directory}/Utils.cpp
+  ${source_directory}/DeviceUtils.cpp
   ${source_directory}/Workshare.cpp
 )
 
diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h
index 23e0106c80a2c..475f6a21bb47e 100644
--- a/offload/DeviceRTL/include/Allocator.h
+++ b/offload/DeviceRTL/include/Allocator.h
@@ -12,7 +12,7 @@
 #ifndef OMPTARGET_ALLOCATOR_H
 #define OMPTARGET_ALLOCATOR_H
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 // Forward declaration.
 struct KernelEnvironmentTy;
diff --git a/offload/DeviceRTL/include/Configuration.h b/offload/DeviceRTL/include/Configuration.h
index 8e6f5c89cbf24..f8b7a6c3c6c9d 100644
--- a/offload/DeviceRTL/include/Configuration.h
+++ b/offload/DeviceRTL/include/Configuration.h
@@ -15,7 +15,7 @@
 
 #include "Shared/Environment.h"
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 namespace ompx {
 namespace config {
diff --git a/offload/DeviceRTL/include/Types.h b/offload/DeviceRTL/include/DeviceTypes.h
similarity index 97%
rename from offload/DeviceRTL/include/Types.h
rename to offload/DeviceRTL/include/DeviceTypes.h
index cd8f925a392a8..c7132be345e0e 100644
--- a/offload/DeviceRTL/include/Types.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -1,4 +1,4 @@
-//===---------- Types.h - OpenMP types ---------------------------- C++ -*-===//
+//===---------- DeviceTypes.h - OpenMP types ---------------------- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -115,9 +115,9 @@ enum kmp_sched_t {
 #define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
   (enum kmp_sched_t)(                                                          \
       (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
-#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0)
+#define SCHEDULE_HAS_MONOTONIC(s) (((s) & kmp_sched_modifier_monotonic) != 0)
 #define SCHEDULE_HAS_NONMONOTONIC(s)                                           \
-  (((s)&kmp_sched_modifier_nonmonotonic) != 0)
+  (((s) & kmp_sched_modifier_nonmonotonic) != 0)
 #define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
   (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
    0)
diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h
new file mode 100644
index 0000000000000..549ca16e1c34c
--- /dev/null
+++ b/offload/DeviceRTL/include/DeviceUtils.h
@@ -0,0 +1,54 @@
+//===--- DeviceUtils.h - OpenMP device runtime utility functions -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_DEVICE_UTILS_H
+#define OMPTARGET_DEVICERTL_DEVICE_UTILS_H
+
+#include "DeviceTypes.h"
+#include "Shared/Utils.h"
+
+#pragma omp begin declare target device_type(nohost)
+
+namespace utils {
+
+/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
+/// is identified by \p Mask.
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
+
+int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width);
+
+uint64_t ballotSync(uint64_t Mask, int32_t Pred);
+
+/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
+uint64_t pack(uint32_t LowBits, uint32_t HighBits);
+
+/// Unpack \p Val into \p LowBits and \p HighBits.
+void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
+
+/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)).
+bool isSharedMemPtr(void *Ptr);
+
+/// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)).
+bool isThreadLocalMemPtr(void *Ptr);
+
+/// A  pointer variable that has by design an `undef` value. Use with care.
+[[clang::loader_uninitialized]] static void *const UndefPtr;
+
+#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
+#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
+
+} // namespace utils
+
+#pragma omp end declare target
+
+#endif
diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h
index d36d4227091ef..c4bfaaa2404b4 100644
--- a/offload/DeviceRTL/include/Interface.h
+++ b/offload/DeviceRTL/include/Interface.h
@@ -14,7 +14,7 @@
 
 #include "Shared/Environment.h"
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 /// External API
 ///
diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h
index 59a795cc62e0e..03febdb508342 100644
--- a/offload/DeviceRTL/include/LibC.h
+++ b/offload/DeviceRTL/include/LibC.h
@@ -12,7 +12,7 @@
 #ifndef OMPTARGET_LIBC_H
 #define OMPTARGET_LIBC_H
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 extern "C" {
 
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index 165904644dbb9..2fb87abe5418c 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -12,7 +12,7 @@
 #ifndef OMPTARGET_MAPPING_H
 #define OMPTARGET_MAPPING_H
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 namespace ompx {
 
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index 1a3490394458f..565235cd48a91 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -15,9 +15,9 @@
 #include "Shared/Environment.h"
 
 #include "Debug.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 #include "Mapping.h"
-#include "Types.h"
-#include "Utils.h"
 
 // Forward declaration.
 struct KernelEnvironmentTy;
diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h
index af9e1a673e6a2..874974cc861df 100644
--- a/offload/DeviceRTL/include/Synchronization.h
+++ b/offload/DeviceRTL/include/Synchronization.h
@@ -12,7 +12,7 @@
 #ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
 #define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 namespace ompx {
 
diff --git a/offload/DeviceRTL/include/Utils.h b/offload/DeviceRTL/include/Utils.h
deleted file mode 100644
index 82e2397b5958b..0000000000000
--- a/offload/DeviceRTL/include/Utils.h
+++ /dev/null
@@ -1,100 +0,0 @@
-//===--------- Utils.h - OpenMP device runtime utility functions -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_UTILS_H
-#define OMPTARGET_DEVICERTL_UTILS_H
-
-#include "Types.h"
-
-#pragma omp begin declare target device_type(nohost)
-
-namespace ompx {
-namespace utils {
-
-/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
-/// is identified by \p Mask.
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
-
-int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width);
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred);
-
-/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
-uint64_t pack(uint32_t LowBits, uint32_t HighBits);
-
-/// Unpack \p Val into \p LowBits and \p HighBits.
-void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
-
-/// Round up \p V to a \p Boundary.
-template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
-  return (V + Boundary - 1) / Boundary * Boundary;
-}
-
-/// Advance \p Ptr by \p Bytes bytes.
-template <typename Ty1, typename Ty2> inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) {
-  return reinterpret_cast<Ty1 *>(reinterpret_cast<char *>(Ptr) + Bytes);
-}
-
-/// Return the first bit set in \p V.
-inline uint32_t ffs(uint32_t V) {
-  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
-  return __builtin_ffs(V);
-}
-
-/// Return the first bit set in \p V.
-inline uint32_t ffs(uint64_t V) {
-  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
-  return __builtin_ffsl(V);
-}
-
-/// Return the number of bits set in \p V.
-inline uint32_t popc(uint32_t V) {
-  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
-  return __builtin_popcount(V);
-}
-
-/// Return the number of bits set in \p V.
-inline uint32_t popc(uint64_t V) {
-  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
-  return __builtin_popcountl(V);
-}
-
-/// Return \p V aligned "upwards" according to \p Align.
-template <typename Ty1, typename Ty2> inline Ty1 align_up(Ty1 V, Ty2 Align) {
-  return ((V + Ty1(Align) - 1) / Ty1(Align)) * Ty1(Align);
-}
-/// Return \p V aligned "downwards" according to \p Align.
-template <typename Ty1, typename Ty2> inline Ty1 align_down(Ty1 V, Ty2 Align) {
-  return V - V % Align;
-}
-
-/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)).
-bool isSharedMemPtr(void *Ptr);
-
-/// Return \p V typed punned as \p DstTy.
-template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
-  return *((DstTy *)(&V));
-}
-
-/// A  pointer variable that has by design an `undef` value. Use with care.
-[[clang::loader_uninitialized]] static void *const UndefPtr;
-
-#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
-#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
-
-} // namespace utils
-} // namespace ompx
-
-#pragma omp end declare target
-
-#endif
diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp
index c9c940de62c1a..ac662c48d4f5f 100644
--- a/offload/DeviceRTL/src/Allocator.cpp
+++ b/offload/DeviceRTL/src/Allocator.cpp
@@ -12,10 +12,10 @@
 
 #include "Allocator.h"
 #include "Configuration.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 #include "Mapping.h"
 #include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp
index ef0c3663536f5..9e14c203d4a04 100644
--- a/offload/DeviceRTL/src/Configuration.cpp
+++ b/offload/DeviceRTL/src/Configuration.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "Configuration.h"
+#include "DeviceTypes.h"
 #include "State.h"
-#include "Types.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp
index 5a2c84c7ee38a..b451f17c6bbd8 100644
--- a/offload/DeviceRTL/src/Debug.cpp
+++ b/offload/DeviceRTL/src/Debug.cpp
@@ -14,10 +14,10 @@
 
 #include "Configuration.h"
 #include "Debug.h"
+#include "DeviceTypes.h"
 #include "Interface.h"
 #include "Mapping.h"
 #include "State.h"
-#include "Types.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/Utils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp
similarity index 90%
rename from offload/DeviceRTL/src/Utils.cpp
rename to offload/DeviceRTL/src/DeviceUtils.cpp
index 53cc803234867..c204a7be73b1f 100644
--- a/offload/DeviceRTL/src/Utils.cpp
+++ b/offload/DeviceRTL/src/DeviceUtils.cpp
@@ -9,7 +9,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Utils.h"
+#include "DeviceUtils.h"
 
 #include "Debug.h"
 #include "Interface.h"
@@ -33,7 +33,7 @@ uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
   return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
 }
 
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
 int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
                     int32_t Width);
 
@@ -44,8 +44,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred);
 ///{
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
 
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
-  int Width = mapping::getWarpSize();
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
   int Self = mapping::getThreadIdInWarp();
   int Index = SrcLane + (Self & ~(Width - 1));
   return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
@@ -77,8 +76,8 @@ bool isSharedMemPtr(const void *Ptr) {
         device = {arch(nvptx, nvptx64)},                                       \
             implementation = {extension(match_any)})
 
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
-  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
+  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
 }
 
 int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
@@ -104,8 +103,9 @@ void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
   impl::Unpack(Val, &LowBits, &HighBits);
 }
 
-int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
-  return impl::shuffle(Mask, Var, SrcLane);
+int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
+                       int32_t Width) {
+  return impl::shuffle(Mask, Var, SrcLane, Width);
 }
 
 int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index e70704f25e922..8bb275eae776c 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -14,11 +14,11 @@
 
 #include "Allocator.h"
 #include "Debug.h"
+#include "DeviceTypes.h"
 #include "Interface.h"
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
-#include "Types.h"
 #include "Workshare.h"
 
 #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index c1ce878746a69..3aefcff68e195 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -10,10 +10,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "Mapping.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 #include "Interface.h"
 #include "State.h"
-#include "Types.h"
-#include "Utils.h"
 
 #pragma omp begin declare target device_type(nohost)
 
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
index ce4a221bdb37d..8e690f6fd8e7c 100644
--- a/offload/DeviceRTL/src/Misc.cpp
+++ b/offload/DeviceRTL/src/Misc.cpp
@@ -11,7 +11,7 @@
 
 #include "Allocator.h"
 #include "Configuration.h"
-#include "Types.h"
+#include "DeviceTypes.h"
 
 #include "Debug.h"
 
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index 15b991f202539..5286d53b623f0 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -33,12 +33,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "Debug.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 #include "Interface.h"
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 744d1a3a231c8..57df159d3f28e 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -11,12 +11,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "Debug.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 #include "Interface.h"
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index f43f2cedb431d..855c74fa58e0a 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -13,13 +13,13 @@
 #include "Allocator.h"
 #include "Configuration.h"
 #include "Debug.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 #include "Interface.h"
 #include "LibC.h"
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
 
 using namespace ompx;
 
@@ -84,14 +84,14 @@ struct SharedMemorySmartStackTy {
 
   /// Deallocate the last allocation made by the encountering thread and pointed
   /// to by \p Ptr from the stack. Each thread can call this function.
-  void pop(void *Ptr, uint32_t Bytes);
+  void pop(void *Ptr, uint64_t Bytes);
 
 private:
   /// Compute the size of the storage space reserved for a thread.
   uint32_t computeThreadStorageTotal() {
     uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
-    return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
-                             allocator::ALIGNMENT);
+    return utils::alignDown((state::SharedScratchpadSize / NumLanesInBlock),
+                            allocator::ALIGNMENT);
   }
 
   /// Return the top address of the warp data stack, that is the first address
@@ -121,7 +121,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
   // First align the number of requested bytes.
   /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
   /// be passed in as an argument and the stack rewritten to support it.
-  uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
+  uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT);
 
   uint32_t StorageTotal = computeThreadStorageTotal();
 
@@ -148,8 +148,8 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
   return GlobalMemory;
 }
 
-void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
-  uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
+void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
+  uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT);
   if (utils::isSharedMemPtr(Ptr)) {
     int TId = mapping::getThreadIdInBlock();
     Usage[TId] -= AlignedBytes;
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index 80ba87b300bcd..d6452a5d589c5 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -13,11 +13,11 @@
 #include "Synchronization.h"
 
 #include "Debug.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 #include "Interface.h"
 #include "Mapping.h"
 #include "State.h"
-#include "Types.h"
-#include "Utils.h"
 
 #pragma omp begin declare target device_type(nohost)
 
diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp
index 2dc33562e6d79..23a967c1a337e 100644
--- a/offload/DeviceRTL/src/Tasking.cpp
+++ b/offload/DeviceRTL/src/Tasking.cpp
@@ -13,10 +13,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 #include "Interface.h"
 #include "State.h"
-#include "Types.h"
-#include "Utils.h"
 
 using namespace ompx;
 
@@ -34,7 +34,7 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
   TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal(
       TaskSizeTotal, "explicit task descriptor");
   TaskDescriptor->Payload =
-      utils::advance(TaskDescriptor, TaskSizeInclPrivateValuesPadded);
+      utils::advancePtr(TaskDescriptor, TaskSizeInclPrivateValuesPadded);
   TaskDescriptor->TaskFn = TaskFn;
 
   return TaskDescriptor;
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index 7e087a07e4420..ad60e66548be9 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -14,12 +14,12 @@
 
 #include "Workshare.h"
 #include "Debug.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 #include "Interface.h"
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
 
 using namespace ompx;
 
@@ -349,7 +349,7 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
     if (rank == 0) {
       warp_res = atomic::add(&Cnt, change, atomic::seq_cst);
     }
-    warp_res = utils::shuffle(active, warp_res, leader);
+    warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize());
     return warp_res + rank;
   }
 
diff --git a/offload/include/Shared/RefCnt.h b/offload/include/Shared/RefCnt.h
new file mode 100644
index 0000000000000..7c615ba167a3d
--- /dev/null
+++ b/offload/include/Shared/RefCnt.h
@@ -0,0 +1,56 @@
+//===-- Shared/RefCnt.h - Helper to keep track of references --- C++ ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_SHARED_REF_CNT_H
+#define OMPTARGET_SHARED_REF_CNT_H
+
+#include <atomic>
+#include <cassert>
+#include <limits>
+#include <memory>
+
+namespace llvm {
+namespace omp {
+namespace target {
+
+/// Utility class for thread-safe reference counting. Any class that needs
+/// objects' reference counting can inherit from this entity or have it as a
+/// class data member.
+template <typename Ty = uint32_t,
+          std::memory_order MemoryOrder = std::memory_order_relaxed>
+struct RefCountTy {
+  /// Create a refcount object initialized to zero.
+  RefCountTy() : Refs(0) {}
+
+  ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); }
+
+  /// Increase the reference count atomically.
+  void increase() { Refs.fetch_add(1, MemoryOrder); }
+
+  /// Decrease the reference count and return whether it became zero. Decreasing
+  /// the counter in more units than it was previously increased results in
+  /// undefined behavior.
+  bool decrease() {
+    Ty Prev = Refs.fetch_sub(1, MemoryOrder);
+    assert(Prev > 0 && "Invalid refcount");
+    return (Prev == 1);
+  }
+
+  Ty get() const { return Refs.load(MemoryOrder); }
+
+private:
+  /// The atomic reference counter.
+  std::atomic<Ty> Refs;
+};
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif
diff --git a/offload/include/Shared/Types.h b/offload/include/Shared/Types.h
new file mode 100644
index 0000000000000..15e3cfefa37ed
--- /dev/null
+++ b/offload/include/Shared/Types.h
@@ -0,0 +1,22 @@
+//===-- Shared/Types.h - Type defs shared between host and device - C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Environments shared between host and device.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_SHARED_TYPES_H
+#define OMPTARGET_SHARED_TYPES_H
+
+#ifndef OMPTARGET_DEVICE_RUNTIME
+#include <cstdint>
+#else
+#include "DeviceTypes.h"
+#endif
+
+#endif // OMPTARGET_SHARED_TYPES_H
diff --git a/offload/include/Shared/Utils.h b/offload/include/Shared/Utils.h
index fce14b54edb98..da83551fffd54 100644
--- a/offload/include/Shared/Utils.h
+++ b/offload/include/Shared/Utils.h
@@ -6,83 +6,73 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Routines and classes used to provide useful functionalities like string
-// parsing and environment variables.
+// Routines and classes used to provide useful functionalities for the host and
+// the device.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef OMPTARGET_SHARED_UTILS_H
 #define OMPTARGET_SHARED_UTILS_H
 
-#include "llvm/ADT/StringRef.h"
+#include "Types.h"
 
-#include "Debug.h"
-
-#include <atomic>
-#include <cassert>
-#include <limits>
-#include <memory>
-
-namespace llvm {
-namespace omp {
-namespace target {
-
-/// Utility class for thread-safe reference counting. Any class that needs
-/// objects' reference counting can inherit from this entity or have it as a
-/// class data member.
-template <typename Ty = uint32_t,
-          std::memory_order MemoryOrder = std::memory_order_relaxed>
-struct RefCountTy {
-  /// Create a refcount object initialized to zero.
-  RefCountTy() : Refs(0) {}
-
-  ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); }
-
-  /// Increase the reference count atomically.
-  void increase() { Refs.fetch_add(1, MemoryOrder); }
-
-  /// Decrease the reference count and return whether it became zero. Decreasing
-  /// the counter in more units than it was previously increased results in
-  /// undefined behavior.
-  bool decrease() {
-    Ty Prev = Refs.fetch_sub(1, MemoryOrder);
-    assert(Prev > 0 && "Invalid refcount");
-    return (Prev == 1);
-  }
-
-  Ty get() const { return Refs.load(MemoryOrder); }
-
-private:
-  /// The atomic reference counter.
-  std::atomic<Ty> Refs;
-};
+namespace utils {
 
 /// Return the difference (in bytes) between \p Begin and \p End.
 template <typename Ty = char>
-ptrdiff_t getPtrDiff(const void *End, const void *Begin) {
+auto getPtrDiff(const void *End, const void *Begin) {
   return reinterpret_cast<const Ty *>(End) -
          reinterpret_cast<const Ty *>(Begin);
 }
 
 /// Return \p Ptr advanced by \p Offset bytes.
-template <typename Ty> Ty *advanceVoidPtr(Ty *Ptr, int64_t Offset) {
-  static_assert(std::is_void<Ty>::value);
-  return const_cast<char *>(reinterpret_cast<const char *>(Ptr) + Offset);
+template <typename Ty1, typename Ty2> Ty1 *advancePtr(Ty1 *Ptr, Ty2 Offset) {
+  return (Ty1 *)(const_cast<char *>((const char *)(Ptr)) + Offset);
 }
 
-/// Return \p Ptr aligned to \p Alignment bytes.
-template <typename Ty> Ty *alignPtr(Ty *Ptr, int64_t Alignment) {
-  size_t Space = std::numeric_limits<size_t>::max();
-  return std::align(Alignment, sizeof(char), Ptr, Space);
+/// Return \p V aligned "upwards" according to \p Align.
+template <typename Ty1, typename Ty2> inline Ty1 alignPtr(Ty1 V, Ty2 Align) {
+  return reinterpret_cast<Ty1>(((uintptr_t(V) + Align - 1) / Align) * Align);
+}
+/// Return \p V aligned "downwards" according to \p Align.
+template <typename Ty1, typename Ty2> inline Ty1 alignDown(Ty1 V, Ty2 Align) {
+  return V - V % Align;
 }
 
 /// Round up \p V to a \p Boundary.
 template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
-  return (V + Boundary - 1) / Boundary * Boundary;
+  return alignPtr(V, Boundary);
+}
+
+/// Return the first bit set in \p V.
+inline uint32_t ffs(uint32_t V) {
+  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
+  return __builtin_ffs(V);
+}
+
+/// Return the first bit set in \p V.
+inline uint32_t ffs(uint64_t V) {
+  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
+  return __builtin_ffsl(V);
+}
+
+/// Return the number of bits set in \p V.
+inline uint32_t popc(uint32_t V) {
+  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
+  return __builtin_popcount(V);
+}
+
+/// Return the number of bits set in \p V.
+inline uint32_t popc(uint64_t V) {
+  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
+  return __builtin_popcountl(V);
+}
+
+template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
+  static_assert(sizeof(DstTy) == sizeof(SrcTy), "Bad conversion");
+  return *((DstTy *)(&V));
 }
 
-} // namespace target
-} // namespace omp
-} // namespace llvm
+} // namespace utils
 
 #endif // OMPTARGET_SHARED_UTILS_H
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 86df4584db091..f0cc0c2e4d08e 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -26,6 +26,7 @@
 #include "Shared/APITypes.h"
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
+#include "Shared/RefCnt.h"
 #include "Shared/Utils.h"
 #include "Utils/ELF.h"
 
@@ -91,7 +92,7 @@ struct AMDGPUDeviceImageTy;
 struct AMDGPUMemoryManagerTy;
 struct AMDGPUMemoryPoolTy;
 
-namespace utils {
+namespace hsa_utils {
 
 /// Iterate elements using an HSA iterate function. Do not use this function
 /// directly but the specialized ones below instead.
@@ -191,7 +192,7 @@ Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
 
 Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
   std::string Target;
-  auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
+  auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
     uint32_t Length;
     hsa_status_t Status;
     Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
@@ -212,7 +213,7 @@ Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
     return Err;
   return Target;
 }
-} // namespace utils
+} // namespace hsa_utils
 
 /// Utility class representing generic resource references to AMDGPU resources.
 template <typename ResourceTy>
@@ -549,7 +550,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
     // TODO: Read the kernel descriptor for the max threads per block. May be
     // read from the image.
 
-    ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
+    ImplicitArgsSize =
+        hsa_utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
     DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
 
     // Get additional kernel info read from image
@@ -1270,13 +1272,14 @@ struct AMDGPUStreamTy {
     // Issue the async memory copy.
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
-                                 CopySize, 1, &InputSignalRaw,
-                                 OutputSignal->get());
+      return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
+                                     Agent, CopySize, 1, &InputSignalRaw,
+                                     OutputSignal->get());
     }
 
-    return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
-                               CopySize, 0, nullptr, OutputSignal->get());
+    return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
+                                   Agent, CopySize, 0, nullptr,
+                                   OutputSignal->get());
   }
 
   /// Push an asynchronous memory copy device-to-host involving an unpinned
@@ -1310,14 +1313,14 @@ struct AMDGPUStreamTy {
     // dependency if already satisfied.
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      if (auto Err = utils::asyncMemCopy(
+      if (auto Err = hsa_utils::asyncMemCopy(
               UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
               &InputSignalRaw, OutputSignals[0]->get()))
         return Err;
     } else {
-      if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent,
-                                         Src, Agent, CopySize, 0, nullptr,
-                                         OutputSignals[0]->get()))
+      if (auto Err = hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Inter,
+                                             Agent, Src, Agent, CopySize, 0,
+                                             nullptr, OutputSignals[0]->get()))
         return Err;
     }
 
@@ -1408,12 +1411,13 @@ struct AMDGPUStreamTy {
     // dependency if already satisfied.
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
-                                 Agent, CopySize, 1, &InputSignalRaw,
-                                 OutputSignal->get());
+      return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
+                                     Agent, CopySize, 1, &InputSignalRaw,
+                                     OutputSignal->get());
     }
-    return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
-                               CopySize, 0, nullptr, OutputSignal->get());
+    return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
+                                   Agent, CopySize, 0, nullptr,
+                                   OutputSignal->get());
   }
 
   // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
@@ -1437,13 +1441,13 @@ struct AMDGPUStreamTy {
 
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
-                                 SrcAgent, CopySize, 1, &InputSignalRaw,
-                                 OutputSignal->get());
+      return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
+                                     SrcAgent, CopySize, 1, &InputSignalRaw,
+                                     OutputSignal->get());
     }
-    return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
-                               SrcAgent, CopySize, 0, nullptr,
-                               OutputSignal->get());
+    return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
+                                   SrcAgent, CopySize, 0, nullptr,
+                                   OutputSignal->get());
   }
 
   /// Synchronize with the stream. The current thread waits until all operations
@@ -1806,7 +1810,7 @@ struct AMDHostDeviceTy : public AMDGenericDeviceTy {
   Error retrieveAllMemoryPools() override {
     // Iterate through the available pools across the host agents.
     for (hsa_agent_t Agent : Agents) {
-      Error Err = utils::iterateAgentMemoryPools(
+      Error Err = hsa_utils::iterateAgentMemoryPools(
           Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
             AMDGPUMemoryPoolTy *MemoryPool =
                 new AMDGPUMemoryPoolTy(HSAMemoryPool);
@@ -1971,7 +1975,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     // Detect if XNACK is enabled
     auto TargeTripleAndFeaturesOrError =
-        utils::getTargetTripleAndFeatures(Agent);
+        hsa_utils::getTargetTripleAndFeatures(Agent);
     if (!TargeTripleAndFeaturesOrError)
       return TargeTripleAndFeaturesOrError.takeError();
     if (static_cast<StringRef>(*TargeTripleAndFeaturesOrError)
@@ -2323,9 +2327,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       if (auto Err = Signal.init())
         return Err;
 
-      if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
-                                         Agent, PinnedPtr, Agent, Size, 0,
-                                         nullptr, Signal.get()))
+      if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
+                                             Agent, PinnedPtr, Agent, Size, 0,
+                                             nullptr, Signal.get()))
         return Err;
 
       if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2383,9 +2387,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       if (auto Err = Signal.init())
         return Err;
 
-      if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr,
-                                         Agent, TgtPtr, Agent, Size, 0, nullptr,
-                                         Signal.get()))
+      if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(),
+                                             PinnedPtr, Agent, TgtPtr, Agent,
+                                             Size, 0, nullptr, Signal.get()))
         return Err;
 
       if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2427,7 +2431,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       if (auto Err = Signal.init())
         return Err;
 
-      if (auto Err = utils::asyncMemCopy(
+      if (auto Err = hsa_utils::asyncMemCopy(
               useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
               getAgent(), (uint64_t)Size, 0, nullptr, Signal.get()))
         return Err;
@@ -2693,7 +2697,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     }
 
     Info.add("ISAs");
-    auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
+    auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
       Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
       if (Status == HSA_STATUS_SUCCESS)
         Info.add<InfoLevel2>("Name", TmpChar);
@@ -2775,7 +2779,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Retrieve and construct all memory pools of the device agent.
   Error retrieveAllMemoryPools() override {
     // Iterate through the available pools of the device agent.
-    return utils::iterateAgentMemoryPools(
+    return hsa_utils::iterateAgentMemoryPools(
         Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
           AMDGPUMemoryPoolTy *MemoryPool =
               Plugin.allocate<AMDGPUMemoryPoolTy>();
@@ -2961,7 +2965,7 @@ Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
   if (Result)
     return Plugin::error("Loaded HSA executable does not validate");
 
-  if (auto Err = utils::readAMDGPUMetaDataFromImage(
+  if (auto Err = hsa_utils::readAMDGPUMetaDataFromImage(
           getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
     return Err;
 
@@ -3090,7 +3094,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     llvm::SmallVector<hsa_agent_t> HostAgents;
 
     // Count the number of available agents.
-    auto Err = utils::iterateAgents([&](hsa_agent_t Agent) {
+    auto Err = hsa_utils::iterateAgents([&](hsa_agent_t Agent) {
       // Get the device type of the agent.
       hsa_device_type_t DeviceType;
       hsa_status_t Status =
@@ -3185,7 +3189,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
       return false;
 
     auto TargeTripleAndFeaturesOrError =
-        utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId));
+        hsa_utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId));
     if (!TargeTripleAndFeaturesOrError)
       return TargeTripleAndFeaturesOrError.takeError();
     return offloading::amdgpu::isImageCompatibleWithEnv(
@@ -3333,11 +3337,11 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
     return Err;
 
-  utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
+  hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
   if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
     // Initialize implicit arguments.
-    ImplArgs = reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
-        advanceVoidPtr(AllArgs, LaunchParams.Size));
+    ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
+        utils::advancePtr(AllArgs, LaunchParams.Size));
 
     // Initialize the implicit arguments to zero.
     std::memset(ImplArgs, 0, getImplicitArgsSize());
@@ -3361,7 +3365,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
 
   // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
   if (ImplArgs &&
-      getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) {
+      getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) {
     ImplArgs->BlockCountX = NumBlocks;
     ImplArgs->BlockCountY = 1;
     ImplArgs->BlockCountZ = 1;
diff --git a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
index 0b6bc50ebf1d8..43be4e8edeba4 100644
--- a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -23,7 +23,8 @@ namespace llvm {
 namespace omp {
 namespace target {
 namespace plugin {
-namespace utils {
+namespace hsa_utils {
+
 // The implicit arguments of COV5 AMDGPU kernels.
 struct AMDGPUImplicitArgsTy {
   uint32_t BlockCountX;
@@ -66,7 +67,7 @@ inline Error readAMDGPUMetaDataFromImage(
   return Err;
 }
 
-} // namespace utils
+} // namespace hsa_utils
 } // namespace plugin
 } // namespace target
 } // namespace omp
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 7e3e788fa52dc..41cc0f286a581 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -232,7 +232,7 @@ class DeviceImageTy {
 
   /// Get the image size.
   size_t getSize() const {
-    return getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
+    return utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
   }
 
   /// Get a memory buffer reference to the whole image.
@@ -539,7 +539,7 @@ class PinnedAllocationMapTy {
     --It;
 
     // The buffer is not contained in the pinned allocation.
-    if (advanceVoidPtr(It->HstPtr, It->Size) > HstPtr)
+    if (utils::advancePtr(It->HstPtr, It->Size) > HstPtr)
       return &(*It);
 
     // None found.
@@ -566,15 +566,15 @@ class PinnedAllocationMapTy {
 
   /// Indicate whether the first range A fully contains the second range B.
   static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
-    void *EndA = advanceVoidPtr(PtrA, SizeA);
-    void *EndB = advanceVoidPtr(PtrB, SizeB);
+    void *EndA = utils::advancePtr(PtrA, SizeA);
+    void *EndB = utils::advancePtr(PtrB, SizeB);
     return (PtrB >= PtrA && EndB <= EndA);
   }
 
   /// Indicate whether the first range A intersects with the second range B.
   static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
-    void *EndA = advanceVoidPtr(PtrA, SizeA);
-    void *EndB = advanceVoidPtr(PtrB, SizeB);
+    void *EndA = utils::advancePtr(PtrA, SizeA);
+    void *EndB = utils::advancePtr(PtrB, SizeB);
     return (PtrA < EndB && PtrB < EndA);
   }
 
@@ -656,8 +656,8 @@ class PinnedAllocationMapTy {
     if (!Entry)
       return nullptr;
 
-    return advanceVoidPtr(Entry->DevAccessiblePtr,
-                          getPtrDiff(HstPtr, Entry->HstPtr));
+    return utils::advancePtr(Entry->DevAccessiblePtr,
+                             utils::getPtrDiff(HstPtr, Entry->HstPtr));
   }
 
   /// Check whether a buffer belongs to a registered host pinned allocation.
@@ -944,7 +944,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
     auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
     for (auto &It : *AllocationTraceMap) {
       if (It.first <= DevicePtr &&
-          advanceVoidPtr(It.first, It.second->Size) > DevicePtr)
+          utils::advancePtr(It.first, It.second->Size) > DevicePtr)
         return It.second;
     }
     return nullptr;
diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
index 59719027f122a..5acc94458bbd8 100644
--- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -153,8 +153,8 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device,
      HostGlobal.getPtr());
 
   assert(Image.getStart() <= ImageGlobal.getPtr() &&
-         advanceVoidPtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) <
-             advanceVoidPtr(Image.getStart(), Image.getSize()) &&
+         utils::advancePtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) <
+             utils::advancePtr(Image.getStart(), Image.getSize()) &&
          "Attempting to read outside the image!");
 
   // Perform the copy from the image to the host memory.
diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp
index 9dbba1459839d..9adb62b677b92 100644
--- a/offload/plugins-nextgen/common/src/JIT.cpp
+++ b/offload/plugins-nextgen/common/src/JIT.cpp
@@ -51,7 +51,7 @@ namespace {
 
 bool isImageBitcode(const __tgt_device_image &Image) {
   StringRef Binary(reinterpret_cast<const char *>(Image.ImageStart),
-                   target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
+                   utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
 
   return identify_magic(Binary) == file_magic::bitcode;
 }
@@ -69,7 +69,7 @@ createModuleFromMemoryBuffer(std::unique_ptr<MemoryBuffer> &MB,
 Expected<std::unique_ptr<Module>>
 createModuleFromImage(const __tgt_device_image &Image, LLVMContext &Context) {
   StringRef Data((const char *)Image.ImageStart,
-                 target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
+                 utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
   std::unique_ptr<MemoryBuffer> MB = MemoryBuffer::getMemBuffer(
       Data, /*BufferName=*/"", /*RequiresNullTerminator=*/false);
   return createModuleFromMemoryBuffer(MB, Context);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 60f7c918d7adb..25b815b7f9669 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -17,6 +17,7 @@
 #include "ErrorReporting.h"
 #include "GlobalHandler.h"
 #include "JIT.h"
+#include "Shared/Utils.h"
 #include "Utils/ELF.h"
 #include "omptarget.h"
 
@@ -77,7 +78,7 @@ struct RecordReplayTy {
         Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
     Device->free(Addr);
     // Align Address to MaxMemoryAllocation
-    Addr = (void *)alignPtr((Addr), MaxMemoryAllocation);
+    Addr = (void *)utils::alignPtr((Addr), MaxMemoryAllocation);
     return Addr;
   }
 
@@ -210,8 +211,8 @@ struct RecordReplayTy {
     if (EC)
       report_fatal_error("Error saving image : " + StringRef(EC.message()));
     if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) {
-      size_t Size =
-          getPtrDiff(TgtImageBitcode->ImageEnd, TgtImageBitcode->ImageStart);
+      size_t Size = utils::getPtrDiff(TgtImageBitcode->ImageEnd,
+                                      TgtImageBitcode->ImageStart);
       MemoryBufferRef MBR = MemoryBufferRef(
           StringRef((const char *)TgtImageBitcode->ImageStart, Size), "");
       OS << MBR.getBuffer();
@@ -244,10 +245,10 @@ struct RecordReplayTy {
 
       int32_t NameLength = std::strlen(OffloadEntry.Name) + 1;
       memcpy(BufferPtr, OffloadEntry.Name, NameLength);
-      BufferPtr = advanceVoidPtr(BufferPtr, NameLength);
+      BufferPtr = utils::advancePtr(BufferPtr, NameLength);
 
       *((uint32_t *)(BufferPtr)) = OffloadEntry.Size;
-      BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t));
+      BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t));
 
       auto Err = Plugin::success();
       {
@@ -257,11 +258,12 @@ struct RecordReplayTy {
       }
       if (Err)
         report_fatal_error("Error retrieving data for global");
-      BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.Size);
+      BufferPtr = utils::advancePtr(BufferPtr, OffloadEntry.Size);
     }
     assert(BufferPtr == GlobalsMB->get()->getBufferEnd() &&
            "Buffer over/under-filled.");
-    assert(Size == getPtrDiff(BufferPtr, GlobalsMB->get()->getBufferStart()) &&
+    assert(Size == utils::getPtrDiff(BufferPtr,
+                                     GlobalsMB->get()->getBufferStart()) &&
            "Buffer size mismatch");
 
     StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size);
@@ -931,7 +933,7 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
 #ifdef OMPT_SUPPORT
   if (ompt::Initialized) {
     size_t Bytes =
-        getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
+        utils::getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
     performOmptCallback(
         device_load, Plugin.getUserId(DeviceId),
         /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr,
@@ -1159,8 +1161,8 @@ Expected<void *> PinnedAllocationMapTy::lockHostBuffer(void *HstPtr,
       return std::move(Err);
 
     // Return the device accessible pointer with the correct offset.
-    return advanceVoidPtr(Entry->DevAccessiblePtr,
-                          getPtrDiff(HstPtr, Entry->HstPtr));
+    return utils::advancePtr(Entry->DevAccessiblePtr,
+                             utils::getPtrDiff(HstPtr, Entry->HstPtr));
   }
 
   // No intersecting registered allocation found in the map. First, lock the
@@ -1697,7 +1699,7 @@ int32_t GenericPluginTy::is_initialized() const { return Initialized; }
 
 int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) {
   StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
-                   target::getPtrDiff(Image->ImageEnd, Image->ImageStart));
+                   utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
 
   auto HandleError = [&](Error Err) -> bool {
     [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
@@ -1729,7 +1731,7 @@ int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) {
 int32_t GenericPluginTy::is_device_compatible(int32_t DeviceId,
                                               __tgt_device_image *Image) {
   StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
-                   target::getPtrDiff(Image->ImageEnd, Image->ImageStart));
+                   utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
 
   auto HandleError = [&](Error Err) -> bool {
     [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b6465d61bd033..015c7775ba351 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -705,7 +705,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
       return Plugin::error("Wrong device Page size");
 
     // Ceil to page size.
-    Size = roundUp(Size, Granularity);
+    Size = utils::roundUp(Size, Granularity);
 
     // Create a handler of our allocation
     CUmemGenericAllocationHandle AHandle;
diff --git a/offload/src/DeviceImage.cpp b/offload/src/DeviceImage.cpp
index e42460b5cca4f..e5b4bf5526437 100644
--- a/offload/src/DeviceImage.cpp
+++ b/offload/src/DeviceImage.cpp
@@ -27,9 +27,8 @@ DeviceImageTy::DeviceImageTy(__tgt_bin_desc &BinaryDesc,
                              __tgt_device_image &TgtDeviceImage)
     : BinaryDesc(&BinaryDesc), Image(TgtDeviceImage) {
 
-  llvm::StringRef ImageStr(
-      static_cast<char *>(Image.ImageStart),
-      llvm::omp::target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
+  llvm::StringRef ImageStr(static_cast<char *>(Image.ImageStart),
+                           utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
 
   auto BinaryOrErr =
       llvm::object::OffloadBinary::create(llvm::MemoryBufferRef(ImageStr, ""));
diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp
index 7a2ee1303d68c..66137b53b0cb4 100644
--- a/offload/src/omptarget.cpp
+++ b/offload/src/omptarget.cpp
@@ -156,8 +156,8 @@ void handleTargetOutcome(bool Success, ident_t *Loc) {
         for (auto &Image : PM->deviceImages()) {
           const char *Start = reinterpret_cast<const char *>(
               Image.getExecutableImage().ImageStart);
-          uint64_t Length = llvm::omp::target::getPtrDiff(
-              Start, Image.getExecutableImage().ImageEnd);
+          uint64_t Length =
+              utils::getPtrDiff(Start, Image.getExecutableImage().ImageEnd);
           llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length),
                                        /*Identifier=*/"");
 

From 84bf0da34dd020090e05816fcbda305d1f422c27 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Thu, 5 Sep 2024 13:37:21 -0700
Subject: [PATCH 292/425] [Attributor][FIX] Ensure to always translate call
 site arguments (#107323)

When we propagate call site arguments we always need to translate them,
this is important as we ended up picking the function argument for a
recurisve call not the call site argument. `@recBad` and `@recGood` in
`returned.ll` show the problem as they used to transform them the same
way. The restructuring cleans the code up and helps derive more
"returned" arguments and better information in the presence of recursive
calls. The "dropped" attributes are simply dropped because we do not
query them anymore, not because we cannot derive them.
---
 .../Transforms/IPO/AttributorAttributes.cpp   | 103 +++++++---------
 .../Attributor/IPConstantProp/PR26044.ll      |  34 +++---
 llvm/test/Transforms/Attributor/align.ll      |  34 +++---
 .../Transforms/Attributor/memory_locations.ll |   2 +-
 llvm/test/Transforms/Attributor/range.ll      |   2 +-
 .../read_write_returned_arguments_scc.ll      |  28 ++---
 llvm/test/Transforms/Attributor/returned.ll   | 112 ++++++++++++++++--
 .../OpenMP/replace_globalization.ll           |   4 +-
 8 files changed, 197 insertions(+), 122 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 6b6d6d8d2a1e4..1fe8e6515fe0e 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -11516,9 +11516,21 @@ struct AAPotentialValuesReturned : public AAPotentialValuesFloating {
           return false;
         if (!AddValues)
           continue;
-        for (const AA::ValueAndContext &VAC : Values)
+
+        bool AllInterAreIntra = false;
+        if (S == AA::Interprocedural)
+          AllInterAreIntra =
+              llvm::all_of(Values, [&](const AA::ValueAndContext &VAC) {
+                return AA::isValidInScope(*VAC.getValue(), AnchorScope);
+              });
+
+        for (const AA::ValueAndContext &VAC : Values) {
           addValue(A, getState(), *VAC.getValue(),
-                   VAC.getCtxI() ? VAC.getCtxI() : CtxI, S, AnchorScope);
+                   VAC.getCtxI() ? VAC.getCtxI() : CtxI,
+                   AllInterAreIntra ? AA::AnyScope : S, AnchorScope);
+        }
+        if (AllInterAreIntra)
+          break;
       }
       return true;
     };
@@ -11547,16 +11559,6 @@ struct AAPotentialValuesReturned : public AAPotentialValuesFloating {
                                            : ChangeStatus::CHANGED;
   }
 
-  void addValue(Attributor &A, StateType &State, Value &V,
-                const Instruction *CtxI, AA::ValueScope S,
-                Function *AnchorScope) const override {
-    Function *F = getAssociatedFunction();
-    if (auto *CB = dyn_cast<CallBase>(&V))
-      if (CB->getCalledOperand() == F)
-        return;
-    Base::addValue(A, State, V, CtxI, S, AnchorScope);
-  }
-
   ChangeStatus manifest(Attributor &A) override {
     if (ReturnedArg)
       return ChangeStatus::UNCHANGED;
@@ -11651,64 +11653,39 @@ struct AAPotentialValuesCallSiteReturned : AAPotentialValuesImpl {
                          UsedAssumedInformation))
       return indicatePessimisticFixpoint();
 
-    SmallVector<AA::ValueAndContext> Values;
-    if (!A.getAssumedSimplifiedValues(IRPosition::returned(*Callee), this,
-                                      Values, AA::Intraprocedural,
-                                      UsedAssumedInformation))
-      return indicatePessimisticFixpoint();
-
     Function *Caller = CB->getCaller();
 
-    bool AnyNonLocal = false;
-    for (auto &It : Values) {
-      Value *V = It.getValue();
-      std::optional<Value *> CallerV = A.translateArgumentToCallSiteContent(
-          V, *CB, *this, UsedAssumedInformation);
-      if (!CallerV.has_value()) {
-        // Nothing to do as long as no value was determined.
-        continue;
-      }
-      V = *CallerV ? *CallerV : V;
-      if (AA::isDynamicallyUnique(A, *this, *V) &&
-          AA::isValidInScope(*V, Caller)) {
-        if (*CallerV) {
-          SmallVector<AA::ValueAndContext> ArgValues;
-          IRPosition IRP = IRPosition::value(*V);
-          if (auto *Arg = dyn_cast<Argument>(V))
-            if (Arg->getParent() == CB->getCalledOperand())
-              IRP = IRPosition::callsite_argument(*CB, Arg->getArgNo());
-          if (recurseForValue(A, IRP, AA::AnyScope))
-            continue;
-        }
-        addValue(A, getState(), *V, CB, AA::AnyScope, getAnchorScope());
-      } else {
-        AnyNonLocal = true;
-        break;
-      }
-    }
-    if (AnyNonLocal) {
-      Values.clear();
+    auto AddScope = [&](AA::ValueScope S) {
+      SmallVector<AA::ValueAndContext> Values;
       if (!A.getAssumedSimplifiedValues(IRPosition::returned(*Callee), this,
-                                        Values, AA::Interprocedural,
-                                        UsedAssumedInformation))
-        return indicatePessimisticFixpoint();
-      AnyNonLocal = false;
-      getState() = PotentialLLVMValuesState::getBestState();
+                                        Values, S, UsedAssumedInformation))
+        return false;
+
       for (auto &It : Values) {
         Value *V = It.getValue();
-        if (!AA::isDynamicallyUnique(A, *this, *V))
-          return indicatePessimisticFixpoint();
-        if (AA::isValidInScope(*V, Caller)) {
-          addValue(A, getState(), *V, CB, AA::AnyScope, getAnchorScope());
-        } else {
-          AnyNonLocal = true;
-          addValue(A, getState(), *V, CB, AA::Interprocedural,
-                   getAnchorScope());
+        std::optional<Value *> CallerV = A.translateArgumentToCallSiteContent(
+            V, *CB, *this, UsedAssumedInformation);
+        if (!CallerV.has_value()) {
+          // Nothing to do as long as no value was determined.
+          continue;
+        }
+        V = *CallerV ? *CallerV : V;
+        if (*CallerV && AA::isDynamicallyUnique(A, *this, *V)) {
+          if (recurseForValue(A, IRPosition::value(*V), S))
+            continue;
+        }
+        if (S == AA::Intraprocedural && !AA::isValidInScope(*V, Caller)) {
+          giveUpOnIntraprocedural(A);
+          return true;
         }
+        addValue(A, getState(), *V, CB, S, getAnchorScope());
       }
-      if (AnyNonLocal)
-        giveUpOnIntraprocedural(A);
-    }
+      return true;
+    };
+    if (!AddScope(AA::Intraprocedural))
+      return indicatePessimisticFixpoint();
+    if (!AddScope(AA::Interprocedural))
+      return indicatePessimisticFixpoint();
     return (AssumedBefore == getAssumed()) ? ChangeStatus::UNCHANGED
                                            : ChangeStatus::CHANGED;
   }
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/PR26044.ll b/llvm/test/Transforms/Attributor/IPConstantProp/PR26044.ll
index 35d0eeac50d51..f673ffc3bfac6 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/PR26044.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/PR26044.ll
@@ -16,8 +16,7 @@ define void @fn2(ptr %P, i1 %C) {
 ; TUNIT:       if.end:
 ; TUNIT-NEXT:    [[E_2:%.*]] = phi ptr [ [[P]], [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ]
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[E_2]], align 4
-; TUNIT-NEXT:    [[CALL:%.*]] = call i32 @fn1(i32 [[TMP0]]) #[[ATTR3:[0-9]+]]
-; TUNIT-NEXT:    store i32 [[CALL]], ptr [[P]], align 4
+; TUNIT-NEXT:    store i32 [[TMP0]], ptr [[P]], align 4
 ; TUNIT-NEXT:    br label [[FOR_COND1]]
 ; TUNIT:       exit:
 ; TUNIT-NEXT:    ret void
@@ -55,11 +54,11 @@ exit:
 }
 
 define internal i32 @fn1(i32 %p1) {
-; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@fn1
-; CHECK-SAME: (i32 returned [[P1:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 [[P1]]
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CGSCC-LABEL: define {{[^@]+}}@fn1
+; CGSCC-SAME: (i32 returned [[P1:%.*]]) #[[ATTR1:[0-9]+]] {
+; CGSCC-NEXT:  entry:
+; CGSCC-NEXT:    ret i32 [[P1]]
 ;
 entry:
   %tobool = icmp ne i32 %p1, 0
@@ -71,7 +70,7 @@ define void @fn_no_null_opt(ptr %P, i1 %C) null_pointer_is_valid {
 ;
 ; TUNIT: Function Attrs: nofree norecurse nosync nounwind null_pointer_is_valid
 ; TUNIT-LABEL: define {{[^@]+}}@fn_no_null_opt
-; TUNIT-SAME: (ptr nocapture nofree writeonly [[P:%.*]], i1 [[C:%.*]]) #[[ATTR2:[0-9]+]] {
+; TUNIT-SAME: (ptr nocapture nofree writeonly [[P:%.*]], i1 [[C:%.*]]) #[[ATTR1:[0-9]+]] {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    br label [[IF_END:%.*]]
 ; TUNIT:       for.cond1:
@@ -79,8 +78,7 @@ define void @fn_no_null_opt(ptr %P, i1 %C) null_pointer_is_valid {
 ; TUNIT:       if.end:
 ; TUNIT-NEXT:    [[E_2:%.*]] = phi ptr [ undef, [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ]
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr null, align 4294967296
-; TUNIT-NEXT:    [[CALL:%.*]] = call i32 @fn0(i32 [[TMP0]]) #[[ATTR3]]
-; TUNIT-NEXT:    store i32 [[CALL]], ptr [[P]], align 4
+; TUNIT-NEXT:    store i32 [[TMP0]], ptr [[P]], align 4
 ; TUNIT-NEXT:    br label [[FOR_COND1]]
 ; TUNIT:       exit:
 ; TUNIT-NEXT:    ret void
@@ -118,11 +116,11 @@ exit:
 }
 
 define internal i32 @fn0(i32 %p1) {
-; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define {{[^@]+}}@fn0
-; CHECK-SAME: (i32 returned [[P1:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 [[P1]]
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CGSCC-LABEL: define {{[^@]+}}@fn0
+; CGSCC-SAME: (i32 returned [[P1:%.*]]) #[[ATTR1]] {
+; CGSCC-NEXT:  entry:
+; CGSCC-NEXT:    ret i32 [[P1]]
 ;
 entry:
   %tobool = icmp ne i32 %p1, 0
@@ -131,12 +129,12 @@ entry:
 }
 ;.
 ; TUNIT: attributes #[[ATTR0]] = { nofree norecurse nosync nounwind memory(argmem: readwrite) }
-; TUNIT: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
-; TUNIT: attributes #[[ATTR2]] = { nofree norecurse nosync nounwind null_pointer_is_valid }
-; TUNIT: attributes #[[ATTR3]] = { nofree nosync nounwind memory(none) }
+; TUNIT: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind null_pointer_is_valid }
 ;.
 ; CGSCC: attributes #[[ATTR0]] = { nofree nosync nounwind memory(argmem: readwrite) }
 ; CGSCC: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR2]] = { nofree nosync nounwind null_pointer_is_valid }
 ; CGSCC: attributes #[[ATTR3]] = { nofree nosync }
 ;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/align.ll b/llvm/test/Transforms/Attributor/align.ll
index 9880e53fd43a5..0a0cd415a2ab7 100644
--- a/llvm/test/Transforms/Attributor/align.ll
+++ b/llvm/test/Transforms/Attributor/align.ll
@@ -416,14 +416,14 @@ define void @test9_traversal(i1 %cnd, ptr align 4 %B, ptr align 8 %C) {
 ; FIXME: This will work with an upcoming patch (D66618 or similar)
 ;             store i32 -1, ptr %g1, align 32
 define ptr @test10a(ptr align 32 %p) {
-; TUNIT: Function Attrs: nofree nosync nounwind
+; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: readwrite)
 ; TUNIT-LABEL: define {{[^@]+}}@test10a
 ; TUNIT-SAME: (ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR3:[0-9]+]] {
 ; TUNIT-NEXT:    [[L:%.*]] = load i32, ptr [[P]], align 32
 ; TUNIT-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 0
 ; TUNIT-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; TUNIT:       t:
-; TUNIT-NEXT:    [[R:%.*]] = call align 32 ptr @test10a(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR3]]
+; TUNIT-NEXT:    [[R:%.*]] = call align 32 ptr @test10a(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR13:[0-9]+]]
 ; TUNIT-NEXT:    store i32 1, ptr [[R]], align 32
 ; TUNIT-NEXT:    [[G0:%.*]] = getelementptr i32, ptr [[P]], i32 8
 ; TUNIT-NEXT:    br label [[E:%.*]]
@@ -435,14 +435,14 @@ define ptr @test10a(ptr align 32 %p) {
 ; TUNIT-NEXT:    [[PHI:%.*]] = phi ptr [ [[G0]], [[T]] ], [ [[G1]], [[F]] ]
 ; TUNIT-NEXT:    ret ptr [[PHI]]
 ;
-; CGSCC: Function Attrs: nofree nosync nounwind
+; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: readwrite)
 ; CGSCC-LABEL: define {{[^@]+}}@test10a
 ; CGSCC-SAME: (ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR4:[0-9]+]] {
 ; CGSCC-NEXT:    [[L:%.*]] = load i32, ptr [[P]], align 32
 ; CGSCC-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 0
 ; CGSCC-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; CGSCC:       t:
-; CGSCC-NEXT:    [[R:%.*]] = call align 32 ptr @test10a(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR4]]
+; CGSCC-NEXT:    [[R:%.*]] = call align 32 ptr @test10a(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR16:[0-9]+]]
 ; CGSCC-NEXT:    store i32 1, ptr [[R]], align 32
 ; CGSCC-NEXT:    [[G0:%.*]] = getelementptr i32, ptr [[P]], i32 8
 ; CGSCC-NEXT:    br label [[E:%.*]]
@@ -478,14 +478,14 @@ e:
 ; FIXME: This will work with an upcoming patch (D66618 or similar)
 ;             store i32 -1, ptr %g1, align 32
 define ptr @test10b(ptr align 32 %p) {
-; TUNIT: Function Attrs: nofree nosync nounwind
+; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: readwrite)
 ; TUNIT-LABEL: define {{[^@]+}}@test10b
 ; TUNIT-SAME: (ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR3]] {
 ; TUNIT-NEXT:    [[L:%.*]] = load i32, ptr [[P]], align 32
 ; TUNIT-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 0
 ; TUNIT-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; TUNIT:       t:
-; TUNIT-NEXT:    [[R:%.*]] = call align 32 ptr @test10b(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR3]]
+; TUNIT-NEXT:    [[R:%.*]] = call align 32 ptr @test10b(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR13]]
 ; TUNIT-NEXT:    store i32 1, ptr [[R]], align 32
 ; TUNIT-NEXT:    [[G0:%.*]] = getelementptr i32, ptr [[P]], i32 8
 ; TUNIT-NEXT:    br label [[E:%.*]]
@@ -497,14 +497,14 @@ define ptr @test10b(ptr align 32 %p) {
 ; TUNIT-NEXT:    [[PHI:%.*]] = phi ptr [ [[G0]], [[T]] ], [ [[G1]], [[F]] ]
 ; TUNIT-NEXT:    ret ptr [[PHI]]
 ;
-; CGSCC: Function Attrs: nofree nosync nounwind
+; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: readwrite)
 ; CGSCC-LABEL: define {{[^@]+}}@test10b
 ; CGSCC-SAME: (ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR4]] {
 ; CGSCC-NEXT:    [[L:%.*]] = load i32, ptr [[P]], align 32
 ; CGSCC-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 0
 ; CGSCC-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; CGSCC:       t:
-; CGSCC-NEXT:    [[R:%.*]] = call align 32 ptr @test10b(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR4]]
+; CGSCC-NEXT:    [[R:%.*]] = call align 32 ptr @test10b(ptr nofree noundef nonnull align 32 dereferenceable(4) "no-capture-maybe-returned" [[P]]) #[[ATTR16]]
 ; CGSCC-NEXT:    store i32 1, ptr [[R]], align 32
 ; CGSCC-NEXT:    [[G0:%.*]] = getelementptr i32, ptr [[P]], i32 8
 ; CGSCC-NEXT:    br label [[E:%.*]]
@@ -946,7 +946,7 @@ define i32 @musttail_caller_1(ptr %p) {
 ; TUNIT-NEXT:    [[C:%.*]] = load i1, ptr @cnd, align 1
 ; TUNIT-NEXT:    br i1 [[C]], label [[MT:%.*]], label [[EXIT:%.*]]
 ; TUNIT:       mt:
-; TUNIT-NEXT:    [[V:%.*]] = musttail call i32 @musttail_callee_1(ptr nocapture nofree noundef readonly [[P]]) #[[ATTR13:[0-9]+]]
+; TUNIT-NEXT:    [[V:%.*]] = musttail call i32 @musttail_callee_1(ptr nocapture nofree noundef readonly [[P]]) #[[ATTR14:[0-9]+]]
 ; TUNIT-NEXT:    ret i32 [[V]]
 ; TUNIT:       exit:
 ; TUNIT-NEXT:    ret i32 0
@@ -957,7 +957,7 @@ define i32 @musttail_caller_1(ptr %p) {
 ; CGSCC-NEXT:    [[C:%.*]] = load i1, ptr @cnd, align 1
 ; CGSCC-NEXT:    br i1 [[C]], label [[MT:%.*]], label [[EXIT:%.*]]
 ; CGSCC:       mt:
-; CGSCC-NEXT:    [[V:%.*]] = musttail call i32 @musttail_callee_1(ptr nocapture nofree noundef nonnull readonly dereferenceable(4) [[P]]) #[[ATTR16:[0-9]+]]
+; CGSCC-NEXT:    [[V:%.*]] = musttail call i32 @musttail_callee_1(ptr nocapture nofree noundef nonnull readonly dereferenceable(4) [[P]]) #[[ATTR17:[0-9]+]]
 ; CGSCC-NEXT:    ret i32 [[V]]
 ; CGSCC:       exit:
 ; CGSCC-NEXT:    ret i32 0
@@ -1089,7 +1089,7 @@ define ptr @aligned_8_return_caller(ptr align(16) %a, i1 %c1, i1 %c2) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@aligned_8_return_caller
 ; TUNIT-SAME: (ptr nofree readnone align 16 "no-capture-maybe-returned" [[A:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) #[[ATTR10]] {
-; TUNIT-NEXT:    [[R:%.*]] = call align 8 ptr @aligned_8_return(ptr noalias nofree readnone align 16 "no-capture-maybe-returned" [[A]], i1 noundef [[C1]], i1 [[C2]]) #[[ATTR14:[0-9]+]]
+; TUNIT-NEXT:    [[R:%.*]] = call align 8 ptr @aligned_8_return(ptr noalias nofree readnone align 16 "no-capture-maybe-returned" [[A]], i1 noundef [[C1]], i1 [[C2]]) #[[ATTR15:[0-9]+]]
 ; TUNIT-NEXT:    ret ptr [[R]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
@@ -1221,7 +1221,7 @@ attributes #2 = { null_pointer_is_valid }
 ; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable }
 ; TUNIT: attributes #[[ATTR1]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable }
 ; TUNIT: attributes #[[ATTR2]] = { nounwind }
-; TUNIT: attributes #[[ATTR3]] = { nofree nosync nounwind }
+; TUNIT: attributes #[[ATTR3]] = { nofree nosync nounwind memory(argmem: readwrite) }
 ; TUNIT: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) }
 ; TUNIT: attributes #[[ATTR5]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
 ; TUNIT: attributes #[[ATTR6]] = { nounwind willreturn }
@@ -1231,14 +1231,15 @@ attributes #2 = { null_pointer_is_valid }
 ; TUNIT: attributes #[[ATTR10]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; TUNIT: attributes #[[ATTR11]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(read) }
 ; TUNIT: attributes #[[ATTR12]] = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) }
-; TUNIT: attributes #[[ATTR13]] = { nofree nosync nounwind willreturn memory(read) }
-; TUNIT: attributes #[[ATTR14]] = { nofree nosync nounwind willreturn }
+; TUNIT: attributes #[[ATTR13]] = { nofree nosync nounwind }
+; TUNIT: attributes #[[ATTR14]] = { nofree nosync nounwind willreturn memory(read) }
+; TUNIT: attributes #[[ATTR15]] = { nofree nosync nounwind willreturn }
 ;.
 ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable }
 ; CGSCC: attributes #[[ATTR1]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable }
 ; CGSCC: attributes #[[ATTR2]] = { noinline nounwind uwtable }
 ; CGSCC: attributes #[[ATTR3]] = { nounwind }
-; CGSCC: attributes #[[ATTR4]] = { nofree nosync nounwind }
+; CGSCC: attributes #[[ATTR4]] = { nofree nosync nounwind memory(argmem: readwrite) }
 ; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) }
 ; CGSCC: attributes #[[ATTR6]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
 ; CGSCC: attributes #[[ATTR7]] = { nounwind willreturn }
@@ -1250,5 +1251,6 @@ attributes #2 = { null_pointer_is_valid }
 ; CGSCC: attributes #[[ATTR13]] = { mustprogress nofree nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR14]] = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) }
 ; CGSCC: attributes #[[ATTR15]] = { nofree nosync willreturn }
-; CGSCC: attributes #[[ATTR16]] = { nofree willreturn memory(read) }
+; CGSCC: attributes #[[ATTR16]] = { nofree nosync nounwind }
+; CGSCC: attributes #[[ATTR17]] = { nofree willreturn memory(read) }
 ;.
diff --git a/llvm/test/Transforms/Attributor/memory_locations.ll b/llvm/test/Transforms/Attributor/memory_locations.ll
index 2dbdf9e6048c0..a7d3fba9cf9b8 100644
--- a/llvm/test/Transforms/Attributor/memory_locations.ll
+++ b/llvm/test/Transforms/Attributor/memory_locations.ll
@@ -35,7 +35,7 @@ define dso_local ptr @internal_only_rec(i32 %arg) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[ARG]], 2
-; CHECK-NEXT:    [[CALL:%.*]] = call noalias ptr @internal_only_rec(i32 [[DIV]])
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr @internal_only_rec(i32 [[DIV]])
 ; CHECK-NEXT:    br label [[RETURN:%.*]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[ARG]] to i64
diff --git a/llvm/test/Transforms/Attributor/range.ll b/llvm/test/Transforms/Attributor/range.ll
index 9b2f9ed2dde91..48040fec772dc 100644
--- a/llvm/test/Transforms/Attributor/range.ll
+++ b/llvm/test/Transforms/Attributor/range.ll
@@ -48,7 +48,7 @@ define void @test0-icmp-check(ptr %p){
   ; ret = [0, 10)
 ; TUNIT-LABEL: define {{[^@]+}}@test0-icmp-check
 ; TUNIT-SAME: (ptr nocapture nofree readonly align 4 [[P:%.*]]) {
-; TUNIT-NEXT:    [[RET:%.*]] = tail call i32 @test0(ptr nocapture nofree noundef readonly align 4 [[P]]) #[[ATTR3]], !range [[RNG0]]
+; TUNIT-NEXT:    [[RET:%.*]] = tail call i32 @test0(ptr nocapture nofree noundef readonly align 4 [[P]]) #[[ATTR3]]
 ; TUNIT-NEXT:    [[CMP_EQ_1:%.*]] = icmp eq i32 [[RET]], 10
 ; TUNIT-NEXT:    [[CMP_EQ_2:%.*]] = icmp eq i32 [[RET]], 9
 ; TUNIT-NEXT:    [[CMP_EQ_3:%.*]] = icmp eq i32 [[RET]], 8
diff --git a/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll b/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll
index 81530e9164899..775c949ca8939 100644
--- a/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll
+++ b/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll
@@ -64,7 +64,7 @@ entry:
 define internal ptr @internal_ret0_nw(ptr %n0, ptr %w0) {
 ; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: readwrite)
 ; TUNIT-LABEL: define {{[^@]+}}@internal_ret0_nw
-; TUNIT-SAME: (ptr nofree [[N0:%.*]], ptr nofree [[W0:%.*]]) #[[ATTR0]] {
+; TUNIT-SAME: (ptr nofree returned [[N0:%.*]], ptr nofree [[W0:%.*]]) #[[ATTR0]] {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[R0:%.*]] = alloca i32, align 4
 ; TUNIT-NEXT:    [[R1:%.*]] = alloca i32, align 4
@@ -84,12 +84,12 @@ define internal ptr @internal_ret0_nw(ptr %n0, ptr %w0) {
 ; TUNIT-NEXT:    [[CALL5:%.*]] = call ptr @internal_ret0_nw(ptr nofree nonnull [[N0]], ptr nofree nonnull align 4 dereferenceable(4) [[W0]]) #[[ATTR3]]
 ; TUNIT-NEXT:    br label [[RETURN]]
 ; TUNIT:       return:
-; TUNIT-NEXT:    [[RETVAL_0:%.*]] = phi ptr [ [[CALL5]], [[IF_END]] ], [ [[N0]], [[IF_THEN]] ]
-; TUNIT-NEXT:    ret ptr [[RETVAL_0]]
+; TUNIT-NEXT:    [[RETVAL_0:%.*]] = phi ptr [ [[N0]], [[IF_END]] ], [ [[N0]], [[IF_THEN]] ]
+; TUNIT-NEXT:    ret ptr [[N0]]
 ;
 ; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: readwrite)
 ; CGSCC-LABEL: define {{[^@]+}}@internal_ret0_nw
-; CGSCC-SAME: (ptr nofree [[N0:%.*]], ptr nofree [[W0:%.*]]) #[[ATTR0]] {
+; CGSCC-SAME: (ptr nofree returned [[N0:%.*]], ptr nofree [[W0:%.*]]) #[[ATTR0]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[R0:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    [[R1:%.*]] = alloca i32, align 4
@@ -109,8 +109,8 @@ define internal ptr @internal_ret0_nw(ptr %n0, ptr %w0) {
 ; CGSCC-NEXT:    [[CALL5:%.*]] = call ptr @internal_ret0_nw(ptr nofree nonnull [[N0]], ptr nofree nonnull align 4 dereferenceable(4) [[W0]]) #[[ATTR2]]
 ; CGSCC-NEXT:    br label [[RETURN]]
 ; CGSCC:       return:
-; CGSCC-NEXT:    [[RETVAL_0:%.*]] = phi ptr [ [[CALL5]], [[IF_END]] ], [ [[N0]], [[IF_THEN]] ]
-; CGSCC-NEXT:    ret ptr [[RETVAL_0]]
+; CGSCC-NEXT:    [[RETVAL_0:%.*]] = phi ptr [ [[N0]], [[IF_END]] ], [ [[N0]], [[IF_THEN]] ]
+; CGSCC-NEXT:    ret ptr [[N0]]
 ;
 entry:
   %r0 = alloca i32, align 4
@@ -164,7 +164,7 @@ define internal ptr @internal_ret1_rrw(ptr %r0, ptr %r1, ptr %w0) {
 ; TUNIT-NEXT:    [[CALL8:%.*]] = call ptr @internal_ret0_nw(ptr nofree nonnull align 4 dereferenceable(4) [[R1]], ptr nofree nonnull align 4 dereferenceable(4) [[W0]]) #[[ATTR3]]
 ; TUNIT-NEXT:    br label [[RETURN]]
 ; TUNIT:       return:
-; TUNIT-NEXT:    [[RETVAL_0:%.*]] = phi ptr [ [[CALL8]], [[IF_END]] ], [ [[R1]], [[IF_THEN]] ]
+; TUNIT-NEXT:    [[RETVAL_0:%.*]] = phi ptr [ [[R1]], [[IF_END]] ], [ [[R1]], [[IF_THEN]] ]
 ; TUNIT-NEXT:    ret ptr undef
 ;
 ; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: readwrite)
@@ -192,7 +192,7 @@ define internal ptr @internal_ret1_rrw(ptr %r0, ptr %r1, ptr %w0) {
 ; CGSCC-NEXT:    [[CALL8:%.*]] = call ptr @internal_ret0_nw(ptr nofree nonnull align 4 dereferenceable(4) [[R1]], ptr nofree nonnull align 4 dereferenceable(4) [[W0]]) #[[ATTR2]]
 ; CGSCC-NEXT:    br label [[RETURN]]
 ; CGSCC:       return:
-; CGSCC-NEXT:    [[RETVAL_0:%.*]] = phi ptr [ [[CALL8]], [[IF_END]] ], [ [[R1]], [[IF_THEN]] ]
+; CGSCC-NEXT:    [[RETVAL_0:%.*]] = phi ptr [ [[R1]], [[IF_END]] ], [ [[R1]], [[IF_THEN]] ]
 ; CGSCC-NEXT:    ret ptr undef
 ;
 entry:
@@ -259,7 +259,7 @@ return:                                           ; preds = %if.end, %if.then
 define internal ptr @internal_ret1_rw(ptr %r0, ptr %w0) {
 ; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: readwrite)
 ; TUNIT-LABEL: define {{[^@]+}}@internal_ret1_rw
-; TUNIT-SAME: (ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0:%.*]], ptr nofree [[W0:%.*]]) #[[ATTR0]] {
+; TUNIT-SAME: (ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0:%.*]], ptr nofree returned [[W0:%.*]]) #[[ATTR0]] {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[R0]], align 4
 ; TUNIT-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
@@ -276,12 +276,12 @@ define internal ptr @internal_ret1_rw(ptr %r0, ptr %w0) {
 ; TUNIT-NEXT:    [[CALL4:%.*]] = call ptr @external_ret2_nrw(ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0]], ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0]], ptr nofree nonnull align 4 dereferenceable(4) [[W0]]) #[[ATTR3]]
 ; TUNIT-NEXT:    br label [[RETURN]]
 ; TUNIT:       return:
-; TUNIT-NEXT:    [[RETVAL_0:%.*]] = phi ptr [ [[CALL4]], [[IF_END]] ], [ [[W0]], [[IF_THEN]] ]
-; TUNIT-NEXT:    ret ptr [[RETVAL_0]]
+; TUNIT-NEXT:    [[RETVAL_0:%.*]] = phi ptr [ [[W0]], [[IF_END]] ], [ [[W0]], [[IF_THEN]] ]
+; TUNIT-NEXT:    ret ptr [[W0]]
 ;
 ; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: readwrite)
 ; CGSCC-LABEL: define {{[^@]+}}@internal_ret1_rw
-; CGSCC-SAME: (ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0:%.*]], ptr nofree [[W0:%.*]]) #[[ATTR0]] {
+; CGSCC-SAME: (ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0:%.*]], ptr nofree returned [[W0:%.*]]) #[[ATTR0]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[R0]], align 4
 ; CGSCC-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
@@ -298,8 +298,8 @@ define internal ptr @internal_ret1_rw(ptr %r0, ptr %w0) {
 ; CGSCC-NEXT:    [[CALL4:%.*]] = call ptr @external_ret2_nrw(ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0]], ptr nofree noundef nonnull align 4 dereferenceable(4) [[R0]], ptr nofree nonnull align 4 dereferenceable(4) [[W0]]) #[[ATTR2]]
 ; CGSCC-NEXT:    br label [[RETURN]]
 ; CGSCC:       return:
-; CGSCC-NEXT:    [[RETVAL_0:%.*]] = phi ptr [ [[CALL4]], [[IF_END]] ], [ [[W0]], [[IF_THEN]] ]
-; CGSCC-NEXT:    ret ptr [[RETVAL_0]]
+; CGSCC-NEXT:    [[RETVAL_0:%.*]] = phi ptr [ [[W0]], [[IF_END]] ], [ [[W0]], [[IF_THEN]] ]
+; CGSCC-NEXT:    ret ptr [[W0]]
 ;
 entry:
   %0 = load i32, ptr %r0, align 4
diff --git a/llvm/test/Transforms/Attributor/returned.ll b/llvm/test/Transforms/Attributor/returned.ll
index e94cb95069694..bc55a50f0e6f7 100644
--- a/llvm/test/Transforms/Attributor/returned.ll
+++ b/llvm/test/Transforms/Attributor/returned.ll
@@ -483,7 +483,7 @@ define ptr @rt0(ptr %a) #0 {
 ; TUNIT-LABEL: define {{[^@]+}}@rt0
 ; TUNIT-SAME: (ptr nofree noundef nonnull readonly returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[A:%.*]]) #[[ATTR3:[0-9]+]] {
 ; TUNIT-NEXT:  entry:
-; TUNIT-NEXT:    [[CALL:%.*]] = call ptr @rt0(ptr nofree noundef nonnull readonly align 4 dereferenceable(4) "no-capture-maybe-returned" [[A]]) #[[ATTR9:[0-9]+]]
+; TUNIT-NEXT:    [[CALL:%.*]] = call ptr @rt0(ptr nofree noundef nonnull readonly align 4 dereferenceable(4) "no-capture-maybe-returned" [[A]]) #[[ATTR10:[0-9]+]]
 ; TUNIT-NEXT:    ret ptr [[A]]
 ;
 ; CGSCC: Function Attrs: nofree noinline nosync nounwind memory(argmem: read) uwtable
@@ -667,7 +667,7 @@ define ptr @calls_unknown_fn(ptr %r) #0 {
 ; TUNIT: Function Attrs: noinline nounwind uwtable
 ; TUNIT-LABEL: define {{[^@]+}}@calls_unknown_fn
 ; TUNIT-SAME: (ptr nofree readnone returned "no-capture-maybe-returned" [[R:%.*]]) #[[ATTR5:[0-9]+]] {
-; TUNIT-NEXT:    tail call void @unknown_fn(ptr noundef nonnull @calls_unknown_fn) #[[ATTR10:[0-9]+]]
+; TUNIT-NEXT:    tail call void @unknown_fn(ptr noundef nonnull @calls_unknown_fn) #[[ATTR11:[0-9]+]]
 ; TUNIT-NEXT:    ret ptr [[R]]
 ;
 ; CGSCC: Function Attrs: noinline nounwind uwtable
@@ -716,7 +716,7 @@ define ptr @calls_maybe_redefined_fn(ptr %r) #0 {
 ; TUNIT-LABEL: define {{[^@]+}}@calls_maybe_redefined_fn
 ; TUNIT-SAME: (ptr returned [[R:%.*]]) #[[ATTR5]] {
 ; TUNIT-NEXT:  entry:
-; TUNIT-NEXT:    [[CALL:%.*]] = call ptr @maybe_redefined_fn(ptr [[R]]) #[[ATTR10]]
+; TUNIT-NEXT:    [[CALL:%.*]] = call ptr @maybe_redefined_fn(ptr [[R]]) #[[ATTR11]]
 ; TUNIT-NEXT:    ret ptr [[R]]
 ;
 ; CGSCC: Function Attrs: noinline nounwind uwtable
@@ -765,7 +765,7 @@ define ptr @calls_maybe_redefined_fn2(ptr %r) #0 {
 ; TUNIT-LABEL: define {{[^@]+}}@calls_maybe_redefined_fn2
 ; TUNIT-SAME: (ptr [[R:%.*]]) #[[ATTR5]] {
 ; TUNIT-NEXT:  entry:
-; TUNIT-NEXT:    [[CALL:%.*]] = call ptr @maybe_redefined_fn2(ptr [[R]]) #[[ATTR10]]
+; TUNIT-NEXT:    [[CALL:%.*]] = call ptr @maybe_redefined_fn2(ptr [[R]]) #[[ATTR11]]
 ; TUNIT-NEXT:    ret ptr [[CALL]]
 ;
 ; CGSCC: Function Attrs: noinline nounwind uwtable
@@ -1451,6 +1451,103 @@ declare dso_local ptr @__dynamic_cast(ptr, ptr, ptr, i64)
 
 ; UTC_ARGS: --enable
 
+; This does not return %arg, @recGood does.
+
+define internal i32 @recBad(i1 %c, i32 %arg) {
+; TUNIT: Function Attrs: nofree nosync nounwind memory(none)
+; TUNIT-LABEL: define {{[^@]+}}@recBad
+; TUNIT-SAME: (i1 noundef [[C:%.*]], i32 [[ARG:%.*]]) #[[ATTR8]] {
+; TUNIT-NEXT:    [[ADD:%.*]] = add i32 [[ARG]], 1
+; TUNIT-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
+; TUNIT:       t:
+; TUNIT-NEXT:    [[R:%.*]] = call i32 @recBad(i1 noundef false, i32 [[ADD]]) #[[ATTR8]]
+; TUNIT-NEXT:    ret i32 [[ADD]]
+; TUNIT:       f:
+; TUNIT-NEXT:    ret i32 [[ARG]]
+;
+; CGSCC: Function Attrs: nofree nosync nounwind memory(none)
+; CGSCC-LABEL: define {{[^@]+}}@recBad
+; CGSCC-SAME: (i1 noundef [[C:%.*]], i32 [[ARG:%.*]]) #[[ATTR7]] {
+; CGSCC-NEXT:    [[ADD:%.*]] = add i32 [[ARG]], 1
+; CGSCC-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
+; CGSCC:       t:
+; CGSCC-NEXT:    [[R:%.*]] = call i32 @recBad(i1 noundef false, i32 [[ADD]]) #[[ATTR7]]
+; CGSCC-NEXT:    ret i32 [[ADD]]
+; CGSCC:       f:
+; CGSCC-NEXT:    ret i32 [[ARG]]
+;
+  %add = add i32 %arg, 1
+  br i1 %c, label %t, label %f
+t:
+  %r = call i32 @recBad(i1 false, i32 %add)
+  ret i32 %r
+f:
+  ret i32 %arg
+}
+
+define i32 @recBadCaller(i1 %cQ, i32 %argQ) {
+; TUNIT: Function Attrs: nofree norecurse nosync nounwind memory(none)
+; TUNIT-LABEL: define {{[^@]+}}@recBadCaller
+; TUNIT-SAME: (i1 [[CQ:%.*]], i32 [[ARGQ:%.*]]) #[[ATTR9:[0-9]+]] {
+; TUNIT-NEXT:    [[RQ:%.*]] = call i32 @recBad(i1 noundef [[CQ]], i32 [[ARGQ]]) #[[ATTR8]]
+; TUNIT-NEXT:    ret i32 [[RQ]]
+;
+; CGSCC: Function Attrs: nofree nosync nounwind memory(none)
+; CGSCC-LABEL: define {{[^@]+}}@recBadCaller
+; CGSCC-SAME: (i1 noundef [[CQ:%.*]], i32 [[ARGQ:%.*]]) #[[ATTR7]] {
+; CGSCC-NEXT:    [[RQ:%.*]] = call i32 @recBad(i1 noundef [[CQ]], i32 [[ARGQ]]) #[[ATTR7]]
+; CGSCC-NEXT:    ret i32 [[RQ]]
+;
+  %rQ = call i32 @recBad(i1 %cQ, i32 %argQ)
+  ret i32 %rQ
+}
+
+define internal i32 @recGood(i1 %c, i32 %arg) {
+; TUNIT: Function Attrs: nofree nosync nounwind memory(none)
+; TUNIT-LABEL: define {{[^@]+}}@recGood
+; TUNIT-SAME: (i1 noundef [[C:%.*]], i32 returned [[ARG:%.*]]) #[[ATTR8]] {
+; TUNIT-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
+; TUNIT:       t:
+; TUNIT-NEXT:    [[R:%.*]] = call i32 @recGood(i1 noundef false, i32 [[ARG]]) #[[ATTR8]]
+; TUNIT-NEXT:    ret i32 [[ARG]]
+; TUNIT:       f:
+; TUNIT-NEXT:    ret i32 [[ARG]]
+;
+; CGSCC: Function Attrs: nofree nosync nounwind memory(none)
+; CGSCC-LABEL: define {{[^@]+}}@recGood
+; CGSCC-SAME: (i1 noundef [[C:%.*]], i32 returned [[ARG:%.*]]) #[[ATTR7]] {
+; CGSCC-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
+; CGSCC:       t:
+; CGSCC-NEXT:    [[R:%.*]] = call i32 @recGood(i1 noundef false, i32 [[ARG]]) #[[ATTR7]]
+; CGSCC-NEXT:    ret i32 [[ARG]]
+; CGSCC:       f:
+; CGSCC-NEXT:    ret i32 [[ARG]]
+;
+  br i1 %c, label %t, label %f
+t:
+  %r = call i32 @recGood(i1 false, i32 %arg)
+  ret i32 %r
+f:
+  ret i32 %arg
+}
+
+define i32 @recGoodCaller(i1 %cQ, i32 %argQ) {
+; TUNIT: Function Attrs: nofree norecurse nosync nounwind memory(none)
+; TUNIT-LABEL: define {{[^@]+}}@recGoodCaller
+; TUNIT-SAME: (i1 [[CQ:%.*]], i32 returned [[ARGQ:%.*]]) #[[ATTR9]] {
+; TUNIT-NEXT:    [[RQ:%.*]] = call i32 @recGood(i1 noundef [[CQ]], i32 [[ARGQ]]) #[[ATTR8]]
+; TUNIT-NEXT:    ret i32 [[ARGQ]]
+;
+; CGSCC: Function Attrs: nofree nosync nounwind memory(none)
+; CGSCC-LABEL: define {{[^@]+}}@recGoodCaller
+; CGSCC-SAME: (i1 noundef [[CQ:%.*]], i32 [[ARGQ:%.*]]) #[[ATTR7]] {
+; CGSCC-NEXT:    [[RQ:%.*]] = call i32 @recGood(i1 noundef [[CQ]], i32 [[ARGQ]]) #[[ATTR7]]
+; CGSCC-NEXT:    ret i32 [[RQ]]
+;
+  %rQ = call i32 @recGood(i1 %cQ, i32 %argQ)
+  ret i32 %rQ
+}
+
 attributes #0 = { noinline nounwind uwtable }
 ;.
 ; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable }
@@ -1462,9 +1559,10 @@ attributes #0 = { noinline nounwind uwtable }
 ; TUNIT: attributes #[[ATTR6]] = { noreturn }
 ; TUNIT: attributes #[[ATTR7:[0-9]+]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; TUNIT: attributes #[[ATTR8]] = { nofree nosync nounwind memory(none) }
-; TUNIT: attributes #[[ATTR9]] = { nofree nosync nounwind memory(read) }
-; TUNIT: attributes #[[ATTR10]] = { nounwind }
-; TUNIT: attributes #[[ATTR11:[0-9]+]] = { nounwind memory(none) }
+; TUNIT: attributes #[[ATTR9]] = { nofree norecurse nosync nounwind memory(none) }
+; TUNIT: attributes #[[ATTR10]] = { nofree nosync nounwind memory(read) }
+; TUNIT: attributes #[[ATTR11]] = { nounwind }
+; TUNIT: attributes #[[ATTR12:[0-9]+]] = { nounwind memory(none) }
 ;.
 ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable }
 ; CGSCC: attributes #[[ATTR1]] = { nofree noinline nosync nounwind memory(none) uwtable }
diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
index 0f89b428de7bf..6e4fb9e57388b 100644
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -201,7 +201,7 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: norecurse nosync nounwind allocsize(0) memory(read)
+; CHECK: Function Attrs: nosync nounwind allocsize(0) memory(read)
 ; CHECK-LABEL: define {{[^@]+}}@__kmpc_alloc_shared
 ; CHECK-SAME: (i64 [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr @offset, align 4
@@ -216,7 +216,7 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "kernel" }
 ; CHECK: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind memory(write) }
-; CHECK: attributes #[[ATTR2]] = { norecurse nosync nounwind allocsize(0) memory(read) }
+; CHECK: attributes #[[ATTR2]] = { nosync nounwind allocsize(0) memory(read) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nosync nounwind }
 ; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR5]] = { "llvm.assume"="omp_no_openmp" }

From 52dca6ffae08fcd86cff32ab469870016a6aceb5 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Thu, 5 Sep 2024 13:38:39 -0700
Subject: [PATCH 293/425] [SandboxVec] Boilerplate (#107431)

This patch implements the new pass and registers it with the pass
manager. For context, this is a vectorizer that operates on Sandbox IR,
which is a transactional IR on top of LLVM IR.
---
 .../SandboxVectorizer/SandboxVectorizer.h     | 31 +++++++++++++++
 llvm/lib/Passes/PassBuilder.cpp               |  1 +
 llvm/lib/Passes/PassRegistry.def              |  1 +
 llvm/lib/Transforms/Vectorize/CMakeLists.txt  |  3 ++
 .../SandboxVectorizer/SandboxVectorizer.cpp   | 39 +++++++++++++++++++
 .../SandboxVectorizer/boilerplate.ll          | 11 ++++++
 6 files changed, 86 insertions(+)
 create mode 100644 llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h
 create mode 100644 llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
 create mode 100644 llvm/test/Transforms/SandboxVectorizer/boilerplate.ll

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h
new file mode 100644
index 0000000000000..0d4dbae44521e
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h
@@ -0,0 +1,31 @@
+//===- SandboxVectorizer.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZER_H
+#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZER_H
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/SandboxIR/SandboxIR.h"
+
+namespace llvm {
+
+class SandboxVectorizerPass : public PassInfoMixin<SandboxVectorizerPass> {
+  TargetTransformInfo *TTI = nullptr;
+
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  bool runImpl(Function &F);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZER_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index a22abed8051a1..7c0acc0745eba 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -321,6 +321,7 @@
 #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h"
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
 #include <optional>
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index d6067089c6b5c..a1324e8170566 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -453,6 +453,7 @@ FUNCTION_PASS("reassociate", ReassociatePass())
 FUNCTION_PASS("redundant-dbg-inst-elim", RedundantDbgInstEliminationPass())
 FUNCTION_PASS("reg2mem", RegToMemPass())
 FUNCTION_PASS("safe-stack", SafeStackPass(TM))
+FUNCTION_PASS("sandbox-vectorizer", SandboxVectorizerPass())
 FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass())
 FUNCTION_PASS("scalarizer", ScalarizerPass())
 FUNCTION_PASS("sccp", SCCPPass())
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 5c88d94d96622..649faad48d71e 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_component_library(LLVMVectorize
   LoopIdiomVectorize.cpp
   LoopVectorizationLegality.cpp
   LoopVectorize.cpp
+  SandboxVectorizer/SandboxVectorizer.cpp
   SLPVectorizer.cpp
   Vectorize.cpp
   VectorCombine.cpp
@@ -18,6 +19,7 @@ add_llvm_component_library(LLVMVectorize
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Vectorize
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Vectorize/SandboxVectorizer
 
   DEPENDS
   intrinsics_gen
@@ -27,4 +29,5 @@ add_llvm_component_library(LLVMVectorize
   Core
   Support
   TransformUtils
+  SandboxIR
   )
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
new file mode 100644
index 0000000000000..072a6606694a0
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
@@ -0,0 +1,39 @@
+//===- SandboxVectorizer.cpp - Vectorizer based on Sandbox IR -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define SV_NAME "sandbox-vectorizer"
+#define DEBUG_TYPE SV_NAME
+
+PreservedAnalyses SandboxVectorizerPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+  bool Changed = runImpl(F);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+bool SandboxVectorizerPass::runImpl(Function &F) {
+  LLVM_DEBUG(dbgs() << "SBVec: Analyzing " << F.getName() << ".\n");
+  sandboxir::Context Ctx(F.getContext());
+  // Create SandboxIR for `F`.
+  sandboxir::Function &SBF = *Ctx.createFunction(&F);
+  // TODO: Initialize SBVec Pass Manager
+  (void)SBF;
+
+  return false;
+}
diff --git a/llvm/test/Transforms/SandboxVectorizer/boilerplate.ll b/llvm/test/Transforms/SandboxVectorizer/boilerplate.ll
new file mode 100644
index 0000000000000..353659d41485f
--- /dev/null
+++ b/llvm/test/Transforms/SandboxVectorizer/boilerplate.ll
@@ -0,0 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=sandbox-vectorizer %s -S | FileCheck %s
+
+; This test checks that the pass was registered with the pass manager.
+; TODO: Remove this test once actual tests land.
+define void @boilerplate() {
+; CHECK-LABEL: define void @boilerplate() {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}

From 4634a480e0e5aa3116b397369fe3877a8dfe4dc0 Mon Sep 17 00:00:00 2001
From: Christopher Ferris <cferris1000@users.noreply.github.com>
Date: Thu, 5 Sep 2024 13:43:34 -0700
Subject: [PATCH 294/425] [scudo] Add a method to use a hard-coded page size
 (#106646)

Currently, only Android supports using a hard-code page size. Make this
a bit more generic so any platform that wants to can use this.

In addition, add a getPageSizeLogCached() function since this value is
used in release.h and can avoid keeping this value around in objects.

Finally, change some of the release.h page size multiplies to shifts
using the new page size log value.
---
 compiler-rt/lib/scudo/standalone/combined.h |  3 +++
 compiler-rt/lib/scudo/standalone/common.cpp | 10 ++++++-
 compiler-rt/lib/scudo/standalone/common.h   | 30 ++++++++++++++++++---
 compiler-rt/lib/scudo/standalone/linux.cpp  |  3 +++
 compiler-rt/lib/scudo/standalone/platform.h |  5 ++++
 compiler-rt/lib/scudo/standalone/release.h  | 30 ++++++++++++---------
 6 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 9c26282e6f860..a5f1bc388e882 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -140,6 +140,9 @@ class Allocator {
   typedef typename QuarantineT::CacheT QuarantineCacheT;
 
   void init() {
+    // Make sure that the page size is initialized if it's not a constant.
+    CHECK_NE(getPageSizeCached(), 0U);
+
     performSanityChecks();
 
     // Check if hardware CRC32 is supported in the binary and by the platform,
diff --git a/compiler-rt/lib/scudo/standalone/common.cpp b/compiler-rt/lib/scudo/standalone/common.cpp
index 06e930638f6f9..80134c39e757d 100644
--- a/compiler-rt/lib/scudo/standalone/common.cpp
+++ b/compiler-rt/lib/scudo/standalone/common.cpp
@@ -12,13 +12,21 @@
 
 namespace scudo {
 
-uptr PageSizeCached;
+#if !defined(SCUDO_PAGE_SIZE)
+uptr PageSizeCached = 0;
+uptr PageSizeLogCached = 0;
+
+// Must be defined in platform specific code.
 uptr getPageSize();
 
+// This must be called in the init path or there could be a race if multiple
+// threads try to set the cached values.
 uptr getPageSizeSlow() {
   PageSizeCached = getPageSize();
   CHECK_NE(PageSizeCached, 0);
+  PageSizeLogCached = getLog2(PageSizeCached);
   return PageSizeCached;
 }
+#endif
 
 } // namespace scudo
diff --git a/compiler-rt/lib/scudo/standalone/common.h b/compiler-rt/lib/scudo/standalone/common.h
index 151fbd317e74d..e5dfda2e9072a 100644
--- a/compiler-rt/lib/scudo/standalone/common.h
+++ b/compiler-rt/lib/scudo/standalone/common.h
@@ -133,18 +133,40 @@ inline void computePercentage(uptr Numerator, uptr Denominator, uptr *Integral,
 
 // Platform specific functions.
 
+#if defined(SCUDO_PAGE_SIZE)
+
+inline constexpr uptr getPageSizeCached() { return SCUDO_PAGE_SIZE; }
+
+inline constexpr uptr getPageSizeSlow() { return getPageSizeCached(); }
+
+inline constexpr uptr getPageSizeLogCached() {
+  return static_cast<uptr>(__builtin_ctzl(SCUDO_PAGE_SIZE));
+}
+
+#else
+
 extern uptr PageSizeCached;
+extern uptr PageSizeLogCached;
+
 uptr getPageSizeSlow();
+
 inline uptr getPageSizeCached() {
-#if SCUDO_ANDROID && defined(PAGE_SIZE)
-  // Most Android builds have a build-time constant page size.
-  return PAGE_SIZE;
-#endif
   if (LIKELY(PageSizeCached))
     return PageSizeCached;
   return getPageSizeSlow();
 }
 
+inline uptr getPageSizeLogCached() {
+  if (LIKELY(PageSizeLogCached))
+    return PageSizeLogCached;
+  // PageSizeLogCached and PageSizeCached are both set in getPageSizeSlow()
+  getPageSizeSlow();
+  DCHECK_NE(PageSizeLogCached, 0);
+  return PageSizeLogCached;
+}
+
+#endif
+
 // Returns 0 if the number of CPUs could not be determined.
 u32 getNumberOfCPUs();
 
diff --git a/compiler-rt/lib/scudo/standalone/linux.cpp b/compiler-rt/lib/scudo/standalone/linux.cpp
index 2746951081098..6cc8e0c786e06 100644
--- a/compiler-rt/lib/scudo/standalone/linux.cpp
+++ b/compiler-rt/lib/scudo/standalone/linux.cpp
@@ -40,7 +40,10 @@
 
 namespace scudo {
 
+#if !defined(SCUDO_PAGE_SIZE)
+// This function is only used when page size is not hard-coded.
 uptr getPageSize() { return static_cast<uptr>(sysconf(_SC_PAGESIZE)); }
+#endif
 
 void NORETURN die() { abort(); }
 
diff --git a/compiler-rt/lib/scudo/standalone/platform.h b/compiler-rt/lib/scudo/standalone/platform.h
index 5af1275e32d2b..3f017faaee78f 100644
--- a/compiler-rt/lib/scudo/standalone/platform.h
+++ b/compiler-rt/lib/scudo/standalone/platform.h
@@ -21,6 +21,11 @@
 // See https://android.googlesource.com/platform/bionic/+/master/docs/defines.md
 #if defined(__BIONIC__)
 #define SCUDO_ANDROID 1
+// Transitive includes of unistd.h will get PAGE_SIZE if it is defined.
+#include <unistd.h>
+#if defined(PAGE_SIZE)
+#define SCUDO_PAGE_SIZE PAGE_SIZE
+#endif
 #else
 #define SCUDO_ANDROID 0
 #endif
diff --git a/compiler-rt/lib/scudo/standalone/release.h b/compiler-rt/lib/scudo/standalone/release.h
index 69f926e3f8680..51abdd82aa538 100644
--- a/compiler-rt/lib/scudo/standalone/release.h
+++ b/compiler-rt/lib/scudo/standalone/release.h
@@ -88,7 +88,7 @@ class FragmentationRecorder {
 
   void releasePageRangeToOS(uptr From, uptr To) {
     DCHECK_EQ((To - From) % getPageSizeCached(), 0U);
-    ReleasedPagesCount += (To - From) / getPageSizeCached();
+    ReleasedPagesCount += (To - From) >> getPageSizeLogCached();
   }
 
 private:
@@ -348,7 +348,7 @@ class RegionPageMap {
 template <class ReleaseRecorderT> class FreePagesRangeTracker {
 public:
   explicit FreePagesRangeTracker(ReleaseRecorderT &Recorder)
-      : Recorder(Recorder), PageSizeLog(getLog2(getPageSizeCached())) {}
+      : Recorder(Recorder) {}
 
   void processNextPage(bool Released) {
     if (Released) {
@@ -372,6 +372,7 @@ template <class ReleaseRecorderT> class FreePagesRangeTracker {
 private:
   void closeOpenedRange() {
     if (InRange) {
+      const uptr PageSizeLog = getPageSizeLogCached();
       Recorder.releasePageRangeToOS((CurrentRangeStatePage << PageSizeLog),
                                     (CurrentPage << PageSizeLog));
       InRange = false;
@@ -379,7 +380,6 @@ template <class ReleaseRecorderT> class FreePagesRangeTracker {
   }
 
   ReleaseRecorderT &Recorder;
-  const uptr PageSizeLog;
   bool InRange = false;
   uptr CurrentPage = 0;
   uptr CurrentRangeStatePage = 0;
@@ -389,7 +389,7 @@ struct PageReleaseContext {
   PageReleaseContext(uptr BlockSize, uptr NumberOfRegions, uptr ReleaseSize,
                      uptr ReleaseOffset = 0)
       : BlockSize(BlockSize), NumberOfRegions(NumberOfRegions) {
-    PageSize = getPageSizeCached();
+    const uptr PageSize = getPageSizeCached();
     if (BlockSize <= PageSize) {
       if (PageSize % BlockSize == 0) {
         // Same number of chunks per page, no cross overs.
@@ -408,7 +408,7 @@ struct PageReleaseContext {
         SameBlockCountPerPage = false;
       }
     } else {
-      if (BlockSize % PageSize == 0) {
+      if ((BlockSize & (PageSize - 1)) == 0) {
         // One chunk covers multiple pages, no cross overs.
         FullPagesBlockCountMax = 1;
         SameBlockCountPerPage = true;
@@ -427,8 +427,8 @@ struct PageReleaseContext {
     if (NumberOfRegions != 1)
       DCHECK_EQ(ReleaseOffset, 0U);
 
-    PagesCount = roundUp(ReleaseSize, PageSize) / PageSize;
-    PageSizeLog = getLog2(PageSize);
+    const uptr PageSizeLog = getPageSizeLogCached();
+    PagesCount = roundUp(ReleaseSize, PageSize) >> PageSizeLog;
     ReleasePageOffset = ReleaseOffset >> PageSizeLog;
   }
 
@@ -451,6 +451,7 @@ struct PageReleaseContext {
   // RegionSize, it's not necessary to be aligned with page size.
   bool markRangeAsAllCounted(uptr From, uptr To, uptr Base,
                              const uptr RegionIndex, const uptr RegionSize) {
+    const uptr PageSize = getPageSizeCached();
     DCHECK_LT(From, To);
     DCHECK_LE(To, Base + RegionSize);
     DCHECK_EQ(From % PageSize, 0U);
@@ -544,6 +545,7 @@ struct PageReleaseContext {
     if (!ensurePageMapAllocated())
       return false;
 
+    const uptr PageSize = getPageSizeCached();
     if (MayContainLastBlockInRegion) {
       const uptr LastBlockInRegion =
           ((RegionSize / BlockSize) - 1U) * BlockSize;
@@ -605,17 +607,19 @@ struct PageReleaseContext {
     return true;
   }
 
-  uptr getPageIndex(uptr P) { return (P >> PageSizeLog) - ReleasePageOffset; }
-  uptr getReleaseOffset() { return ReleasePageOffset << PageSizeLog; }
+  uptr getPageIndex(uptr P) {
+    return (P >> getPageSizeLogCached()) - ReleasePageOffset;
+  }
+  uptr getReleaseOffset() {
+    return ReleasePageOffset << getPageSizeLogCached();
+  }
 
   uptr BlockSize;
   uptr NumberOfRegions;
   // For partial region marking, some pages in front are not needed to be
   // counted.
   uptr ReleasePageOffset;
-  uptr PageSize;
   uptr PagesCount;
-  uptr PageSizeLog;
   uptr FullPagesBlockCountMax;
   bool SameBlockCountPerPage;
   RegionPageMap PageMap;
@@ -628,7 +632,7 @@ template <class ReleaseRecorderT, typename SkipRegionT>
 NOINLINE void
 releaseFreeMemoryToOS(PageReleaseContext &Context,
                       ReleaseRecorderT &Recorder, SkipRegionT SkipRegion) {
-  const uptr PageSize = Context.PageSize;
+  const uptr PageSize = getPageSizeCached();
   const uptr BlockSize = Context.BlockSize;
   const uptr PagesCount = Context.PagesCount;
   const uptr NumberOfRegions = Context.NumberOfRegions;
@@ -671,7 +675,7 @@ releaseFreeMemoryToOS(PageReleaseContext &Context,
       uptr PrevPageBoundary = 0;
       uptr CurrentBoundary = 0;
       if (ReleasePageOffset > 0) {
-        PrevPageBoundary = ReleasePageOffset * PageSize;
+        PrevPageBoundary = ReleasePageOffset << getPageSizeLogCached();
         CurrentBoundary = roundUpSlow(PrevPageBoundary, BlockSize);
       }
       for (uptr J = 0; J < PagesCount; J++) {

From 247d3ea843cb20d8d75ec781cd603c8ececf8934 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 5 Sep 2024 13:43:31 -0700
Subject: [PATCH 295/425] [SLP] Expand non-power-of-two bailout in
 TryToFindDuplicates

This fixes a crash noticed when doing a downstream merge.  The
test case has been reduced, and is included in this commit.

The existing bailout for non-power-of-two vectors in TryToFindDuplicates
did not consider the case where the list being vectorized had no
root node.  This allowed reshuffled scalars to slip through to code
which does not yet expect to handle it.

This was an existing bug (likely introduced by my ed03070e), but
made easier to hit by 63e8a1b1
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  3 +-
 .../SLPVectorizer/RISCV/vec3-base.ll          | 30 +++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 74bb529b2526e..a77d236413a96 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6989,7 +6989,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       ReuseShuffleIndices.clear();
     } else {
       // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
-      if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
+      if ((UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) ||
+          !llvm::has_single_bit(VL.size())) {
         LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
                              "for nodes with padding.\n");
         newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 4e8e019e155db..faffe16f8e9cd 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -762,6 +762,36 @@ define double @dot_product_fp64(ptr %a, ptr %b) {
   ret double %add.1
 }
 
+;; Covers a case where SLP would previous crash due to a
+;; missing bailout in TryToFindDuplicates for the case
+;; where a VL=3 list was vectorized directly (without
+;; a root instruction such as a store or reduce).
+define double @no_root_reshuffle(ptr  %ptr) {
+; CHECK-LABEL: @no_root_reshuffle(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[PTR:%.*]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast double [[TMP0]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast double [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[MUL6:%.*]] = fmul fast double [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast double [[MUL6]], [[MUL]]
+; CHECK-NEXT:    ret double [[ADD]]
+;
+entry:
+  %0 = load double, ptr %ptr, align 8
+  %mul = fmul fast double %0, %0
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 8
+  %1 = load double, ptr %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds i8, ptr %ptr, i64 16
+  %2 = load double, ptr %arrayidx3, align 8
+  %3 = fmul fast double %2, %2
+  %mul6 = fmul fast double %3, %1
+  %add = fadd fast double %mul6, %mul
+  ret double %add
+}
 
 declare float @llvm.fmuladd.f32(float, float, float)
 

From 24684bb4a9791145a36a97477eb1fd525a122d8e Mon Sep 17 00:00:00 2001
From: Evgenii Stepanov <eugeni.stepanov@gmail.com>
Date: Thu, 5 Sep 2024 14:09:33 -0700
Subject: [PATCH 296/425] [sanitizer] Delay sanitizer args parsing (#107280)

Delay sanitizer arg parsing until after -Xclang flags are forwarded to
the clang command line. This allows the check in hasTargetFeatureMTE to
pick up manually specified target feature, and enables the following:
  -march=armv8-a -Xclang -target-feature -Xclang +mte
  -fsanitize=memtag-stack
---
 clang/lib/Driver/ToolChains/Clang.cpp | 6 ++++--
 clang/test/Driver/fsanitize.c         | 4 +++-
 clang/test/Driver/fuchsia.c           | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 90a747ca58986..3fe4ce5d893b8 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6786,8 +6786,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("--offload-new-driver");
   }
 
-  SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType);
-
   const XRayArgs &XRay = TC.getXRayArgs();
   XRay.addArgs(TC, Args, CmdArgs, InputType);
 
@@ -7677,6 +7675,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
+  // This needs to run after -Xclang argument forwarding to pick up the target
+  // features enabled through -Xclang -target-feature flags.
+  SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType);
+
   // With -save-temps, we want to save the unoptimized bitcode output from the
   // CompileJobAction, use -disable-llvm-passes to get pristine IR generated
   // by the frontend.
diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c
index f86c978f221cd..6ecf0b57bee5c 100644
--- a/clang/test/Driver/fsanitize.c
+++ b/clang/test/Driver/fsanitize.c
@@ -197,6 +197,8 @@
 // CHECK-SANMT-MT: "-target-feature" "+mte"
 // CHECK-SANMT-MT-SAME: "-fsanitize=memtag-stack,memtag-heap,memtag-globals"
 
+// RUN: not %clang --target=aarch64-linux -fsanitize=memtag -Xclang -target-feature -Xclang +mte %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANMT-MT
+
 // RUN: not %clang --target=aarch64-linux -fsanitize=memtag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANMT-NOMT-0
 // CHECK-SANMT-NOMT-0: '-fsanitize=memtag-stack' requires hardware support (+memtag)
 
@@ -726,8 +728,8 @@
 // NO-SP-NOT: stack-protector
 // NO-SP: "-fsanitize=safe-stack"
 // SP-ASAN: error: invalid argument '-fsanitize=safe-stack' not allowed with '-fsanitize=address'
-// SP: "-fsanitize=safe-stack"
 // SP: -stack-protector
+// SP: "-fsanitize=safe-stack"
 // NO-SP-NOT: stack-protector
 
 // RUN: %clang --target=powerpc64-unknown-linux-gnu -fsanitize=memory %s -### 2>&1 | FileCheck %s -check-prefix=CHECK-SANM
diff --git a/clang/test/Driver/fuchsia.c b/clang/test/Driver/fuchsia.c
index c67f7f8c005b3..83dee16981690 100644
--- a/clang/test/Driver/fuchsia.c
+++ b/clang/test/Driver/fuchsia.c
@@ -30,10 +30,10 @@
 // CHECK: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK: "-isysroot" "[[SYSROOT:[^"]+]]"
 // CHECK: "-internal-externc-isystem" "[[SYSROOT]]{{/|\\\\}}include"
+// CHECK: "-stack-protector" "2"
 // CHECK-AARCH64: "-fsanitize=shadow-call-stack"
 // CHECK-RISCV64: "-fsanitize=shadow-call-stack"
 // CHECK-X86_64: "-fsanitize=safe-stack"
-// CHECK: "-stack-protector" "2"
 // CHECK-AARCH64: "-target-feature" "+outline-atomics"
 // CHECK-NOT: "-fcommon"
 // CHECK: {{.*}}ld.lld{{.*}}" "-z" "max-page-size=4096" "-z" "now" "-z" "start-stop-visibility=hidden" "-z" "rodynamic" "-z" "separate-loadable-segments" "-z" "rel" "--pack-dyn-relocs=relr"

From 3836d4acccbe87216133d08d75df509e95c291f0 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 5 Sep 2024 14:27:31 -0700
Subject: [PATCH 297/425] [lldb] Convert ConnectionGenericFileWindows.cpp to
 new Status API (NFC)

---
 lldb/source/Host/windows/ConnectionGenericFileWindows.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp b/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp
index fd78a625d9cb9..170444f56b310 100644
--- a/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp
+++ b/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp
@@ -236,7 +236,7 @@ size_t ConnectionGenericFile::Read(void *dst, size_t dst_len,
 finish:
   status = return_info.GetStatus();
   if (error_ptr)
-    *error_ptr = return_info.GetError();
+    *error_ptr = Status::FromError(return_info.GetError());
 
   // kBytesAvailableEvent is a manual reset event.  Make sure it gets reset
   // here so that any subsequent operations don't immediately see bytes
@@ -290,7 +290,7 @@ size_t ConnectionGenericFile::Write(const void *src, size_t src_len,
 finish:
   status = return_info.GetStatus();
   if (error_ptr)
-    *error_ptr = return_info.GetError();
+    *error_ptr = Status::FromError(return_info.GetError());
 
   IncrementFilePointer(return_info.GetBytes());
   Log *log = GetLog(LLDBLog::Connection);

From f00c946c2da0caf6da4a49e87ac905a8b1d2e8b6 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 5 Sep 2024 14:28:28 -0700
Subject: [PATCH 298/425] [lldb] Convert MainLoopWindows.cpp to new Status API
 (NFC)

---
 lldb/source/Host/windows/MainLoopWindows.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Host/windows/MainLoopWindows.cpp b/lldb/source/Host/windows/MainLoopWindows.cpp
index 551e73e6904ae..88d929535ab6c 100644
--- a/lldb/source/Host/windows/MainLoopWindows.cpp
+++ b/lldb/source/Host/windows/MainLoopWindows.cpp
@@ -121,7 +121,7 @@ Status MainLoopWindows::Run() {
 
     llvm::Expected<size_t> signaled_event = Poll();
     if (!signaled_event)
-      return Status(signaled_event.takeError());
+      return Status::FromError(signaled_event.takeError());
 
     if (*signaled_event < m_read_fds.size()) {
       auto &KV = *std::next(m_read_fds.begin(), *signaled_event);

From 64498c54831bed9cf069e0923b9b73678c6451d8 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Thu, 5 Sep 2024 14:49:03 -0700
Subject: [PATCH 299/425] [LTO][ELF][lld] Use unique string saver in ELF
 bitcode symbol parsing (#106670)

lld ELF
[BitcodeFile](https://github.com/llvm/llvm-project/blob/a527248a3c2d638b0c92a06992f3f1c1f80842ad/lld/ELF/InputFiles.h#L328)
uses [string
saver](https://github.com/llvm/llvm-project/blob/a527248a3c2d638b0c92a06992f3f1c1f80842ad/lld/include/lld/Common/CommonLinkerContext.h#L57)
to keep copies of bitcode symbols. Symbol duplication is very common
when compiling application binaries.

This change proposes to introduce a UniqueStringSaver in lld context and
use it for bitcode symbol parsing. The implementation covers ELF only.
Similar opportunities should exist on other (COFF, MachO, wasm) formats.

For an internal production binary where lto indexing takes ~10GiB
originally, this changes optimizes away ~800MiB (~7.8%), measured by
https://github.com/google/pprof. Flame graph breaks down memory by usage
call stacks and agrees with this measurement.
---
 lld/ELF/InputFiles.cpp                       | 8 ++++++--
 lld/include/lld/Common/CommonLinkerContext.h | 8 +++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 7adc35f20984a..1570adf137093 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -1744,8 +1744,10 @@ createBitcodeSymbol(Symbol *&sym, const std::vector<bool> &keptComdats,
   uint8_t type = objSym.isTLS() ? STT_TLS : STT_NOTYPE;
   uint8_t visibility = mapVisibility(objSym.getVisibility());
 
+  // Symbols can be duplicated in bitcode files because of '#include' and
+  // linkonce_odr. Use unique_saver to save symbol names for de-duplication.
   if (!sym)
-    sym = symtab.insert(saver().save(objSym.getName()));
+    sym = symtab.insert(unique_saver().save(objSym.getName()));
 
   int c = objSym.getComdatIndex();
   if (objSym.isUndefined() || (c != -1 && !keptComdats[c])) {
@@ -1797,7 +1799,9 @@ void BitcodeFile::parseLazy() {
   symbols = std::make_unique<Symbol *[]>(numSymbols);
   for (auto [i, irSym] : llvm::enumerate(obj->symbols()))
     if (!irSym.isUndefined()) {
-      auto *sym = symtab.insert(saver().save(irSym.getName()));
+      // Symbols can be duplicated in bitcode files because of '#include' and
+      // linkonce_odr. Use unique_saver to save symbol names for de-duplication.
+      auto *sym = symtab.insert(unique_saver().save(irSym.getName()));
       sym->resolve(LazySymbol{*this});
       symbols[i] = sym;
     }
diff --git a/lld/include/lld/Common/CommonLinkerContext.h b/lld/include/lld/Common/CommonLinkerContext.h
index 0627bbdc8bd87..9970dfcb713f8 100644
--- a/lld/include/lld/Common/CommonLinkerContext.h
+++ b/lld/include/lld/Common/CommonLinkerContext.h
@@ -38,6 +38,7 @@ class CommonLinkerContext {
 
   llvm::BumpPtrAllocator bAlloc;
   llvm::StringSaver saver{bAlloc};
+  llvm::UniqueStringSaver unique_saver{bAlloc};
   llvm::DenseMap<void *, SpecificAllocBase *> instances;
 
   ErrorHandler e;
@@ -54,8 +55,13 @@ template <typename T = CommonLinkerContext> T &context() {
 
 bool hasContext();
 
-inline llvm::StringSaver &saver() { return context().saver; }
 inline llvm::BumpPtrAllocator &bAlloc() { return context().bAlloc; }
+inline llvm::StringSaver &saver() { return context().saver; }
+inline llvm::UniqueStringSaver &unique_saver() {
+  // FIXME: Look into other places where duplications are common in saved
+  // strings and unique saver make sense.
+  return context().unique_saver;
+}
 } // namespace lld
 
 #endif

From 50be455ab88b17872cd620698156b4058dc92f58 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Thu, 5 Sep 2024 14:52:30 -0700
Subject: [PATCH 300/425] [TableGen] Add check for number of intrinsic return
 values (#107326)

Fail if we see an intrinsic that returns more than the supported number
of return values.

Intrinsics can return only upto a certain nyumber of values, as defined
by the `IIT_RetNumbers` list in `Intrinsics.td`. Currently, if we define
an intrinsic that exceeds the limit, llvm-tblgen crashes. Instead, read
this limit and fail if it's exceeded with a proper error message.
---
 llvm/test/TableGen/intrinsic-struct.td        | 25 +++++++++++++------
 .../TableGen/Basic/CodeGenIntrinsics.cpp      | 17 ++++++++++++-
 llvm/utils/TableGen/Basic/CodeGenIntrinsics.h |  3 +++
 3 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/llvm/test/TableGen/intrinsic-struct.td b/llvm/test/TableGen/intrinsic-struct.td
index f23a7a7643af2..467fd9057c183 100644
--- a/llvm/test/TableGen/intrinsic-struct.td
+++ b/llvm/test/TableGen/intrinsic-struct.td
@@ -1,11 +1,22 @@
-// RUN: llvm-tblgen -gen-intrinsic-enums -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS | FileCheck %s
+// RUN: llvm-tblgen -gen-intrinsic-enums -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS | FileCheck %s --check-prefix=CHECK-ENUM
+// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS > /dev/null 2>&1
+// RUN: not llvm-tblgen -gen-intrinsic-impl -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS -DENABLE_ERROR 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
 // XFAIL: vg_leak
 
 include "llvm/IR/Intrinsics.td"
 
-// Make sure we can return up to 8 values
-// CHECK: returns_8_results = {{[0-9]+}}, // llvm.returns.8.results
-def int_returns_8_results : Intrinsic<
-    [llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty,
-     llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty],
-     [], [], "llvm.returns.8.results">;
+// Make sure we can return up to 9 values.
+// CHECK-ENUM: returns_9_results = {{[0-9]+}}, // llvm.returns.9.results
+def int_returns_9_results : Intrinsic<
+                              !listsplat(llvm_anyint_ty, 9),
+                              [], [], "llvm.returns.9.results">;
+
+#ifdef ENABLE_ERROR
+// CHECK-ERROR: error: intrinsics can only return upto 9 values, 'int_returns_10_results' returns 10 values
+// CHECK-ERROR-NEXT: def int_returns_10_results : Intrinsic<
+def int_returns_10_results : Intrinsic<
+                              !listsplat(llvm_anyint_ty, 10),
+                              [], [], "llvm.returns.10.results">;
+
+#endif
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
index a30a7577408f8..23c64912c780f 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
@@ -29,6 +29,15 @@ CodeGenIntrinsicContext::CodeGenIntrinsicContext(const RecordKeeper &RC) {
   for (const Record *Rec : RC.getAllDerivedDefinitions("IntrinsicProperty"))
     if (Rec->getValueAsBit("IsDefault"))
       DefaultProperties.push_back(Rec);
+
+  // The maximum number of values that an intrinsic can return is the size of
+  // of `IIT_RetNumbers` list - 1 (since we index into this list using the
+  // number of return values as the index).
+  const auto *IIT_RetNumbers =
+      dyn_cast_or_null<ListInit>(RC.getGlobal("IIT_RetNumbers"));
+  if (!IIT_RetNumbers)
+    PrintFatalError("unable to find 'IIT_RetNumbers' list");
+  MaxNumReturn = IIT_RetNumbers->size() - 1;
 }
 
 CodeGenIntrinsicTable::CodeGenIntrinsicTable(const RecordKeeper &RC) {
@@ -106,6 +115,13 @@ CodeGenIntrinsic::CodeGenIntrinsic(const Record *R,
                                   TargetPrefix + ".'!");
   }
 
+  unsigned NumRet = R->getValueAsListInit("RetTypes")->size();
+  if (NumRet > Ctx.MaxNumReturn)
+    PrintFatalError(DefLoc, "intrinsics can only return upto " +
+                                Twine(Ctx.MaxNumReturn) + " values, '" +
+                                DefName + "' returns " + Twine(NumRet) +
+                                " values");
+
   const Record *TypeInfo = R->getValueAsDef("TypeInfo");
   if (!TypeInfo->isSubClassOf("TypeInfoGen"))
     PrintFatalError(DefLoc, "TypeInfo field in " + DefName +
@@ -116,7 +132,6 @@ CodeGenIntrinsic::CodeGenIntrinsic(const Record *R,
 
   // Types field is a concatenation of Return types followed by Param types.
   unsigned Idx = 0;
-  unsigned NumRet = R->getValueAsListInit("RetTypes")->size();
   for (; Idx < NumRet; ++Idx)
     IS.RetTys.push_back(TypeList->getElementAsRecord(Idx));
 
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
index 51c2359155380..83282d18789b2 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
@@ -30,6 +30,9 @@ class RecordKeeper;
 struct CodeGenIntrinsicContext {
   explicit CodeGenIntrinsicContext(const RecordKeeper &RC);
   std::vector<const Record *> DefaultProperties;
+
+  // Maximum number of values an intrinsic can return.
+  unsigned MaxNumReturn;
 };
 
 struct CodeGenIntrinsic {

From 3380dae2f0d6b8035744da573c4508b98c80045c Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Thu, 5 Sep 2024 14:55:05 -0700
Subject: [PATCH 301/425] [lld][InstrProf] Refactor BPSectionOrderer.cpp
 (#107347)

Refactor some code in `BPSectionOrderer.cpp` in preparation for
https://github.com/llvm/llvm-project/pull/107348.

* Rename `constructNodesForCompression()` -> `getUnsForCompression()`
and return a `SmallVector` directly rather than populating a vector
alias
* Pass `duplicateSectionIdxs` as a pointer to make it possible to skip
finding (nearly) duplicate sections
* Combine `duplicate{Function,Data}SectionIdxs` into one variable
* Compute all `BPFunctionNode` vectors at the end (like
`nodesForStartup`)

There should be no functional change.
---
 lld/MachO/BPSectionOrderer.cpp | 124 ++++++++++++++++++---------------
 1 file changed, 66 insertions(+), 58 deletions(-)

diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp
index 568843d72bbb5..97458c96fd80d 100644
--- a/lld/MachO/BPSectionOrderer.cpp
+++ b/lld/MachO/BPSectionOrderer.cpp
@@ -21,6 +21,8 @@
 using namespace llvm;
 using namespace lld::macho;
 
+using UtilityNodes = SmallVector<BPFunctionNode::UtilityNodeT>;
+
 /// Symbols can be appended with "(.__uniq.xxxx)?.llvm.yyyy" where "xxxx" and
 /// "yyyy" are numbers that could change between builds. We need to use the root
 /// symbol name before this suffix so these symbols can be matched with profiles
@@ -60,12 +62,15 @@ getRelocHash(const Reloc &reloc,
   return getRelocHash(kind, sectionIdx.value_or(0), 0, reloc.addend);
 }
 
-static void constructNodesForCompression(
-    const SmallVector<const InputSection *> &sections,
+/// Given \p sectionIdxs, a list of section indexes, return a list of utility
+/// nodes for each section index. If \p duplicateSectionIdx is provided,
+/// populate it with nearly identical sections. Increment \p maxUN to be the
+/// largest utility node we have used so far.
+static SmallVector<std::pair<unsigned, UtilityNodes>> getUnsForCompression(
+    ArrayRef<const InputSection *> sections,
     const DenseMap<const InputSection *, uint64_t> &sectionToIdx,
-    const SmallVector<unsigned> &sectionIdxs,
-    std::vector<BPFunctionNode> &nodes,
-    DenseMap<unsigned, SmallVector<unsigned>> &duplicateSectionIdxs,
+    ArrayRef<unsigned> sectionIdxs,
+    DenseMap<unsigned, SmallVector<unsigned>> *duplicateSectionIdxs,
     BPFunctionNode::UtilityNodeT &maxUN) {
   TimeTraceScope timeScope("Build nodes for compression");
 
@@ -103,49 +108,52 @@ static void constructNodesForCompression(
     for (auto hash : hashes)
       ++hashFrequency[hash];
 
-  // Merge section that are nearly identical
-  SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
-  DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
-  for (auto &[sectionIdx, hashes] : sectionHashes) {
-    uint64_t wholeHash = 0;
-    for (auto hash : hashes)
-      if (hashFrequency[hash] > 5)
-        wholeHash ^= hash;
-    auto [it, wasInserted] =
-        wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
-    if (wasInserted) {
-      newSectionHashes.emplace_back(sectionIdx, hashes);
-    } else {
-      duplicateSectionIdxs[it->getSecond()].push_back(sectionIdx);
+  if (duplicateSectionIdxs) {
+    // Merge section that are nearly identical
+    SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
+    DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
+    for (auto &[sectionIdx, hashes] : sectionHashes) {
+      uint64_t wholeHash = 0;
+      for (auto hash : hashes)
+        if (hashFrequency[hash] > 5)
+          wholeHash ^= hash;
+      auto [it, wasInserted] =
+          wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
+      if (wasInserted) {
+        newSectionHashes.emplace_back(sectionIdx, hashes);
+      } else {
+        (*duplicateSectionIdxs)[it->getSecond()].push_back(sectionIdx);
+      }
     }
-  }
-  sectionHashes = newSectionHashes;
+    sectionHashes = newSectionHashes;
 
-  // Recompute hash frequencies
-  hashFrequency.clear();
-  for (auto &[sectionIdx, hashes] : sectionHashes)
-    for (auto hash : hashes)
-      ++hashFrequency[hash];
+    // Recompute hash frequencies
+    hashFrequency.clear();
+    for (auto &[sectionIdx, hashes] : sectionHashes)
+      for (auto hash : hashes)
+        ++hashFrequency[hash];
+  }
 
   // Filter rare and common hashes and assign each a unique utility node that
   // doesn't conflict with the trace utility nodes
   DenseMap<uint64_t, BPFunctionNode::UtilityNodeT> hashToUN;
   for (auto &[hash, frequency] : hashFrequency) {
-    if (frequency <= 1 || frequency * 2 > wholeHashToSectionIdx.size())
+    if (frequency <= 1 || frequency * 2 > sectionHashes.size())
       continue;
     hashToUN[hash] = ++maxUN;
   }
 
-  std::vector<BPFunctionNode::UtilityNodeT> uns;
+  SmallVector<std::pair<unsigned, UtilityNodes>> sectionUns;
   for (auto &[sectionIdx, hashes] : sectionHashes) {
+    UtilityNodes uns;
     for (auto &hash : hashes) {
       auto it = hashToUN.find(hash);
       if (it != hashToUN.end())
         uns.push_back(it->second);
     }
-    nodes.emplace_back(sectionIdx, uns);
-    uns.clear();
+    sectionUns.emplace_back(sectionIdx, uns);
   }
+  return sectionUns;
 }
 
 DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
@@ -185,10 +193,10 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
                                            sectionIdxs.end());
   }
 
-  std::vector<BPFunctionNode> nodesForStartup;
   BPFunctionNode::UtilityNodeT maxUN = 0;
-  DenseMap<unsigned, SmallVector<BPFunctionNode::UtilityNodeT>>
-      startupSectionIdxUNs;
+  DenseMap<unsigned, UtilityNodes> startupSectionIdxUNs;
+  // Used to define the initial order for startup functions.
+  DenseMap<unsigned, size_t> sectionIdxToTimestamp;
   std::unique_ptr<InstrProfReader> reader;
   if (!profilePath.empty()) {
     auto fs = vfs::getRealFileSystem();
@@ -202,8 +210,6 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     }
     auto &traces = reader->getTemporalProfTraces();
 
-    // Used to define the initial order for startup functions.
-    DenseMap<unsigned, size_t> sectionIdxToTimestamp;
     DenseMap<unsigned, BPFunctionNode::UtilityNodeT> sectionIdxToFirstUN;
     for (size_t traceIdx = 0; traceIdx < traces.size(); traceIdx++) {
       uint64_t currentSize = 0, cutoffSize = 1;
@@ -245,15 +251,6 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
       ++maxUN;
       sectionIdxToFirstUN.clear();
     }
-
-    // These uns should already be sorted without duplicates.
-    for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
-      nodesForStartup.emplace_back(sectionIdx, uns);
-
-    llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
-      return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
-             std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
-    });
   }
 
   SmallVector<unsigned> sectionIdxsForFunctionCompression,
@@ -271,21 +268,32 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     }
   }
 
-  std::vector<BPFunctionNode> nodesForFunctionCompression,
-      nodesForDataCompression;
   // Map a section index (to be ordered for compression) to a list of duplicate
   // section indices (not ordered for compression).
-  DenseMap<unsigned, SmallVector<unsigned>> duplicateFunctionSectionIdxs,
-      duplicateDataSectionIdxs;
-  constructNodesForCompression(
+  DenseMap<unsigned, SmallVector<unsigned>> duplicateSectionIdxs;
+  auto unsForFunctionCompression = getUnsForCompression(
       sections, sectionToIdx, sectionIdxsForFunctionCompression,
-      nodesForFunctionCompression, duplicateFunctionSectionIdxs, maxUN);
-  constructNodesForCompression(
+      &duplicateSectionIdxs, maxUN);
+  auto unsForDataCompression = getUnsForCompression(
       sections, sectionToIdx, sectionIdxsForDataCompression,
-      nodesForDataCompression, duplicateDataSectionIdxs, maxUN);
+      &duplicateSectionIdxs, maxUN);
 
-  // Sort nodes by their Id (which is the section index) because the input
-  // linker order tends to be not bad
+  std::vector<BPFunctionNode> nodesForStartup, nodesForFunctionCompression,
+      nodesForDataCompression;
+  for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
+    nodesForStartup.emplace_back(sectionIdx, uns);
+  for (auto &[sectionIdx, uns] : unsForFunctionCompression)
+    nodesForFunctionCompression.emplace_back(sectionIdx, uns);
+  for (auto &[sectionIdx, uns] : unsForDataCompression)
+    nodesForDataCompression.emplace_back(sectionIdx, uns);
+
+  // Use the first timestamp to define the initial order for startup nodes.
+  llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
+    return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
+           std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
+  });
+  // Sort compression nodes by their Id (which is the section index) because the
+  // input linker order tends to be not bad.
   llvm::sort(nodesForFunctionCompression,
              [](auto &L, auto &R) { return L.Id < R.Id; });
   llvm::sort(nodesForDataCompression,
@@ -318,8 +326,8 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     if (orderedSections.insert(isec))
       ++numCodeCompressionSections;
 
-    auto It = duplicateFunctionSectionIdxs.find(node.Id);
-    if (It == duplicateFunctionSectionIdxs.end())
+    auto It = duplicateSectionIdxs.find(node.Id);
+    if (It == duplicateSectionIdxs.end())
       continue;
     for (auto dupSecIdx : It->getSecond()) {
       const auto *dupIsec = sections[dupSecIdx];
@@ -332,8 +340,8 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     const auto *isec = sections[node.Id];
     if (orderedSections.insert(isec))
       ++numDataCompressionSections;
-    auto It = duplicateDataSectionIdxs.find(node.Id);
-    if (It == duplicateDataSectionIdxs.end())
+    auto It = duplicateSectionIdxs.find(node.Id);
+    if (It == duplicateSectionIdxs.end())
       continue;
     for (auto dupSecIdx : It->getSecond()) {
       const auto *dupIsec = sections[dupSecIdx];

From 7ea9f0d85fc3dc80b45e6ba7087c41c6f2481f07 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas@google.com>
Date: Thu, 5 Sep 2024 14:51:14 -0700
Subject: [PATCH 302/425] [SandboxVec][NFC] Remove unused header files

---
 .../Vectorize/SandboxVectorizer/SandboxVectorizer.h        | 7 ++-----
 .../Vectorize/SandboxVectorizer/SandboxVectorizer.cpp      | 3 ++-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h
index 0d4dbae44521e..dd9f02d327264 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h
@@ -8,15 +8,12 @@
 #ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZER_H
 #define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZER_H
 
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/SandboxIR/SandboxIR.h"
 
 namespace llvm {
 
+class TargetTransformInfo;
+
 class SandboxVectorizerPass : public PassInfoMixin<SandboxVectorizerPass> {
   TargetTransformInfo *TTI = nullptr;
 
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
index 072a6606694a0..ec4bfbb56ecb4 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/SandboxIR/SandboxIR.h"
 
 using namespace llvm;
 

From 73514f6831cfcea49f33fb9e31db0141b05532f2 Mon Sep 17 00:00:00 2001
From: wldfngrs <wldfngrs@gmail.com>
Date: Thu, 5 Sep 2024 23:04:35 +0100
Subject: [PATCH 303/425] [libc] Add proxy header for __sighandler_t type
 (#107354)

Added proxy headers for __sighandler_t type, modified the corresponding
CMakeLists.txt files and test files
---
 libc/hdr/types/CMakeLists.txt        | 10 ++++++++++
 libc/hdr/types/sighandler_t.h        | 24 ++++++++++++++++++++++++
 libc/src/signal/linux/signal.cpp     |  1 +
 libc/src/signal/signal.h             |  4 +---
 libc/test/src/signal/CMakeLists.txt  |  2 +-
 libc/test/src/signal/signal_test.cpp |  4 ++--
 6 files changed, 39 insertions(+), 6 deletions(-)
 create mode 100644 libc/hdr/types/sighandler_t.h

diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index f41576c07d99b..ea7bbccffbb81 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -171,3 +171,13 @@ add_proxy_header_library(
     libc.include.llvm-libc-types.locale_t
     libc.include.locale
 )
+
+add_proxy_header_library(
+  __sighandler_t
+  HDRS
+    sighandler_t.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.__sighandler_t
+    libc.include.signal
+)
+
diff --git a/libc/hdr/types/sighandler_t.h b/libc/hdr/types/sighandler_t.h
new file mode 100644
index 0000000000000..b18f8e856c5b6
--- /dev/null
+++ b/libc/hdr/types/sighandler_t.h
@@ -0,0 +1,24 @@
+//===-- Definition of macros from __sighandler_t.h ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_SIGHANDLER_T_H
+#define LLVM_LIBC_HDR_SIGHANDLER_T_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/__sighandler_t.h"
+
+using sighandler_t = __sighandler_t;
+
+#else // overlay mode
+
+#include <signal.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_SIGHANDLER_T_H
diff --git a/libc/src/signal/linux/signal.cpp b/libc/src/signal/linux/signal.cpp
index 241258faf2e81..1da0ef8c97a20 100644
--- a/libc/src/signal/linux/signal.cpp
+++ b/libc/src/signal/linux/signal.cpp
@@ -8,6 +8,7 @@
 
 #include "src/signal/signal.h"
 #include "hdr/signal_macros.h"
+#include "hdr/types/sighandler_t.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/signal/sigaction.h"
diff --git a/libc/src/signal/signal.h b/libc/src/signal/signal.h
index 2037305f8c11e..06e77e11bf0bd 100644
--- a/libc/src/signal/signal.h
+++ b/libc/src/signal/signal.h
@@ -9,13 +9,11 @@
 #ifndef LLVM_LIBC_SRC_SIGNAL_SIGNAL_H
 #define LLVM_LIBC_SRC_SIGNAL_SIGNAL_H
 
+#include "hdr/types/sighandler_t.h"
 #include "src/__support/macros/config.h"
-#include <signal.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
-using sighandler_t = __sighandler_t;
-
 sighandler_t signal(int signum, sighandler_t handler);
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/signal/CMakeLists.txt b/libc/test/src/signal/CMakeLists.txt
index edbd5c19edab3..f7923204eaf49 100644
--- a/libc/test/src/signal/CMakeLists.txt
+++ b/libc/test/src/signal/CMakeLists.txt
@@ -74,7 +74,7 @@ add_libc_unittest(
   SRCS
     signal_test.cpp
   DEPENDS
-    libc.include.signal
+    libc.hdr.types.sighandler_t
     libc.src.errno.errno
     libc.src.signal.raise
     libc.src.signal.signal
diff --git a/libc/test/src/signal/signal_test.cpp b/libc/test/src/signal/signal_test.cpp
index 70e95a8c159a8..4b57311eee2d8 100644
--- a/libc/test/src/signal/signal_test.cpp
+++ b/libc/test/src/signal/signal_test.cpp
@@ -13,14 +13,14 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <signal.h>
+#include "hdr/types/sighandler_t.h"
 
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
 TEST(LlvmLibcSignal, Invalid) {
   LIBC_NAMESPACE::libc_errno = 0;
-  LIBC_NAMESPACE::sighandler_t valid = +[](int) {};
+  sighandler_t valid = +[](int) {};
   EXPECT_THAT((void *)LIBC_NAMESPACE::signal(0, valid),
               Fails(EINVAL, (void *)SIG_ERR));
   EXPECT_THAT((void *)LIBC_NAMESPACE::signal(65, valid),

From e44a67543c0b6a3a2307362f5bbcf54cd6de6a8e Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang@amd.com>
Date: Thu, 5 Sep 2024 15:05:15 -0700
Subject: [PATCH 304/425] AMDGPU: Add a few unsupported checks for
 llvm.fptrunc.round intrinsic (#107330)

A check here can be removed when we implement support for the
corresponding types/mode.
---
 .../CodeGen/AMDGPU/llvm.fptrunc.round.err.ll  | 55 +++++++++++++++++--
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll
index f1d5b07e832c4..291fe00a6177b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll
@@ -1,11 +1,54 @@
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL
-; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL
+; RUN: split-file %s %t
 
-define amdgpu_gs void @test_fptrunc_round_f64(double %a, ptr addrspace(1) %out) {
-; FAIL: LLVM ERROR: Cannot select
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F16-F64-FAIL %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F16-F64-FAIL %s
+
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f32-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F32-F64-FAIL %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f32-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F32-F64-FAIL %s
+
+; TODO: check for GISEL when bfloat is supported.
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/bf16-f32-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=BF16-F32-FAIL %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/bf16-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=BF16-F64-FAIL %s
+
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f32-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=TONEARESTAWAY-FAIL %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f32-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=TONEARESTAWAY-FAIL %s
+
+;--- f16-f64-err.ll
+define amdgpu_gs void @test_fptrunc_round_f16_f64(double %a, ptr addrspace(1) %out) {
+; F16-F64-FAIL: LLVM ERROR: Cannot select
   %res = call half @llvm.fptrunc.round.f16.f64(double %a, metadata !"round.upward")
-  store half %res, ptr addrspace(1) %out, align 4
+  store half %res, ptr addrspace(1) %out, align 2
   ret void
 }
 
-declare half @llvm.fptrunc.round.f16.f64(double, metadata)
+;--- f32-f64-err.ll
+define amdgpu_gs void @test_fptrunc_round_f32_f64(double %a, ptr addrspace(1) %out) {
+; F32-F64-FAIL: LLVM ERROR: Cannot select
+  %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.downward")
+  store float %res, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+;--- bf16-f32-err.ll
+define amdgpu_gs void @test_fptrunc_round_bf16_f32(float %a, ptr addrspace(1) %out) {
+; BF16-F32-FAIL: LLVM ERROR: Cannot select
+  %res = call bfloat @llvm.fptrunc.round.bf16.f32(float %a, metadata !"round.towardzero")
+  store bfloat %res, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+;--- bf16-f64-err.ll
+define amdgpu_gs void @test_fptrunc_round_bf16_f64(double %a, ptr addrspace(1) %out) {
+; BF16-F64-FAIL: LLVM ERROR: Cannot select
+  %res = call bfloat @llvm.fptrunc.round.bf16.f32(double %a, metadata !"round.tonearest")
+  store bfloat %res, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+;--- f16-f32-tonearestaway-err.ll
+define amdgpu_gs void @test_fptrunc_round_f16_f32_tonearestaway(float %a, ptr addrspace(1) %out) {
+; TONEARESTAWAY-FAIL: LLVM ERROR: Cannot select
+  %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.tonearestaway")
+  store half %res, ptr addrspace(1) %out, align 2
+  ret void
+}

From 1e98aa4730b1b3b93205af74be26e04d5f876d10 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 5 Sep 2024 15:09:05 -0700
Subject: [PATCH 305/425] [lldb] Convert ConnectionGenericFileWindows.cpp to
 new Status API (NFC)

---
 lldb/source/Host/windows/ConnectionGenericFileWindows.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp b/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp
index 170444f56b310..6a6153d6e34a0 100644
--- a/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp
+++ b/lldb/source/Host/windows/ConnectionGenericFileWindows.cpp
@@ -236,7 +236,7 @@ size_t ConnectionGenericFile::Read(void *dst, size_t dst_len,
 finish:
   status = return_info.GetStatus();
   if (error_ptr)
-    *error_ptr = Status::FromError(return_info.GetError());
+    *error_ptr = return_info.GetError().Clone();
 
   // kBytesAvailableEvent is a manual reset event.  Make sure it gets reset
   // here so that any subsequent operations don't immediately see bytes
@@ -290,7 +290,7 @@ size_t ConnectionGenericFile::Write(const void *src, size_t src_len,
 finish:
   status = return_info.GetStatus();
   if (error_ptr)
-    *error_ptr = Status::FromError(return_info.GetError());
+    *error_ptr = return_info.GetError().Clone();
 
   IncrementFilePointer(return_info.GetBytes());
   Log *log = GetLog(LLDBLog::Connection);

From bd840a40042c2c67f56079493d0bcdbfc70325ba Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Thu, 5 Sep 2024 15:14:31 -0700
Subject: [PATCH 306/425] [AMDGPU] Add target intrinsic for s_prefetch_data
 (#107133)

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   1 +
 clang/lib/CodeGen/CGBuiltin.cpp               |   3 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx12.cl    |  26 +++-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   8 ++
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |   7 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  15 ++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  13 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   2 +-
 llvm/lib/Target/AMDGPU/SMInstructions.td      |  22 +++
 .../AMDGPU/llvm.amdgcn.s.prefetch.data.ll     | 136 ++++++++++++++++++
 10 files changed, 230 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index ab29ef38f7792..5060647d35764 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -448,6 +448,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_prefetch_data, "vvC*Ui", "nc", "gfx12-insts")
 
 TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 02d8726baa421..da7a1a55da531 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19608,6 +19608,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
         F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)),
             EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))});
   }
+  case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
+    return emitBuiltinWithOneOverloadedType<2>(
+        *this, E, Intrinsic::amdgcn_s_prefetch_data);
   default:
     return nullptr;
   }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index d9ec258e644c9..34ee44afe0f10 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -256,4 +256,28 @@ void test_s_ttracedata_imm()
   __builtin_amdgcn_s_ttracedata_imm(1);
 }
 
-
+// CHECK-LABEL: @test_s_prefetch_data(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[GP_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[CP_ADDR:%.*]] = alloca ptr addrspace(4), align 8, addrspace(5)
+// CHECK-NEXT:    [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    store ptr [[FP:%.*]], ptr addrspace(5) [[FP_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[GP:%.*]], ptr addrspace(5) [[GP_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(4) [[CP:%.*]], ptr addrspace(5) [[CP_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.amdgcn.s.prefetch.data.p0(ptr [[TMP0]], i32 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4
+// CHECK-NEXT:    call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(5) [[CP_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) [[TMP3]], i32 31)
+// CHECK-NEXT:    ret void
+//
+void test_s_prefetch_data(int *fp, global float *gp, constant char *cp, unsigned int len)
+{
+  __builtin_amdgcn_s_prefetch_data(fp, 0);
+  __builtin_amdgcn_s_prefetch_data(gp, len);
+  __builtin_amdgcn_s_prefetch_data(cp, 31);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index dc13a35c66f9a..a5259ba9eec36 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2689,6 +2689,14 @@ def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
 def int_amdgcn_wave_id :
   DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
 
+def int_amdgcn_s_prefetch_data :
+  Intrinsic<[],
+  [llvm_anyptr_ty, // Pointer to a constant/global memory
+   llvm_i32_ty],   // Length to prefetch 0-31 (1-32 chaunks, units of 128 bytes)
+    [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree],
+    "", [SDNPMemOperand]
+  >;
+
 //===----------------------------------------------------------------------===//
 // Deep learning intrinsics.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3fcb364fc2c53..9bebd418bb426 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5541,7 +5541,12 @@ void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
                                                 const MachineInstr &MI,
                                                 int OpIdx) const {
-  MIB.addImm(MI.getOperand(OpIdx).getImm());
+  const MachineOperand &Op = MI.getOperand(OpIdx);
+  int64_t Imm;
+  if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
+    MIB.addImm(Imm);
+  else
+    MIB.addImm(Op.getImm());
 }
 
 void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 4737a322c255f..a2e6842b760f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3290,6 +3290,16 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(B, MI, 2);
       return;
     }
+    case Intrinsic::amdgcn_s_prefetch_data: {
+      Register PtrReg = MI.getOperand(1).getReg();
+      unsigned AS = MRI.getType(PtrReg).getAddressSpace();
+      if (AMDGPU::isFlatGlobalAddrSpace(AS)) {
+        constrainOpWithReadfirstlane(B, MI, 1);
+        constrainOpWithReadfirstlane(B, MI, 2);
+      } else
+        MI.eraseFromParent();
+      return;
+    }
     default: {
       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -5151,6 +5161,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     case Intrinsic::amdgcn_pops_exiting_wave_id:
       return getDefaultMappingSOP(MI);
+    case Intrinsic::amdgcn_s_prefetch_data: {
+      OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+      break;
+    }
     default:
       return getInvalidInstructionMapping();
     }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 81b52935ddf39..accc3084217f2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1430,6 +1430,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
     return true;
   }
+  case Intrinsic::amdgcn_s_prefetch_data: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
+    Info.ptrVal = CI.getArgOperand(0);
+    Info.flags |= MachineMemOperand::MOLoad;
+    return true;
+  }
   default:
     return false;
   }
@@ -9921,6 +9928,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
     return SDValue(NewMI, 0);
   }
+  case Intrinsic::amdgcn_s_prefetch_data: {
+    // For non-global address space preserve the chain and remove the call.
+    if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
+      return Op.getOperand(0);
+    return Op;
+  }
   default: {
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 844f62abc2671..90e11df500bc9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6231,7 +6231,7 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
     SBase->setReg(SGPR);
   }
   MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
-  if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
+  if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
     Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
     SOff->setReg(SGPR);
   }
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 9fc570bb85f24..e7db4f49d9e54 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -1152,6 +1152,28 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> {
 defm : SMPrefetchPat<"INST", i32imm_zero>;
 defm : SMPrefetchPat<"DATA", i32imm_one>;
 
+let SubtargetPredicate = isGFX12Plus in {
+  def : GCNPat <
+    (int_amdgcn_s_prefetch_data (SMRDImm i64:$sbase, i32:$offset), (i32 SReg_32:$len)),
+    (S_PREFETCH_DATA $sbase, $offset, $len, 0)
+  >;
+
+  def : GCNPat <
+    (int_amdgcn_s_prefetch_data (i64 SReg_64:$sbase), (i32 SReg_32:$len)),
+    (S_PREFETCH_DATA $sbase, 0, $len, 0)
+  >;
+
+  def : GCNPat <
+    (int_amdgcn_s_prefetch_data (SMRDImm i64:$sbase, i32:$offset), imm:$len),
+    (S_PREFETCH_DATA $sbase, $offset, (i32 SGPR_NULL), (as_i8timm $len))
+  >;
+
+  def : GCNPat <
+    (int_amdgcn_s_prefetch_data (i64 SReg_64:$sbase), imm:$len),
+    (S_PREFETCH_DATA $sbase, 0, (i32 SGPR_NULL), (as_i8timm $len))
+  >;
+} // End let SubtargetPredicate = isGFX12Plus
+
 //===----------------------------------------------------------------------===//
 // GFX10.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll
new file mode 100644
index 0000000000000..54c39d78adb58
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,GISEL %s
+
+define amdgpu_ps void @prefetch_data_sgpr_base_sgpr_len(ptr addrspace(4) inreg %ptr, i32 inreg %len) {
+; GCN-LABEL: prefetch_data_sgpr_base_sgpr_len:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_prefetch_data s[0:1], 0x0, s2, 0
+; GCN-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_imm_base_sgpr_len(ptr addrspace(4) inreg %ptr, i32 inreg %len) {
+; GCN-LABEL: prefetch_data_sgpr_imm_base_sgpr_len:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_prefetch_data s[0:1], 0x200, s2, 0
+; GCN-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128
+  tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 %len)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_base_imm_len(ptr addrspace(4) inreg %ptr) {
+; GCN-LABEL: prefetch_data_sgpr_base_imm_len:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_prefetch_data s[0:1], 0x0, null, 31
+; GCN-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 31)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_imm_base_imm_len(ptr addrspace(4) inreg %ptr) {
+; GCN-LABEL: prefetch_data_sgpr_imm_base_imm_len:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_prefetch_data s[0:1], 0x200, null, 31
+; GCN-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128
+  tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 31)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_base_sgpr_len(ptr addrspace(4) %ptr, i32 inreg %len) {
+; GCN-LABEL: prefetch_data_vgpr_base_sgpr_len:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    v_readfirstlane_b32 s3, v1
+; GCN-NEXT:    s_prefetch_data s[2:3], 0x0, s0, 0
+; GCN-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_imm_base_sgpr_len(ptr addrspace(4) %ptr, i32 inreg %len) {
+; SDAG-LABEL: prefetch_data_vgpr_imm_base_sgpr_len:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; SDAG-NEXT:    v_readfirstlane_b32 s3, v1
+; SDAG-NEXT:    s_prefetch_data s[2:3], 0x200, s0, 0
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: prefetch_data_vgpr_imm_base_sgpr_len:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x200, v0
+; GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s3, v1
+; GISEL-NEXT:    s_prefetch_data s[2:3], 0x0, s0, 0
+; GISEL-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128
+  tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 %len)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_base_vgpr_len(ptr addrspace(4) inreg %ptr, i32 %len) {
+; GCN-LABEL: prefetch_data_sgpr_base_vgpr_len:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_prefetch_data s[0:1], 0x0, s2, 0
+; GCN-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_global(ptr addrspace(1) inreg %ptr) {
+; GCN-LABEL: prefetch_data_sgpr_base_imm_len_global:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_prefetch_data s[0:1], 0x0, null, 31
+; GCN-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) %ptr, i32 31)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_flat(ptr inreg %ptr) {
+; GCN-LABEL: prefetch_data_sgpr_base_imm_len_flat:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_prefetch_data s[0:1], 0x0, null, 31
+; GCN-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.amdgcn.s.prefetch.data.p0(ptr %ptr, i32 31)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_local(ptr addrspace(3) inreg %ptr) {
+; GCN-LABEL: prefetch_data_sgpr_base_imm_len_local:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.amdgcn.s.prefetch.data.p3(ptr addrspace(3) %ptr, i32 31)
+  ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_base_imm_len(ptr addrspace(4) %ptr) {
+; GCN-LABEL: prefetch_data_vgpr_base_imm_len:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s1, v1
+; GCN-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GCN-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
+declare void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) %ptr, i32 %len)
+declare void @llvm.amdgcn.s.prefetch.data.p0(ptr %ptr, i32 %len)

From c1c42518c1356e78a10bf252a4a5a643b2bb9efd Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Thu, 5 Sep 2024 15:25:50 -0700
Subject: [PATCH 307/425] [SandboxVec] Early return checks (#107465)

This patch implements a couple of early return checks.
---
 .../SandboxVectorizer/SandboxVectorizer.cpp   | 11 +++++++++
 .../X86/no_implicit_float.ll                  | 23 +++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 llvm/test/Transforms/SandboxVectorizer/X86/no_implicit_float.ll

diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
index ec4bfbb56ecb4..e9be6f5283fea 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
@@ -29,7 +29,18 @@ PreservedAnalyses SandboxVectorizerPass::run(Function &F,
 }
 
 bool SandboxVectorizerPass::runImpl(Function &F) {
+  // If the target claims to have no vector registers early return.
+  if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
+    LLVM_DEBUG(dbgs() << "SBVec: Target has no vector registers, return.\n");
+    return false;
+  }
   LLVM_DEBUG(dbgs() << "SBVec: Analyzing " << F.getName() << ".\n");
+  // Early return if the attribute NoImplicitFloat is used.
+  if (F.hasFnAttribute(Attribute::NoImplicitFloat)) {
+    LLVM_DEBUG(dbgs() << "SBVec: NoImplicitFloat attribute, return.\n");
+    return false;
+  }
+
   sandboxir::Context Ctx(F.getContext());
   // Create SandboxIR for `F`.
   sandboxir::Function &SBF = *Ctx.createFunction(&F);
diff --git a/llvm/test/Transforms/SandboxVectorizer/X86/no_implicit_float.ll b/llvm/test/Transforms/SandboxVectorizer/X86/no_implicit_float.ll
new file mode 100644
index 0000000000000..37fc03cb31166
--- /dev/null
+++ b/llvm/test/Transforms/SandboxVectorizer/X86/no_implicit_float.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=sandbox-vectorizer -mtriple=x86_64-- -mattr=+avx %s -S | FileCheck %s
+
+; Check that we don't vectorize a NoImplicitFloat function.
+define void @no_implicit_float(ptr %ptr) noimplicitfloat {
+; CHECK-LABEL: define void @no_implicit_float(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr double, ptr [[PTR]], i32 0
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr double, ptr [[PTR]], i32 1
+; CHECK-NEXT:    [[LD0:%.*]] = load double, ptr [[PTR0]], align 8
+; CHECK-NEXT:    [[LD1:%.*]] = load double, ptr [[PTR1]], align 8
+; CHECK-NEXT:    store double [[LD0]], ptr [[PTR0]], align 8
+; CHECK-NEXT:    store double [[LD1]], ptr [[PTR1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr0 = getelementptr double, ptr %ptr, i32 0
+  %ptr1 = getelementptr double, ptr %ptr, i32 1
+  %ld0 = load double, ptr %ptr0
+  %ld1 = load double, ptr %ptr1
+  store double %ld0, ptr %ptr0
+  store double %ld1, ptr %ptr1
+  ret void
+}

From 169d453429ca9015046b42719ff5d13cda5d2c6f Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 5 Sep 2024 15:47:28 -0700
Subject: [PATCH 308/425] [ADT] Declare replaceAllocation in SmallVector.cpp
 (NFC) (#107469)

This patch changes replaceAllocation to a static function while moving
the declaration to SmallVector.cpp.  Note that:

- replaceAllocation is used only within SmallVector.cpp.
- replaceAllocation doesn't access any class members.
---
 llvm/include/llvm/ADT/SmallVector.h | 13 -------------
 llvm/lib/Support/SmallVector.cpp    | 16 ++++++++++++----
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h
index 730f84ca038d6..bd3e887e36bce 100644
--- a/llvm/include/llvm/ADT/SmallVector.h
+++ b/llvm/include/llvm/ADT/SmallVector.h
@@ -74,19 +74,6 @@ template <class Size_T> class SmallVectorBase {
   /// This function will report a fatal error if it cannot increase capacity.
   void grow_pod(void *FirstEl, size_t MinSize, size_t TSize);
 
-  /// If vector was first created with capacity 0, getFirstEl() points to the
-  /// memory right after, an area unallocated. If a subsequent allocation,
-  /// that grows the vector, happens to return the same pointer as getFirstEl(),
-  /// get a new allocation, otherwise isSmall() will falsely return that no
-  /// allocation was done (true) and the memory will not be freed in the
-  /// destructor. If a VSize is given (vector size), also copy that many
-  /// elements to the new allocation - used if realloca fails to increase
-  /// space, and happens to allocate precisely at BeginX.
-  /// This is unlikely to be called often, but resolves a memory leak when the
-  /// situation does occur.
-  void *replaceAllocation(void *NewElts, size_t TSize, size_t NewCapacity,
-                          size_t VSize = 0);
-
 public:
   size_t size() const { return Size; }
   size_t capacity() const { return Capacity; }
diff --git a/llvm/lib/Support/SmallVector.cpp b/llvm/lib/Support/SmallVector.cpp
index b6ce37842040b..dceea4fbc630e 100644
--- a/llvm/lib/Support/SmallVector.cpp
+++ b/llvm/lib/Support/SmallVector.cpp
@@ -108,10 +108,18 @@ static size_t getNewCapacity(size_t MinSize, size_t TSize, size_t OldCapacity) {
   return std::clamp(NewCapacity, MinSize, MaxSize);
 }
 
-template <class Size_T>
-void *SmallVectorBase<Size_T>::replaceAllocation(void *NewElts, size_t TSize,
-                                                 size_t NewCapacity,
-                                                 size_t VSize) {
+/// If vector was first created with capacity 0, getFirstEl() points to the
+/// memory right after, an area unallocated. If a subsequent allocation,
+/// that grows the vector, happens to return the same pointer as getFirstEl(),
+/// get a new allocation, otherwise isSmall() will falsely return that no
+/// allocation was done (true) and the memory will not be freed in the
+/// destructor. If a VSize is given (vector size), also copy that many
+/// elements to the new allocation - used if realloca fails to increase
+/// space, and happens to allocate precisely at BeginX.
+/// This is unlikely to be called often, but resolves a memory leak when the
+/// situation does occur.
+static void *replaceAllocation(void *NewElts, size_t TSize, size_t NewCapacity,
+                               size_t VSize = 0) {
   void *NewEltsReplace = llvm::safe_malloc(NewCapacity * TSize);
   if (VSize)
     memcpy(NewEltsReplace, NewElts, VSize * TSize);

From 6d3725924fe6adf0d490697327938de9c3516cbe Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Fri, 6 Sep 2024 07:03:05 +0800
Subject: [PATCH 309/425] [clang-tidy][NFC] remove autosar link in documents
 (#107412)

As discussion in
https://discourse.llvm.org/t/clang-tidy-rfc-add-autosar-c-14-clang-tidy-module/59223/12.
We should not link clang-tidy check with AUTOSAR rules.
---
 .../docs/clang-tidy/checks/misc/const-correctness.rst          | 3 +--
 .../clang-tidy/checks/misc/unconventional-assign-operator.rst  | 3 ---
 .../checks/readability/avoid-nested-conditional-operator.rst   | 3 ---
 3 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst
index 86fba6c7e4f7c..8ac1ad56bc8cf 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst
@@ -7,8 +7,7 @@ This check implements detection of local variables which could be declared as
 ``const`` but are not. Declaring variables as ``const`` is required or recommended by many
 coding guidelines, such as:
 `ES.25 <https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#es25-declare-an-object-const-or-constexpr-unless-you-want-to-modify-its-value-later-on>`_
-from the C++ Core Guidelines and `AUTOSAR C++14 Rule A7-1-1 (6.7.1 Specifiers)
-<https://www.autosar.org/fileadmin/standards/R22-11/AP/AUTOSAR_RS_CPP14Guidelines.pdf>`_.
+from the C++ Core Guidelines.
 
 Please note that this check's analysis is type-based only. Variables that are not modified
 but used to create a non-const handle that might escape the scope are not diagnosed
diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/unconventional-assign-operator.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/unconventional-assign-operator.rst
index 3b4b65a5cb683..49e3fd5b6ee42 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/misc/unconventional-assign-operator.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/misc/unconventional-assign-operator.rst
@@ -13,6 +13,3 @@ types and definitions with good return type but wrong ``return`` statements.
     type (e.g. ``int``).
   * Private and deleted operators are ignored.
   * The operator must always return ``*this``.
-
-This check implements `AUTOSAR C++14 Rule A13-2-1
-<https://www.autosar.org/fileadmin/standards/R22-11/AP/AUTOSAR_RS_CPP14Guidelines.pdf>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-nested-conditional-operator.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-nested-conditional-operator.rst
index 44b74283292ce..cd3906855d497 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-nested-conditional-operator.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-nested-conditional-operator.rst
@@ -16,6 +16,3 @@ Examples:
   int NestInConditional = (condition1 ? true1 : false1) ? true2 : false2;
   int NestInTrue = condition1 ? (condition2 ? true1 : false1) : false2;
   int NestInFalse = condition1 ? true1 : condition2 ? true2 : false1;
-
-This check implements part of `AUTOSAR C++14 Rule A5-16-1
-<https://www.autosar.org/fileadmin/standards/R22-11/AP/AUTOSAR_RS_CPP14Guidelines.pdf>`_.

From c8834527b729c8c89f453d215e667047fd948aa1 Mon Sep 17 00:00:00 2001
From: Tai Ly <tai.ly@arm.com>
Date: Thu, 5 Sep 2024 18:07:36 -0500
Subject: [PATCH 310/425] [TOSA] Move CreateOpAndInfer into ConversionUtils.h
 (#106122)

This moves CreateOpAndInfer from TF legalize_util.h into
ConversionUtils.h

also removed duplicate createOpAndInfer function from
TosaDecomposeTransposeConv.cpp

Renamed to CreateOpAndInferShape so we can upstream this independently
of tensorflow (otherwise a redefinition error would break TF compile if
not upstreamed together with removal of CreateOpAndInfer in TF)

---------

Signed-off-by: Tai Ly <tai.ly@arm.com>
---
 .../mlir/Dialect/Tosa/Utils/ConversionUtils.h | 137 ++++++++++++++++++
 .../Transforms/TosaDecomposeTransposeConv.cpp |  93 +++---------
 .../Dialect/Tosa/Utils/ConversionUtils.cpp    |  12 +-
 3 files changed, 169 insertions(+), 73 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
index ceab7d9c628a5..ef40b348ab549 100644
--- a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
+++ b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
@@ -15,7 +15,9 @@
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tosa/Utils/ShapeUtils.h"
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/PatternMatch.h"
 #include <optional>
 
@@ -79,6 +81,141 @@ checkHasDynamicBatchDims(PatternRewriter &rewriter, Op op,
 LogicalResult EqualizeRanks(PatternRewriter &rewriter, Location loc,
                             Value &input1, Value &input2);
 
+LogicalResult EqualizeRanks(ImplicitLocOpBuilder &builder, Value &input1,
+                            Value &input2);
+
+namespace {
+
+// Creates a TOSA operation and performs shape inference on the individual
+// op. This allows shape inference when lowering down to TOSA.
+template <typename TosaOp, typename... Args>
+TosaOp createOpAndInferShape(ImplicitLocOpBuilder &builder, Type resultTy,
+                             Args &&...args) {
+  auto op = builder.create<TosaOp>(resultTy, args...);
+
+  InferShapedTypeOpInterface shapeInterface =
+      dyn_cast<InferShapedTypeOpInterface>(op.getOperation());
+  if (!shapeInterface)
+    return op;
+
+  SmallVector<ShapedTypeComponents> returnedShapes;
+  if (shapeInterface
+          .inferReturnTypeComponents(op.getContext(), builder.getLoc(),
+                                     op->getOperands(), op->getAttrDictionary(),
+                                     op->getPropertiesStorage(),
+                                     op->getRegions(), returnedShapes)
+          .failed())
+    return op;
+
+  // We need to use the element type of the existing result type to generate
+  // the new result shaped type. This is because rescale can include a cast to
+  // different bit-width types and does not have a TypeAttr to define the
+  // target type.
+  auto result = op->getResult(0);
+  auto predictedShape = returnedShapes[0];
+  auto currentKnowledge = ValueKnowledge::getKnowledgeFromType(resultTy);
+
+  // Compute the knowledge based on the inferred type.
+  auto inferredKnowledge = ValueKnowledge::getPessimisticValueState();
+  inferredKnowledge.dtype = mlir::cast<ShapedType>(resultTy).getElementType();
+  inferredKnowledge.hasRank = predictedShape.hasRank();
+  if (predictedShape.hasRank()) {
+    for (auto dim : predictedShape.getDims()) {
+      inferredKnowledge.sizes.push_back(dim);
+    }
+  }
+
+  // Compute the new type based on the joined version.
+  auto newKnowledge = ValueKnowledge::join(currentKnowledge, inferredKnowledge);
+  Type newTy =
+      newKnowledge.hasRank
+          ? Type{mlir::RankedTensorType::get(llvm::ArrayRef(newKnowledge.sizes),
+                                             newKnowledge.dtype)}
+          : Type{mlir::UnrankedTensorType::get(newKnowledge.dtype)};
+  result.setType(newTy);
+  return op;
+}
+
+} // namespace
+
+// Creates a TOSA operation by:
+//   - first equalize ranks for ops with SameOperandsAndResultRank trait
+//   - create operator
+//   - performs shape inference on this operator
+template <typename TosaOp, typename... Args>
+TosaOp CreateOpAndInferShape(ImplicitLocOpBuilder &builder, Type resultTy,
+                             Args &&...args) {
+  if (TosaOp::template hasTrait<OpTrait::SameOperandsAndResultRank>()) {
+    // op requires same ranks for tensor operands
+    if constexpr (sizeof...(Args) == 2) {
+      auto argX = std::get<0>(std::tie(args...));
+      auto argY = std::get<1>(std::tie(args...));
+      using ArgX = decltype(argX);
+      using ArgY = decltype(argY);
+      if constexpr (std::is_same_v<ArgX, Value> &&
+                    std::is_same_v<ArgY, Value>) {
+        Value x = std::get<0>(std::tie(args...));
+        Value y = std::get<1>(std::tie(args...));
+        if (EqualizeRanks(builder, x, y).failed()) {
+          // incompatible broadcast shapes, no reshape is inserted
+          // ResultsBroadcastableShape verify will handle this
+        }
+        return createOpAndInferShape<TosaOp>(builder, resultTy, x, y);
+      }
+    }
+    if constexpr (sizeof...(Args) == 3) {
+      auto argX = std::get<0>(std::tie(args...));
+      auto argY = std::get<1>(std::tie(args...));
+      auto argZ = std::get<2>(std::tie(args...));
+      using ArgX = decltype(argX);
+      using ArgY = decltype(argY);
+      using ArgZ = decltype(argZ);
+      if constexpr (std::is_same_v<ArgX, Value> &&
+                    std::is_same_v<ArgY, Value> && std::is_same_v<ArgZ, bool>) {
+        // special case for ArithmeticRightShiftOp
+        Value x = std::get<0>(std::tie(args...));
+        Value y = std::get<1>(std::tie(args...));
+        bool round = std::get<2>(std::tie(args...));
+        if (EqualizeRanks(builder, x, y).failed()) {
+          // incompatible broadcast shapes, no reshape is inserted
+          // ResultsBroadcastableShape verify will handle this
+        }
+        return createOpAndInferShape<TosaOp>(builder, resultTy, x, y, round);
+      }
+      if constexpr (std::is_same_v<ArgX, Value> &&
+                    std::is_same_v<ArgY, Value> &&
+                    std::is_same_v<ArgZ, Value>) {
+        // special case for Select
+        Value x = std::get<0>(std::tie(args...));
+        Value y = std::get<1>(std::tie(args...));
+        Value z = std::get<2>(std::tie(args...));
+
+        if (EqualizeRanks(builder, x, y).failed() ||
+            EqualizeRanks(builder, x, z).failed() ||
+            EqualizeRanks(builder, y, z).failed()) {
+          // incompatible broadcast shapes, no reshape is inserted
+          // ResultsBroadcastableShape verify will handle this
+        }
+
+        return createOpAndInferShape<TosaOp>(builder, resultTy, x, y, z);
+      }
+    }
+  }
+
+  return createOpAndInferShape<TosaOp>(builder, resultTy, args...);
+}
+
+// Creates a TOSA operation by:
+//   - first equalize ranks for ops with SameOperandsAndResultRank trait
+//   - create operator
+//   - performs shape inference on this operator
+template <typename TosaOp, typename... Args>
+TosaOp CreateOpAndInferShape(PatternRewriter &rewriter, Location loc,
+                             Type resultTy, Args &&...args) {
+  ImplicitLocOpBuilder builder(loc, rewriter);
+  return CreateOpAndInferShape<TosaOp>(builder, resultTy, args...);
+}
+
 } // namespace tosa
 } // namespace mlir
 
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
index a94bb3a920b1d..0779cdb9667a1 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
@@ -26,53 +26,6 @@ using namespace mlir::tosa;
 
 namespace {
 
-template <typename TosaOp, typename... Args>
-TosaOp createOpAndInfer(PatternRewriter &rewriter, Location loc, Type resultTy,
-                        Args &&...args) {
-  auto op = rewriter.create<TosaOp>(loc, resultTy, args...);
-
-  InferShapedTypeOpInterface shapeInterface =
-      dyn_cast<InferShapedTypeOpInterface>(op.getOperation());
-  if (!shapeInterface)
-    return op;
-
-  SmallVector<ShapedTypeComponents> returnedShapes;
-  if (shapeInterface
-          .inferReturnTypeComponents(
-              op.getContext(), op.getLoc(), op->getOperands(),
-              op->getDiscardableAttrDictionary(), op->getPropertiesStorage(),
-              op->getRegions(), returnedShapes)
-          .failed())
-    return op;
-
-  // We need to use the element type of the existing result type to generate
-  // the new result shaped type. This is because rescale can include a cast to
-  // different bit-width types and does not have a TypeAttr to define the
-  // target type.
-  auto result = op->getResult(0);
-  auto predictedShape = returnedShapes[0];
-  auto currentKnowledge =
-      mlir::tosa::ValueKnowledge::getKnowledgeFromType(resultTy);
-
-  // Compute the knowledge based on the inferred type.
-  auto inferredKnowledge =
-      mlir::tosa::ValueKnowledge::getPessimisticValueState();
-  inferredKnowledge.dtype = cast<ShapedType>(resultTy).getElementType();
-  inferredKnowledge.hasRank = predictedShape.hasRank();
-  if (predictedShape.hasRank()) {
-    for (auto dim : predictedShape.getDims()) {
-      inferredKnowledge.sizes.push_back(dim);
-    }
-  }
-
-  // Compute the new type based on the joined version.
-  auto newKnowledge =
-      mlir::tosa::ValueKnowledge::join(currentKnowledge, inferredKnowledge);
-  auto newTy = newKnowledge.getType();
-  result.setType(newTy);
-  return op;
-}
-
 class TransposeConvNonStridedConverter
     : public OpRewritePattern<tosa::TransposeConv2DOp> {
 public:
@@ -187,20 +140,20 @@ class TransposeConvStridedConverter
         (weightWidth % stride[1]) ? (stride[1] - weightWidth % stride[1]) : 0;
     DenseElementsAttr weightPaddingAttr = DenseIntElementsAttr::get(
         RankedTensorType::get({4, 2}, rewriter.getI32Type()), weightPadding);
-    Value weightPaddingVal = createOpAndInfer<tosa::ConstOp>(
+    Value weightPaddingVal = CreateOpAndInferShape<tosa::ConstOp>(
         rewriter, loc, weightPaddingAttr.getType(), weightPaddingAttr);
 
     if (op.getQuantizationInfo().has_value()) {
       auto quantInfo = op.getQuantizationInfo().value();
-      weight = createOpAndInfer<tosa::PadOp>(
+      weight = CreateOpAndInferShape<tosa::PadOp>(
           rewriter, loc, UnrankedTensorType::get(weightETy), weight,
           weightPaddingVal, nullptr,
           rewriter.getAttr<PadOpQuantizationAttr>(quantInfo.getWeightZp()));
 
     } else {
-      weight = createOpAndInfer<tosa::PadOp>(rewriter, loc,
-                                             UnrankedTensorType::get(weightETy),
-                                             weight, weightPaddingVal);
+      weight = CreateOpAndInferShape<tosa::PadOp>(
+          rewriter, loc, UnrankedTensorType::get(weightETy), weight,
+          weightPaddingVal);
     }
 
     weightTy = cast<ShapedType>(weight.getType());
@@ -212,7 +165,7 @@ class TransposeConvStridedConverter
         outputChannels, weightHeight / stride[0],
         stride[0],      weightWidth / stride[1],
         stride[1],      inputChannels};
-    weight = createOpAndInfer<tosa::ReshapeOp>(
+    weight = CreateOpAndInferShape<tosa::ReshapeOp>(
         rewriter, loc, UnrankedTensorType::get(weightETy), weight,
         rewriter.getDenseI64ArrayAttr(weightReshapeDims0));
 
@@ -221,7 +174,7 @@ class TransposeConvStridedConverter
         loc, RankedTensorType::get({6}, rewriter.getI32Type()),
         rewriter.getI32TensorAttr({2, 4, 0, 1, 3, 5}));
 
-    weight = createOpAndInfer<tosa::TransposeOp>(
+    weight = CreateOpAndInferShape<tosa::TransposeOp>(
         rewriter, loc, UnrankedTensorType::get(weightETy), weight,
         transposeWeightVal);
 
@@ -229,15 +182,15 @@ class TransposeConvStridedConverter
     llvm::SmallVector<int64_t, 6> weightReshapeDims1 = {
         outputChannels * stride[0] * stride[1], weightHeight / stride[0],
         weightWidth / stride[1], inputChannels};
-    weight = createOpAndInfer<tosa::ReshapeOp>(
+    weight = CreateOpAndInferShape<tosa::ReshapeOp>(
         rewriter, loc, UnrankedTensorType::get(weightETy), weight,
         rewriter.getDenseI64ArrayAttr(weightReshapeDims1));
     ShapedType restridedWeightTy = cast<ShapedType>(weight.getType());
 
-    weight = createOpAndInfer<tosa::ReverseOp>(
+    weight = CreateOpAndInferShape<tosa::ReverseOp>(
         rewriter, loc, UnrankedTensorType::get(weightETy), weight,
         /* axis = */ rewriter.getI32IntegerAttr(1));
-    weight = createOpAndInfer<tosa::ReverseOp>(
+    weight = CreateOpAndInferShape<tosa::ReverseOp>(
         rewriter, loc, UnrankedTensorType::get(weightETy), weight,
         /* axis = */ rewriter.getI32IntegerAttr(2));
 
@@ -251,19 +204,19 @@ class TransposeConvStridedConverter
     DenseElementsAttr inputPaddingAttr = DenseIntElementsAttr::get(
         RankedTensorType::get({4, 2}, rewriter.getI32Type()), inputPadding);
 
-    Value inputPaddingVal = createOpAndInfer<tosa::ConstOp>(
+    Value inputPaddingVal = CreateOpAndInferShape<tosa::ConstOp>(
         rewriter, loc, inputPaddingAttr.getType(), inputPaddingAttr);
 
     if (op.getQuantizationInfo().has_value()) {
       auto quantInfo = op.getQuantizationInfo().value();
-      input = createOpAndInfer<tosa::PadOp>(
+      input = CreateOpAndInferShape<tosa::PadOp>(
           rewriter, loc, UnrankedTensorType::get(inputETy), input,
           inputPaddingVal, nullptr,
           rewriter.getAttr<PadOpQuantizationAttr>(quantInfo.getInputZp()));
     } else {
-      input = createOpAndInfer<tosa::PadOp>(rewriter, loc,
-                                            UnrankedTensorType::get(inputETy),
-                                            input, inputPaddingVal);
+      input = CreateOpAndInferShape<tosa::PadOp>(
+          rewriter, loc, UnrankedTensorType::get(inputETy), input,
+          inputPaddingVal);
     }
 
     // We use a zero bias as we need to broadcast the bias.
@@ -279,7 +232,7 @@ class TransposeConvStridedConverter
     // Perform the convolution using the zero bias.
     Value conv2d;
     if (op.getQuantizationInfo()) {
-      conv2d = createOpAndInfer<tosa::Conv2DOp>(
+      conv2d = CreateOpAndInferShape<tosa::Conv2DOp>(
                    rewriter, loc, UnrankedTensorType::get(resultETy), input,
                    weight, zeroBias,
                    /*pad=*/rewriter.getDenseI64ArrayAttr({0, 0, 0, 0}),
@@ -288,7 +241,7 @@ class TransposeConvStridedConverter
                    *op.getQuantizationInfo())
                    .getResult();
     } else {
-      conv2d = createOpAndInfer<tosa::Conv2DOp>(
+      conv2d = CreateOpAndInferShape<tosa::Conv2DOp>(
                    rewriter, loc, UnrankedTensorType::get(resultETy), input,
                    weight, zeroBias,
                    /*pad=*/rewriter.getDenseI64ArrayAttr({0, 0, 0, 0}),
@@ -307,7 +260,7 @@ class TransposeConvStridedConverter
     // Factor striding out of the convolution result.
     llvm::SmallVector<int64_t, 6> convReshapeDims0 = {
         batch, convHeight, convWidth, stride[0], stride[1], outputChannels};
-    conv2d = createOpAndInfer<tosa::ReshapeOp>(
+    conv2d = CreateOpAndInferShape<tosa::ReshapeOp>(
         rewriter, loc, UnrankedTensorType::get(resultETy), conv2d,
         rewriter.getDenseI64ArrayAttr(convReshapeDims0));
 
@@ -316,14 +269,14 @@ class TransposeConvStridedConverter
         loc, RankedTensorType::get({6}, rewriter.getI32Type()),
         rewriter.getI32TensorAttr({0, 1, 3, 2, 4, 5}));
 
-    conv2d = createOpAndInfer<tosa::TransposeOp>(
+    conv2d = CreateOpAndInferShape<tosa::TransposeOp>(
         rewriter, loc, UnrankedTensorType::get(convETy), conv2d,
         transposeConvVal);
 
     // Fuse striding behavior back into width / height.
     llvm::SmallVector<int64_t, 6> convReshapeDims1 = {
         batch, convHeight * stride[0], convWidth * stride[1], outputChannels};
-    conv2d = createOpAndInfer<tosa::ReshapeOp>(
+    conv2d = CreateOpAndInferShape<tosa::ReshapeOp>(
         rewriter, loc, UnrankedTensorType::get(resultETy), conv2d,
         rewriter.getDenseI64ArrayAttr(convReshapeDims1));
 
@@ -348,7 +301,7 @@ class TransposeConvStridedConverter
     sliceSize[1] = resultSliceHeight;
     sliceSize[2] = resultSliceWidth;
 
-    auto slice = createOpAndInfer<tosa::SliceOp>(
+    auto slice = CreateOpAndInferShape<tosa::SliceOp>(
                      rewriter, loc, UnrankedTensorType::get(resultETy), conv2d,
                      rewriter.getDenseI64ArrayAttr(sliceBegin),
                      rewriter.getDenseI64ArrayAttr(sliceSize))
@@ -363,10 +316,10 @@ class TransposeConvStridedConverter
     DenseElementsAttr resultPaddingAttr = DenseIntElementsAttr::get(
         RankedTensorType::get({4, 2}, rewriter.getI32Type()), resultPadding);
 
-    Value resultPaddingVal = createOpAndInfer<tosa::ConstOp>(
+    Value resultPaddingVal = CreateOpAndInferShape<tosa::ConstOp>(
         rewriter, loc, resultPaddingAttr.getType(), resultPaddingAttr);
 
-    Value resultPad = createOpAndInfer<tosa::PadOp>(
+    Value resultPad = CreateOpAndInferShape<tosa::PadOp>(
         rewriter, loc, UnrankedTensorType::get(resultETy), slice,
         resultPaddingVal);
 
diff --git a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
index f276924a8a9f6..1f6e3b2ab8391 100644
--- a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
+++ b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
@@ -102,6 +102,12 @@ computeReshapeOutput(ArrayRef<int64_t> higherRankShape,
 
 LogicalResult mlir::tosa::EqualizeRanks(PatternRewriter &rewriter, Location loc,
                                         Value &input1, Value &input2) {
+  ImplicitLocOpBuilder builder(loc, rewriter);
+  return EqualizeRanks(builder, input1, input2);
+}
+
+LogicalResult mlir::tosa::EqualizeRanks(ImplicitLocOpBuilder &builder,
+                                        Value &input1, Value &input2) {
   auto input1Ty = llvm::dyn_cast<RankedTensorType>(input1.getType());
   auto input2Ty = llvm::dyn_cast<RankedTensorType>(input2.getType());
 
@@ -140,9 +146,9 @@ LogicalResult mlir::tosa::EqualizeRanks(PatternRewriter &rewriter, Location loc,
   auto reshapeOutputType = RankedTensorType::get(
       ArrayRef<int64_t>(reshapeOutputShape), reshapeInputType.getElementType());
 
-  auto reshapeLower = rewriter.create<tosa::ReshapeOp>(
-      loc, reshapeOutputType, lowerTensorValue,
-      rewriter.getDenseI64ArrayAttr(reshapeOutputShape));
+  auto reshapeLower = builder.create<tosa::ReshapeOp>(
+      reshapeOutputType, lowerTensorValue,
+      builder.getDenseI64ArrayAttr(reshapeOutputShape));
 
   if (input1Rank > input2Rank) {
     input1 = higherTensorValue;

From 67fb8d15c993f5695cf944b16022a9ee49b9252d Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 5 Sep 2024 16:11:08 -0700
Subject: [PATCH 311/425] [lldb] Convert ProcessDebugger.cpp to new Status API
 (NFC)

---
 .../source/Plugins/Process/Windows/Common/ProcessDebugger.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessDebugger.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessDebugger.cpp
index 9fc8077851d25..bde72d61b0fee 100644
--- a/lldb/source/Plugins/Process/Windows/Common/ProcessDebugger.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/ProcessDebugger.cpp
@@ -560,7 +560,7 @@ void ProcessDebugger::OnDebuggerError(const Status &error, uint32_t type) {
     // If we haven't actually launched the process yet, this was an error
     // launching the process.  Set the internal error and signal the initial
     // stop event so that the DoLaunch method wakes up and returns a failure.
-    m_session_data->m_launch_error = error;
+    m_session_data->m_launch_error = error.Clone();
     ::SetEvent(m_session_data->m_initial_stop_event);
     LLDB_LOG(log,
              "Error {0} occurred launching the process before the initial "
@@ -582,7 +582,7 @@ Status ProcessDebugger::WaitForDebuggerConnection(DebuggerThreadSP debugger,
     LLDB_LOG(log, "hit loader breakpoint, returning.");
 
     process = debugger->GetProcess();
-    return m_session_data->m_launch_error;
+    return m_session_data->m_launch_error.Clone();
   } else
     return Status(::GetLastError(), eErrorTypeWin32);
 }

From e0a93d3505bf6b4c87e819db7a871e0ce4d4100c Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 5 Sep 2024 16:12:11 -0700
Subject: [PATCH 312/425] [lldb] Convert ProcessWindows.cpp to new Status API
 (NFC)

---
 lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp | 2 +-
 lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp
index 6f2f6633021d3..9a5fcd88e1282 100644
--- a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp
@@ -57,7 +57,7 @@ Status DebuggerThread::DebugLaunch(const ProcessLaunchInfo &launch_info) {
       "lldb.plugin.process-windows.secondary[?]",
       [this, launch_info] { return DebuggerThreadLaunchRoutine(launch_info); });
   if (!secondary_thread) {
-    result = Status(secondary_thread.takeError());
+    result = Status::FromError(secondary_thread.takeError());
     LLDB_LOG(log, "couldn't launch debugger thread. {0}", result);
   }
 
diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
index 688b363bbb2db..b25068dda53ff 100644
--- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
@@ -821,7 +821,7 @@ void ProcessWindows::OnDebuggerError(const Status &error, uint32_t type) {
     // If we haven't actually launched the process yet, this was an error
     // launching the process.  Set the internal error and signal the initial
     // stop event so that the DoLaunch method wakes up and returns a failure.
-    m_session_data->m_launch_error = error;
+    m_session_data->m_launch_error = error.Clone();
     ::SetEvent(m_session_data->m_initial_stop_event);
     LLDB_LOG(
         log,

From 1be9a80768a03ea9bd2bfbb03762b2bc3c350007 Mon Sep 17 00:00:00 2001
From: Kai Sasaki <lewuathe@gmail.com>
Date: Fri, 6 Sep 2024 08:21:47 +0900
Subject: [PATCH 313/425] [mlir][affine] Fix the crash due to the simultaneous
 replacement store (#90829)

`AffineScalarReplacement` should forward the memref store op to load op
only if the store op reaches the load. But it now checks the
reachability only if these ops are in the same block, which causes the
crash reported in https://github.com/llvm/llvm-project/issues/76309.

We need to check the reachability even if they are both in the same
block, which rescues the case where consecutive store operations are
written before the load op.
---
 mlir/lib/Dialect/Affine/Utils/Utils.cpp |  3 +--
 mlir/test/Dialect/Affine/scalrep.mlir   | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
index f46381403bc52..898467d573362 100644
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -860,8 +860,7 @@ static void forwardStoreToLoad(
     // 3. The store must reach the load. Access function equivalence only
     // guarantees this for accesses in the same block. The load could be in a
     // nested block that is unreachable.
-    if (storeOp->getBlock() != loadOp->getBlock() &&
-        !mustReachAtInnermost(srcAccess, destAccess))
+    if (!mustReachAtInnermost(srcAccess, destAccess))
       continue;
 
     // 4. Ensure there is no intermediate operation which could replace the
diff --git a/mlir/test/Dialect/Affine/scalrep.mlir b/mlir/test/Dialect/Affine/scalrep.mlir
index 4a99dee50a280..fdfe3bfb62f95 100644
--- a/mlir/test/Dialect/Affine/scalrep.mlir
+++ b/mlir/test/Dialect/Affine/scalrep.mlir
@@ -5,6 +5,7 @@
 // CHECK-DAG: [[$MAP2:#map[0-9]*]] = affine_map<(d0, d1) -> (d1)>
 // CHECK-DAG: [[$MAP3:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 - 1)>
 // CHECK-DAG: [[$MAP4:#map[0-9]*]] = affine_map<(d0) -> (d0 + 1)>
+// CHECK-DAG: [[$IDENT:#map[0-9]*]] = affine_map<(d0) -> (d0)>
 
 // CHECK-LABEL: func @simple_store_load() {
 func.func @simple_store_load() {
@@ -931,3 +932,23 @@ func.func @cross_block() {
   %69 = affine.load %alloc_99[%c10] : memref<13xi1>
   return
 }
+
+#map1 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @consecutive_store
+func.func @consecutive_store() {
+  // CHECK: %[[CST:.*]] = arith.constant
+  %tmp = arith.constant 1.1 : f16
+  // CHECK: %[[ALLOC:.*]] = memref.alloc
+  %alloc_66 = memref.alloc() : memref<f16, 1>
+  affine.for %arg2 = 4 to 6 {
+    affine.for %arg3 = #map1(%arg2) to #map1(%arg2) step 4 {
+      // CHECK: affine.store %[[CST]], %[[ALLOC]][]
+      affine.store %tmp, %alloc_66[] : memref<f16, 1>
+      // CHECK-NOT: affine.store %[[CST]], %[[ALLOC]][]
+      affine.store %tmp, %alloc_66[] : memref<f16, 1>
+      %270 = affine.load %alloc_66[] : memref<f16, 1>
+    }
+  }
+  return
+}

From d1756165a9066f907b88d51dd8e3ffee15a8cc1e Mon Sep 17 00:00:00 2001
From: alx32 <103613512+alx32@users.noreply.github.com>
Date: Thu, 5 Sep 2024 16:36:21 -0700
Subject: [PATCH 314/425] [lld-macho][arm64] Enhance safe ICF with thunk-based
 deduplication (#106573)

Currently, our `safe` ICF mode only merges non-address-significant code,
leaving duplicate address-significant functions in the output. This
patch introduces `safe_thunks` ICF mode, which keeps a single master
copy of each function and replaces address-significant duplicates with
thunks that branch to the master copy.
Currently `--icf=safe_thunks` is only supported for `arm64`
architectures.

**Perf stats for a large binary:**
| ICF Option | Total Size | __text Size | __unwind_info | % total |

|-------------------|------------|-------------|---------------------|---------------------------|
| `--icf=none` | 91.738 MB | 55.220 MB | 1.424 MB | 0% |
| `--icf=safe` | 85.042 MB | 49.572 MB | 1.168 MB | 7.30% |
| `--icf=safe_thunks` | 84.650 MB | 49.219 MB | 1.143 MB | 7.72% |
| `--icf=all` | 82.060 MB | 48.726 MB | 1.111 MB | 10.55% |

So overall we can expect a `~0.45%` binary size reduction for a typical
large binary compared to the `--icf=safe` option.

**Runtime:**
Linking the above binary took ~10 seconds. Comparing the link
performance of --icf=safe_thunks vs --icf=safe, a ~2% slowdown was
observed.
---
 lld/MachO/Arch/ARM64.cpp          |  23 +++
 lld/MachO/Config.h                |   1 +
 lld/MachO/Driver.cpp              |   9 +-
 lld/MachO/ICF.cpp                 |  92 ++++++++++-
 lld/MachO/InputSection.cpp        |   5 +-
 lld/MachO/InputSection.h          |   3 +-
 lld/MachO/MapFile.cpp             |   2 +-
 lld/MachO/Symbols.cpp             |   2 +-
 lld/MachO/Symbols.h               |  13 +-
 lld/MachO/SyntheticSections.cpp   |   3 +-
 lld/MachO/Target.h                |  10 ++
 lld/test/MachO/icf-safe-thunks.ll | 254 ++++++++++++++++++++++++++++++
 12 files changed, 405 insertions(+), 12 deletions(-)
 create mode 100644 lld/test/MachO/icf-safe-thunks.ll

diff --git a/lld/MachO/Arch/ARM64.cpp b/lld/MachO/Arch/ARM64.cpp
index e192676394c96..195a8f09f47c1 100644
--- a/lld/MachO/Arch/ARM64.cpp
+++ b/lld/MachO/Arch/ARM64.cpp
@@ -41,6 +41,10 @@ struct ARM64 : ARM64Common {
                             Symbol *objcMsgSend) const override;
   void populateThunk(InputSection *thunk, Symbol *funcSym) override;
   void applyOptimizationHints(uint8_t *, const ObjFile &) const override;
+
+  void initICFSafeThunkBody(InputSection *thunk,
+                            InputSection *branchTarget) const override;
+  uint32_t getICFSafeThunkSize() const override;
 };
 
 } // namespace
@@ -175,6 +179,25 @@ void ARM64::populateThunk(InputSection *thunk, Symbol *funcSym) {
                              /*offset=*/0, /*addend=*/0,
                              /*referent=*/funcSym);
 }
+// Just a single direct branch to the target function.
+static constexpr uint32_t icfSafeThunkCode[] = {
+    0x14000000, // 08: b    target
+};
+
+void ARM64::initICFSafeThunkBody(InputSection *thunk,
+                                 InputSection *branchTarget) const {
+  // The base data here will not be itself modified, we'll just be adding a
+  // reloc below. So we can directly use the constexpr above as the data.
+  thunk->data = {reinterpret_cast<const uint8_t *>(icfSafeThunkCode),
+                 sizeof(icfSafeThunkCode)};
+
+  thunk->relocs.emplace_back(/*type=*/ARM64_RELOC_BRANCH26,
+                             /*pcrel=*/true, /*length=*/2,
+                             /*offset=*/0, /*addend=*/0,
+                             /*referent=*/branchTarget);
+}
+
+uint32_t ARM64::getICFSafeThunkSize() const { return sizeof(icfSafeThunkCode); }
 
 ARM64::ARM64() : ARM64Common(LP64()) {
   cpuType = CPU_TYPE_ARM64;
diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index 5beb0662ba727..4e940693602c9 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -68,6 +68,7 @@ enum class ICFLevel {
   unknown,
   none,
   safe,
+  safe_thunks,
   all,
 };
 
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 09a539d71dab3..fa5a5dab8b565 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -847,8 +847,14 @@ static ICFLevel getICFLevel(const ArgList &args) {
   auto icfLevel = StringSwitch<ICFLevel>(icfLevelStr)
                       .Cases("none", "", ICFLevel::none)
                       .Case("safe", ICFLevel::safe)
+                      .Case("safe_thunks", ICFLevel::safe_thunks)
                       .Case("all", ICFLevel::all)
                       .Default(ICFLevel::unknown);
+
+  if ((icfLevel == ICFLevel::safe_thunks) && (config->arch() != AK_arm64)) {
+    error("--icf=safe_thunks is only supported on arm64 targets");
+  }
+
   if (icfLevel == ICFLevel::unknown) {
     warn(Twine("unknown --icf=OPTION `") + icfLevelStr +
          "', defaulting to `none'");
@@ -2116,7 +2122,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     // foldIdenticalLiterals before foldIdenticalSections.
     foldIdenticalLiterals();
     if (config->icfLevel != ICFLevel::none) {
-      if (config->icfLevel == ICFLevel::safe)
+      if (config->icfLevel == ICFLevel::safe ||
+          config->icfLevel == ICFLevel::safe_thunks)
         markAddrSigSymbols();
       foldIdenticalSections(/*onlyCfStrings=*/false);
     } else if (config->dedupStrings) {
diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
index fc786b571dc64..2ff962b06e367 100644
--- a/lld/MachO/ICF.cpp
+++ b/lld/MachO/ICF.cpp
@@ -45,6 +45,7 @@ class ICF {
                       const ConcatInputSection *ib);
   bool equalsVariable(const ConcatInputSection *ia,
                       const ConcatInputSection *ib);
+  void applySafeThunksToRange(size_t begin, size_t end);
 
   // ICF needs a copy of the inputs vector because its equivalence-class
   // segregation algorithm destroys the proper sequence.
@@ -251,6 +252,50 @@ void ICF::forEachClassRange(size_t begin, size_t end,
   }
 }
 
+// Given a range of identical icfInputs, replace address significant functions
+// with a thunk that is just a direct branch to the first function in the
+// series. This way we keep only one main body of the function but we still
+// retain the address uniqueness of relevant functions by having them be a
+// direct branch thunk rather than containing a full copy of the actual function
+// body.
+void ICF::applySafeThunksToRange(size_t begin, size_t end) {
+  // If the functions we're dealing with are smaller than the thunk size, then
+  // just leave them all as-is - creating thunks would be a net loss.
+  uint32_t thunkSize = target->getICFSafeThunkSize();
+  if (icfInputs[begin]->data.size() <= thunkSize)
+    return;
+
+  // When creating a unique ICF thunk, use the first section as the section that
+  // all thunks will branch to.
+  ConcatInputSection *masterIsec = icfInputs[begin];
+
+  for (size_t i = begin + 1; i < end; ++i) {
+    ConcatInputSection *isec = icfInputs[i];
+    // When we're done processing keepUnique entries, we can stop. Sorting
+    // guaratees that all keepUnique will be at the front.
+    if (!isec->keepUnique)
+      break;
+
+    ConcatInputSection *thunk =
+        makeSyntheticInputSection(isec->getSegName(), isec->getName());
+    addInputSection(thunk);
+
+    target->initICFSafeThunkBody(thunk, masterIsec);
+    thunk->foldIdentical(isec, Symbol::ICFFoldKind::Thunk);
+
+    // Since we're folding the target function into a thunk, we need to adjust
+    // the symbols that now got relocated from the target function to the thunk.
+    // Since the thunk is only one branch, we move all symbols to offset 0 and
+    // make sure that the size of all non-zero-size symbols is equal to the size
+    // of the branch.
+    for (auto *sym : thunk->symbols) {
+      sym->value = 0;
+      if (sym->size != 0)
+        sym->size = thunkSize;
+    }
+  }
+}
+
 // Split icfInputs into shards, then parallelize invocation of FUNC on subranges
 // with matching equivalence class
 void ICF::forEachClass(llvm::function_ref<void(size_t, size_t)> func) {
@@ -312,6 +357,12 @@ void ICF::run() {
 
   llvm::stable_sort(
       icfInputs, [](const ConcatInputSection *a, const ConcatInputSection *b) {
+        // When using safe_thunks, ensure that we first sort by icfEqClass and
+        // then by keepUnique (descending). This guarantees that within an
+        // equivalence class, the keepUnique inputs are always first.
+        if (config->icfLevel == ICFLevel::safe_thunks)
+          if (a->icfEqClass[0] == b->icfEqClass[0])
+            return a->keepUnique > b->keepUnique;
         return a->icfEqClass[0] < b->icfEqClass[0];
       });
   forEachClass([&](size_t begin, size_t end) {
@@ -331,13 +382,37 @@ void ICF::run() {
     log("equalsVariable() called " + Twine(equalsVariableCount) + " times");
   }
 
+  // When using safe_thunks, we need to create thunks for all keepUnique
+  // functions that can be deduplicated. Since we're creating / adding new
+  // InputSections, we can't paralellize this.
+  if (config->icfLevel == ICFLevel::safe_thunks)
+    forEachClassRange(0, icfInputs.size(), [&](size_t begin, size_t end) {
+      applySafeThunksToRange(begin, end);
+    });
+
   // Fold sections within equivalence classes
   forEachClass([&](size_t begin, size_t end) {
     if (end - begin < 2)
       return;
+    bool useSafeThunks = config->icfLevel == ICFLevel::safe_thunks;
+
+    // For ICF level safe_thunks, replace keepUnique function bodies with
+    // thunks. For all other ICF levles, directly merge the functions.
+
     ConcatInputSection *beginIsec = icfInputs[begin];
-    for (size_t i = begin + 1; i < end; ++i)
+    for (size_t i = begin + 1; i < end; ++i) {
+      // Skip keepUnique inputs when using safe_thunks (already handeled above)
+      if (useSafeThunks && icfInputs[i]->keepUnique) {
+        // Assert keepUnique sections are either small or replaced with thunks.
+        assert(!icfInputs[i]->live ||
+               icfInputs[i]->data.size() <= target->getICFSafeThunkSize());
+        assert(!icfInputs[i]->replacement ||
+               icfInputs[i]->replacement->data.size() ==
+                   target->getICFSafeThunkSize());
+        continue;
+      }
       beginIsec->foldIdentical(icfInputs[i]);
+    }
   });
 }
 
@@ -421,11 +496,22 @@ void macho::foldIdenticalSections(bool onlyCfStrings) {
     // can still fold it.
     bool hasFoldableFlags = (isSelRefsSection(isec) ||
                              sectionType(isec->getFlags()) == MachO::S_REGULAR);
+
+    bool isCodeSec = isCodeSection(isec);
+
+    // When keepUnique is true, the section is not foldable. Unless we are at
+    // icf level safe_thunks, in which case we still want to fold code sections.
+    // When using safe_thunks we'll apply the safe_thunks logic at merge time
+    // based on the 'keepUnique' flag.
+    bool noUniqueRequirement =
+        !isec->keepUnique ||
+        ((config->icfLevel == ICFLevel::safe_thunks) && isCodeSec);
+
     // FIXME: consider non-code __text sections as foldable?
     bool isFoldable = (!onlyCfStrings || isCfStringSection(isec)) &&
-                      (isCodeSection(isec) || isFoldableWithAddendsRemoved ||
+                      (isCodeSec || isFoldableWithAddendsRemoved ||
                        isGccExceptTabSection(isec)) &&
-                      !isec->keepUnique && !isec->hasAltEntry &&
+                      noUniqueRequirement && !isec->hasAltEntry &&
                       !isec->shouldOmitFromOutput() && hasFoldableFlags;
     if (isFoldable) {
       foldable.push_back(isec);
diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp
index a9b93e07a6013..64c584920defb 100644
--- a/lld/MachO/InputSection.cpp
+++ b/lld/MachO/InputSection.cpp
@@ -190,13 +190,14 @@ const Reloc *InputSection::getRelocAt(uint32_t off) const {
   return &*it;
 }
 
-void ConcatInputSection::foldIdentical(ConcatInputSection *copy) {
+void ConcatInputSection::foldIdentical(ConcatInputSection *copy,
+                                       Symbol::ICFFoldKind foldKind) {
   align = std::max(align, copy->align);
   copy->live = false;
   copy->wasCoalesced = true;
   copy->replacement = this;
   for (auto &copySym : copy->symbols)
-    copySym->wasIdenticalCodeFolded = true;
+    copySym->identicalCodeFoldingKind = foldKind;
 
   symbols.insert(symbols.end(), copy->symbols.begin(), copy->symbols.end());
   copy->symbols.clear();
diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h
index 0f389e50425a3..4e238d8ef7779 100644
--- a/lld/MachO/InputSection.h
+++ b/lld/MachO/InputSection.h
@@ -117,7 +117,8 @@ class ConcatInputSection final : public InputSection {
   bool shouldOmitFromOutput() const { return !live || isCoalescedWeak(); }
   void writeTo(uint8_t *buf);
 
-  void foldIdentical(ConcatInputSection *redundant);
+  void foldIdentical(ConcatInputSection *redundant,
+                     Symbol::ICFFoldKind foldKind = Symbol::ICFFoldKind::Body);
   ConcatInputSection *canonical() override {
     return replacement ? replacement : this;
   }
diff --git a/lld/MachO/MapFile.cpp b/lld/MachO/MapFile.cpp
index 5bcaeca48da2a..9c0621622ae2f 100644
--- a/lld/MachO/MapFile.cpp
+++ b/lld/MachO/MapFile.cpp
@@ -156,7 +156,7 @@ static void printNonLazyPointerSection(raw_fd_ostream &os,
 }
 
 static uint64_t getSymSizeForMap(Defined *sym) {
-  if (sym->wasIdenticalCodeFolded)
+  if (sym->identicalCodeFoldingKind == Symbol::ICFFoldKind::Body)
     return 0;
   return sym->size;
 }
diff --git a/lld/MachO/Symbols.cpp b/lld/MachO/Symbols.cpp
index f52da4f48aafb..9faf01e09de05 100644
--- a/lld/MachO/Symbols.cpp
+++ b/lld/MachO/Symbols.cpp
@@ -60,7 +60,7 @@ Defined::Defined(StringRef name, InputFile *file, InputSection *isec,
                  bool interposable)
     : Symbol(DefinedKind, name, file), overridesWeakDef(canOverrideWeakDef),
       privateExtern(isPrivateExtern), includeInSymtab(includeInSymtab),
-      wasIdenticalCodeFolded(false),
+      identicalCodeFoldingKind(ICFFoldKind::None),
       referencedDynamically(isReferencedDynamically), noDeadStrip(noDeadStrip),
       interposable(interposable), weakDefCanBeHidden(isWeakDefCanBeHidden),
       weakDef(isWeakDef), external(isExternal), originalIsec(isec),
diff --git a/lld/MachO/Symbols.h b/lld/MachO/Symbols.h
index bd60226f38ad3..70fd195f25c92 100644
--- a/lld/MachO/Symbols.h
+++ b/lld/MachO/Symbols.h
@@ -33,6 +33,15 @@ class Symbol {
     AliasKind,
   };
 
+  // Enum that describes the type of Identical Code Folding (ICF) applied to a
+  // symbol. This information is crucial for accurately representing symbol
+  // sizes in the map file.
+  enum ICFFoldKind {
+    None, // No folding is applied.
+    Body, // The entire body (function or data) is folded.
+    Thunk // The function body is folded into a single branch thunk.
+  };
+
   virtual ~Symbol() {}
 
   Kind kind() const { return symbolKind; }
@@ -142,8 +151,8 @@ class Defined : public Symbol {
   bool privateExtern : 1;
   // Whether this symbol should appear in the output symbol table.
   bool includeInSymtab : 1;
-  // Whether this symbol was folded into a different symbol during ICF.
-  bool wasIdenticalCodeFolded : 1;
+  // The ICF folding kind of this symbol: None / Body / Thunk.
+  ICFFoldKind identicalCodeFoldingKind : 2;
   // Symbols marked referencedDynamically won't be removed from the output's
   // symbol table by tools like strip. In theory, this could be set on arbitrary
   // symbols in input object files. In practice, it's used solely for the
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 939e9b286d77f..7b0078856c5e2 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -1231,7 +1231,8 @@ void SymtabSection::emitStabs() {
 
       // Constant-folded symbols go in the executable's symbol table, but don't
       // get a stabs entry unless --keep-icf-stabs flag is specified
-      if (!config->keepICFStabs && defined->wasIdenticalCodeFolded)
+      if (!config->keepICFStabs &&
+          defined->identicalCodeFoldingKind == Symbol::ICFFoldKind::Body)
         continue;
 
       ObjFile *file = defined->getObjectFile();
diff --git a/lld/MachO/Target.h b/lld/MachO/Target.h
index cc47ae4386b47..eaa0336e70cb6 100644
--- a/lld/MachO/Target.h
+++ b/lld/MachO/Target.h
@@ -74,6 +74,16 @@ class TargetInfo {
                                     uint64_t selrefVA,
                                     Symbol *objcMsgSend) const = 0;
 
+  // Init 'thunk' so that it be a direct jump to 'branchTarget'.
+  virtual void initICFSafeThunkBody(InputSection *thunk,
+                                    InputSection *branchTarget) const {
+    llvm_unreachable("target does not support ICF safe thunks");
+  }
+
+  virtual uint32_t getICFSafeThunkSize() const {
+    llvm_unreachable("target does not support ICF safe thunks");
+  }
+
   // Symbols may be referenced via either the GOT or the stubs section,
   // depending on the relocation type. prepareSymbolRelocation() will set up the
   // GOT/stubs entries, and resolveSymbolVA() will return the addresses of those
diff --git a/lld/test/MachO/icf-safe-thunks.ll b/lld/test/MachO/icf-safe-thunks.ll
new file mode 100644
index 0000000000000..238e90f952e16
--- /dev/null
+++ b/lld/test/MachO/icf-safe-thunks.ll
@@ -0,0 +1,254 @@
+; REQUIRES: aarch64
+
+; RUN: rm -rf %t; mkdir %t
+; RUN: llc -filetype=obj %s -O3 -o %t/icf-obj-safe-thunks.o -enable-machine-outliner=never -mtriple arm64-apple-macos -addrsig
+; RUN: %lld -arch arm64 -lSystem --icf=safe_thunks -dylib -o %t/icf-safe.dylib -map %t/icf-safe.map %t/icf-obj-safe-thunks.o
+; RUN: llvm-objdump %t/icf-safe.dylib -d --macho | FileCheck %s --check-prefixes=CHECK-ARM64
+; RUN: cat %t/icf-safe.map | FileCheck %s --check-prefixes=CHECK-ARM64-MAP
+
+; CHECK-ARM64:        (__TEXT,__text) section
+; CHECK-ARM64-NEXT:   _func_unique_1:
+; CHECK-ARM64-NEXT:        mov {{.*}}, #0x1
+;
+; CHECK-ARM64:        _func_unique_2_canmerge:
+; CHECK-ARM64-NEXT:   _func_2identical_v1:
+; CHECK-ARM64-NEXT:        mov {{.*}}, #0x2
+;
+; CHECK-ARM64:        _func_3identical_v1:
+; CHECK-ARM64-NEXT:        mov {{.*}}, #0x3
+;
+; CHECK-ARM64:        _func_3identical_v1_canmerge:
+; CHECK-ARM64-NEXT:   _func_3identical_v2_canmerge:
+; CHECK-ARM64-NEXT:   _func_3identical_v3_canmerge:
+; CHECK-ARM64-NEXT:        mov {{.*}}, #0x21
+;
+; CHECK-ARM64:        _call_all_funcs:
+; CHECK-ARM64-NEXT:        stp  x29
+;
+; CHECK-ARM64:        _take_func_addr:
+; CHECK-ARM64-NEXT:        adr
+;
+; CHECK-ARM64:        _func_2identical_v2:
+; CHECK-ARM64-NEXT:        b  _func_2identical_v1
+; CHECK-ARM64-NEXT:   _func_3identical_v2:
+; CHECK-ARM64-NEXT:        b  _func_3identical_v1
+; CHECK-ARM64-NEXT:   _func_3identical_v3:
+; CHECK-ARM64-NEXT:        b  _func_3identical_v1
+
+
+; CHECK-ARM64-MAP:      0x00000010 [  2] _func_unique_1
+; CHECK-ARM64-MAP-NEXT: 0x00000010 [  2] _func_2identical_v1
+; CHECK-ARM64-MAP-NEXT: 0x00000000 [  2] _func_unique_2_canmerge
+; CHECK-ARM64-MAP-NEXT: 0x00000010 [  2] _func_3identical_v1
+; CHECK-ARM64-MAP-NEXT: 0x00000010 [  2] _func_3identical_v1_canmerge
+; CHECK-ARM64-MAP-NEXT: 0x00000000 [  2] _func_3identical_v2_canmerge
+; CHECK-ARM64-MAP-NEXT: 0x00000000 [  2] _func_3identical_v3_canmerge
+; CHECK-ARM64-MAP-NEXT: 0x00000034 [  2] _call_all_funcs
+; CHECK-ARM64-MAP-NEXT: 0x00000050 [  2] _take_func_addr
+; CHECK-ARM64-MAP-NEXT: 0x00000004 [  2] _func_2identical_v2
+; CHECK-ARM64-MAP-NEXT: 0x00000004 [  2] _func_3identical_v2
+; CHECK-ARM64-MAP-NEXT: 0x00000004 [  2] _func_3identical_v3
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx11.0.0"
+
+@g_val = global i8 0, align 1
+@g_ptr = global ptr null, align 8
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_unique_1() #0 {
+entry:
+  store volatile i8 1, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_unique_2_canmerge() local_unnamed_addr #0 {
+entry:
+  store volatile i8 2, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_2identical_v1() #0 {
+entry:
+  store volatile i8 2, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_2identical_v2() #0 {
+entry:
+  store volatile i8 2, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v1() #0 {
+entry:
+  store volatile i8 3, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v2() #0 {
+entry:
+  store volatile i8 3, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v3() #0 {
+entry:
+  store volatile i8 3, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v1_canmerge() local_unnamed_addr #0 {
+entry:
+  store volatile i8 33, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v2_canmerge() local_unnamed_addr #0 {
+entry:
+  store volatile i8 33, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_3identical_v3_canmerge() local_unnamed_addr #0 {
+entry:
+  store volatile i8 33, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp uwtable(sync)
+define void @call_all_funcs() local_unnamed_addr #1 {
+entry:
+  tail call void @func_unique_1()
+  tail call void @func_unique_2_canmerge()
+  tail call void @func_2identical_v1()
+  tail call void @func_2identical_v2()
+  tail call void @func_3identical_v1()
+  tail call void @func_3identical_v2()
+  tail call void @func_3identical_v3()
+  tail call void @func_3identical_v1_canmerge()
+  tail call void @func_3identical_v2_canmerge()
+  tail call void @func_3identical_v3_canmerge()
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @take_func_addr() local_unnamed_addr #0 {
+entry:
+  store volatile ptr @func_unique_1, ptr @g_ptr, align 8, !tbaa !8
+  store volatile ptr @func_2identical_v1, ptr @g_ptr, align 8, !tbaa !8
+  store volatile ptr @func_2identical_v2, ptr @g_ptr, align 8, !tbaa !8
+  store volatile ptr @func_3identical_v1, ptr @g_ptr, align 8, !tbaa !8
+  store volatile ptr @func_3identical_v2, ptr @g_ptr, align 8, !tbaa !8
+  store volatile ptr @func_3identical_v3, ptr @g_ptr, align 8, !tbaa !8
+  ret void
+}
+
+attributes #0 = { mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,+zcm,+zcz" }
+attributes #1 = { mustprogress nofree noinline norecurse nounwind ssp uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,+zcm,+zcz" }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"uwtable", i32 1}
+!3 = !{i32 7, !"frame-pointer", i32 1}
+!4 = !{!"clang"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C++ TBAA"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"any pointer", !6, i64 0}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;; Generate the above LLVM IR with the below script ;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; #!/bin/bash
+; set -ex
+; TOOLCHAIN_BIN="llvm-project/build/Debug/bin"
+;
+; # Create icf-safe-thunks.cpp file
+; cat > icf-safe-thunks.cpp <<EOF
+;
+; #define ATTR __attribute__((noinline)) extern "C"
+; typedef unsigned long long ULL;
+;
+; volatile char g_val = 0;
+; void *volatile g_ptr = 0;
+;
+; ATTR void func_unique_1() {
+;     g_val = 1;
+; }
+;
+; ATTR void func_unique_2_canmerge() {
+;     g_val = 2;
+; }
+;
+; ATTR void func_2identical_v1() {
+;     g_val = 2;
+; }
+;
+; ATTR void func_2identical_v2() {
+;     g_val = 2;
+; }
+;
+; ATTR void func_3identical_v1() {
+;     g_val = 3;
+; }
+;
+; ATTR void func_3identical_v2() {
+;     g_val = 3;
+; }
+;
+; ATTR void func_3identical_v3() {
+;     g_val = 3;
+; }
+;
+; ATTR void func_3identical_v1_canmerge() {
+;     g_val = 33;
+; }
+;
+; ATTR void func_3identical_v2_canmerge() {
+;     g_val = 33;
+; }
+;
+; ATTR void func_3identical_v3_canmerge() {
+;     g_val = 33;
+; }
+;
+; ATTR void call_all_funcs() {
+;     func_unique_1();
+;     func_unique_2_canmerge();
+;     func_2identical_v1();
+;     func_2identical_v2();
+;     func_3identical_v1();
+;     func_3identical_v2();
+;     func_3identical_v3();
+;     func_3identical_v1_canmerge();
+;     func_3identical_v2_canmerge();
+;     func_3identical_v3_canmerge();
+; }
+;
+; ATTR void take_func_addr() {
+;     g_ptr = (void*)func_unique_1;
+;     g_ptr = (void*)func_2identical_v1;
+;     g_ptr = (void*)func_2identical_v2;
+;     g_ptr = (void*)func_3identical_v1;
+;     g_ptr = (void*)func_3identical_v2;
+;     g_ptr = (void*)func_3identical_v3;
+; }
+; EOF
+;
+; $TOOLCHAIN_BIN/clang -target arm64-apple-macos11.0 -S -emit-llvm \
+;                      icf-safe-thunks.cpp -O3 -o icf-safe-thunks.ll

From b525ead65c224cd2e73298c7a735f8136a926c35 Mon Sep 17 00:00:00 2001
From: Sterling-Augustine
 <56981066+Sterling-Augustine@users.noreply.github.com>
Date: Thu, 5 Sep 2024 23:41:18 +0000
Subject: [PATCH 315/425] [SandboxIR] Add bazel support (#107486)

---
 .../bazel/llvm-project-overlay/llvm/BUILD.bazel | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index b2dcc696b0ad0..b574397e108b7 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -1446,6 +1446,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "SandboxIR",
+    srcs = glob([
+        "lib/SandboxIR/*.cpp",
+    ]),
+    hdrs = glob(["include/llvm/SandboxIR/*.h"]),
+    textual_hdrs = ["include/llvm/SandboxIR/SandboxIRValues.def"],
+    copts = llvm_copts,
+    deps = [
+        ":Core",
+        ":Support",
+    ],
+)
+
 cc_library(
     name = "Scalar",
     srcs = glob([
@@ -1474,14 +1488,17 @@ cc_library(
     srcs = glob([
         "lib/Transforms/Vectorize/*.cpp",
         "lib/Transforms/Vectorize/*.h",
+        "lib/Transforms/Vectorize/SandboxVectorizer/*.cpp",
     ]),
     hdrs = glob([
         "include/llvm/Transforms/Vectorize/*.h",
+        "include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h"
     ]),
     copts = llvm_copts,
     deps = [
         ":Analysis",
         ":Core",
+        ":SandboxIR",
         ":Scalar",
         ":Support",
         ":Target",

From d7dd2c468fecae871ba67e891a3519c758c94b63 Mon Sep 17 00:00:00 2001
From: ziqingluo-90 <ziqing_luo@apple.com>
Date: Thu, 5 Sep 2024 16:18:50 -0700
Subject: [PATCH 316/425] Re-land "[-Wunsafe-buffer-usage] Warning Libc
 functions (#101583)"

Revert commit 23457964392d00fc872fa6021763859024fb38da, and re-land
with a new flag "-Wunsafe-buffer-usage-in-libc-call" for the new
warning.

(rdar://117182250)
---
 .../Analysis/Analyses/UnsafeBufferUsage.h     |  19 +
 .../Analyses/UnsafeBufferUsageGadgets.def     |  13 +-
 clang/include/clang/Basic/DiagnosticGroups.td |   3 +-
 .../clang/Basic/DiagnosticSemaKinds.td        |   7 +
 clang/lib/Analysis/UnsafeBufferUsage.cpp      | 529 +++++++++++++++++-
 clang/lib/Sema/AnalysisBasedWarnings.cpp      |  23 +-
 ...-usage-libc-functions-inline-namespace.cpp |  60 ++
 ...arn-unsafe-buffer-usage-libc-functions.cpp | 124 ++++
 ...n-unsafe-buffer-usage-test-unreachable.cpp |   4 +-
 9 files changed, 766 insertions(+), 16 deletions(-)
 create mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp
 create mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp

diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
index 228b4ae1e3e11..267cde64f8f23 100644
--- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
+++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
@@ -15,6 +15,7 @@
 #define LLVM_CLANG_ANALYSIS_ANALYSES_UNSAFEBUFFERUSAGE_H
 
 #include "clang/AST/Decl.h"
+#include "clang/AST/Expr.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/Support/Debug.h"
@@ -106,6 +107,20 @@ class UnsafeBufferUsageHandler {
   virtual void handleUnsafeOperation(const Stmt *Operation,
                                      bool IsRelatedToDecl, ASTContext &Ctx) = 0;
 
+  /// Invoked when a call to an unsafe libc function is found.
+  /// \param PrintfInfo
+  ///  is 0 if the callee function is not a member of the printf family;
+  ///  is 1 if the callee is `sprintf`;
+  ///  is 2 if arguments of the call have `__size_by` relation but are not in a
+  ///  safe pattern;
+  ///  is 3 if string arguments do not guarantee null-termination
+  ///  is 4 if the callee takes va_list
+  /// \param UnsafeArg one of the actual arguments that is unsafe, non-null
+  /// only when `2 <= PrintfInfo <= 3`
+  virtual void handleUnsafeLibcCall(const CallExpr *Call, unsigned PrintfInfo,
+                                    ASTContext &Ctx,
+                                    const Expr *UnsafeArg = nullptr) = 0;
+
   /// Invoked when an unsafe operation with a std container is found.
   virtual void handleUnsafeOperationInContainer(const Stmt *Operation,
                                                 bool IsRelatedToDecl,
@@ -151,6 +166,10 @@ class UnsafeBufferUsageHandler {
   virtual bool
   ignoreUnsafeBufferInContainer(const SourceLocation &Loc) const = 0;
 
+  /// \return true iff unsafe libc call should NOT be reported at `Loc`
+  virtual bool
+  ignoreUnsafeBufferInLibcCall(const SourceLocation &Loc) const = 0;
+
   virtual std::string
   getUnsafeBufferUsageAttributeTextAt(SourceLocation Loc,
                                       StringRef WSSuffix = "") const = 0;
diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
index 242ad763ba62b..09fa510bc0472 100644
--- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
+++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
@@ -18,10 +18,10 @@
 #define WARNING_GADGET(name) GADGET(name)
 #endif
 
-/// A `WARNING_GADGET` subset, where the code pattern of each gadget
-/// corresponds uses of a (possibly hardened) contatiner (e.g., `std::span`).
-#ifndef WARNING_CONTAINER_GADGET
-#define WARNING_CONTAINER_GADGET(name) WARNING_GADGET(name)
+/// A `WARNING_GADGET` subset, each of which may be enable/disable separately
+/// with different flags
+#ifndef WARNING_OPTIONAL_GADGET
+#define WARNING_OPTIONAL_GADGET(name) WARNING_GADGET(name)
 #endif
 
 /// Safe gadgets correspond to code patterns that aren't unsafe but need to be
@@ -38,7 +38,8 @@ WARNING_GADGET(PointerArithmetic)
 WARNING_GADGET(UnsafeBufferUsageAttr)
 WARNING_GADGET(UnsafeBufferUsageCtorAttr)
 WARNING_GADGET(DataInvocation)
-WARNING_CONTAINER_GADGET(SpanTwoParamConstructor) // Uses of `std::span(arg0, arg1)`
+WARNING_OPTIONAL_GADGET(UnsafeLibcFunctionCall)
+WARNING_OPTIONAL_GADGET(SpanTwoParamConstructor) // Uses of `std::span(arg0, arg1)`
 FIXABLE_GADGET(ULCArraySubscript)          // `DRE[any]` in an Unspecified Lvalue Context
 FIXABLE_GADGET(DerefSimplePtrArithFixable)
 FIXABLE_GADGET(PointerDereference)
@@ -52,5 +53,5 @@ FIXABLE_GADGET(PointerInit)
 
 #undef FIXABLE_GADGET
 #undef WARNING_GADGET
-#undef WARNING_CONTAINER_GADGET
+#undef WARNING_OPTIONAL_GADGET
 #undef GADGET
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index c4c29942ee1cb..116ce7a04f66f 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1558,7 +1558,8 @@ def ReadOnlyPlacementChecks : DiagGroup<"read-only-types">;
 
 // Warnings and fixes to support the "safe buffers" programming model.
 def UnsafeBufferUsageInContainer : DiagGroup<"unsafe-buffer-usage-in-container">;
-def UnsafeBufferUsage : DiagGroup<"unsafe-buffer-usage", [UnsafeBufferUsageInContainer]>;
+def UnsafeBufferUsageInLibcCall : DiagGroup<"unsafe-buffer-usage-in-libc-call">;
+def UnsafeBufferUsage : DiagGroup<"unsafe-buffer-usage", [UnsafeBufferUsageInContainer, UnsafeBufferUsageInLibcCall]>;
 
 // Warnings and notes related to the function effects system underlying
 // the nonblocking and nonallocating attributes.
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 72ea5338ce615..083684670a980 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12416,6 +12416,13 @@ def warn_unsafe_buffer_operation : Warning<
   "unsafe buffer access|function introduces unsafe buffer manipulation|unsafe invocation of span::data|"
   "field %1 prone to unsafe buffer manipulation}0">,
   InGroup<UnsafeBufferUsage>, DefaultIgnore;
+def warn_unsafe_buffer_libc_call : Warning<
+  "function %0 is unsafe">,
+  InGroup<UnsafeBufferUsageInLibcCall>, DefaultIgnore;
+def note_unsafe_buffer_printf_call : Note<
+  "%select{|change to 'snprintf' for explicit bounds checking | buffer pointer and size may not match"
+          "|string argument is not guaranteed to be null-terminated"
+          "|'va_list' is unsafe}0">;
 def note_unsafe_buffer_operation : Note<
   "used%select{| in pointer arithmetic| in buffer access}0 here">;
 def note_unsafe_buffer_variable_fixit_group : Note<
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index da7446913f7c8..ec76eae8b6077 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -10,12 +10,12 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/FormatString.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtVisitor.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
-#include "clang/Basic/CharInfo.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Lex/Preprocessor.h"
@@ -247,6 +247,11 @@ AST_MATCHER_P(Stmt, ignoreUnsafeBufferInContainer,
   return Handler->ignoreUnsafeBufferInContainer(Node.getBeginLoc());
 }
 
+AST_MATCHER_P(Stmt, ignoreUnsafeLibcCall, const UnsafeBufferUsageHandler *,
+              Handler) {
+  return Handler->ignoreUnsafeBufferInLibcCall(Node.getBeginLoc());
+}
+
 AST_MATCHER_P(CastExpr, castSubExpr, internal::Matcher<Expr>, innerMatcher) {
   return innerMatcher.matches(*Node.getSubExpr(), Finder, Builder);
 }
@@ -443,6 +448,425 @@ AST_MATCHER(ArraySubscriptExpr, isSafeArraySubscript) {
   return false;
 }
 
+AST_MATCHER_P(CallExpr, hasNumArgs, unsigned, Num) {
+  return Node.getNumArgs() == Num;
+}
+
+namespace libc_func_matchers {
+// Under `libc_func_matchers`, define a set of matchers that match unsafe
+// functions in libc and unsafe calls to them.
+
+//  A tiny parser to strip off common prefix and suffix of libc function names
+//  in real code.
+//
+//  Given a function name, `matchName` returns `CoreName` according to the
+//  following grammar:
+//
+//  LibcName     := CoreName | CoreName + "_s"
+//  MatchingName := "__builtin_" + LibcName              |
+//                  "__builtin___" + LibcName + "_chk"   |
+//                  "__asan_" + LibcName
+//
+struct LibcFunNamePrefixSuffixParser {
+  StringRef matchName(StringRef FunName, bool isBuiltin) {
+    // Try to match __builtin_:
+    if (isBuiltin && FunName.starts_with("__builtin_"))
+      // Then either it is __builtin_LibcName or __builtin___LibcName_chk or
+      // no match:
+      return matchLibcNameOrBuiltinChk(
+          FunName.drop_front(10 /* truncate "__builtin_" */));
+    // Try to match __asan_:
+    if (FunName.starts_with("__asan_"))
+      return matchLibcName(FunName.drop_front(7 /* truncate of "__asan_" */));
+    return matchLibcName(FunName);
+  }
+
+  // Parameter `Name` is the substring after stripping off the prefix
+  // "__builtin_".
+  StringRef matchLibcNameOrBuiltinChk(StringRef Name) {
+    if (Name.starts_with("__") && Name.ends_with("_chk"))
+      return matchLibcName(
+          Name.drop_front(2).drop_back(4) /* truncate "__" and "_chk" */);
+    return matchLibcName(Name);
+  }
+
+  StringRef matchLibcName(StringRef Name) {
+    if (Name.ends_with("_s"))
+      return Name.drop_back(2 /* truncate "_s" */);
+    return Name;
+  }
+};
+
+// A pointer type expression is known to be null-terminated, if it has the
+// form: E.c_str(), for any expression E of `std::string` type.
+static bool isNullTermPointer(const Expr *Ptr) {
+  if (isa<StringLiteral>(Ptr->IgnoreParenImpCasts()))
+    return true;
+  if (isa<PredefinedExpr>(Ptr->IgnoreParenImpCasts()))
+    return true;
+  if (auto *MCE = dyn_cast<CXXMemberCallExpr>(Ptr->IgnoreParenImpCasts())) {
+    const CXXMethodDecl *MD = MCE->getMethodDecl();
+    const CXXRecordDecl *RD = MCE->getRecordDecl()->getCanonicalDecl();
+
+    if (MD && RD && RD->isInStdNamespace())
+      if (MD->getName() == "c_str" && RD->getName() == "basic_string")
+        return true;
+  }
+  return false;
+}
+
+// Return true iff at least one of following cases holds:
+//  1. Format string is a literal and there is an unsafe pointer argument
+//     corresponding to an `s` specifier;
+//  2. Format string is not a literal and there is least an unsafe pointer
+//     argument (including the formatter argument).
+//
+// `UnsafeArg` is the output argument that will be set only if this function
+// returns true.
+static bool hasUnsafeFormatOrSArg(const CallExpr *Call, const Expr *&UnsafeArg,
+                                  const unsigned FmtArgIdx, ASTContext &Ctx,
+                                  bool isKprintf = false) {
+  class StringFormatStringHandler
+      : public analyze_format_string::FormatStringHandler {
+    const CallExpr *Call;
+    unsigned FmtArgIdx;
+    const Expr *&UnsafeArg;
+
+  public:
+    StringFormatStringHandler(const CallExpr *Call, unsigned FmtArgIdx,
+                              const Expr *&UnsafeArg)
+        : Call(Call), FmtArgIdx(FmtArgIdx), UnsafeArg(UnsafeArg) {}
+
+    bool HandlePrintfSpecifier(const analyze_printf::PrintfSpecifier &FS,
+                               const char *startSpecifier,
+                               unsigned specifierLen,
+                               const TargetInfo &Target) override {
+      if (FS.getConversionSpecifier().getKind() ==
+          analyze_printf::PrintfConversionSpecifier::sArg) {
+        unsigned ArgIdx = FS.getPositionalArgIndex() + FmtArgIdx;
+
+        if (0 < ArgIdx && ArgIdx < Call->getNumArgs())
+          if (!isNullTermPointer(Call->getArg(ArgIdx))) {
+            UnsafeArg = Call->getArg(ArgIdx); // output
+            // returning false stops parsing immediately
+            return false;
+          }
+      }
+      return true; // continue parsing
+    }
+  };
+
+  const Expr *Fmt = Call->getArg(FmtArgIdx);
+
+  if (auto *SL = dyn_cast<StringLiteral>(Fmt->IgnoreParenImpCasts())) {
+    StringRef FmtStr = SL->getString();
+    StringFormatStringHandler Handler(Call, FmtArgIdx, UnsafeArg);
+
+    return analyze_format_string::ParsePrintfString(
+        Handler, FmtStr.begin(), FmtStr.end(), Ctx.getLangOpts(),
+        Ctx.getTargetInfo(), isKprintf);
+  }
+  // If format is not a string literal, we cannot analyze the format string.
+  // In this case, this call is considered unsafe if at least one argument
+  // (including the format argument) is unsafe pointer.
+  return llvm::any_of(
+      llvm::make_range(Call->arg_begin() + FmtArgIdx, Call->arg_end()),
+      [&UnsafeArg](const Expr *Arg) -> bool {
+        if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg)) {
+          UnsafeArg = Arg;
+          return true;
+        }
+        return false;
+      });
+}
+
+// Matches a FunctionDecl node such that
+//  1. It's name, after stripping off predefined prefix and suffix, is
+//     `CoreName`; and
+//  2. `CoreName` or `CoreName[str/wcs]` is one of the `PredefinedNames`, which
+//     is a set of libc function names.
+//
+//  Note: For predefined prefix and suffix, see `LibcFunNamePrefixSuffixParser`.
+//  The notation `CoreName[str/wcs]` means a new name obtained from replace
+//  string "wcs" with "str" in `CoreName`.
+AST_MATCHER(FunctionDecl, isPredefinedUnsafeLibcFunc) {
+  static std::unique_ptr<std::set<StringRef>> PredefinedNames = nullptr;
+  if (!PredefinedNames)
+    PredefinedNames =
+        std::make_unique<std::set<StringRef>, std::set<StringRef>>({
+            // numeric conversion:
+            "atof",
+            "atoi",
+            "atol",
+            "atoll",
+            "strtol",
+            "strtoll",
+            "strtoul",
+            "strtoull",
+            "strtof",
+            "strtod",
+            "strtold",
+            "strtoimax",
+            "strtoumax",
+            // "strfromf",  "strfromd", "strfroml", // C23?
+            // string manipulation:
+            "strcpy",
+            "strncpy",
+            "strlcpy",
+            "strcat",
+            "strncat",
+            "strlcat",
+            "strxfrm",
+            "strdup",
+            "strndup",
+            // string examination:
+            "strlen",
+            "strnlen",
+            "strcmp",
+            "strncmp",
+            "stricmp",
+            "strcasecmp",
+            "strcoll",
+            "strchr",
+            "strrchr",
+            "strspn",
+            "strcspn",
+            "strpbrk",
+            "strstr",
+            "strtok",
+            // "mem-" functions
+            "memchr",
+            "wmemchr",
+            "memcmp",
+            "wmemcmp",
+            "memcpy",
+            "memccpy",
+            "mempcpy",
+            "wmemcpy",
+            "memmove",
+            "wmemmove",
+            "memset",
+            "wmemset",
+            // IO:
+            "fread",
+            "fwrite",
+            "fgets",
+            "fgetws",
+            "gets",
+            "fputs",
+            "fputws",
+            "puts",
+            // others
+            "strerror_s",
+            "strerror_r",
+            "bcopy",
+            "bzero",
+            "bsearch",
+            "qsort",
+        });
+
+  auto *II = Node.getIdentifier();
+
+  if (!II)
+    return false;
+
+  StringRef Name = LibcFunNamePrefixSuffixParser().matchName(
+      II->getName(), Node.getBuiltinID());
+
+  // Match predefined names:
+  if (PredefinedNames->find(Name) != PredefinedNames->end())
+    return true;
+
+  std::string NameWCS = Name.str();
+  size_t WcsPos = NameWCS.find("wcs");
+
+  while (WcsPos != std::string::npos) {
+    NameWCS[WcsPos++] = 's';
+    NameWCS[WcsPos++] = 't';
+    NameWCS[WcsPos++] = 'r';
+    WcsPos = NameWCS.find("wcs", WcsPos);
+  }
+  if (PredefinedNames->find(NameWCS) != PredefinedNames->end())
+    return true;
+  // All `scanf` functions are unsafe (including `sscanf`, `vsscanf`, etc.. They
+  // all should end with "scanf"):
+  return Name.ends_with("scanf");
+}
+
+// Match a call to one of the `v*printf` functions taking `va_list`.  We cannot
+// check safety for these functions so they should be changed to their
+// non-va_list versions.
+AST_MATCHER(FunctionDecl, isUnsafeVaListPrintfFunc) {
+  auto *II = Node.getIdentifier();
+
+  if (!II)
+    return false;
+
+  StringRef Name = LibcFunNamePrefixSuffixParser().matchName(
+      II->getName(), Node.getBuiltinID());
+
+  if (!Name.ends_with("printf"))
+    return false; // neither printf nor scanf
+  return Name.starts_with("v");
+}
+
+// Matches a call to one of the `sprintf` functions as they are always unsafe
+// and should be changed to `snprintf`.
+AST_MATCHER(FunctionDecl, isUnsafeSprintfFunc) {
+  auto *II = Node.getIdentifier();
+
+  if (!II)
+    return false;
+
+  StringRef Name = LibcFunNamePrefixSuffixParser().matchName(
+      II->getName(), Node.getBuiltinID());
+
+  if (!Name.ends_with("printf") ||
+      // Let `isUnsafeVaListPrintfFunc` check for cases with va-list:
+      Name.starts_with("v"))
+    return false;
+
+  StringRef Prefix = Name.drop_back(6);
+
+  if (Prefix.ends_with("w"))
+    Prefix = Prefix.drop_back(1);
+  return Prefix == "s";
+}
+
+// Match function declarations of `printf`, `fprintf`, `snprintf` and their wide
+// character versions.  Calls to these functions can be safe if their arguments
+// are carefully made safe.
+AST_MATCHER(FunctionDecl, isNormalPrintfFunc) {
+  auto *II = Node.getIdentifier();
+
+  if (!II)
+    return false;
+
+  StringRef Name = LibcFunNamePrefixSuffixParser().matchName(
+      II->getName(), Node.getBuiltinID());
+
+  if (!Name.ends_with("printf") || Name.starts_with("v"))
+    return false;
+
+  StringRef Prefix = Name.drop_back(6);
+
+  if (Prefix.ends_with("w"))
+    Prefix = Prefix.drop_back(1);
+
+  return Prefix.empty() || Prefix == "k" || Prefix == "f" || Prefix == "sn";
+}
+
+// This matcher requires that it is known that the callee `isNormalPrintf`.
+// Then if the format string is a string literal, this matcher matches when at
+// least one string argument is unsafe. If the format is not a string literal,
+// this matcher matches when at least one pointer type argument is unsafe.
+AST_MATCHER_P(CallExpr, hasUnsafePrintfStringArg,
+              clang::ast_matchers::internal::Matcher<Expr>,
+              UnsafeStringArgMatcher) {
+  // Determine what printf it is:
+  const Expr *FirstArg = Node.getArg(0);
+  ASTContext &Ctx = Finder->getASTContext();
+
+  if (isa<StringLiteral>(FirstArg->IgnoreParenImpCasts())) {
+    // It is a printf/kprintf. And, the format is a string literal:
+    bool isKprintf = false;
+    const Expr *UnsafeArg;
+
+    if (auto *Callee = Node.getDirectCallee())
+      if (auto *II = Callee->getIdentifier())
+        isKprintf = II->getName() == "kprintf";
+    if (hasUnsafeFormatOrSArg(&Node, UnsafeArg, 0, Ctx, isKprintf))
+      return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder);
+    return false;
+  }
+
+  QualType PtrTy = FirstArg->getType();
+
+  assert(PtrTy->isPointerType());
+
+  QualType PteTy = (cast<PointerType>(PtrTy))->getPointeeType();
+
+  if (!Ctx.getFILEType().isNull() /* If `FILE *` is not ever in the ASTContext,
+                                     there can't be any file pointer then */
+      && PteTy.getCanonicalType() == Ctx.getFILEType().getCanonicalType()) {
+    // It is a fprintf:
+    const Expr *UnsafeArg;
+
+    if (hasUnsafeFormatOrSArg(&Node, UnsafeArg, 1, Ctx, false))
+      return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder);
+    return false;
+  }
+
+  const Expr *SecondArg = Node.getArg(1);
+
+  if (SecondArg->getType()->isIntegerType()) {
+    // It is a snprintf:
+    const Expr *UnsafeArg;
+
+    if (hasUnsafeFormatOrSArg(&Node, UnsafeArg, 2, Ctx, false))
+      return UnsafeStringArgMatcher.matches(*UnsafeArg, Finder, Builder);
+    return false;
+  }
+  // It is printf but the format string is passed by pointer. The only thing we
+  // can do is to require all pointers to be null-terminated:
+  for (auto Arg : Node.arguments())
+    if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg))
+      if (UnsafeStringArgMatcher.matches(*Arg, Finder, Builder))
+        return true;
+  return false;
+}
+
+// This matcher requires that it is known that the callee `isNormalPrintf`.
+// Then it matches if the first two arguments of the call is a pointer and an
+// integer and they are not in a safe pattern.
+//
+// For the first two arguments: `ptr` and `size`, they are safe if in the
+// following patterns:
+//    ptr := DRE.data();
+//    size:= DRE.size()/DRE.size_bytes()
+// And DRE is a hardened container or view.
+AST_MATCHER(CallExpr, hasUnsafeSnprintfBuffer) {
+  if (Node.getNumArgs() < 3)
+    return false; // not an snprintf call
+
+  const Expr *Buf = Node.getArg(0), *Size = Node.getArg(1);
+
+  if (!Buf->getType()->isPointerType() || !Size->getType()->isIntegerType())
+    return false; // not an snprintf call
+
+  static StringRef SizedObjs[] = {"span", "array", "vector",
+                                  "basic_string_view", "basic_string"};
+  Buf = Buf->IgnoreParenImpCasts();
+  Size = Size->IgnoreParenImpCasts();
+  if (auto *MCEPtr = dyn_cast<CXXMemberCallExpr>(Buf))
+    if (auto *MCESize = dyn_cast<CXXMemberCallExpr>(Size)) {
+      auto *DREOfPtr = dyn_cast<DeclRefExpr>(
+          MCEPtr->getImplicitObjectArgument()->IgnoreParenImpCasts());
+      auto *DREOfSize = dyn_cast<DeclRefExpr>(
+          MCESize->getImplicitObjectArgument()->IgnoreParenImpCasts());
+
+      if (!DREOfPtr || !DREOfSize)
+        return true; // not in safe pattern
+      if (DREOfPtr->getDecl() != DREOfSize->getDecl())
+        return true; // not in safe pattern
+      if (MCEPtr->getMethodDecl()->getName() != "data")
+        return true; // not in safe pattern
+
+      if (MCESize->getMethodDecl()->getName() == "size_bytes" ||
+          // Note here the pointer must be a pointer-to-char type unless there
+          // is explicit casting.  If there is explicit casting, this branch
+          // is unreachable. Thus, at this branch "size" and "size_bytes" are
+          // equivalent as the pointer is a char pointer:
+          MCESize->getMethodDecl()->getName() == "size")
+        for (StringRef SizedObj : SizedObjs)
+          if (MCEPtr->getRecordDecl()->isInStdNamespace() &&
+              MCEPtr->getRecordDecl()->getCanonicalDecl()->getName() ==
+                  SizedObj)
+            return false; // It is in fact safe
+    }
+  return true; // ptr and size are not in safe pattern
+}
+} // namespace libc_func_matchers
 } // namespace clang::ast_matchers
 
 namespace {
@@ -760,6 +1184,10 @@ class SpanTwoParamConstructorGadget : public WarningGadget {
                     .bind(SpanTwoParamConstructorTag));
   }
 
+  static Matcher matcher(const UnsafeBufferUsageHandler *Handler) {
+    return stmt(unless(ignoreUnsafeBufferInContainer(Handler)), matcher());
+  }
+
   void handleUnsafeOperation(UnsafeBufferUsageHandler &Handler,
                              bool IsRelatedToDecl,
                              ASTContext &Ctx) const override {
@@ -1030,6 +1458,98 @@ class DataInvocationGadget : public WarningGadget {
   DeclUseList getClaimedVarUseSites() const override { return {}; }
 };
 
+class UnsafeLibcFunctionCallGadget : public WarningGadget {
+  const CallExpr *const Call;
+  const Expr *UnsafeArg = nullptr;
+  constexpr static const char *const Tag = "UnsafeLibcFunctionCall";
+  // Extra tags for additional information:
+  constexpr static const char *const UnsafeSprintfTag =
+      "UnsafeLibcFunctionCall_sprintf";
+  constexpr static const char *const UnsafeSizedByTag =
+      "UnsafeLibcFunctionCall_sized_by";
+  constexpr static const char *const UnsafeStringTag =
+      "UnsafeLibcFunctionCall_string";
+  constexpr static const char *const UnsafeVaListTag =
+      "UnsafeLibcFunctionCall_va_list";
+
+  enum UnsafeKind {
+    OTHERS = 0,  // no specific information, the callee function is unsafe
+    SPRINTF = 1, // never call `-sprintf`s, call `-snprintf`s instead.
+    SIZED_BY =
+        2, // the first two arguments of `snprintf` function have
+           // "__sized_by" relation but they do not conform to safe patterns
+    STRING = 3,  // an argument is a pointer-to-char-as-string but does not
+                 // guarantee null-termination
+    VA_LIST = 4, // one of the `-printf`s function that take va_list, which is
+                 // considered unsafe as it is not compile-time check
+  } WarnedFunKind = OTHERS;
+
+public:
+  UnsafeLibcFunctionCallGadget(const MatchFinder::MatchResult &Result)
+      : WarningGadget(Kind::UnsafeLibcFunctionCall),
+        Call(Result.Nodes.getNodeAs<CallExpr>(Tag)) {
+    if (Result.Nodes.getNodeAs<Decl>(UnsafeSprintfTag))
+      WarnedFunKind = SPRINTF;
+    else if (auto *E = Result.Nodes.getNodeAs<Expr>(UnsafeStringTag)) {
+      WarnedFunKind = STRING;
+      UnsafeArg = E;
+    } else if (Result.Nodes.getNodeAs<CallExpr>(UnsafeSizedByTag)) {
+      WarnedFunKind = SIZED_BY;
+      UnsafeArg = Call->getArg(0);
+    } else if (Result.Nodes.getNodeAs<Decl>(UnsafeVaListTag))
+      WarnedFunKind = VA_LIST;
+  }
+
+  static Matcher matcher(const UnsafeBufferUsageHandler *Handler) {
+    return stmt(unless(ignoreUnsafeLibcCall(Handler)),
+      anyOf(
+        callExpr(
+            callee(functionDecl(anyOf(
+                // Match a predefined unsafe libc
+                // function:
+                functionDecl(libc_func_matchers::isPredefinedUnsafeLibcFunc()),
+                // Match a call to one of the `v*printf` functions
+                // taking va-list, which cannot be checked at
+                // compile-time:
+                functionDecl(libc_func_matchers::isUnsafeVaListPrintfFunc())
+                    .bind(UnsafeVaListTag),
+                // Match a call to a `sprintf` function, which is never
+                // safe:
+                functionDecl(libc_func_matchers::isUnsafeSprintfFunc())
+                    .bind(UnsafeSprintfTag)))),
+            //  (unless the call has a sole string literal argument):
+            unless(
+                allOf(hasArgument(0, expr(stringLiteral())), hasNumArgs(1)))),
+
+        // The following two cases require checking against actual
+        // arguments of the call:
+
+        // Match a call to an `snprintf` function. And first two
+        // arguments of the call (that describe a buffer) are not in
+        // safe patterns:
+        callExpr(callee(functionDecl(libc_func_matchers::isNormalPrintfFunc())),
+                 libc_func_matchers::hasUnsafeSnprintfBuffer())
+            .bind(UnsafeSizedByTag),
+        // Match a call to a `printf` function, which can be safe if
+        // all arguments are null-terminated:
+        callExpr(callee(functionDecl(libc_func_matchers::isNormalPrintfFunc())),
+                 libc_func_matchers::hasUnsafePrintfStringArg(
+                     expr().bind(UnsafeStringTag)))));
+  }
+
+  const Stmt *getBaseStmt() const { return Call; }
+
+  SourceLocation getSourceLoc() const override { return Call->getBeginLoc(); }
+
+  void handleUnsafeOperation(UnsafeBufferUsageHandler &Handler,
+                             bool IsRelatedToDecl,
+                             ASTContext &Ctx) const override {
+    Handler.handleUnsafeLibcCall(Call, WarnedFunKind, Ctx, UnsafeArg);
+  }
+
+  DeclUseList getClaimedVarUseSites() const override { return {}; }
+};
+
 // Represents expressions of the form `DRE[*]` in the Unspecified Lvalue
 // Context (see `isInUnspecifiedLvalueContext`).
 // Note here `[]` is the built-in subscript operator.
@@ -1452,10 +1972,9 @@ findGadgets(const Decl *D, const UnsafeBufferUsageHandler &Handler,
 #define WARNING_GADGET(x)                                                      \
           allOf(x ## Gadget::matcher().bind(#x),                               \
                 notInSafeBufferOptOut(&Handler)),
-#define WARNING_CONTAINER_GADGET(x)                                            \
-          allOf(x ## Gadget::matcher().bind(#x),                               \
-                notInSafeBufferOptOut(&Handler),                               \
-                unless(ignoreUnsafeBufferInContainer(&Handler))),
+#define WARNING_OPTIONAL_GADGET(x)                                            \
+          allOf(x ## Gadget::matcher(&Handler).bind(#x),                      \
+                notInSafeBufferOptOut(&Handler)),
 #include "clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def"
             // Avoid a hanging comma.
             unless(stmt())
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index e6ce89dc7ec40..117b2c8bc5793 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -2304,6 +2304,20 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler {
     }
   }
 
+  void handleUnsafeLibcCall(const CallExpr *Call, unsigned PrintfInfo,
+                            ASTContext &Ctx,
+                            const Expr *UnsafeArg = nullptr) override {
+    S.Diag(Call->getBeginLoc(), diag::warn_unsafe_buffer_libc_call)
+        << Call->getDirectCallee() // We've checked there is a direct callee
+        << Call->getSourceRange();
+    if (PrintfInfo > 0) {
+      SourceRange R =
+          UnsafeArg ? UnsafeArg->getSourceRange() : Call->getSourceRange();
+      S.Diag(R.getBegin(), diag::note_unsafe_buffer_printf_call)
+          << PrintfInfo << R;
+    }
+  }
+
   void handleUnsafeOperationInContainer(const Stmt *Operation,
                                         bool IsRelatedToDecl,
                                         ASTContext &Ctx) override {
@@ -2382,6 +2396,10 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler {
     return S.Diags.isIgnored(diag::warn_unsafe_buffer_usage_in_container, Loc);
   }
 
+  bool ignoreUnsafeBufferInLibcCall(const SourceLocation &Loc) const override {
+    return S.Diags.isIgnored(diag::warn_unsafe_buffer_libc_call, Loc);
+  }
+
   // Returns the text representation of clang::unsafe_buffer_usage attribute.
   // `WSSuffix` holds customized "white-space"s, e.g., newline or whilespace
   // characters.
@@ -2548,6 +2566,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
         !Diags.isIgnored(diag::warn_unsafe_buffer_variable,
                          Node->getBeginLoc()) ||
         !Diags.isIgnored(diag::warn_unsafe_buffer_usage_in_container,
+                         Node->getBeginLoc()) ||
+        !Diags.isIgnored(diag::warn_unsafe_buffer_libc_call,
                          Node->getBeginLoc())) {
       clang::checkUnsafeBufferUsage(Node, R,
                                     UnsafeBufferUsageShouldEmitSuggestions);
@@ -2560,7 +2580,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
   if (!Diags.isIgnored(diag::warn_unsafe_buffer_operation, SourceLocation()) ||
       !Diags.isIgnored(diag::warn_unsafe_buffer_variable, SourceLocation()) ||
       !Diags.isIgnored(diag::warn_unsafe_buffer_usage_in_container,
-                       SourceLocation())) {
+                       SourceLocation()) ||
+      !Diags.isIgnored(diag::warn_unsafe_buffer_libc_call, SourceLocation())) {
     CallableVisitor(CallAnalyzers).TraverseTranslationUnitDecl(TU);
   }
 }
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp
new file mode 100644
index 0000000000000..2bd12db93fd52
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions-inline-namespace.cpp
@@ -0,0 +1,60 @@
+// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage \
+// RUN:            -verify %s
+
+namespace std {
+  inline namespace __1 {
+  template< class InputIt, class OutputIt >
+  OutputIt copy( InputIt first, InputIt last,
+		 OutputIt d_first );
+
+  struct iterator{};
+  template<typename T>
+  struct span {
+    T * ptr;
+    T * data();
+    unsigned size_bytes();
+    unsigned size();
+    iterator begin() const noexcept;
+    iterator end() const noexcept;
+  };
+
+  template<typename T>
+  struct basic_string {
+    T* p;
+    T *c_str();
+    T *data();
+    unsigned size_bytes();
+  };
+
+  typedef basic_string<char> string;
+  typedef basic_string<wchar_t> wstring;
+
+  // C function under std:
+  void memcpy();
+  void strcpy();
+  int snprintf( char* buffer, unsigned buf_size, const char* format, ... );
+  }
+}
+
+void f(char * p, char * q, std::span<char> s) {
+  std::memcpy();              // expected-warning{{function 'memcpy' is unsafe}}
+  std::strcpy();              // expected-warning{{function 'strcpy' is unsafe}}
+  std::__1::memcpy();              // expected-warning{{function 'memcpy' is unsafe}}
+  std::__1::strcpy();              // expected-warning{{function 'strcpy' is unsafe}}
+
+  /* Test printfs */
+  std::snprintf(s.data(), 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}}
+  std::__1::snprintf(s.data(), 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}}
+  std::snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn
+  std::__1::snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn
+}
+
+void v(std::string s1) {
+  std::snprintf(s1.data(), s1.size_bytes(), "%s%d", s1.c_str(), 0); // no warn
+  std::__1::snprintf(s1.data(), s1.size_bytes(), "%s%d", s1.c_str(), 0); // no warn
+}
+
+void g(char *begin, char *end, char *p, std::span<char> s) {
+  std::copy(begin, end, p); // no warn
+  std::copy(s.begin(), s.end(), s.begin()); // no warn
+}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp
new file mode 100644
index 0000000000000..0438f71b1c792
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp
@@ -0,0 +1,124 @@
+// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage \
+// RUN:            -verify %s
+// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage-in-libc-call \
+// RUN:            -verify %s
+
+typedef struct {} FILE;
+void memcpy();
+void __asan_memcpy();
+void strcpy();
+void strcpy_s();
+void wcscpy_s();
+unsigned strlen( const char* str );
+int fprintf( FILE* stream, const char* format, ... );
+int printf( const char* format, ... );
+int sprintf( char* buffer, const char* format, ... );
+int swprintf( char* buffer, const char* format, ... );
+int snprintf( char* buffer, unsigned buf_size, const char* format, ... );
+int snwprintf( char* buffer, unsigned buf_size, const char* format, ... );
+int snwprintf_s( char* buffer, unsigned buf_size, const char* format, ... );
+int vsnprintf( char* buffer, unsigned buf_size, const char* format, ... );
+int sscanf_s(const char * buffer, const char * format, ...);
+int sscanf(const char * buffer, const char * format, ... );
+
+namespace std {
+  template< class InputIt, class OutputIt >
+  OutputIt copy( InputIt first, InputIt last,
+		 OutputIt d_first );
+
+  struct iterator{};
+  template<typename T>
+  struct span {
+    T * ptr;
+    T * data();
+    unsigned size_bytes();
+    unsigned size();
+    iterator begin() const noexcept;
+    iterator end() const noexcept;
+  };
+
+  template<typename T>
+  struct basic_string {
+    T* p;
+    T *c_str();
+    T *data();
+    unsigned size_bytes();
+  };
+
+  typedef basic_string<char> string;
+  typedef basic_string<wchar_t> wstring;
+
+  // C function under std:
+  void memcpy();
+  void strcpy();
+}
+
+void f(char * p, char * q, std::span<char> s, std::span<char> s2) {
+  memcpy();                   // expected-warning{{function 'memcpy' is unsafe}}
+  std::memcpy();              // expected-warning{{function 'memcpy' is unsafe}}
+  __builtin_memcpy(p, q, 64); // expected-warning{{function '__builtin_memcpy' is unsafe}}
+  __builtin___memcpy_chk(p, q, 8, 64);  // expected-warning{{function '__builtin___memcpy_chk' is unsafe}}
+  __asan_memcpy();                      // expected-warning{{function '__asan_memcpy' is unsafe}}
+  strcpy();                   // expected-warning{{function 'strcpy' is unsafe}}
+  std::strcpy();              // expected-warning{{function 'strcpy' is unsafe}}
+  strcpy_s();                 // expected-warning{{function 'strcpy_s' is unsafe}}
+  wcscpy_s();                 // expected-warning{{function 'wcscpy_s' is unsafe}}
+
+
+  /* Test printfs */
+  fprintf((FILE*)p, "%s%d", p, *p);  // expected-warning{{function 'fprintf' is unsafe}} expected-note{{string argument is not guaranteed to be null-terminated}}
+  printf("%s%d", // expected-warning{{function 'printf' is unsafe}}
+	 p,    // expected-note{{string argument is not guaranteed to be null-terminated}} note attached to the unsafe argument
+	 *p);
+  sprintf(q, "%s%d", "hello", *p); // expected-warning{{function 'sprintf' is unsafe}} expected-note{{change to 'snprintf' for explicit bounds checking}}
+  swprintf(q, "%s%d", "hello", *p); // expected-warning{{function 'swprintf' is unsafe}} expected-note{{change to 'snprintf' for explicit bounds checking}}
+  snprintf(q, 10, "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}}
+  snprintf(s.data(), s2.size(), "%s%d", "hello", *p); // expected-warning{{function 'snprintf' is unsafe}} expected-note{{buffer pointer and size may not match}}
+  snwprintf(s.data(), s2.size(), "%s%d", "hello", *p); // expected-warning{{function 'snwprintf' is unsafe}} expected-note{{buffer pointer and size may not match}}
+  snwprintf_s(                      // expected-warning{{function 'snwprintf_s' is unsafe}}
+	      s.data(),             // expected-note{{buffer pointer and size may not match}} // note attached to the buffer
+	      s2.size(),
+	      "%s%d", "hello", *p);
+  vsnprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // expected-warning{{function 'vsnprintf' is unsafe}} expected-note{{'va_list' is unsafe}}
+  sscanf(p, "%s%d", "hello", *p);    // expected-warning{{function 'sscanf' is unsafe}}
+  sscanf_s(p, "%s%d", "hello", *p);  // expected-warning{{function 'sscanf_s' is unsafe}}
+  fprintf((FILE*)p, "%P%d%p%i hello world %32s", *p, *p, p, *p, p); // expected-warning{{function 'fprintf' is unsafe}} expected-note{{string argument is not guaranteed to be null-terminated}}
+  fprintf((FILE*)p, "%P%d%p%i hello world %32s", *p, *p, p, *p, "hello"); // no warn
+  printf("%s%d", "hello", *p); // no warn
+  snprintf(s.data(), s.size_bytes(), "%s%d", "hello", *p); // no warn
+  snprintf(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn
+  snwprintf(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn
+  snwprintf_s(s.data(), s.size_bytes(), "%s%d", __PRETTY_FUNCTION__, *p); // no warn
+  strlen("hello");// no warn
+}
+
+void v(std::string s1, int *p) {
+  snprintf(s1.data(), s1.size_bytes(), "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str()); // no warn
+  snprintf(s1.data(), s1.size_bytes(), s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str());      // no warn
+  printf("%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str());              // no warn
+  printf(s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str());                   // no warn
+  fprintf((FILE*)0, "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", p, s1.c_str());   // no warn
+  fprintf((FILE*)0, s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str());        // no warn
+}
+
+
+void g(char *begin, char *end, char *p, std::span<char> s) {
+  std::copy(begin, end, p); // no warn
+  std::copy(s.begin(), s.end(), s.begin()); // no warn
+}
+
+// warning gets turned off
+void ff(char * p, char * q, std::span<char> s, std::span<char> s2) {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunsafe-buffer-usage-in-libc-call"
+  memcpy();
+  std::memcpy();
+  __builtin_memcpy(p, q, 64);
+  __builtin___memcpy_chk(p, q, 8, 64);
+  __asan_memcpy();
+  strcpy();
+  std::strcpy();
+  strcpy_s();
+  wcscpy_s();
+#pragma clang diagnostic pop
+}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp
index 844311c3a51a5..989931e41c0cc 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-test-unreachable.cpp
@@ -1,8 +1,6 @@
 // RUN: %clang_cc1 -std=c++20 -Wunsafe-buffer-usage -fsafe-buffer-usage-suggestions -verify %s
 
-// expected-no-diagnostics
-
 typedef unsigned __darwin_size_t;
 typedef __darwin_size_t size_t;
  #define bzero(s, n) __builtin_bzero(s, n)
-void __nosan_bzero(void *dst, size_t sz) { bzero(dst, sz); }
+void __nosan_bzero(void *dst, size_t sz) { bzero(dst, sz); } // expected-warning{{function '__builtin_bzero' is unsafe}}

From a84baef74892dc294eb65bb2a1ea2339be13e8b2 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 5 Sep 2024 20:37:22 -0400
Subject: [PATCH 317/425] [gn build] Port 52dca6ffae08

---
 llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
index 8114c88e665e6..79ba8e3afdf40 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
@@ -4,6 +4,7 @@ static_library("Vectorize") {
     "//llvm/lib/Analysis",
     "//llvm/lib/IR",
     "//llvm/lib/Support",
+    "//llvm/lib/SandboxIR",
     "//llvm/lib/Transforms/Utils",
   ]
   sources = [
@@ -12,6 +13,7 @@ static_library("Vectorize") {
     "LoopVectorizationLegality.cpp",
     "LoopVectorize.cpp",
     "SLPVectorizer.cpp",
+    "SandboxVectorizer/SandboxVectorizer.cpp",
     "VPlan.cpp",
     "VPlanAnalysis.cpp",
     "VPlanHCFGBuilder.cpp",

From 2949720c2e55d2695682d6412d5afe45b167cb1e Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 6 Sep 2024 08:59:13 +0800
Subject: [PATCH 318/425] [RISCV] Move vmerge same mask peephole to
 RISCVVectorPeephole (#106108)

We currently fold a vmerge.vvm into its true operand if the true operand
is a masked pseudo with the same mask.

We can move this over to RISCVVectorPeephole by instead splitting it up
into a smaller peephole which converts it to a vmv.v.v first. The
existing foldVMV_V_V peephole will then take care of folding it if
needed.

This is very similar to the existing all-ones mask peephole and we could
potentially do it inside of it. I opted to put it in a separate peephole
to make it easier to reason about, given that the duplication is small,
but I could be persuaded either way.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   | 42 ++---------
 llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 74 ++++++++++++++++---
 .../fixed-vectors-strided-load-store-asm.ll   | 22 +++---
 .../RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir  | 70 ++++++++++++++++++
 4 files changed, 148 insertions(+), 60 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 4580f3191d138..ff4c0e9bbd50e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3833,15 +3833,8 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   uint64_t TrueTSFlags = TrueMCID.TSFlags;
   bool HasTiedDest = RISCVII::isFirstDefTiedToFirstUse(TrueMCID);
 
-  bool IsMasked = false;
   const RISCV::RISCVMaskedPseudoInfo *Info =
       RISCV::lookupMaskedIntrinsicByUnmasked(TrueOpc);
-  if (!Info && HasTiedDest) {
-    Info = RISCV::getMaskedPseudoInfo(TrueOpc);
-    IsMasked = true;
-  }
-  assert(!(IsMasked && !HasTiedDest) && "Expected tied dest");
-
   if (!Info)
     return false;
 
@@ -3853,19 +3846,6 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
       return false;
   }
 
-  // If True is masked then the vmerge must have either the same mask or an all
-  // 1s mask, since we're going to keep the mask from True.
-  if (IsMasked) {
-    // FIXME: Support mask agnostic True instruction which would have an
-    // undef passthru operand.
-    SDValue TrueMask =
-        getMaskSetter(True->getOperand(Info->MaskOpIdx),
-                      True->getOperand(True->getNumOperands() - 1));
-    assert(TrueMask);
-    if (!usesAllOnesMask(Mask, Glue) && getMaskSetter(Mask, Glue) != TrueMask)
-      return false;
-  }
-
   // Skip if True has side effect.
   if (TII->get(TrueOpc).hasUnmodeledSideEffects())
     return false;
@@ -3930,24 +3910,13 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
       (Mask && !usesAllOnesMask(Mask, Glue)))
     return false;
 
-  // If we end up changing the VL or mask of True, then we need to make sure it
-  // doesn't raise any observable fp exceptions, since changing the active
-  // elements will affect how fflags is set.
-  if (TrueVL != VL || !IsMasked)
-    if (mayRaiseFPException(True.getNode()) &&
-        !True->getFlags().hasNoFPExcept())
-      return false;
+  // Make sure it doesn't raise any observable fp exceptions, since changing the
+  // active elements will affect how fflags is set.
+  if (mayRaiseFPException(True.getNode()) && !True->getFlags().hasNoFPExcept())
+    return false;
 
   SDLoc DL(N);
 
-  // From the preconditions we checked above, we know the mask and thus glue
-  // for the result node will be taken from True.
-  if (IsMasked) {
-    Mask = True->getOperand(Info->MaskOpIdx);
-    Glue = True->getOperand(True->getNumOperands() - 1);
-    assert(Glue.getValueType() == MVT::Glue);
-  }
-
   unsigned MaskedOpc = Info->MaskedPseudo;
 #ifndef NDEBUG
   const MCInstrDesc &MaskedMCID = TII->get(MaskedOpc);
@@ -3977,8 +3946,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   Ops.push_back(False);
 
   const bool HasRoundingMode = RISCVII::hasRoundModeOp(TrueTSFlags);
-  const unsigned NormalOpsEnd = TrueVLIndex - IsMasked - HasRoundingMode;
-  assert(!IsMasked || NormalOpsEnd == Info->MaskOpIdx);
+  const unsigned NormalOpsEnd = TrueVLIndex - HasRoundingMode;
   Ops.append(True->op_begin() + HasTiedDest, True->op_begin() + NormalOpsEnd);
 
   Ops.push_back(Mask);
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index db8e496493c41..48ea2a0187617 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -65,7 +65,8 @@ class RISCVVectorPeephole : public MachineFunctionPass {
   bool convertToVLMAX(MachineInstr &MI) const;
   bool convertToWholeRegister(MachineInstr &MI) const;
   bool convertToUnmasked(MachineInstr &MI) const;
-  bool convertVMergeToVMv(MachineInstr &MI) const;
+  bool convertAllOnesVMergeToVMv(MachineInstr &MI) const;
+  bool convertSameMaskVMergeToVMv(MachineInstr &MI) const;
   bool foldUndefPassthruVMV_V_V(MachineInstr &MI);
   bool foldVMV_V_V(MachineInstr &MI);
 
@@ -342,17 +343,13 @@ bool RISCVVectorPeephole::convertToWholeRegister(MachineInstr &MI) const {
   return true;
 }
 
-// Transform (VMERGE_VVM_<LMUL> pt, false, true, allones, vl, sew) to
-// (VMV_V_V_<LMUL> pt, true, vl, sew). It may decrease uses of VMSET.
-bool RISCVVectorPeephole::convertVMergeToVMv(MachineInstr &MI) const {
+static unsigned getVMV_V_VOpcodeForVMERGE_VVM(const MachineInstr &MI) {
 #define CASE_VMERGE_TO_VMV(lmul)                                               \
   case RISCV::PseudoVMERGE_VVM_##lmul:                                         \
-    NewOpc = RISCV::PseudoVMV_V_V_##lmul;                                      \
-    break;
-  unsigned NewOpc;
+    return RISCV::PseudoVMV_V_V_##lmul;
   switch (MI.getOpcode()) {
   default:
-    return false;
+    return 0;
     CASE_VMERGE_TO_VMV(MF8)
     CASE_VMERGE_TO_VMV(MF4)
     CASE_VMERGE_TO_VMV(MF2)
@@ -361,14 +358,68 @@ bool RISCVVectorPeephole::convertVMergeToVMv(MachineInstr &MI) const {
     CASE_VMERGE_TO_VMV(M4)
     CASE_VMERGE_TO_VMV(M8)
   }
+}
 
+/// Convert a PseudoVMERGE_VVM with an all ones mask to a PseudoVMV_V_V.
+///
+/// %x = PseudoVMERGE_VVM %passthru, %false, %true, %allones, sew, vl
+/// ->
+/// %x = PseudoVMV_V_V %passthru, %true, vl, sew, tu_mu
+bool RISCVVectorPeephole::convertAllOnesVMergeToVMv(MachineInstr &MI) const {
+  unsigned NewOpc = getVMV_V_VOpcodeForVMERGE_VVM(MI);
+  if (!NewOpc)
+    return false;
   assert(MI.getOperand(4).isReg() && MI.getOperand(4).getReg() == RISCV::V0);
   if (!isAllOnesMask(V0Defs.lookup(&MI)))
     return false;
 
   MI.setDesc(TII->get(NewOpc));
-  MI.removeOperand(2);  // False operand
-  MI.removeOperand(3);  // Mask operand
+  MI.removeOperand(2); // False operand
+  MI.removeOperand(3); // Mask operand
+  MI.addOperand(
+      MachineOperand::CreateImm(RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED));
+
+  // vmv.v.v doesn't have a mask operand, so we may be able to inflate the
+  // register class for the destination and passthru operands e.g. VRNoV0 -> VR
+  MRI->recomputeRegClass(MI.getOperand(0).getReg());
+  if (MI.getOperand(1).getReg() != RISCV::NoRegister)
+    MRI->recomputeRegClass(MI.getOperand(1).getReg());
+  return true;
+}
+
+/// If a PseudoVMERGE_VVM's true operand is a masked pseudo and both have the
+/// same mask, and the masked pseudo's passthru is the same as the false
+/// operand, we can convert the PseudoVMERGE_VVM to a PseudoVMV_V_V.
+///
+/// %true = PseudoVADD_VV_M1_MASK %false, %x, %y, %mask, vl1, sew, policy
+/// %x = PseudoVMERGE_VVM %passthru, %false, %true, %mask, vl2, sew
+/// ->
+/// %true = PseudoVADD_VV_M1_MASK %false, %x, %y, %mask, vl1, sew, policy
+/// %x = PseudoVMV_V_V %passthru, %true, vl2, sew, tu_mu
+bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) const {
+  unsigned NewOpc = getVMV_V_VOpcodeForVMERGE_VVM(MI);
+  if (!NewOpc)
+    return false;
+  MachineInstr *True = MRI->getVRegDef(MI.getOperand(3).getReg());
+  if (!True || !RISCV::getMaskedPseudoInfo(True->getOpcode()) ||
+      !hasSameEEW(MI, *True))
+    return false;
+
+  // True's passthru needs to be equivalent to False
+  Register TruePassthruReg = True->getOperand(1).getReg();
+  Register FalseReg = MI.getOperand(2).getReg();
+  if (TruePassthruReg != RISCV::NoRegister && TruePassthruReg != FalseReg)
+    return false;
+
+  const MachineInstr *TrueV0Def = V0Defs.lookup(True);
+  const MachineInstr *MIV0Def = V0Defs.lookup(&MI);
+  assert(TrueV0Def && TrueV0Def->isCopy() && MIV0Def && MIV0Def->isCopy());
+  if (TrueV0Def->getOperand(1).getReg() != MIV0Def->getOperand(1).getReg())
+    return false;
+
+  MI.setDesc(TII->get(NewOpc));
+  MI.removeOperand(2); // False operand
+  MI.removeOperand(3); // Mask operand
   MI.addOperand(
       MachineOperand::CreateImm(RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED));
 
@@ -623,7 +674,8 @@ bool RISCVVectorPeephole::runOnMachineFunction(MachineFunction &MF) {
       Changed |= tryToReduceVL(MI);
       Changed |= convertToUnmasked(MI);
       Changed |= convertToWholeRegister(MI);
-      Changed |= convertVMergeToVMv(MI);
+      Changed |= convertAllOnesVMergeToVMv(MI);
+      Changed |= convertSameMaskVMergeToVMv(MI);
       if (foldUndefPassthruVMV_V_V(MI)) {
         Changed |= true;
         continue; // MI is erased
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index e57b6a22dd6ea..569ada7949b1b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -62,12 +62,11 @@ define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-NEXT:    li a4, 5
 ; CHECK-NEXT:  .LBB1_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmv1r.v v9, v8
-; CHECK-NEXT:    vsetvli zero, a3, e8, m1, ta, mu
-; CHECK-NEXT:    vlse8.v v9, (a1), a4, v0.t
-; CHECK-NEXT:    vle8.v v10, (a0)
-; CHECK-NEXT:    vadd.vv v9, v10, v9
-; CHECK-NEXT:    vse8.v v9, (a0)
+; CHECK-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-NEXT:    vlse8.v v8, (a1), a4, v0.t
+; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 32
 ; CHECK-NEXT:    addi a1, a1, 160
 ; CHECK-NEXT:    bne a0, a2, .LBB1_1
@@ -344,12 +343,11 @@ define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture read
 ; CHECK-NEXT:    li a4, 5
 ; CHECK-NEXT:  .LBB7_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vsetvli zero, a3, e8, m1, ta, mu
-; CHECK-NEXT:    vle8.v v9, (a1)
-; CHECK-NEXT:    vmv1r.v v10, v8
-; CHECK-NEXT:    vlse8.v v10, (a0), a4, v0.t
-; CHECK-NEXT:    vadd.vv v9, v10, v9
-; CHECK-NEXT:    vsse8.v v9, (a0), a4, v0.t
+; CHECK-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a1)
+; CHECK-NEXT:    vlse8.v v9, (a0), a4, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsse8.v v8, (a0), a4, v0.t
 ; CHECK-NEXT:    addi a1, a1, 32
 ; CHECK-NEXT:    addi a0, a0, 160
 ; CHECK-NEXT:    bne a1, a2, .LBB7_1
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
index 19a918148e6eb..875d4229bbc6e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
@@ -68,3 +68,73 @@ body: |
     $v0 = COPY %mask
     %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, %avl, 5
 ...
+---
+name: same_mask
+body: |
+  bb.0:
+    liveins: $v8, $v9, $v0
+    ; CHECK-LABEL: name: same_mask
+    ; CHECK: liveins: $v8, $v9, $v0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %pt:vr = COPY $v8
+    ; CHECK-NEXT: %false:vrnov0 = COPY $v9
+    ; CHECK-NEXT: %mask:vr = COPY $v0
+    ; CHECK-NEXT: $v0 = COPY %mask
+    ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, $v0, 4, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: $v0 = COPY %mask
+    ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, 8, 5 /* e32 */, 0 /* tu, mu */
+    %pt:vrnov0 = COPY $v8
+    %false:vrnov0 = COPY $v9
+    %mask:vr = COPY $v0
+    $v0 = COPY %mask
+    %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, $v0, 4, 5 /* e32 */, 0 /* tu, mu */
+    $v0 = COPY %mask
+    %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, 8, 5 /* e32 */
+...
+---
+# Shouldn't be converted because false operands are different
+name: same_mask_different_false
+body: |
+  bb.0:
+    liveins: $v8, $v9, $v0
+    ; CHECK-LABEL: name: same_mask_different_false
+    ; CHECK: liveins: $v8, $v9, $v0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %pt:vrnov0 = COPY $v8
+    ; CHECK-NEXT: %false:vrnov0 = COPY $v9
+    ; CHECK-NEXT: %mask:vr = COPY $v0
+    ; CHECK-NEXT: $v0 = COPY %mask
+    ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %pt, $noreg, $noreg, $v0, 4, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: $v0 = COPY %mask
+    ; CHECK-NEXT: %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, 8, 5 /* e32 */
+    %pt:vrnov0 = COPY $v8
+    %false:vrnov0 = COPY $v9
+    %mask:vr = COPY $v0
+    $v0 = COPY %mask
+    %true:vrnov0 = PseudoVADD_VV_M1_MASK %pt, $noreg, $noreg, $v0, 4, 5 /* e32 */, 0 /* tu, mu */
+    $v0 = COPY %mask
+    %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, 8, 5 /* e32 */
+...
+---
+# Shouldn't be converted because EEWs are different
+name: same_mask_different_eew
+body: |
+  bb.0:
+    liveins: $v8, $v9, $v0
+    ; CHECK-LABEL: name: same_mask_different_eew
+    ; CHECK: liveins: $v8, $v9, $v0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %pt:vrnov0 = COPY $v8
+    ; CHECK-NEXT: %false:vrnov0 = COPY $v9
+    ; CHECK-NEXT: %mask:vr = COPY $v0
+    ; CHECK-NEXT: $v0 = COPY %mask
+    ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, $v0, 4, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: $v0 = COPY %mask
+    ; CHECK-NEXT: %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, 8, 5 /* e32 */
+    %pt:vrnov0 = COPY $v8
+    %false:vrnov0 = COPY $v9
+    %mask:vr = COPY $v0
+    $v0 = COPY %mask
+    %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, $v0, 4, 4 /* e16 */, 0 /* tu, mu */
+    $v0 = COPY %mask
+    %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, 8, 5 /* e32 */

From 11084c5c49f8bb7825f81adc5b7140b3506fe253 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 5 Sep 2024 18:00:46 -0700
Subject: [PATCH 319/425] [lldb] Convert DebuggerThread.cpp to new Status API
 (NFC)

---
 lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp
index 9a5fcd88e1282..d62eb26ca1a29 100644
--- a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp
@@ -75,7 +75,7 @@ Status DebuggerThread::DebugAttach(lldb::pid_t pid,
         return DebuggerThreadAttachRoutine(pid, attach_info);
       });
   if (!secondary_thread) {
-    result = Status(secondary_thread.takeError());
+    result = Status::FromError(secondary_thread.takeError());
     LLDB_LOG(log, "couldn't attach to process '{0}'. {1}", pid, result);
   }
 

From 16df489fdae23e77eb5180e4d4dc99b07421bf77 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Thu, 5 Sep 2024 18:35:55 -0700
Subject: [PATCH 320/425] [TableGen] Add const variants of accessors for
 backend (#106658)

Split RecordKeeper `getAllDerivedDefinitions` family of functions into
two variants:
  (a) non-const ones that return vectors of `Record *` and
  (b) const ones, that return vector/ArrayRef of `const Record *`.

This will help gradual migration of TableGen backends to use
`const RecordKeeper` and by implication change code to work
with const pointers and better const correctness.

Existing backends are not yet compatible with the const family of
functions, so change them to use a non-constant `RecordKeeper`
reference, till they are migrated.
---
 clang/utils/TableGen/ClangAttrEmitter.cpp     |  4 +-
 clang/utils/TableGen/ClangSyntaxEmitter.cpp   |  2 +-
 llvm/include/llvm/TableGen/DirectiveEmitter.h |  5 +-
 llvm/include/llvm/TableGen/Record.h           | 34 ++++++++--
 llvm/lib/TableGen/Record.cpp                  | 65 +++++++++++++++----
 .../TableGen/Basic/CodeGenIntrinsics.cpp      |  2 +-
 .../TableGen/Common/SubtargetFeatureInfo.cpp  |  2 +-
 .../TableGen/Common/SubtargetFeatureInfo.h    |  2 +-
 llvm/utils/TableGen/ExegesisEmitter.cpp       |  2 +-
 llvm/utils/TableGen/GlobalISelEmitter.cpp     |  2 +-
 llvm/utils/TableGen/SubtargetEmitter.cpp      |  2 +-
 llvm/utils/TableGen/TableGen.cpp              |  2 +-
 mlir/include/mlir/TableGen/GenInfo.h          |  6 +-
 mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp   | 28 ++++----
 mlir/tools/mlir-tblgen/OmpOpGen.cpp           |  2 +-
 mlir/tools/mlir-tblgen/OpDocGen.cpp           | 20 +++---
 mlir/tools/mlir-tblgen/OpInterfacesGen.cpp    | 15 ++---
 mlir/tools/mlir-tblgen/RewriterGen.cpp        |  8 +--
 .../tools/tblgen-to-irdl/OpDefinitionsGen.cpp |  7 +-
 19 files changed, 134 insertions(+), 76 deletions(-)

diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index adbe6af62d5cb..d24215d10f17c 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -189,7 +189,7 @@ static StringRef NormalizeGNUAttrSpelling(StringRef AttrSpelling) {
 
 typedef std::vector<std::pair<std::string, const Record *>> ParsedAttrMap;
 
-static ParsedAttrMap getParsedAttrList(const RecordKeeper &Records,
+static ParsedAttrMap getParsedAttrList(RecordKeeper &Records,
                                        ParsedAttrMap *Dupes = nullptr,
                                        bool SemaOnly = true) {
   std::vector<Record *> Attrs = Records.getAllDerivedDefinitions("Attr");
@@ -4344,7 +4344,7 @@ static void GenerateAppertainsTo(const Record &Attr, raw_ostream &OS) {
 // written into OS and the checks for merging declaration attributes are
 // written into MergeOS.
 static void GenerateMutualExclusionsChecks(const Record &Attr,
-                                           const RecordKeeper &Records,
+                                           RecordKeeper &Records,
                                            raw_ostream &OS,
                                            raw_ostream &MergeDeclOS,
                                            raw_ostream &MergeStmtOS) {
diff --git a/clang/utils/TableGen/ClangSyntaxEmitter.cpp b/clang/utils/TableGen/ClangSyntaxEmitter.cpp
index 9720d58731843..2a69e4c353b6b 100644
--- a/clang/utils/TableGen/ClangSyntaxEmitter.cpp
+++ b/clang/utils/TableGen/ClangSyntaxEmitter.cpp
@@ -41,7 +41,7 @@ using llvm::formatv;
 // stable and useful way, where abstract Node subclasses correspond to ranges.
 class Hierarchy {
 public:
-  Hierarchy(const llvm::RecordKeeper &Records) {
+  Hierarchy(llvm::RecordKeeper &Records) {
     for (llvm::Record *T : Records.getAllDerivedDefinitions("NodeType"))
       add(T);
     for (llvm::Record *Derived : Records.getAllDerivedDefinitions("NodeType"))
diff --git a/llvm/include/llvm/TableGen/DirectiveEmitter.h b/llvm/include/llvm/TableGen/DirectiveEmitter.h
index 1121459be6ce7..ca21c8fc10145 100644
--- a/llvm/include/llvm/TableGen/DirectiveEmitter.h
+++ b/llvm/include/llvm/TableGen/DirectiveEmitter.h
@@ -15,8 +15,7 @@ namespace llvm {
 // DirectiveBase.td and provides helper methods for accessing it.
 class DirectiveLanguage {
 public:
-  explicit DirectiveLanguage(const llvm::RecordKeeper &Records)
-      : Records(Records) {
+  explicit DirectiveLanguage(llvm::RecordKeeper &Records) : Records(Records) {
     const auto &DirectiveLanguages = getDirectiveLanguages();
     Def = DirectiveLanguages[0];
   }
@@ -71,7 +70,7 @@ class DirectiveLanguage {
 
 private:
   const llvm::Record *Def;
-  const llvm::RecordKeeper &Records;
+  llvm::RecordKeeper &Records;
 
   std::vector<Record *> getDirectiveLanguages() const {
     return Records.getAllDerivedDefinitions("DirectiveLanguage");
diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index ff596df94e4f5..5d36fcf57e23e 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -2057,19 +2057,28 @@ class RecordKeeper {
   //===--------------------------------------------------------------------===//
   // High-level helper methods, useful for tablegen backends.
 
+  // Non-const methods return std::vector<Record *> by value or reference.
+  // Const methods return std::vector<const Record *> by value or
+  // ArrayRef<const Record *>.
+
   /// Get all the concrete records that inherit from the one specified
   /// class. The class must be defined.
-  std::vector<Record *> getAllDerivedDefinitions(StringRef ClassName) const;
+  ArrayRef<const Record *> getAllDerivedDefinitions(StringRef ClassName) const;
+  const std::vector<Record *> &getAllDerivedDefinitions(StringRef ClassName);
 
   /// Get all the concrete records that inherit from all the specified
   /// classes. The classes must be defined.
-  std::vector<Record *> getAllDerivedDefinitions(
-      ArrayRef<StringRef> ClassNames) const;
+  std::vector<const Record *>
+  getAllDerivedDefinitions(ArrayRef<StringRef> ClassNames) const;
+  std::vector<Record *>
+  getAllDerivedDefinitions(ArrayRef<StringRef> ClassNames);
 
   /// Get all the concrete records that inherit from specified class, if the
   /// class is defined. Returns an empty vector if the class is not defined.
-  std::vector<Record *>
+  ArrayRef<const Record *>
   getAllDerivedDefinitionsIfDefined(StringRef ClassName) const;
+  const std::vector<Record *> &
+  getAllDerivedDefinitionsIfDefined(StringRef ClassName);
 
   void dump() const;
 
@@ -2081,9 +2090,24 @@ class RecordKeeper {
   RecordKeeper &operator=(RecordKeeper &&) = delete;
   RecordKeeper &operator=(const RecordKeeper &) = delete;
 
+  // Helper template functions for backend accessors.
+  template <typename VecTy>
+  const VecTy &
+  getAllDerivedDefinitionsImpl(StringRef ClassName,
+                               std::map<std::string, VecTy> &Cache) const;
+
+  template <typename VecTy>
+  VecTy getAllDerivedDefinitionsImpl(ArrayRef<StringRef> ClassNames) const;
+
+  template <typename VecTy>
+  const VecTy &getAllDerivedDefinitionsIfDefinedImpl(
+      StringRef ClassName, std::map<std::string, VecTy> &Cache) const;
+
   std::string InputFilename;
   RecordMap Classes, Defs;
-  mutable StringMap<std::vector<Record *>> ClassRecordsMap;
+  mutable std::map<std::string, std::vector<const Record *>>
+      ClassRecordsMapConst;
+  mutable std::map<std::string, std::vector<Record *>> ClassRecordsMap;
   GlobalMap ExtraGlobals;
 
   // These members are for the phase timing feature. We need a timer group,
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index cead8f865a607..17afa2f7eb1b9 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -3248,25 +3248,28 @@ void RecordKeeper::stopBackendTimer() {
   }
 }
 
-std::vector<Record *>
-RecordKeeper::getAllDerivedDefinitions(StringRef ClassName) const {
+template <typename VecTy>
+const VecTy &RecordKeeper::getAllDerivedDefinitionsImpl(
+    StringRef ClassName, std::map<std::string, VecTy> &Cache) const {
   // We cache the record vectors for single classes. Many backends request
   // the same vectors multiple times.
-  auto Pair = ClassRecordsMap.try_emplace(ClassName);
+  auto Pair = Cache.try_emplace(ClassName.str());
   if (Pair.second)
-    Pair.first->second = getAllDerivedDefinitions(ArrayRef(ClassName));
+    Pair.first->second =
+        getAllDerivedDefinitionsImpl<VecTy>(ArrayRef(ClassName));
 
   return Pair.first->second;
 }
 
-std::vector<Record *> RecordKeeper::getAllDerivedDefinitions(
+template <typename VecTy>
+VecTy RecordKeeper::getAllDerivedDefinitionsImpl(
     ArrayRef<StringRef> ClassNames) const {
-  SmallVector<Record *, 2> ClassRecs;
-  std::vector<Record *> Defs;
+  SmallVector<const Record *, 2> ClassRecs;
+  VecTy Defs;
 
   assert(ClassNames.size() > 0 && "At least one class must be passed.");
   for (const auto &ClassName : ClassNames) {
-    Record *Class = getClass(ClassName);
+    const Record *Class = getClass(ClassName);
     if (!Class)
       PrintFatalError("The class '" + ClassName + "' is not defined\n");
     ClassRecs.push_back(Class);
@@ -3274,20 +3277,54 @@ std::vector<Record *> RecordKeeper::getAllDerivedDefinitions(
 
   for (const auto &OneDef : getDefs()) {
     if (all_of(ClassRecs, [&OneDef](const Record *Class) {
-                            return OneDef.second->isSubClassOf(Class);
-                          }))
+          return OneDef.second->isSubClassOf(Class);
+        }))
       Defs.push_back(OneDef.second.get());
   }
-
   llvm::sort(Defs, LessRecord());
-
   return Defs;
 }
 
+template <typename VecTy>
+const VecTy &RecordKeeper::getAllDerivedDefinitionsIfDefinedImpl(
+    StringRef ClassName, std::map<std::string, VecTy> &Cache) const {
+  return getClass(ClassName)
+             ? getAllDerivedDefinitionsImpl<VecTy>(ClassName, Cache)
+             : Cache[""];
+}
+
+ArrayRef<const Record *>
+RecordKeeper::getAllDerivedDefinitions(StringRef ClassName) const {
+  return getAllDerivedDefinitionsImpl<std::vector<const Record *>>(
+      ClassName, ClassRecordsMapConst);
+}
+
+const std::vector<Record *> &
+RecordKeeper::getAllDerivedDefinitions(StringRef ClassName) {
+  return getAllDerivedDefinitionsImpl<std::vector<Record *>>(ClassName,
+                                                             ClassRecordsMap);
+}
+
+std::vector<const Record *>
+RecordKeeper::getAllDerivedDefinitions(ArrayRef<StringRef> ClassNames) const {
+  return getAllDerivedDefinitionsImpl<std::vector<const Record *>>(ClassNames);
+}
+
 std::vector<Record *>
+RecordKeeper::getAllDerivedDefinitions(ArrayRef<StringRef> ClassNames) {
+  return getAllDerivedDefinitionsImpl<std::vector<Record *>>(ClassNames);
+}
+
+ArrayRef<const Record *>
 RecordKeeper::getAllDerivedDefinitionsIfDefined(StringRef ClassName) const {
-  return getClass(ClassName) ? getAllDerivedDefinitions(ClassName)
-                             : std::vector<Record *>();
+  return getAllDerivedDefinitionsIfDefinedImpl<std::vector<const Record *>>(
+      ClassName, ClassRecordsMapConst);
+}
+
+const std::vector<Record *> &
+RecordKeeper::getAllDerivedDefinitionsIfDefined(StringRef ClassName) {
+  return getAllDerivedDefinitionsIfDefinedImpl<std::vector<Record *>>(
+      ClassName, ClassRecordsMap);
 }
 
 void RecordKeeper::dumpAllocationStats(raw_ostream &OS) const {
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
index 23c64912c780f..05104e938b848 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
@@ -43,7 +43,7 @@ CodeGenIntrinsicContext::CodeGenIntrinsicContext(const RecordKeeper &RC) {
 CodeGenIntrinsicTable::CodeGenIntrinsicTable(const RecordKeeper &RC) {
   CodeGenIntrinsicContext Ctx(RC);
 
-  std::vector<Record *> Defs = RC.getAllDerivedDefinitions("Intrinsic");
+  ArrayRef<const Record *> Defs = RC.getAllDerivedDefinitions("Intrinsic");
   Intrinsics.reserve(Defs.size());
 
   for (const Record *Def : Defs)
diff --git a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp
index 4f57234d6fe27..a4d6d8d21b356 100644
--- a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp
+++ b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp
@@ -21,7 +21,7 @@ LLVM_DUMP_METHOD void SubtargetFeatureInfo::dump() const {
 #endif
 
 std::vector<std::pair<Record *, SubtargetFeatureInfo>>
-SubtargetFeatureInfo::getAll(const RecordKeeper &Records) {
+SubtargetFeatureInfo::getAll(RecordKeeper &Records) {
   std::vector<std::pair<Record *, SubtargetFeatureInfo>> SubtargetFeatures;
   std::vector<Record *> AllPredicates =
       Records.getAllDerivedDefinitions("Predicate");
diff --git a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h
index 2635e4b733e1a..fee2c0263c496 100644
--- a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h
+++ b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h
@@ -49,7 +49,7 @@ struct SubtargetFeatureInfo {
 
   void dump() const;
   static std::vector<std::pair<Record *, SubtargetFeatureInfo>>
-  getAll(const RecordKeeper &Records);
+  getAll(RecordKeeper &Records);
 
   /// Emit the subtarget feature flag definitions.
   ///
diff --git a/llvm/utils/TableGen/ExegesisEmitter.cpp b/llvm/utils/TableGen/ExegesisEmitter.cpp
index d48c7f3a480f2..0de7cb4233748 100644
--- a/llvm/utils/TableGen/ExegesisEmitter.cpp
+++ b/llvm/utils/TableGen/ExegesisEmitter.cpp
@@ -59,7 +59,7 @@ class ExegesisEmitter {
 };
 
 static std::map<llvm::StringRef, unsigned>
-collectPfmCounters(const RecordKeeper &Records) {
+collectPfmCounters(RecordKeeper &Records) {
   std::map<llvm::StringRef, unsigned> PfmCounterNameTable;
   const auto AddPfmCounterName = [&PfmCounterNameTable](
                                      const Record *PfmCounterDef) {
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index a491a049e7c81..2606768c0c582 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -335,7 +335,7 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter {
 private:
   std::string ClassName;
 
-  const RecordKeeper &RK;
+  RecordKeeper &RK;
   const CodeGenDAGPatterns CGP;
   const CodeGenTarget &Target;
   CodeGenRegBank &CGRegs;
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 66ca38ee5ae2f..7ae61cb7c446b 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -1545,7 +1545,7 @@ void SubtargetEmitter::EmitSchedModel(raw_ostream &OS) {
   EmitProcessorModels(OS);
 }
 
-static void emitPredicateProlog(const RecordKeeper &Records, raw_ostream &OS) {
+static void emitPredicateProlog(RecordKeeper &Records, raw_ostream &OS) {
   std::string Buffer;
   raw_string_ostream Stream(Buffer);
 
diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/TableGen.cpp
index 7ee6fa5c83211..882410bac081b 100644
--- a/llvm/utils/TableGen/TableGen.cpp
+++ b/llvm/utils/TableGen/TableGen.cpp
@@ -52,7 +52,7 @@ static void PrintEnums(RecordKeeper &Records, raw_ostream &OS) {
 static void PrintSets(const RecordKeeper &Records, raw_ostream &OS) {
   SetTheory Sets;
   Sets.addFieldExpander("Set", "Elements");
-  for (Record *Rec : Records.getAllDerivedDefinitions("Set")) {
+  for (const Record *Rec : Records.getAllDerivedDefinitions("Set")) {
     OS << Rec->getName() << " = [";
     const std::vector<Record *> *Elts = Sets.expand(Rec);
     assert(Elts && "Couldn't expand Set instance");
diff --git a/mlir/include/mlir/TableGen/GenInfo.h b/mlir/include/mlir/TableGen/GenInfo.h
index ef2e12f07df16..d59d64223827b 100644
--- a/mlir/include/mlir/TableGen/GenInfo.h
+++ b/mlir/include/mlir/TableGen/GenInfo.h
@@ -21,8 +21,8 @@ class RecordKeeper;
 namespace mlir {
 
 /// Generator function to invoke.
-using GenFunction = std::function<bool(const llvm::RecordKeeper &recordKeeper,
-                                       raw_ostream &os)>;
+using GenFunction =
+    std::function<bool(llvm::RecordKeeper &recordKeeper, raw_ostream &os)>;
 
 /// Structure to group information about a generator (argument to invoke via
 /// mlir-tblgen, description, and generator function).
@@ -34,7 +34,7 @@ class GenInfo {
       : arg(arg), description(description), generator(std::move(generator)) {}
 
   /// Invokes the generator and returns whether the generator failed.
-  bool invoke(const llvm::RecordKeeper &recordKeeper, raw_ostream &os) const {
+  bool invoke(llvm::RecordKeeper &recordKeeper, raw_ostream &os) const {
     assert(generator && "Cannot call generator with null generator");
     return generator(recordKeeper, os);
   }
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index eccd8029d950f..feca04bff643d 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -690,10 +690,10 @@ class DefGenerator {
   bool emitDefs(StringRef selectedDialect);
 
 protected:
-  DefGenerator(std::vector<llvm::Record *> &&defs, raw_ostream &os,
+  DefGenerator(const std::vector<llvm::Record *> &defs, raw_ostream &os,
                StringRef defType, StringRef valueType, bool isAttrGenerator)
-      : defRecords(std::move(defs)), os(os), defType(defType),
-        valueType(valueType), isAttrGenerator(isAttrGenerator) {
+      : defRecords(defs), os(os), defType(defType), valueType(valueType),
+        isAttrGenerator(isAttrGenerator) {
     // Sort by occurrence in file.
     llvm::sort(defRecords, [](llvm::Record *lhs, llvm::Record *rhs) {
       return lhs->getID() < rhs->getID();
@@ -721,13 +721,13 @@ class DefGenerator {
 
 /// A specialized generator for AttrDefs.
 struct AttrDefGenerator : public DefGenerator {
-  AttrDefGenerator(const llvm::RecordKeeper &records, raw_ostream &os)
+  AttrDefGenerator(llvm::RecordKeeper &records, raw_ostream &os)
       : DefGenerator(records.getAllDerivedDefinitionsIfDefined("AttrDef"), os,
                      "Attr", "Attribute", /*isAttrGenerator=*/true) {}
 };
 /// A specialized generator for TypeDefs.
 struct TypeDefGenerator : public DefGenerator {
-  TypeDefGenerator(const llvm::RecordKeeper &records, raw_ostream &os)
+  TypeDefGenerator(llvm::RecordKeeper &records, raw_ostream &os)
       : DefGenerator(records.getAllDerivedDefinitionsIfDefined("TypeDef"), os,
                      "Type", "Type", /*isAttrGenerator=*/false) {}
 };
@@ -1029,7 +1029,7 @@ bool DefGenerator::emitDefs(StringRef selectedDialect) {
 
 /// Find all type constraints for which a C++ function should be generated.
 static std::vector<Constraint>
-getAllTypeConstraints(const llvm::RecordKeeper &records) {
+getAllTypeConstraints(llvm::RecordKeeper &records) {
   std::vector<Constraint> result;
   for (llvm::Record *def :
        records.getAllDerivedDefinitionsIfDefined("TypeConstraint")) {
@@ -1046,7 +1046,7 @@ getAllTypeConstraints(const llvm::RecordKeeper &records) {
   return result;
 }
 
-static void emitTypeConstraintDecls(const llvm::RecordKeeper &records,
+static void emitTypeConstraintDecls(llvm::RecordKeeper &records,
                                     raw_ostream &os) {
   static const char *const typeConstraintDecl = R"(
 bool {0}(::mlir::Type type);
@@ -1056,7 +1056,7 @@ bool {0}(::mlir::Type type);
     os << strfmt(typeConstraintDecl, *constr.getCppFunctionName());
 }
 
-static void emitTypeConstraintDefs(const llvm::RecordKeeper &records,
+static void emitTypeConstraintDefs(llvm::RecordKeeper &records,
                                    raw_ostream &os) {
   static const char *const typeConstraintDef = R"(
 bool {0}(::mlir::Type type) {
@@ -1087,13 +1087,13 @@ static llvm::cl::opt<std::string>
 
 static mlir::GenRegistration
     genAttrDefs("gen-attrdef-defs", "Generate AttrDef definitions",
-                [](const llvm::RecordKeeper &records, raw_ostream &os) {
+                [](llvm::RecordKeeper &records, raw_ostream &os) {
                   AttrDefGenerator generator(records, os);
                   return generator.emitDefs(attrDialect);
                 });
 static mlir::GenRegistration
     genAttrDecls("gen-attrdef-decls", "Generate AttrDef declarations",
-                 [](const llvm::RecordKeeper &records, raw_ostream &os) {
+                 [](llvm::RecordKeeper &records, raw_ostream &os) {
                    AttrDefGenerator generator(records, os);
                    return generator.emitDecls(attrDialect);
                  });
@@ -1109,13 +1109,13 @@ static llvm::cl::opt<std::string>
 
 static mlir::GenRegistration
     genTypeDefs("gen-typedef-defs", "Generate TypeDef definitions",
-                [](const llvm::RecordKeeper &records, raw_ostream &os) {
+                [](llvm::RecordKeeper &records, raw_ostream &os) {
                   TypeDefGenerator generator(records, os);
                   return generator.emitDefs(typeDialect);
                 });
 static mlir::GenRegistration
     genTypeDecls("gen-typedef-decls", "Generate TypeDef declarations",
-                 [](const llvm::RecordKeeper &records, raw_ostream &os) {
+                 [](llvm::RecordKeeper &records, raw_ostream &os) {
                    TypeDefGenerator generator(records, os);
                    return generator.emitDecls(typeDialect);
                  });
@@ -1123,14 +1123,14 @@ static mlir::GenRegistration
 static mlir::GenRegistration
     genTypeConstrDefs("gen-type-constraint-defs",
                       "Generate type constraint definitions",
-                      [](const llvm::RecordKeeper &records, raw_ostream &os) {
+                      [](llvm::RecordKeeper &records, raw_ostream &os) {
                         emitTypeConstraintDefs(records, os);
                         return false;
                       });
 static mlir::GenRegistration
     genTypeConstrDecls("gen-type-constraint-decls",
                        "Generate type constraint declarations",
-                       [](const llvm::RecordKeeper &records, raw_ostream &os) {
+                       [](llvm::RecordKeeper &records, raw_ostream &os) {
                          emitTypeConstraintDecls(records, os);
                          return false;
                        });
diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
index ffa2e17cc8f91..b7f6ca975a9a3 100644
--- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp
+++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
@@ -149,7 +149,7 @@ static void verifyClause(Record *op, Record *clause) {
 
 /// Verify that all properties of `OpenMP_Clause`s of records deriving from
 /// `OpenMP_Op`s have been inherited by the latter.
-static bool verifyDecls(const RecordKeeper &recordKeeper, raw_ostream &) {
+static bool verifyDecls(RecordKeeper &recordKeeper, raw_ostream &) {
   for (Record *op : recordKeeper.getAllDerivedDefinitions("OpenMP_Op")) {
     for (Record *clause : op->getValueAsListOfDefs("clauseList"))
       verifyClause(op, clause);
diff --git a/mlir/tools/mlir-tblgen/OpDocGen.cpp b/mlir/tools/mlir-tblgen/OpDocGen.cpp
index 71df80cd110f1..066e5b24f5a3c 100644
--- a/mlir/tools/mlir-tblgen/OpDocGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDocGen.cpp
@@ -282,7 +282,7 @@ static void emitSourceLink(StringRef inputFilename, raw_ostream &os) {
      << inputFromMlirInclude << ")\n\n";
 }
 
-static void emitOpDoc(const RecordKeeper &recordKeeper, raw_ostream &os) {
+static void emitOpDoc(RecordKeeper &recordKeeper, raw_ostream &os) {
   auto opDefs = getRequestedOpDefinitions(recordKeeper);
 
   os << "<!-- Autogenerated by mlir-tblgen; don't manually edit -->\n";
@@ -371,8 +371,8 @@ static void emitAttrOrTypeDefDoc(const AttrOrTypeDef &def, raw_ostream &os) {
   os << "\n";
 }
 
-static void emitAttrOrTypeDefDoc(const RecordKeeper &recordKeeper,
-                                 raw_ostream &os, StringRef recordTypeName) {
+static void emitAttrOrTypeDefDoc(RecordKeeper &recordKeeper, raw_ostream &os,
+                                 StringRef recordTypeName) {
   std::vector<llvm::Record *> defs =
       recordKeeper.getAllDerivedDefinitions(recordTypeName);
 
@@ -405,7 +405,7 @@ static void emitEnumDoc(const EnumAttr &def, raw_ostream &os) {
   os << "\n";
 }
 
-static void emitEnumDoc(const RecordKeeper &recordKeeper, raw_ostream &os) {
+static void emitEnumDoc(RecordKeeper &recordKeeper, raw_ostream &os) {
   std::vector<llvm::Record *> defs =
       recordKeeper.getAllDerivedDefinitions("EnumAttr");
 
@@ -518,7 +518,7 @@ static void emitDialectDoc(const Dialect &dialect, StringRef inputFilename,
             os);
 }
 
-static bool emitDialectDoc(const RecordKeeper &recordKeeper, raw_ostream &os) {
+static bool emitDialectDoc(RecordKeeper &recordKeeper, raw_ostream &os) {
   std::vector<Record *> dialectDefs =
       recordKeeper.getAllDerivedDefinitionsIfDefined("Dialect");
   SmallVector<Dialect> dialects(dialectDefs.begin(), dialectDefs.end());
@@ -617,34 +617,34 @@ static bool emitDialectDoc(const RecordKeeper &recordKeeper, raw_ostream &os) {
 static mlir::GenRegistration
     genAttrRegister("gen-attrdef-doc",
                     "Generate dialect attribute documentation",
-                    [](const RecordKeeper &records, raw_ostream &os) {
+                    [](RecordKeeper &records, raw_ostream &os) {
                       emitAttrOrTypeDefDoc(records, os, "AttrDef");
                       return false;
                     });
 
 static mlir::GenRegistration
     genOpRegister("gen-op-doc", "Generate dialect documentation",
-                  [](const RecordKeeper &records, raw_ostream &os) {
+                  [](RecordKeeper &records, raw_ostream &os) {
                     emitOpDoc(records, os);
                     return false;
                   });
 
 static mlir::GenRegistration
     genTypeRegister("gen-typedef-doc", "Generate dialect type documentation",
-                    [](const RecordKeeper &records, raw_ostream &os) {
+                    [](RecordKeeper &records, raw_ostream &os) {
                       emitAttrOrTypeDefDoc(records, os, "TypeDef");
                       return false;
                     });
 
 static mlir::GenRegistration
     genEnumRegister("gen-enum-doc", "Generate dialect enum documentation",
-                    [](const RecordKeeper &records, raw_ostream &os) {
+                    [](RecordKeeper &records, raw_ostream &os) {
                       emitEnumDoc(records, os);
                       return false;
                     });
 
 static mlir::GenRegistration
     genRegister("gen-dialect-doc", "Generate dialect documentation",
-                [](const RecordKeeper &records, raw_ostream &os) {
+                [](RecordKeeper &records, raw_ostream &os) {
                   return emitDialectDoc(records, os);
                 });
diff --git a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
index 4b06b92fbc8a8..00f21a1cefbdd 100644
--- a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
@@ -62,8 +62,7 @@ static void emitMethodNameAndArgs(const InterfaceMethod &method,
 /// Get an array of all OpInterface definitions but exclude those subclassing
 /// "DeclareOpInterfaceMethods".
 static std::vector<llvm::Record *>
-getAllInterfaceDefinitions(const llvm::RecordKeeper &recordKeeper,
-                           StringRef name) {
+getAllInterfaceDefinitions(llvm::RecordKeeper &recordKeeper, StringRef name) {
   std::vector<llvm::Record *> defs =
       recordKeeper.getAllDerivedDefinitions((name + "Interface").str());
 
@@ -118,7 +117,7 @@ class InterfaceGenerator {
 
 /// A specialized generator for attribute interfaces.
 struct AttrInterfaceGenerator : public InterfaceGenerator {
-  AttrInterfaceGenerator(const llvm::RecordKeeper &records, raw_ostream &os)
+  AttrInterfaceGenerator(llvm::RecordKeeper &records, raw_ostream &os)
       : InterfaceGenerator(getAllInterfaceDefinitions(records, "Attr"), os) {
     valueType = "::mlir::Attribute";
     interfaceBaseType = "AttributeInterface";
@@ -133,7 +132,7 @@ struct AttrInterfaceGenerator : public InterfaceGenerator {
 };
 /// A specialized generator for operation interfaces.
 struct OpInterfaceGenerator : public InterfaceGenerator {
-  OpInterfaceGenerator(const llvm::RecordKeeper &records, raw_ostream &os)
+  OpInterfaceGenerator(llvm::RecordKeeper &records, raw_ostream &os)
       : InterfaceGenerator(getAllInterfaceDefinitions(records, "Op"), os) {
     valueType = "::mlir::Operation *";
     interfaceBaseType = "OpInterface";
@@ -149,7 +148,7 @@ struct OpInterfaceGenerator : public InterfaceGenerator {
 };
 /// A specialized generator for type interfaces.
 struct TypeInterfaceGenerator : public InterfaceGenerator {
-  TypeInterfaceGenerator(const llvm::RecordKeeper &records, raw_ostream &os)
+  TypeInterfaceGenerator(llvm::RecordKeeper &records, raw_ostream &os)
       : InterfaceGenerator(getAllInterfaceDefinitions(records, "Type"), os) {
     valueType = "::mlir::Type";
     interfaceBaseType = "TypeInterface";
@@ -684,15 +683,15 @@ struct InterfaceGenRegistration {
         genDefDesc(("Generate " + genDesc + " interface definitions").str()),
         genDocDesc(("Generate " + genDesc + " interface documentation").str()),
         genDecls(genDeclArg, genDeclDesc,
-                 [](const llvm::RecordKeeper &records, raw_ostream &os) {
+                 [](llvm::RecordKeeper &records, raw_ostream &os) {
                    return GeneratorT(records, os).emitInterfaceDecls();
                  }),
         genDefs(genDefArg, genDefDesc,
-                [](const llvm::RecordKeeper &records, raw_ostream &os) {
+                [](llvm::RecordKeeper &records, raw_ostream &os) {
                   return GeneratorT(records, os).emitInterfaceDefs();
                 }),
         genDocs(genDocArg, genDocDesc,
-                [](const llvm::RecordKeeper &records, raw_ostream &os) {
+                [](llvm::RecordKeeper &records, raw_ostream &os) {
                   return GeneratorT(records, os).emitInterfaceDocs();
                 }) {}
 
diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp
index 2c79ba2cd6353..401f02246ed23 100644
--- a/mlir/tools/mlir-tblgen/RewriterGen.cpp
+++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp
@@ -268,7 +268,7 @@ class PatternEmitter {
 // inlining them.
 class StaticMatcherHelper {
 public:
-  StaticMatcherHelper(raw_ostream &os, const RecordKeeper &recordKeeper,
+  StaticMatcherHelper(raw_ostream &os, RecordKeeper &recordKeeper,
                       RecordOperatorMap &mapper);
 
   // Determine if we should inline the match logic or delegate to a static
@@ -1886,7 +1886,7 @@ void PatternEmitter::createAggregateLocalVarsForOpArgs(
 }
 
 StaticMatcherHelper::StaticMatcherHelper(raw_ostream &os,
-                                         const RecordKeeper &recordKeeper,
+                                         RecordKeeper &recordKeeper,
                                          RecordOperatorMap &mapper)
     : opMap(mapper), staticVerifierEmitter(os, recordKeeper) {}
 
@@ -1951,7 +1951,7 @@ StringRef StaticMatcherHelper::getVerifierName(DagLeaf leaf) {
   return staticVerifierEmitter.getTypeConstraintFn(leaf.getAsConstraint());
 }
 
-static void emitRewriters(const RecordKeeper &recordKeeper, raw_ostream &os) {
+static void emitRewriters(RecordKeeper &recordKeeper, raw_ostream &os) {
   emitSourceFileHeader("Rewriters", os, recordKeeper);
 
   const auto &patterns = recordKeeper.getAllDerivedDefinitions("Pattern");
@@ -2001,7 +2001,7 @@ static void emitRewriters(const RecordKeeper &recordKeeper, raw_ostream &os) {
 
 static mlir::GenRegistration
     genRewriters("gen-rewriters", "Generate pattern rewriters",
-                 [](const RecordKeeper &records, raw_ostream &os) {
+                 [](RecordKeeper &records, raw_ostream &os) {
                    emitRewriters(records, os);
                    return false;
                  });
diff --git a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
index a55f3539f31db..0957a5d55db95 100644
--- a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
+++ b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
@@ -146,14 +146,13 @@ static irdl::DialectOp createIRDLDialect(OpBuilder &builder) {
 }
 
 static std::vector<llvm::Record *>
-getOpDefinitions(const RecordKeeper &recordKeeper) {
+getOpDefinitions(RecordKeeper &recordKeeper) {
   if (!recordKeeper.getClass("Op"))
     return {};
   return recordKeeper.getAllDerivedDefinitions("Op");
 }
 
-static bool emitDialectIRDLDefs(const RecordKeeper &recordKeeper,
-                                raw_ostream &os) {
+static bool emitDialectIRDLDefs(RecordKeeper &recordKeeper, raw_ostream &os) {
   // Initialize.
   MLIRContext ctx;
   ctx.getOrLoadDialect<irdl::IRDLDialect>();
@@ -185,6 +184,6 @@ static bool emitDialectIRDLDefs(const RecordKeeper &recordKeeper,
 
 static mlir::GenRegistration
     genOpDefs("gen-dialect-irdl-defs", "Generate IRDL dialect definitions",
-              [](const RecordKeeper &records, raw_ostream &os) {
+              [](RecordKeeper &records, raw_ostream &os) {
                 return emitDialectIRDLDefs(records, os);
               });

From 616a8ce6203d8c7569266bfaf163e74df1f440ad Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 5 Sep 2024 18:45:21 -0700
Subject: [PATCH 321/425] [clang-format] Correctly annotate braces in macro
 definition (#107352)

Also add a test case for #107096.

Fixes #106418.
---
 clang/lib/Format/UnwrappedLineParser.cpp      |  3 ++-
 clang/unittests/Format/TokenAnnotatorTest.cpp | 20 +++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 246b29d308bfa..1727ed93822b1 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -570,7 +570,8 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
                                 NextTok->isOneOf(Keywords.kw_of, Keywords.kw_in,
                                                  Keywords.kw_as));
           ProbablyBracedList =
-              ProbablyBracedList || (IsCpp && NextTok->is(tok::l_paren));
+              ProbablyBracedList || (IsCpp && (PrevTok->Tok.isLiteral() ||
+                                               NextTok->is(tok::l_paren)));
 
           // If there is a comma, semicolon or right paren after the closing
           // brace, we assume this is a braced initializer list.
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index c0436d8a2e180..1bb796fd6f5ee 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3278,6 +3278,26 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   EXPECT_BRACE_KIND(Tokens[10], BK_Block);
   EXPECT_TOKEN(Tokens[11], tok::r_brace, TT_StructRBrace);
   EXPECT_BRACE_KIND(Tokens[11], BK_Block);
+
+  Tokens = annotate("#define MACRO            \\\n"
+                    "  struct hash<type> {    \\\n"
+                    "    void f() { return; } \\\n"
+                    "  };");
+  ASSERT_EQ(Tokens.size(), 20u) << Tokens;
+  EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_StructLBrace);
+  EXPECT_BRACE_KIND(Tokens[8], BK_Block);
+  EXPECT_TOKEN(Tokens[10], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[11], tok::l_paren, TT_FunctionDeclarationLParen);
+  EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_FunctionLBrace);
+  EXPECT_BRACE_KIND(Tokens[13], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[16], BK_Block);
+  EXPECT_TOKEN(Tokens[17], tok::r_brace, TT_StructRBrace);
+  EXPECT_BRACE_KIND(Tokens[17], BK_Block);
+
+  Tokens = annotate("#define MEMBER(NAME) NAME{\"\"}");
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_BRACE_KIND(Tokens[7], BK_BracedInit);
+  EXPECT_BRACE_KIND(Tokens[9], BK_BracedInit);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsElaboratedTypeSpecifier) {

From c02fd17c1e20615c9e6174a3f8ad4ef0ec5ebbec Mon Sep 17 00:00:00 2001
From: donald chen <chenxunyu1993@gmail.com>
Date: Fri, 6 Sep 2024 10:02:05 +0800
Subject: [PATCH 322/425] [mlir] [scf] fix crash when conversion from scf to
 control flow (#107221)

This patch fixed a crash when scf.parallel's region donesn't terminate
with reduce op. This can happend in dialect conversion.
---
 mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp b/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp
index 2372ab5b82a77..45f3bcfa393be 100644
--- a/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp
+++ b/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp
@@ -482,7 +482,10 @@ LogicalResult
 ParallelLowering::matchAndRewrite(ParallelOp parallelOp,
                                   PatternRewriter &rewriter) const {
   Location loc = parallelOp.getLoc();
-  auto reductionOp = cast<ReduceOp>(parallelOp.getBody()->getTerminator());
+  auto reductionOp = dyn_cast<ReduceOp>(parallelOp.getBody()->getTerminator());
+  if (!reductionOp) {
+    return failure();
+  }
 
   // For a parallel loop, we essentially need to create an n-dimensional loop
   // nest. We do this by translating to scf.for ops and have those lowered in

From 33ceb2dd7596a05277fd246865862df6b03cf976 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 5 Sep 2024 19:04:30 -0700
Subject: [PATCH 323/425] [clang-tidy] Avoid repeated hash lookups (NFC)
 (#107490)

---
 .../cppcoreguidelines/PreferMemberInitializerCheck.cpp        | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
index 5f046c502eb38..e516b71088425 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
@@ -67,9 +67,7 @@ static bool canAdvanceAssignment(AssignedLevel Level) {
 static void updateAssignmentLevel(
     const FieldDecl *Field, const Expr *Init, const CXXConstructorDecl *Ctor,
     llvm::DenseMap<const FieldDecl *, AssignedLevel> &AssignedFields) {
-  auto It = AssignedFields.find(Field);
-  if (It == AssignedFields.end())
-    It = AssignedFields.insert({Field, AssignedLevel::None}).first;
+  auto It = AssignedFields.try_emplace(Field, AssignedLevel::None).first;
 
   if (!canAdvanceAssignment(It->second))
     // fast path for already decided field.

From 144314eaa5ca7f44817cf0ac162dbd17a5d88391 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 5 Sep 2024 19:04:56 -0700
Subject: [PATCH 324/425] [SLPVectorizer] Avoid repeated hash lookups (NFC)
 (#107491)

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a77d236413a96..c87d1055c8bc5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5854,13 +5854,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       }
       // Build a map between user nodes and their operands order to speedup
       // search. The graph currently does not provide this dependency directly.
-      for (EdgeInfo &EI : TE->UserTreeIndices) {
-        TreeEntry *UserTE = EI.UserTE;
-        auto It = Users.find(UserTE);
-        if (It == Users.end())
-          It = Users.insert({UserTE, {}}).first;
-        It->second.emplace_back(EI.EdgeIdx, TE);
-      }
+      for (EdgeInfo &EI : TE->UserTreeIndices)
+        Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);
     }
     // Erase filtered entries.
     for (TreeEntry *TE : Filtered)

From 5acd9d11373ca67f0d4baf17a78ebb56193a7df0 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Thu, 5 Sep 2024 19:13:08 -0700
Subject: [PATCH 325/425] [clang][scan] Report module dependencies in
 topological order (#107474)

---
 .../Tooling/DependencyScanning/ModuleDepCollector.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
index 370d834846859..c775adc0ddd73 100644
--- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
+++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
@@ -569,12 +569,11 @@ ModuleDepCollectorPP::handleTopLevelModule(const Module *M) {
     return {};
 
   // If this module has been handled already, just return its ID.
-  auto ModI = MDC.ModularDeps.insert({M, nullptr});
-  if (!ModI.second)
-    return ModI.first->second->ID;
+  if (auto ModI = MDC.ModularDeps.find(M); ModI != MDC.ModularDeps.end())
+    return ModI->second->ID;
 
-  ModI.first->second = std::make_unique<ModuleDeps>();
-  ModuleDeps &MD = *ModI.first->second;
+  auto OwnedMD = std::make_unique<ModuleDeps>();
+  ModuleDeps &MD = *OwnedMD;
 
   MD.ID.ModuleName = M->getFullModuleName();
   MD.IsSystem = M->IsSystem;
@@ -650,6 +649,8 @@ ModuleDepCollectorPP::handleTopLevelModule(const Module *M) {
 
   MD.BuildInfo = std::move(CI);
 
+  MDC.ModularDeps.insert({M, std::move(OwnedMD)});
+
   return MD.ID;
 }
 

From ede40da1f8c1e91601b985cd32ad785aa8806880 Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Fri, 6 Sep 2024 10:45:59 +0800
Subject: [PATCH 326/425] [mlir][tensor] Add check for indices of
 `tensor.gather` (#106894)

This patch add a check for indices of `tensor.gather` and
`tensor.scatter`. For that the length of gather_dims/scatter_dims should
match the size of last dimension of the indices. Fix #94901.
---
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp | 12 +++-
 mlir/test/Dialect/Tensor/invalid.mlir    | 80 ++++++++++++++++++------
 2 files changed, 69 insertions(+), 23 deletions(-)

diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 996de530c255d..5fbb3d55f8faa 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -1288,7 +1288,8 @@ RankedTensorType GatherOp::inferResultType(RankedTensorType sourceType,
 }
 
 static LogicalResult
-verifyGatherOrScatterDims(Operation *op, ArrayRef<int64_t> dims, int64_t rank,
+verifyGatherOrScatterDims(Operation *op, ArrayRef<int64_t> dims,
+                          ArrayRef<int64_t> indices, int64_t rank,
                           StringRef gatherOrScatter, StringRef sourceOrDest) {
   if (dims.empty())
     return op->emitOpError(gatherOrScatter) << "_dims must be non-empty";
@@ -1297,6 +1298,9 @@ verifyGatherOrScatterDims(Operation *op, ArrayRef<int64_t> dims, int64_t rank,
   if (numGatherDims > rank)
     return op->emitOpError(gatherOrScatter)
            << "_dims overflow " << sourceOrDest << " rank";
+  if (indices.empty() || indices.back() != numGatherDims)
+    return op->emitOpError(gatherOrScatter)
+           << "_dims length must match the size of last dimension of indices";
   for (int64_t val : dims) {
     if (val < 0)
       return op->emitOpError(gatherOrScatter)
@@ -1316,7 +1320,8 @@ verifyGatherOrScatterDims(Operation *op, ArrayRef<int64_t> dims, int64_t rank,
 LogicalResult GatherOp::verify() {
   int64_t sourceRank = getSourceType().getRank();
   ArrayRef<int64_t> gatherDims = getGatherDims();
-  if (failed(verifyGatherOrScatterDims(getOperation(), gatherDims, sourceRank,
+  if (failed(verifyGatherOrScatterDims(getOperation(), gatherDims,
+                                       getIndicesType().getShape(), sourceRank,
                                        "gather", "source")))
     return failure();
 
@@ -3530,7 +3535,8 @@ void ScatterOp::getAsmResultNames(
 LogicalResult ScatterOp::verify() {
   int64_t destRank = getDestType().getRank();
   ArrayRef<int64_t> scatterDims = getScatterDims();
-  if (failed(verifyGatherOrScatterDims(getOperation(), scatterDims, destRank,
+  if (failed(verifyGatherOrScatterDims(getOperation(), scatterDims,
+                                       getIndicesType().getShape(), destRank,
                                        "scatter", "dest")))
     return failure();
 
diff --git a/mlir/test/Dialect/Tensor/invalid.mlir b/mlir/test/Dialect/Tensor/invalid.mlir
index d9db32b8801ac..84e6c59e403dd 100644
--- a/mlir/test/Dialect/Tensor/invalid.mlir
+++ b/mlir/test/Dialect/Tensor/invalid.mlir
@@ -455,41 +455,59 @@ func.func @gather_coordinate_rank_overflow(
 
 // -----
 
+func.func @gather_coordinate_rank_mismatch0(
+    %source: tensor<4x5x6xf32>, %indices: tensor<index>) {
+  // expected-error@+1 {{gather_dims length must match the size of last dimension of indices}}
+  %out = tensor.gather %source[%indices] gather_dims([0, 1, 2]):
+    (tensor<4x5x6xf32>, tensor<index>) -> tensor<1x2xf32>
+}
+
+// -----
+
+func.func @gather_coordinate_rank_mismatch1(
+    %source: tensor<4x5x6xf32>, %indices: tensor<1x2x2xindex>) {
+  // expected-error@+1 {{gather_dims length must match the size of last dimension of indices}}
+  %out = tensor.gather %source[%indices] gather_dims([0, 1, 2]):
+    (tensor<4x5x6xf32>, tensor<1x2x2xindex>) -> tensor<1x2xf32>
+}
+
+// -----
+
 func.func @gather_coordinate_negative(
-    %source : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) {
+    %source : tensor<4x5x6xf32>, %indices: tensor<1x2x1xindex>) {
   // expected-error@+1 {{gather_dims value must be non-negative}}
   %out = tensor.gather %source[%indices] gather_dims([-1]):
-    (tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1x1x1xf32>
+    (tensor<4x5x6xf32>, tensor<1x2x1xindex>) -> tensor<1x2x1xf32>
   return
 }
 
 // -----
 
 func.func @gather_coordinate_overflow(
-    %source : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) {
+    %source : tensor<4x5x6xf32>, %indices: tensor<1x2x1xindex>) {
   // expected-error@+1 {{gather_dims value must be smaller than source rank}}
   %out = tensor.gather %source[%indices] gather_dims([42]):
-    (tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1x1x1xf32>
+    (tensor<4x5x6xf32>, tensor<1x2x1xindex>) -> tensor<1x2x1xf32>
   return
 }
 
 // -----
 
-func.func @gather_coordinate_overflow(
-    %source : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) {
+func.func @gather_coordinate_increase(
+    %source : tensor<4x5x6xf32>, %indices: tensor<1x2x2xindex>) {
   // expected-error@+1 {{gather_dims values must be strictly increasing}}
   %out = tensor.gather %source[%indices] gather_dims([1, 0]):
-    (tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1x1x1xf32>
+    (tensor<4x5x6xf32>, tensor<1x2x2xindex>) -> tensor<1x2x1x1xf32>
   return
 }
 
 // -----
 
 func.func @gather_wrong_result_type(
-    %source : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) {
+    %source : tensor<4x5x6xf32>, %indices: tensor<1x2x2xindex>) {
   // expected-error@+1 {{result type mismatch: expected 'tensor<1x2x1x5x1xf32>' or its rank-reduced variant 'tensor<1x2x5xf32>' (got: 'tensor<1x2x1xf32>')}}
   %out = tensor.gather %source[%indices] gather_dims([0, 2]):
-    (tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1xf32>
+    (tensor<4x5x6xf32>, tensor<1x2x2xindex>) -> tensor<1x2x1xf32>
   return
 }
 
@@ -517,12 +535,34 @@ func.func @scatter_coordinate_rank_overflow(
 
 // -----
 
+func.func @scatter_coordinate_rank_mismatch0(
+    %source : tensor<f32>,
+    %dest : tensor<4x5x6xf32>, %indices: tensor<index>) {
+  // expected-error@+1 {{scatter_dims length must match the size of last dimension of indices}}
+  %out = tensor.scatter %source into %dest[%indices] scatter_dims([0, 1, 2]) unique:
+    (tensor<f32>, tensor<4x5x6xf32>, tensor<index>) -> tensor<1x2xf32>
+  return
+}
+
+// -----
+
+func.func @scatter_coordinate_rank_mismatch1(
+    %source : tensor<f32>,
+    %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x2xindex>) {
+  // expected-error@+1 {{scatter_dims length must match the size of last dimension of indices}}
+  %out = tensor.scatter %source into %dest[%indices] scatter_dims([0, 1, 2]) unique:
+    (tensor<f32>, tensor<4x5x6xf32>, tensor<1x2x2xindex>) -> tensor<1x2xf32>
+  return
+}
+
+// -----
+
 func.func @scatter_coordinate_negative(
     %source : tensor<f32>,
-    %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) {
+    %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x1xindex>) {
   // expected-error@+1 {{scatter_dims value must be non-negative}}
   %out = tensor.scatter %source into %dest[%indices] scatter_dims([-1]) unique:
-    (tensor<f32>, tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1x1x1xf32>
+    (tensor<f32>, tensor<4x5x6xf32>, tensor<1x2x1xindex>) -> tensor<1x2x1xf32>
   return
 }
 
@@ -530,21 +570,21 @@ func.func @scatter_coordinate_negative(
 
 func.func @scatter_coordinate_overflow(
     %source : tensor<f32>,
-    %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) {
+    %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x1xindex>) {
   // expected-error@+1 {{scatter_dims value must be smaller than dest rank}}
   %out = tensor.scatter %source into %dest[%indices] scatter_dims([42]) unique:
-    (tensor<f32>, tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1x1x1xf32>
+    (tensor<f32>, tensor<4x5x6xf32>, tensor<1x2x1xindex>) -> tensor<1x2x1xf32>
   return
 }
 
 // -----
 
-func.func @scatter_coordinate_overflow(
+func.func @scatter_coordinate_increase(
     %source : tensor<f32>,
-    %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) {
+    %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x2xindex>) {
   // expected-error@+1 {{scatter_dims values must be strictly increasing}}
   %out = tensor.scatter %source into %dest[%indices] scatter_dims([1, 0]) unique:
-    (tensor<f32>, tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1x1x1xf32>
+    (tensor<f32>, tensor<4x5x6xf32>, tensor<1x2x2xindex>) -> tensor<1x2x1x1xf32>
   return
 }
 
@@ -552,10 +592,10 @@ func.func @scatter_coordinate_overflow(
 
 func.func @scatter_missing_unique(
     %source : tensor<f32>,
-    %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) {
+    %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x2xindex>) {
   // expected-error@+1 {{requires 'unique' attribute to be set}}
   %out = tensor.scatter %source into %dest[%indices] scatter_dims([0, 2]):
-    (tensor<f32>, tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1xf32>
+    (tensor<f32>, tensor<4x5x6xf32>, tensor<1x2x2xindex>) -> tensor<1x2x1xf32>
   return
 }
 
@@ -563,10 +603,10 @@ func.func @scatter_missing_unique(
 
 func.func @scatter_wrong_result_type(
     %source : tensor<f32>,
-    %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x3xindex>) {
+    %dest : tensor<4x5x6xf32>, %indices: tensor<1x2x2xindex>) {
   // expected-error@+1 {{source type mismatch: expected 'tensor<1x2x1x5x1xf32>' or its rank-reduced variant 'tensor<1x2x5xf32>' (got: 'tensor<f32>')}}
   %out = tensor.scatter %source into %dest[%indices] scatter_dims([0, 2]) unique:
-    (tensor<f32>, tensor<4x5x6xf32>, tensor<1x2x3xindex>) -> tensor<1x2x1xf32>
+    (tensor<f32>, tensor<4x5x6xf32>, tensor<1x2x2xindex>) -> tensor<1x2x1xf32>
   return
 }
 

From 8e35c86977ce5529a9387657321ac9fefcdae5b5 Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Thu, 5 Sep 2024 21:50:00 -0700
Subject: [PATCH 327/425] [HLSL] Apply resource attributes to the resource type
 rather than the handle member (#107160)

Converts existing resource attributes `[[hlsl::resource_class(..)]]` and
`[[is_rov]]` from declaration attributes to type attributes.

During type attribute processing all HLSL resource type attributes are
validated and collected by `SemaHLSL`
(`SemaHLSL::handleResourceTypeAttr`). At the end of the declaration they
are be combined into a single `HLSLAttributedResourceType` instance
(`SemaHLSL::ProcessResourceTypeAttributes`) that wraps the original type
and stores all of the necessary information about the resource.

`SemaHLSL` will also need to short-term-store the `TypeLoc` information
for the newly created type that will be grabbed by `TypeSpecLocFiller`
soon after it is created.

Updates all places that expected resource attributes on declarations
like resource binding diagnostic, builtin types in
HLSLExternalSemaSource, or codegen.

Also includes implementation of
`TreeTransform<Derived>::TransformHLSLAttributedResourceType` that
enables the use of attributed resource types inside templates.

Fixes #104861
Part 2/2
---
 clang/include/clang/AST/TypeLoc.h             |   8 +
 clang/include/clang/Basic/Attr.td             |   6 +-
 .../clang/Basic/DiagnosticSemaKinds.td        |   1 +
 clang/include/clang/Sema/SemaHLSL.h           |  20 +-
 clang/lib/AST/TypePrinter.cpp                 |  13 +-
 clang/lib/CodeGen/CGHLSLRuntime.cpp           |   9 +-
 clang/lib/Sema/HLSLExternalSemaSource.cpp     |  23 ++-
 clang/lib/Sema/SemaDeclAttr.cpp               |   6 -
 clang/lib/Sema/SemaHLSL.cpp                   | 177 ++++++++++++------
 clang/lib/Sema/TreeTransform.h                |  22 ++-
 clang/test/AST/HLSL/RWBuffer-AST.hlsl         |  10 +-
 ...a-attribute-supported-attributes-list.test |   2 -
 clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl   |  17 +-
 .../ParserHLSL/hlsl_is_rov_attr_error.hlsl    |  21 ++-
 .../ParserHLSL/hlsl_resource_class_attr.hlsl  |  48 ++---
 .../hlsl_resource_class_attr_error.hlsl       |  27 ++-
 .../hlsl_resource_handle_attrs.hlsl           |  17 +-
 .../SemaHLSL/resource_binding_attr_error.hlsl |   2 +-
 .../resource_binding_attr_error_resource.hlsl |  10 +-
 .../resource_binding_attr_error_udt.hlsl      |  10 +-
 20 files changed, 281 insertions(+), 168 deletions(-)

diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h
index 5db39eb3aefa7..03fbdcf60140d 100644
--- a/clang/include/clang/AST/TypeLoc.h
+++ b/clang/include/clang/AST/TypeLoc.h
@@ -951,12 +951,20 @@ class HLSLAttributedResourceTypeLoc
                              HLSLAttributedResourceLocInfo> {
 public:
   TypeLoc getWrappedLoc() const { return getInnerTypeLoc(); }
+
+  TypeLoc getContainedLoc() const {
+    return TypeLoc(getTypePtr()->getContainedType(), getNonLocalData());
+  }
+
   void setSourceRange(const SourceRange &R) { getLocalData()->Range = R; }
   SourceRange getLocalSourceRange() const { return getLocalData()->Range; }
   void initializeLocal(ASTContext &Context, SourceLocation loc) {
     setSourceRange(SourceRange());
   }
   QualType getInnerType() const { return getTypePtr()->getWrappedType(); }
+  unsigned getLocalDataSize() const {
+    return sizeof(HLSLAttributedResourceLocInfo);
+  }
 };
 
 struct ObjCObjectTypeLocInfo {
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 8d2a362abc3c3..0c98f8e25a6fb 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4643,16 +4643,14 @@ def HLSLResource : InheritableAttr {
   let Documentation = [InternalOnly];
 }
 
-def HLSLROV : InheritableAttr {
+def HLSLROV : TypeAttr {
   let Spellings = [CXX11<"hlsl", "is_rov">];
-  let Subjects = SubjectList<[Struct]>;
   let LangOpts = [HLSL]; 
   let Documentation = [InternalOnly];
 }
 
-def HLSLResourceClass : InheritableAttr {
+def HLSLResourceClass : TypeAttr {
   let Spellings = [CXX11<"hlsl", "resource_class">];
-  let Subjects = SubjectList<[Field]>;
   let LangOpts = [HLSL];
   let Args = [
 	EnumArgument<"ResourceClass", "llvm::hlsl::ResourceClass",
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 083684670a980..58819a64813fc 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12368,6 +12368,7 @@ def err_hlsl_packoffset_cross_reg_boundary : Error<"packoffset cannot cross regi
 def err_hlsl_packoffset_alignment_mismatch : Error<"packoffset at 'y' not match alignment %0 required by %1">;
 def err_hlsl_pointers_unsupported : Error<
   "%select{pointers|references}0 are unsupported in HLSL">;
+def err_hlsl_missing_resource_class : Error<"HLSL resource needs to have [[hlsl::resource_class()]] attribute">;
 
 def err_hlsl_operator_unsupported : Error<
   "the '%select{&|*|->}0' operator is unsupported in HLSL">;
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index 285e4e5f3c765..64b39ca7712ee 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -15,8 +15,10 @@
 
 #include "clang/AST/ASTFwd.h"
 #include "clang/AST/Attr.h"
+#include "clang/AST/Type.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Sema/SemaBase.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/TargetParser/Triple.h"
 #include <initializer_list>
 
@@ -26,6 +28,12 @@ class IdentifierInfo;
 class ParsedAttr;
 class Scope;
 
+// FIXME: This can be hidden (as static function in SemaHLSL.cpp) once we no
+// longer need to create builtin buffer types in HLSLExternalSemaSource.
+bool CreateHLSLAttributedResourceType(Sema &S, QualType Wrapped,
+                                      ArrayRef<const Attr *> AttrList,
+                                      QualType &ResType);
+
 class SemaHLSL : public SemaBase {
 public:
   SemaHLSL(Sema &S);
@@ -59,8 +67,6 @@ class SemaHLSL : public SemaBase {
   void handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL);
   void handlePackOffsetAttr(Decl *D, const ParsedAttr &AL);
   void handleShaderAttr(Decl *D, const ParsedAttr &AL);
-  void handleROVAttr(Decl *D, const ParsedAttr &AL);
-  void handleResourceClassAttr(Decl *D, const ParsedAttr &AL);
   void handleResourceBindingAttr(Decl *D, const ParsedAttr &AL);
   void handleParamModifierAttr(Decl *D, const ParsedAttr &AL);
   bool handleResourceTypeAttr(const ParsedAttr &AL);
@@ -78,6 +84,16 @@ class SemaHLSL : public SemaBase {
   ExprResult ActOnOutParamExpr(ParmVarDecl *Param, Expr *Arg);
 
   QualType getInoutParameterType(QualType Ty);
+
+private:
+  // HLSL resource type attributes need to be processed all at once.
+  // This is a list to collect them.
+  llvm::SmallVector<const Attr *> HLSLResourcesTypeAttrs;
+
+  /// SourceLocation corresponding to HLSLAttributedResourceTypeLocs that we
+  /// have not yet populated.
+  llvm::DenseMap<const HLSLAttributedResourceType *, SourceLocation>
+      LocsForHLSLAttributedResources;
 };
 
 } // namespace clang
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index b1d9516c96eb7..3bef9370e621c 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -1942,6 +1942,10 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
   case attr::BTFTypeTag:
     llvm_unreachable("BTFTypeTag attribute handled separately");
 
+  case attr::HLSLResourceClass:
+  case attr::HLSLROV:
+    llvm_unreachable("HLSL resource type attributes handled separately");
+
   case attr::OpenCLPrivateAddressSpace:
   case attr::OpenCLGlobalAddressSpace:
   case attr::OpenCLGlobalDeviceAddressSpace:
@@ -2062,7 +2066,11 @@ void TypePrinter::printBTFTagAttributedAfter(const BTFTagAttributedType *T,
 void TypePrinter::printHLSLAttributedResourceBefore(
     const HLSLAttributedResourceType *T, raw_ostream &OS) {
   printBefore(T->getWrappedType(), OS);
+}
 
+void TypePrinter::printHLSLAttributedResourceAfter(
+    const HLSLAttributedResourceType *T, raw_ostream &OS) {
+  printAfter(T->getWrappedType(), OS);
   const HLSLAttributedResourceType::Attributes &Attrs = T->getAttrs();
   OS << " [[hlsl::resource_class("
      << HLSLResourceClassAttr::ConvertResourceClassToStr(Attrs.ResourceClass)
@@ -2071,11 +2079,6 @@ void TypePrinter::printHLSLAttributedResourceBefore(
     OS << " [[hlsl::is_rov()]]";
 }
 
-void TypePrinter::printHLSLAttributedResourceAfter(
-    const HLSLAttributedResourceType *T, raw_ostream &OS) {
-  printAfter(T->getWrappedType(), OS);
-}
-
 void TypePrinter::printObjCInterfaceBefore(const ObjCInterfaceType *T,
                                            raw_ostream &OS) {
   OS << T->getDecl()->getName();
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 4bd7b6ba58de0..b6e6555e63fca 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -295,13 +295,14 @@ void CGHLSLRuntime::annotateHLSLResource(const VarDecl *D, GlobalVariable *GV) {
   // inside the record decl
   for (auto *FD : RD->fields()) {
     const auto *HLSLResAttr = FD->getAttr<HLSLResourceAttr>();
-    const auto *HLSLResClassAttr = FD->getAttr<HLSLResourceClassAttr>();
-    if (!HLSLResAttr || !HLSLResClassAttr)
+    const HLSLAttributedResourceType *AttrResType =
+        dyn_cast<HLSLAttributedResourceType>(FD->getType().getTypePtr());
+    if (!HLSLResAttr || !AttrResType)
       continue;
 
-    llvm::hlsl::ResourceClass RC = HLSLResClassAttr->getResourceClass();
+    llvm::hlsl::ResourceClass RC = AttrResType->getAttrs().ResourceClass;
+    bool IsROV = AttrResType->getAttrs().IsROV;
     llvm::hlsl::ResourceKind RK = HLSLResAttr->getResourceKind();
-    bool IsROV = FD->hasAttr<HLSLROVAttr>();
     llvm::hlsl::ElementType ET = calculateElementType(CGM.getContext(), Ty);
 
     BufferResBinding Binding(D->getAttr<HLSLResourceBindingAttr>());
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index 9aacbe4ad9548..071e64fe56d48 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -13,10 +13,13 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclCXX.h"
+#include "clang/AST/Type.h"
 #include "clang/Basic/AttrKinds.h"
 #include "clang/Basic/HLSLRuntime.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Sema.h"
+#include "clang/Sema/SemaHLSL.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Frontend/HLSL/HLSLResource.h"
 
 #include <functional>
@@ -107,7 +110,7 @@ struct BuiltinTypeDeclBuilder {
   }
 
   BuiltinTypeDeclBuilder &
-  addHandleMember(ResourceClass RC, ResourceKind RK, bool IsROV,
+  addHandleMember(Sema &S, ResourceClass RC, ResourceKind RK, bool IsROV,
                   AccessSpecifier Access = AccessSpecifier::AS_private) {
     if (Record->isCompleteDefinition())
       return *this;
@@ -118,16 +121,16 @@ struct BuiltinTypeDeclBuilder {
         Ty = Record->getASTContext().getPointerType(
             QualType(TTD->getTypeForDecl(), 0));
     }
-    // add handle member
-    Attr *ResourceClassAttr =
-        HLSLResourceClassAttr::CreateImplicit(Record->getASTContext(), RC);
+
+    // add handle member with resource type attributes
+    QualType AttributedResTy = QualType();
+    SmallVector<const Attr *> Attrs = {
+        HLSLResourceClassAttr::CreateImplicit(Record->getASTContext(), RC),
+        IsROV ? HLSLROVAttr::CreateImplicit(Record->getASTContext()) : nullptr};
     Attr *ResourceAttr =
         HLSLResourceAttr::CreateImplicit(Record->getASTContext(), RK);
-    Attr *ROVAttr =
-        IsROV ? HLSLROVAttr::CreateImplicit(Record->getASTContext()) : nullptr;
-    addMemberVariable("h", Ty, {ResourceClassAttr, ResourceAttr, ROVAttr},
-                      Access);
-
+    if (CreateHLSLAttributedResourceType(S, Ty, Attrs, AttributedResTy))
+      addMemberVariable("h", AttributedResTy, {ResourceAttr}, Access);
     return *this;
   }
 
@@ -494,7 +497,7 @@ static BuiltinTypeDeclBuilder setupBufferType(CXXRecordDecl *Decl, Sema &S,
                                               ResourceClass RC, ResourceKind RK,
                                               bool IsROV) {
   return BuiltinTypeDeclBuilder(Decl)
-      .addHandleMember(RC, RK, IsROV)
+      .addHandleMember(S, RC, RK, IsROV)
       .addDefaultHandleConstructor(S, RC);
 }
 
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 33547c2e6e145..d068cb6a78f26 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -6907,12 +6907,6 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_HLSLResourceBinding:
     S.HLSL().handleResourceBindingAttr(D, AL);
     break;
-  case ParsedAttr::AT_HLSLROV:
-    handleSimpleAttribute<HLSLROVAttr>(S, D, AL);
-    break;
-  case ParsedAttr::AT_HLSLResourceClass:
-    S.HLSL().handleResourceClassAttr(D, AL);
-    break;
   case ParsedAttr::AT_HLSLParamModifier:
     S.HLSL().handleParamModifierAttr(D, AL);
     break;
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index d5ccd3815eb66..3b40769939f12 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Sema/SemaHLSL.h"
+#include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclBase.h"
 #include "clang/AST/DeclCXX.h"
@@ -22,7 +23,9 @@
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/ParsedAttr.h"
 #include "clang/Sema/Sema.h"
+#include "clang/Sema/Template.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
@@ -30,6 +33,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
 #include <iterator>
+#include <utility>
 
 using namespace clang;
 
@@ -559,46 +563,125 @@ void SemaHLSL::handleShaderAttr(Decl *D, const ParsedAttr &AL) {
     D->addAttr(NewAttr);
 }
 
-void SemaHLSL::handleResourceClassAttr(Decl *D, const ParsedAttr &AL) {
-  if (!AL.isArgIdent(0)) {
-    Diag(AL.getLoc(), diag::err_attribute_argument_type)
-        << AL << AANT_ArgumentIdentifier;
-    return;
-  }
+bool clang::CreateHLSLAttributedResourceType(Sema &S, QualType Wrapped,
+                                             ArrayRef<const Attr *> AttrList,
+                                             QualType &ResType) {
+  assert(AttrList.size() && "expected list of resource attributes");
 
-  IdentifierLoc *Loc = AL.getArgAsIdent(0);
-  StringRef Identifier = Loc->Ident->getName();
-  SourceLocation ArgLoc = Loc->Loc;
+  QualType Contained = QualType();
+  HLSLAttributedResourceType::Attributes ResAttrs = {};
 
-  // Validate.
-  llvm::dxil::ResourceClass RC;
-  if (!HLSLResourceClassAttr::ConvertStrToResourceClass(Identifier, RC)) {
-    Diag(ArgLoc, diag::warn_attribute_type_not_supported)
-        << "ResourceClass" << Identifier;
-    return;
+  bool HasResourceClass = false;
+  for (const Attr *A : AttrList) {
+    if (!A)
+      continue;
+    switch (A->getKind()) {
+    case attr::HLSLResourceClass: {
+      llvm::dxil::ResourceClass RC =
+          cast<HLSLResourceClassAttr>(A)->getResourceClass();
+      if (HasResourceClass) {
+        S.Diag(A->getLocation(), ResAttrs.ResourceClass == RC
+                                     ? diag::warn_duplicate_attribute_exact
+                                     : diag::warn_duplicate_attribute)
+            << A;
+        return false;
+      }
+      ResAttrs.ResourceClass = RC;
+      HasResourceClass = true;
+      break;
+    }
+    case attr::HLSLROV:
+      ResAttrs.IsROV = true;
+      break;
+    default:
+      llvm_unreachable("unhandled resource attribute type");
+    }
   }
 
-  D->addAttr(HLSLResourceClassAttr::Create(getASTContext(), RC, ArgLoc));
+  if (!HasResourceClass) {
+    S.Diag(AttrList.back()->getRange().getEnd(),
+           diag::err_hlsl_missing_resource_class);
+    return false;
+  }
+
+  ResType = S.getASTContext().getHLSLAttributedResourceType(Wrapped, Contained,
+                                                            ResAttrs);
+  return true;
 }
 
-// Validates HLSL resource type attribute and adds it to the list to be
-// processed into a single HLSLAttributedResourceType later on.
-// Returns false if the attribute is invalid.
+// Validates and creates an HLSL attribute that is applied as type attribute on
+// HLSL resource. The attributes are collected in HLSLResourcesTypeAttrs and at
+// the end of the declaration they are applied to the declaration type by
+// wrapping it in HLSLAttributedResourceType.
 bool SemaHLSL::handleResourceTypeAttr(const ParsedAttr &AL) {
-  // FIXME: placeholder - not yet implemented
+  Attr *A = nullptr;
+
+  // validate number of arguments
+  if (!AL.checkExactlyNumArgs(SemaRef, AL.getMinArgs()))
+    return false;
+
+  switch (AL.getKind()) {
+  case ParsedAttr::AT_HLSLResourceClass: {
+    if (!AL.isArgIdent(0)) {
+      Diag(AL.getLoc(), diag::err_attribute_argument_type)
+          << AL << AANT_ArgumentIdentifier;
+      return false;
+    }
+
+    IdentifierLoc *Loc = AL.getArgAsIdent(0);
+    StringRef Identifier = Loc->Ident->getName();
+    SourceLocation ArgLoc = Loc->Loc;
+
+    // Validate resource class value
+    llvm::dxil::ResourceClass RC;
+    if (!HLSLResourceClassAttr::ConvertStrToResourceClass(Identifier, RC)) {
+      Diag(ArgLoc, diag::warn_attribute_type_not_supported)
+          << "ResourceClass" << Identifier;
+      return false;
+    }
+    A = HLSLResourceClassAttr::Create(getASTContext(), RC, AL.getLoc());
+    break;
+  }
+  case ParsedAttr::AT_HLSLROV:
+    A = HLSLROVAttr::Create(getASTContext(), AL.getLoc());
+    break;
+  default:
+    llvm_unreachable("unhandled HLSL attribute");
+  }
+
+  HLSLResourcesTypeAttrs.emplace_back(A);
   return true;
 }
 
-// Combines all resource type attributes and create HLSLAttributedResourceType.
+// Combines all resource type attributes and creates HLSLAttributedResourceType.
 QualType SemaHLSL::ProcessResourceTypeAttributes(QualType CurrentType) {
-  // FIXME: placeholder - not yet implemented
-  return CurrentType;
+  if (!HLSLResourcesTypeAttrs.size())
+    return CurrentType;
+
+  QualType QT = CurrentType;
+  if (CreateHLSLAttributedResourceType(SemaRef, CurrentType,
+                                       HLSLResourcesTypeAttrs, QT)) {
+    const HLSLAttributedResourceType *RT =
+        dyn_cast<HLSLAttributedResourceType>(QT.getTypePtr());
+    // Use the location of the first attribute as the location of the aggregated
+    // type. The attributes are stored in HLSLResourceTypeAttrs in the same
+    // order as they are parsed.
+    SourceLocation Loc = HLSLResourcesTypeAttrs[0]->getLoc();
+    LocsForHLSLAttributedResources.insert(std::pair(RT, Loc));
+  }
+  HLSLResourcesTypeAttrs.clear();
+  return QT;
 }
 
 // Returns source location for the HLSLAttributedResourceType
 SourceLocation
 SemaHLSL::TakeLocForHLSLAttribute(const HLSLAttributedResourceType *RT) {
-  // FIXME: placeholder - not yet implemented
+  auto I = LocsForHLSLAttributedResources.find(RT);
+  if (I != LocsForHLSLAttributedResources.end()) {
+    SourceLocation Loc = I->second;
+    LocsForHLSLAttributedResources.erase(I);
+    return Loc;
+  }
   return SourceLocation();
 }
 
@@ -656,33 +739,19 @@ static void updateResourceClassFlagsFromDeclResourceClass(
   }
 }
 
-template <typename T>
-static const T *getSpecifiedHLSLAttrFromRecordDecl(RecordDecl *TheRecordDecl) {
-  if (!TheRecordDecl)
-    return nullptr;
-
-  if (TheRecordDecl->hasAttr<T>())
-    return TheRecordDecl->getAttr<T>();
-  for (auto *FD : TheRecordDecl->fields()) {
-    const T *Attr = FD->getAttr<T>();
-    if (Attr)
-      return Attr;
+const HLSLAttributedResourceType *
+findAttributedResourceTypeOnField(VarDecl *VD) {
+  assert(VD != nullptr && "expected VarDecl");
+  if (RecordDecl *RD = getRecordDeclFromVarDecl(VD)) {
+    for (auto *FD : RD->fields()) {
+      if (const HLSLAttributedResourceType *AttrResType =
+              dyn_cast<HLSLAttributedResourceType>(FD->getType().getTypePtr()))
+        return AttrResType;
+    }
   }
   return nullptr;
 }
 
-template <typename T>
-static const T *getSpecifiedHLSLAttrFromVarDecl(VarDecl *VD) {
-  RecordDecl *TheRecordDecl = nullptr;
-  if (VD) {
-    TheRecordDecl = getRecordDeclFromVarDecl(VD);
-    if (!TheRecordDecl)
-      return nullptr;
-  }
-
-  return getSpecifiedHLSLAttrFromRecordDecl<T>(TheRecordDecl);
-}
-
 static void updateResourceClassFlagsFromRecordType(RegisterBindingFlags &Flags,
                                                    const RecordType *RT) {
   llvm::SmallVector<const Type *> TypesToScan;
@@ -702,10 +771,11 @@ static void updateResourceClassFlagsFromRecordType(RegisterBindingFlags &Flags,
 
     const RecordDecl *RD = RT->getDecl();
     for (FieldDecl *FD : RD->fields()) {
-      if (HLSLResourceClassAttr *RCAttr =
-              FD->getAttr<HLSLResourceClassAttr>()) {
+      const Type *FieldTy = FD->getType().getTypePtr();
+      if (const HLSLAttributedResourceType *AttrResType =
+              dyn_cast<HLSLAttributedResourceType>(FieldTy)) {
         updateResourceClassFlagsFromDeclResourceClass(
-            Flags, RCAttr->getResourceClass());
+            Flags, AttrResType->getAttrs().ResourceClass);
         continue;
       }
       TypesToScan.emplace_back(FD->getType().getTypePtr());
@@ -732,11 +802,10 @@ static RegisterBindingFlags HLSLFillRegisterBindingFlags(Sema &S,
   }
   // Samplers, UAVs, and SRVs are VarDecl types
   else if (VarDecl *TheVarDecl = dyn_cast<VarDecl>(TheDecl)) {
-    const HLSLResourceClassAttr *resClassAttr =
-        getSpecifiedHLSLAttrFromVarDecl<HLSLResourceClassAttr>(TheVarDecl);
-    if (resClassAttr) {
+    if (const HLSLAttributedResourceType *AttrResType =
+            findAttributedResourceTypeOnField(TheVarDecl)) {
       Flags.Resource = true;
-      Flags.ResourceClass = resClassAttr->getResourceClass();
+      Flags.ResourceClass = AttrResType->getAttrs().ResourceClass;
     } else {
       const clang::Type *TheBaseType = TheVarDecl->getType().getTypePtr();
       while (TheBaseType->isArrayType())
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 27eac401c28f5..0daf620b4123e 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -7462,8 +7462,26 @@ QualType TreeTransform<Derived>::TransformBTFTagAttributedType(
 template <typename Derived>
 QualType TreeTransform<Derived>::TransformHLSLAttributedResourceType(
     TypeLocBuilder &TLB, HLSLAttributedResourceTypeLoc TL) {
-  llvm_unreachable(
-      "Unexpected TreeTransform for HLSLAttributedResourceTypeLoc");
+
+  const HLSLAttributedResourceType *oldType = TL.getTypePtr();
+
+  QualType WrappedTy = getDerived().TransformType(TLB, TL.getWrappedLoc());
+  if (WrappedTy.isNull())
+    return QualType();
+
+  QualType ContainedTy = QualType();
+  if (!oldType->getContainedType().isNull())
+    ContainedTy = getDerived().TransformType(TLB, TL.getContainedLoc());
+
+  QualType Result = TL.getType();
+  if (getDerived().AlwaysRebuild() || WrappedTy != oldType->getWrappedType() ||
+      ContainedTy != oldType->getContainedType()) {
+    Result = SemaRef.Context.getHLSLAttributedResourceType(
+        WrappedTy, ContainedTy, oldType->getAttrs());
+  }
+
+  TLB.push<HLSLAttributedResourceTypeLoc>(Result);
+  return Result;
 }
 
 template<typename Derived>
diff --git a/clang/test/AST/HLSL/RWBuffer-AST.hlsl b/clang/test/AST/HLSL/RWBuffer-AST.hlsl
index 1f6ef60e121ea..0e7803ce50a89 100644
--- a/clang/test/AST/HLSL/RWBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/RWBuffer-AST.hlsl
@@ -30,8 +30,7 @@ RWBuffer<float> Buffer;
 // CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class RWBuffer definition
 
 // CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h 'element_type *'
-// CHECK-NEXT: HLSLResourceClassAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit UAV
+// CHECK-NEXT: implicit h 'element_type * {{\[\[}}hlsl::resource_class(UAV)]]':'element_type *'
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
 
 // CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
@@ -39,7 +38,7 @@ RWBuffer<float> Buffer;
 // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
 // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue
-// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type * {{\[\[}}hlsl::resource_class(UAV)]]':'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}}
 // CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'const RWBuffer<element_type>' lvalue implicit this
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int'
 // CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
@@ -49,7 +48,7 @@ RWBuffer<float> Buffer;
 // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
 // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue
-// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type * {{\[\[}}hlsl::resource_class(UAV)]]':'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}}
 // CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'RWBuffer<element_type>' lvalue implicit this
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int'
 // CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
@@ -59,6 +58,5 @@ RWBuffer<float> Buffer;
 // CHECK: TemplateArgument type 'float'
 // CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float'
 // CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'float *'
-// CHECK-NEXT: HLSLResourceClassAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit UAV
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'float * {{\[\[}}hlsl::resource_class(UAV)]]':'float *'
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 5ebbd29b316bf..eca8633114902 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -82,8 +82,6 @@
 // CHECK-NEXT: FunctionReturnThunks (SubjectMatchRule_function)
 // CHECK-NEXT: GNUInline (SubjectMatchRule_function)
 // CHECK-NEXT: HIPManaged (SubjectMatchRule_variable)
-// CHECK-NEXT: HLSLROV (SubjectMatchRule_record_not_is_union)
-// CHECK-NEXT: HLSLResourceClass (SubjectMatchRule_field)
 // CHECK-NEXT: Hot (SubjectMatchRule_function)
 // CHECK-NEXT: HybridPatchable (SubjectMatchRule_function)
 // CHECK-NEXT: IBAction (SubjectMatchRule_objc_method_is_instance)
diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl
index 29850828ad3bc..24c85c6ccf7d7 100644
--- a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl
+++ b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl
@@ -1,9 +1,16 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
 
-
-// CHECK: -HLSLROVAttr 0x{{[0-9a-f]+}} <col:10, col:16>
-struct [[hlsl::is_rov]] Eg1 {
-  int i;  
+// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:6:3, col:68> col:68 h '__hlsl_resource_t {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov()]]':'__hlsl_resource_t'
+struct MyBuffer {
+  __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] h;
 };
 
-Eg1 e1;
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:10:1, col:66> col:66 res '__hlsl_resource_t {{\[\[}}hlsl::resource_class(SRV)]] {{\[\[}}hlsl::is_rov()]]':'__hlsl_resource_t'
+__hlsl_resource_t [[hlsl::is_rov]] [[hlsl::resource_class(SRV)]] res;
+
+// CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:14:1, line:16:1> line:14:6 f 'void ()
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <col:3, col:72> col:72 r '__hlsl_resource_t {{\[\[}}hlsl::resource_class(Sampler)]] {{\[\[}}hlsl::is_rov()]]':'__hlsl_resource_t'
+void f() {
+  __hlsl_resource_t [[hlsl::resource_class(Sampler)]] [[hlsl::is_rov]] r;
+}
diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl
index a21fed22220b6..68b2d9ecb190a 100644
--- a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl
+++ b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl
@@ -1,15 +1,16 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s -verify
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify
 
-// expected-error@+1{{'is_rov' attribute takes no arguments}}
-struct [[hlsl::is_rov(3)]] Eg1 {
-  int i;  
-};
+// expected-error@+1{{'is_rov' attribute cannot be applied to a declaration}}
+[[hlsl::is_rov()]] __hlsl_resource_t res0;
 
-Eg1 e1;
+// expected-error@+1{{HLSL resource needs to have [[hlsl::resource_class()]] attribute}}
+__hlsl_resource_t [[hlsl::is_rov()]] res1;
 
+// expected-error@+1{{'is_rov' attribute takes no arguments}}
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(3)]] res2;
+  
 // expected-error@+1{{use of undeclared identifier 'gibberish'}}
-struct [[hlsl::is_rov(gibberish)]] Eg2 {
-  int i;  
-};
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(gibberish)]] res3;
 
-Eg2 e2;
+// duplicate attribute with the same meaning - no error
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov()]] [[hlsl::is_rov()]] res4;
diff --git a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl
index 4b002e2d89009..f11a64d33839b 100644
--- a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl
+++ b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl
@@ -1,32 +1,32 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
 
-
-// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} <col:26> SRV
-struct Eg1 {
-  [[hlsl::resource_class(SRV)]] int i;  
+// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:6:3, col:51> col:51 h '__hlsl_resource_t {{\[\[}}hlsl::resource_class(UAV)]]':'__hlsl_resource_t'
+struct MyBuffer {
+  __hlsl_resource_t [[hlsl::resource_class(UAV)]] h;
 };
 
-Eg1 e1;
-
-// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} <line:13:1, line:15:1> line:13:8 referenced struct Eg2 definition
-// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} <col:26> UAV
-struct Eg2 {
-  [[hlsl::resource_class(UAV)]] int i;
-};
-Eg2 e2;
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:10:1, col:49> col:49 res '__hlsl_resource_t {{\[\[}}hlsl::resource_class(SRV)]]':'__hlsl_resource_t'
+__hlsl_resource_t [[hlsl::resource_class(SRV)]] res;
 
-// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} <line:20:1, line:22:1> line:20:8 referenced struct Eg3 definition
-// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} <col:26> CBuffer
-struct Eg3 {
-  [[hlsl::resource_class(CBuffer)]] int i;
-}; 
-Eg3 e3;
+// CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:14:1, line:16:1> line:14:6 f 'void ()
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <col:3, col:55> col:55 r '__hlsl_resource_t {{\[\[}}hlsl::resource_class(Sampler)]]':'__hlsl_resource_t'
+void f() {
+  __hlsl_resource_t [[hlsl::resource_class(Sampler)]] r;
+}
 
-// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} <line:27:1, line:29:1> line:27:8 referenced struct Eg4 definition
-// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} <col:26> Sampler
-struct Eg4 {
-  [[hlsl::resource_class(Sampler)]] int i;
+// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} <line:23:1, line:25:1> line:23:29 MyBuffer2
+// CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} <col:10, col:19> col:19 typename depth 0 index 0 T
+// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:22, line:25:1> line:23:29 struct MyBuffer2 definition
+// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:22, col:29> col:29 implicit struct MyBuffer2
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:24:3, col:51> col:51 h '__hlsl_resource_t {{\[\[}}hlsl::resource_class(UAV)]]':'__hlsl_resource_t'
+template<typename T> struct MyBuffer2 {
+  __hlsl_resource_t [[hlsl::resource_class(UAV)]] h;
 };
-Eg4 e4;
 
-RWBuffer<int> In : register(u1);
+// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <line:23:1, line:25:1> line:23:29 struct MyBuffer2 definition implicit_instantiation
+// CHECK: TemplateArgument type 'float'
+// CHECK: BuiltinType 0x{{[0-9a-f]+}} 'float'
+// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:22, col:29> col:29 implicit struct MyBuffer2
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:24:3, col:51> col:51 h '__hlsl_resource_t {{\[\[}}hlsl::resource_class(UAV)]]':'__hlsl_resource_t'
+MyBuffer2<float> myBuffer2;
diff --git a/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl
index 76bed2f060783..01ff1c007e2b5 100644
--- a/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl
+++ b/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl
@@ -1,22 +1,19 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s -verify
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify
 
-struct Eg1 {
-// expected-error@+1{{'resource_class' attribute takes one argument}}
-  [[hlsl::resource_class()]] int i;  
-};
+// expected-error@+1{{'resource_class' attribute cannot be applied to a declaration}}
+[[hlsl::resource_class(UAV)]] __hlsl_resource_t e0;
 
-Eg1 e1;
+// expected-error@+1{{'resource_class' attribute takes one argument}}
+__hlsl_resource_t [[hlsl::resource_class()]] e1;
 
-struct Eg2 {
 // expected-warning@+1{{ResourceClass attribute argument not supported: gibberish}}
-  [[hlsl::resource_class(gibberish)]] int i;  
-};
+__hlsl_resource_t [[hlsl::resource_class(gibberish)]] e2;
 
-Eg2 e2;
+// expected-warning@+1{{attribute 'resource_class' is already applied with different arguments}}
+__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(UAV)]] e3;
 
-// expected-warning@+1{{'resource_class' attribute only applies to non-static data members}}
-struct [[hlsl::resource_class(SRV)]] Eg3 {
-  int i;  
-};
+// expected-warning@+1{{attribute 'resource_class' is already applied}}
+__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(SRV)]] e4;
 
-Eg3 e3;
+// expected-error@+1{{'resource_class' attribute takes one argument}}
+__hlsl_resource_t [[hlsl::resource_class(SRV, "aa")]] e5;
diff --git a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
index 320d1160e761d..6324a11fc8a2d 100644
--- a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
+++ b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
@@ -1,15 +1,16 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump -o - %s | FileCheck %s
 
 // CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> class RWBuffer definition implicit_instantiation
-// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'float *'
-// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit UAV
+// CHECK: -TemplateArgument type 'float'
+// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float'
+// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'float * {{\[\[}}hlsl::resource_class(UAV)]]':'float *'
 // CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit TypedBuffer
 RWBuffer<float> Buffer1;
 
-// CHECK: -ClassTemplateDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit RasterizerOrderedBuffer
-// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit class RasterizerOrderedBuffer definition
-// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit h 'element_type *'
-// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit UAV
+// CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> class RasterizerOrderedBuffer definition implicit_instantiation
+// CHECK: -TemplateArgument type 'vector<float, 4>'
+// CHECK: `-ExtVectorType 0x{{[0-9a-f]+}} 'vector<float, 4>' 4
+// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float'
+// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'vector<float *, 4> {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov()]]':'vector<float *, 4>'
 // CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit TypedBuffer
-// CHECK: -HLSLROVAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit
-RasterizerOrderedBuffer<vector<float, 4> > BufferArray3[4] : register(u4, space1);
+RasterizerOrderedBuffer<vector<float, 4> > BufferArray3[4];
diff --git a/clang/test/SemaHLSL/resource_binding_attr_error.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error.hlsl
index 6a0b5956545dd..cb728dca838c3 100644
--- a/clang/test/SemaHLSL/resource_binding_attr_error.hlsl
+++ b/clang/test/SemaHLSL/resource_binding_attr_error.hlsl
@@ -2,7 +2,7 @@
 
 template<typename T>
 struct MyTemplatedSRV {
-  [[hlsl::resource_class(SRV)]] T x;
+  __hlsl_resource_t [[hlsl::resource_class(SRV)]] x;
 };
 
 // valid, The register keyword in this statement isn't binding a resource, rather it is
diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl
index c40d1d7f60b34..4b6af47c0ab72 100644
--- a/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl
+++ b/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl
@@ -6,23 +6,23 @@
 
 template<typename T>
 struct MyTemplatedSRV {
-  [[hlsl::resource_class(SRV)]] T x;
+  __hlsl_resource_t [[hlsl::resource_class(SRV)]] x;
 };
 
 struct MySRV {
-  [[hlsl::resource_class(SRV)]] int x;
+  __hlsl_resource_t [[hlsl::resource_class(SRV)]] x;
 };
 
 struct MySampler {
-  [[hlsl::resource_class(Sampler)]] int x;
+  __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x;
 };
 
 struct MyUAV {
-  [[hlsl::resource_class(UAV)]] int x;
+  __hlsl_resource_t [[hlsl::resource_class(UAV)]] x;
 };
 
 struct MyCBuffer {
-  [[hlsl::resource_class(CBuffer)]] int x;
+  __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x;
 };
 
 
diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl
index edb3f30739cdf..ea2d576e4cca5 100644
--- a/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl
+++ b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl
@@ -2,23 +2,23 @@
 
 template<typename T>
 struct MyTemplatedUAV {
-  [[hlsl::resource_class(UAV)]] T x;
+  __hlsl_resource_t [[hlsl::resource_class(UAV)]] x;
 };
 
 struct MySRV {
-  [[hlsl::resource_class(SRV)]] int x;
+  __hlsl_resource_t [[hlsl::resource_class(SRV)]] x;
 };
 
 struct MySampler {
-  [[hlsl::resource_class(Sampler)]] int x;
+  __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x;
 };
 
 struct MyUAV {
-  [[hlsl::resource_class(UAV)]] int x;
+  __hlsl_resource_t [[hlsl::resource_class(UAV)]] x;
 };
 
 struct MyCBuffer {
-  [[hlsl::resource_class(CBuffer)]] int x;
+  __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x;
 };
 
 // Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0

From 77f1b481b884621c12cde5f2ce6f080f11dbbbcc Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 6 Sep 2024 09:15:18 +0400
Subject: [PATCH 328/425] DAG: Lower single infinity is.fpclass tests to fcmp
 (#100380)

InstCombine also should have taken care of this, but this
should be helpful when the fcmp based lowering strategy tries
to combine multiple tests.
---
 llvm/lib/CodeGen/CodeGenCommonISel.cpp        |  2 +
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 16 ++++
 llvm/test/CodeGen/X86/is_fpclass.ll           | 92 ++++++++-----------
 3 files changed, 54 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenCommonISel.cpp b/llvm/lib/CodeGen/CodeGenCommonISel.cpp
index d985751e2be0b..4cd2f6ae2fdb1 100644
--- a/llvm/lib/CodeGen/CodeGenCommonISel.cpp
+++ b/llvm/lib/CodeGen/CodeGenCommonISel.cpp
@@ -202,6 +202,8 @@ FPClassTest llvm::invertFPClassTestIfSimpler(FPClassTest Test, bool UseFCmp) {
   case fcSubnormal | fcZero | fcNan:
     return InvertedTest;
   case fcInf | fcNan:
+  case fcPosInf | fcNan:
+  case fcNegInf | fcNan:
     // If we're trying to use fcmp, we can take advantage of the nan check
     // behavior of the compare (but this is more instructions in the integer
     // expansion).
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c3affabb19d37..2b41b8a9a810e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8754,6 +8754,22 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
                           IsOrderedInf ? OrderedCmpOpcode : UnorderedCmpOpcode);
     }
 
+    if ((OrderedFPTestMask == fcPosInf || OrderedFPTestMask == fcNegInf) &&
+        isCondCodeLegalOrCustom(IsOrdered ? OrderedCmpOpcode
+                                          : UnorderedCmpOpcode,
+                                OperandVT.getSimpleVT())) {
+      // isposinf(x) --> x == inf
+      // isneginf(x) --> x == -inf
+      // isposinf(x) || nan --> x u== inf
+      // isneginf(x) || nan --> x u== -inf
+
+      SDValue Inf = DAG.getConstantFP(
+          APFloat::getInf(Semantics, OrderedFPTestMask == fcNegInf), DL,
+          OperandVT);
+      return DAG.getSetCC(DL, ResultVT, Op, Inf,
+                          IsOrdered ? OrderedCmpOpcode : UnorderedCmpOpcode);
+    }
+
     if (OrderedFPTestMask == (fcSubnormal | fcZero) && !IsOrdered) {
       // TODO: Could handle ordered case, but it produces worse code for
       // x86. Maybe handle ordered if fabs is free?
diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll
index cc4d4c4543a51..97136dafa6c2c 100644
--- a/llvm/test/CodeGen/X86/is_fpclass.ll
+++ b/llvm/test/CodeGen/X86/is_fpclass.ll
@@ -2116,24 +2116,19 @@ entry:
 define i1 @is_plus_inf_or_nan_f(float %x) {
 ; X86-LABEL: is_plus_inf_or_nan_f:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-NEXT:    sete %cl
-; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X86-NEXT:    setge %al
-; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NEXT:    fucompp
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: is_plus_inf_or_nan_f:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    sete %cl
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X64-NEXT:    setge %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    sete %al
 ; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 515)  ; 0x200|0x3 = "+inf|nan"
   ret i1 %class
@@ -2142,24 +2137,19 @@ define i1 @is_plus_inf_or_nan_f(float %x) {
 define i1 @is_minus_inf_or_nan_f(float %x) {
 ; X86-LABEL: is_minus_inf_or_nan_f:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; X86-NEXT:    sete %cl
-; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X86-NEXT:    setge %al
-; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NEXT:    fucompp
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: is_minus_inf_or_nan_f:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; X64-NEXT:    sete %cl
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X64-NEXT:    setge %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    sete %al
 ; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 7)  ; "-inf|nan"
   ret i1 %class
@@ -2168,24 +2158,19 @@ define i1 @is_minus_inf_or_nan_f(float %x) {
 define i1 @not_is_plus_inf_or_nan_f(float %x) {
 ; X86-LABEL: not_is_plus_inf_or_nan_f:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; X86-NEXT:    sete %cl
-; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-NEXT:    setl %al
-; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NEXT:    fucompp
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: not_is_plus_inf_or_nan_f:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; X64-NEXT:    sete %cl
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    setl %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    setne %al
 ; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 508)  ; ~(0x200|0x3) = "~(+inf|nan)"
   ret i1 %class
@@ -2194,24 +2179,19 @@ define i1 @not_is_plus_inf_or_nan_f(float %x) {
 define i1 @not_is_minus_inf_or_nan_f(float %x) {
 ; X86-LABEL: not_is_minus_inf_or_nan_f:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-NEXT:    sete %cl
-; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-NEXT:    setl %al
-; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NEXT:    fucompp
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: not_is_minus_inf_or_nan_f:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    sete %cl
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    setl %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    setne %al
 ; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1016)  ; "~(-inf|nan)"
   ret i1 %class

From 093b8bfe6b64c916647ae64af8066df22bb6ea65 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 5 Sep 2024 22:29:23 -0700
Subject: [PATCH 329/425] [RISCV] Separate the calling convention handlers into
 their own file. NFC (#107484)

These are used by both SelectionDAG and GlobalISel and are separate from
RISCVTargetLowering.

Having a separate file is how other targets are structured. Though other
targets generate most of their calling convention code through tablegen.

I moved the `CC_RISV` functions from the `llvm::RISCV` namespace to
`llvm::`. That's what the tablegen code on other targets does and the
functions already have RISCV in their name. `RISCVCCAssignFn` is moved
from `RISCVTargetLowering` to the `llvm` namespace.
---
 llvm/lib/Target/RISCV/CMakeLists.txt          |   1 +
 .../Target/RISCV/GISel/RISCVCallLowering.cpp  |  29 +-
 llvm/lib/Target/RISCV/RISCVCallingConv.cpp    | 684 +++++++++++++++++
 llvm/lib/Target/RISCV/RISCVCallingConv.h      |  51 ++
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 691 +-----------------
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |  31 +-
 6 files changed, 762 insertions(+), 725 deletions(-)
 create mode 100644 llvm/lib/Target/RISCV/RISCVCallingConv.cpp
 create mode 100644 llvm/lib/Target/RISCV/RISCVCallingConv.h

diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index cbb4c2cedfb97..aef0e4fbbf584 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -29,6 +29,7 @@ add_public_tablegen_target(RISCVCommonTableGen)
 
 add_llvm_target(RISCVCodeGen
   RISCVAsmPrinter.cpp
+  RISCVCallingConv.cpp
   RISCVCodeGenPrepare.cpp
   RISCVDeadRegisterDefinitions.cpp
   RISCVMakeCompressible.cpp
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
index 31a9df53a2aa1..14832204058f8 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVCallLowering.h"
+#include "RISCVCallingConv.h"
 #include "RISCVISelLowering.h"
 #include "RISCVMachineFunctionInfo.h"
 #include "RISCVSubtarget.h"
@@ -30,14 +31,13 @@ struct RISCVOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner {
   // The function used internally to assign args - we ignore the AssignFn stored
   // by OutgoingValueAssigner since RISC-V implements its CC using a custom
   // function with a different signature.
-  RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn;
+  RISCVCCAssignFn *RISCVAssignFn;
 
   // Whether this is assigning args for a return.
   bool IsRet;
 
 public:
-  RISCVOutgoingValueAssigner(
-      RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet)
+  RISCVOutgoingValueAssigner(RISCVCCAssignFn *RISCVAssignFn_, bool IsRet)
       : CallLowering::OutgoingValueAssigner(nullptr),
         RISCVAssignFn(RISCVAssignFn_), IsRet(IsRet) {}
 
@@ -182,14 +182,13 @@ struct RISCVIncomingValueAssigner : public CallLowering::IncomingValueAssigner {
   // The function used internally to assign args - we ignore the AssignFn stored
   // by IncomingValueAssigner since RISC-V implements its CC using a custom
   // function with a different signature.
-  RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn;
+  RISCVCCAssignFn *RISCVAssignFn;
 
   // Whether this is assigning args from a return.
   bool IsRet;
 
 public:
-  RISCVIncomingValueAssigner(
-      RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet)
+  RISCVIncomingValueAssigner(RISCVCCAssignFn *RISCVAssignFn_, bool IsRet)
       : CallLowering::IncomingValueAssigner(nullptr),
         RISCVAssignFn(RISCVAssignFn_), IsRet(IsRet) {}
 
@@ -425,7 +424,7 @@ bool RISCVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
     splitToValueTypes(OrigRetInfo, SplitRetInfos, DL, CC);
 
     RISCVOutgoingValueAssigner Assigner(
-        CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV,
+        CC == CallingConv::Fast ? CC_RISCV_FastCC : CC_RISCV,
         /*IsRet=*/true);
     RISCVOutgoingValueHandler Handler(MIRBuilder, MF.getRegInfo(), Ret);
     if (!determineAndHandleAssignments(Handler, Assigner, SplitRetInfos,
@@ -461,9 +460,9 @@ bool RISCVCallLowering::canLowerReturn(MachineFunction &MF,
 
   for (unsigned I = 0, E = Outs.size(); I < E; ++I) {
     MVT VT = MVT::getVT(Outs[I].Ty);
-    if (RISCV::CC_RISCV(MF.getDataLayout(), ABI, I, VT, VT, CCValAssign::Full,
-                        Outs[I].Flags[0], CCInfo, /*IsFixed=*/true,
-                        /*isRet=*/true, nullptr, TLI))
+    if (CC_RISCV(MF.getDataLayout(), ABI, I, VT, VT, CCValAssign::Full,
+                 Outs[I].Flags[0], CCInfo, /*IsFixed=*/true,
+                 /*isRet=*/true, nullptr, TLI))
       return false;
   }
   return true;
@@ -576,9 +575,9 @@ bool RISCVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
     ++Index;
   }
 
-  RISCVIncomingValueAssigner Assigner(
-      CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV,
-      /*IsRet=*/false);
+  RISCVIncomingValueAssigner Assigner(CC == CallingConv::Fast ? CC_RISCV_FastCC
+                                                              : CC_RISCV,
+                                      /*IsRet=*/false);
   RISCVFormalArgHandler Handler(MIRBuilder, MF.getRegInfo());
 
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -639,7 +638,7 @@ bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   Call.addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv));
 
   RISCVOutgoingValueAssigner ArgAssigner(
-      CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV,
+      CC == CallingConv::Fast ? CC_RISCV_FastCC : CC_RISCV,
       /*IsRet=*/false);
   RISCVOutgoingValueHandler ArgHandler(MIRBuilder, MF.getRegInfo(), Call);
   if (!determineAndHandleAssignments(ArgHandler, ArgAssigner, SplitArgInfos,
@@ -667,7 +666,7 @@ bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     splitToValueTypes(Info.OrigRet, SplitRetInfos, DL, CC);
 
     RISCVIncomingValueAssigner RetAssigner(
-        CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV,
+        CC == CallingConv::Fast ? CC_RISCV_FastCC : CC_RISCV,
         /*IsRet=*/true);
     RISCVCallReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), Call);
     if (!determineAndHandleAssignments(RetHandler, RetAssigner, SplitRetInfos,
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
new file mode 100644
index 0000000000000..bf6ae2d1c2910
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -0,0 +1,684 @@
+//===-- RISCVCallingConv.cpp - RISC-V Custom CC Routines ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the RISC-V Calling Convention.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVCallingConv.h"
+#include "RISCVSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCRegister.h"
+
+using namespace llvm;
+
+// Calling Convention Implementation.
+// The expectations for frontend ABI lowering vary from target to target.
+// Ideally, an LLVM frontend would be able to avoid worrying about many ABI
+// details, but this is a longer term goal. For now, we simply try to keep the
+// role of the frontend as simple and well-defined as possible. The rules can
+// be summarised as:
+// * Never split up large scalar arguments. We handle them here.
+// * If a hardfloat calling convention is being used, and the struct may be
+// passed in a pair of registers (fp+fp, int+fp), and both registers are
+// available, then pass as two separate arguments. If either the GPRs or FPRs
+// are exhausted, then pass according to the rule below.
+// * If a struct could never be passed in registers or directly in a stack
+// slot (as it is larger than 2*XLEN and the floating point rules don't
+// apply), then pass it using a pointer with the byval attribute.
+// * If a struct is less than 2*XLEN, then coerce to either a two-element
+// word-sized array or a 2*XLEN scalar (depending on alignment).
+// * The frontend can determine whether a struct is returned by reference or
+// not based on its size and fields. If it will be returned by reference, the
+// frontend must modify the prototype so a pointer with the sret annotation is
+// passed as the first argument. This is not necessary for large scalar
+// returns.
+// * Struct return values and varargs should be coerced to structs containing
+// register-size fields in the same situations they would be for fixed
+// arguments.
+
+static const MCPhysReg ArgFPR16s[] = {RISCV::F10_H, RISCV::F11_H, RISCV::F12_H,
+                                      RISCV::F13_H, RISCV::F14_H, RISCV::F15_H,
+                                      RISCV::F16_H, RISCV::F17_H};
+static const MCPhysReg ArgFPR32s[] = {RISCV::F10_F, RISCV::F11_F, RISCV::F12_F,
+                                      RISCV::F13_F, RISCV::F14_F, RISCV::F15_F,
+                                      RISCV::F16_F, RISCV::F17_F};
+static const MCPhysReg ArgFPR64s[] = {RISCV::F10_D, RISCV::F11_D, RISCV::F12_D,
+                                      RISCV::F13_D, RISCV::F14_D, RISCV::F15_D,
+                                      RISCV::F16_D, RISCV::F17_D};
+// This is an interim calling convention and it may be changed in the future.
+static const MCPhysReg ArgVRs[] = {
+    RISCV::V8,  RISCV::V9,  RISCV::V10, RISCV::V11, RISCV::V12, RISCV::V13,
+    RISCV::V14, RISCV::V15, RISCV::V16, RISCV::V17, RISCV::V18, RISCV::V19,
+    RISCV::V20, RISCV::V21, RISCV::V22, RISCV::V23};
+static const MCPhysReg ArgVRM2s[] = {RISCV::V8M2,  RISCV::V10M2, RISCV::V12M2,
+                                     RISCV::V14M2, RISCV::V16M2, RISCV::V18M2,
+                                     RISCV::V20M2, RISCV::V22M2};
+static const MCPhysReg ArgVRM4s[] = {RISCV::V8M4, RISCV::V12M4, RISCV::V16M4,
+                                     RISCV::V20M4};
+static const MCPhysReg ArgVRM8s[] = {RISCV::V8M8, RISCV::V16M8};
+static const MCPhysReg ArgVRN2M1s[] = {
+    RISCV::V8_V9,   RISCV::V9_V10,  RISCV::V10_V11, RISCV::V11_V12,
+    RISCV::V12_V13, RISCV::V13_V14, RISCV::V14_V15, RISCV::V15_V16,
+    RISCV::V16_V17, RISCV::V17_V18, RISCV::V18_V19, RISCV::V19_V20,
+    RISCV::V20_V21, RISCV::V21_V22, RISCV::V22_V23};
+static const MCPhysReg ArgVRN3M1s[] = {
+    RISCV::V8_V9_V10,   RISCV::V9_V10_V11,  RISCV::V10_V11_V12,
+    RISCV::V11_V12_V13, RISCV::V12_V13_V14, RISCV::V13_V14_V15,
+    RISCV::V14_V15_V16, RISCV::V15_V16_V17, RISCV::V16_V17_V18,
+    RISCV::V17_V18_V19, RISCV::V18_V19_V20, RISCV::V19_V20_V21,
+    RISCV::V20_V21_V22, RISCV::V21_V22_V23};
+static const MCPhysReg ArgVRN4M1s[] = {
+    RISCV::V8_V9_V10_V11,   RISCV::V9_V10_V11_V12,  RISCV::V10_V11_V12_V13,
+    RISCV::V11_V12_V13_V14, RISCV::V12_V13_V14_V15, RISCV::V13_V14_V15_V16,
+    RISCV::V14_V15_V16_V17, RISCV::V15_V16_V17_V18, RISCV::V16_V17_V18_V19,
+    RISCV::V17_V18_V19_V20, RISCV::V18_V19_V20_V21, RISCV::V19_V20_V21_V22,
+    RISCV::V20_V21_V22_V23};
+static const MCPhysReg ArgVRN5M1s[] = {
+    RISCV::V8_V9_V10_V11_V12,   RISCV::V9_V10_V11_V12_V13,
+    RISCV::V10_V11_V12_V13_V14, RISCV::V11_V12_V13_V14_V15,
+    RISCV::V12_V13_V14_V15_V16, RISCV::V13_V14_V15_V16_V17,
+    RISCV::V14_V15_V16_V17_V18, RISCV::V15_V16_V17_V18_V19,
+    RISCV::V16_V17_V18_V19_V20, RISCV::V17_V18_V19_V20_V21,
+    RISCV::V18_V19_V20_V21_V22, RISCV::V19_V20_V21_V22_V23};
+static const MCPhysReg ArgVRN6M1s[] = {
+    RISCV::V8_V9_V10_V11_V12_V13,   RISCV::V9_V10_V11_V12_V13_V14,
+    RISCV::V10_V11_V12_V13_V14_V15, RISCV::V11_V12_V13_V14_V15_V16,
+    RISCV::V12_V13_V14_V15_V16_V17, RISCV::V13_V14_V15_V16_V17_V18,
+    RISCV::V14_V15_V16_V17_V18_V19, RISCV::V15_V16_V17_V18_V19_V20,
+    RISCV::V16_V17_V18_V19_V20_V21, RISCV::V17_V18_V19_V20_V21_V22,
+    RISCV::V18_V19_V20_V21_V22_V23};
+static const MCPhysReg ArgVRN7M1s[] = {
+    RISCV::V8_V9_V10_V11_V12_V13_V14,   RISCV::V9_V10_V11_V12_V13_V14_V15,
+    RISCV::V10_V11_V12_V13_V14_V15_V16, RISCV::V11_V12_V13_V14_V15_V16_V17,
+    RISCV::V12_V13_V14_V15_V16_V17_V18, RISCV::V13_V14_V15_V16_V17_V18_V19,
+    RISCV::V14_V15_V16_V17_V18_V19_V20, RISCV::V15_V16_V17_V18_V19_V20_V21,
+    RISCV::V16_V17_V18_V19_V20_V21_V22, RISCV::V17_V18_V19_V20_V21_V22_V23};
+static const MCPhysReg ArgVRN8M1s[] = {RISCV::V8_V9_V10_V11_V12_V13_V14_V15,
+                                       RISCV::V9_V10_V11_V12_V13_V14_V15_V16,
+                                       RISCV::V10_V11_V12_V13_V14_V15_V16_V17,
+                                       RISCV::V11_V12_V13_V14_V15_V16_V17_V18,
+                                       RISCV::V12_V13_V14_V15_V16_V17_V18_V19,
+                                       RISCV::V13_V14_V15_V16_V17_V18_V19_V20,
+                                       RISCV::V14_V15_V16_V17_V18_V19_V20_V21,
+                                       RISCV::V15_V16_V17_V18_V19_V20_V21_V22,
+                                       RISCV::V16_V17_V18_V19_V20_V21_V22_V23};
+static const MCPhysReg ArgVRN2M2s[] = {RISCV::V8M2_V10M2,  RISCV::V10M2_V12M2,
+                                       RISCV::V12M2_V14M2, RISCV::V14M2_V16M2,
+                                       RISCV::V16M2_V18M2, RISCV::V18M2_V20M2,
+                                       RISCV::V20M2_V22M2};
+static const MCPhysReg ArgVRN3M2s[] = {
+    RISCV::V8M2_V10M2_V12M2,  RISCV::V10M2_V12M2_V14M2,
+    RISCV::V12M2_V14M2_V16M2, RISCV::V14M2_V16M2_V18M2,
+    RISCV::V16M2_V18M2_V20M2, RISCV::V18M2_V20M2_V22M2};
+static const MCPhysReg ArgVRN4M2s[] = {
+    RISCV::V8M2_V10M2_V12M2_V14M2, RISCV::V10M2_V12M2_V14M2_V16M2,
+    RISCV::V12M2_V14M2_V16M2_V18M2, RISCV::V14M2_V16M2_V18M2_V20M2,
+    RISCV::V16M2_V18M2_V20M2_V22M2};
+static const MCPhysReg ArgVRN2M4s[] = {RISCV::V8M4_V12M4, RISCV::V12M4_V16M4,
+                                       RISCV::V16M4_V20M4};
+
+ArrayRef<MCPhysReg> RISCV::getArgGPRs(const RISCVABI::ABI ABI) {
+  // The GPRs used for passing arguments in the ILP32* and LP64* ABIs, except
+  // the ILP32E ABI.
+  static const MCPhysReg ArgIGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12,
+                                       RISCV::X13, RISCV::X14, RISCV::X15,
+                                       RISCV::X16, RISCV::X17};
+  // The GPRs used for passing arguments in the ILP32E/ILP64E ABI.
+  static const MCPhysReg ArgEGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12,
+                                       RISCV::X13, RISCV::X14, RISCV::X15};
+
+  if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E)
+    return ArrayRef(ArgEGPRs);
+
+  return ArrayRef(ArgIGPRs);
+}
+
+static ArrayRef<MCPhysReg> getFastCCArgGPRs(const RISCVABI::ABI ABI) {
+  // The GPRs used for passing arguments in the FastCC, X5 and X6 might be used
+  // for save-restore libcall, so we don't use them.
+  // Don't use X7 for fastcc, since Zicfilp uses X7 as the label register.
+  static const MCPhysReg FastCCIGPRs[] = {
+      RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14, RISCV::X15,
+      RISCV::X16, RISCV::X17, RISCV::X28, RISCV::X29, RISCV::X30, RISCV::X31};
+
+  // The GPRs used for passing arguments in the FastCC when using ILP32E/ILP64E.
+  static const MCPhysReg FastCCEGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12,
+                                          RISCV::X13, RISCV::X14, RISCV::X15};
+
+  if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E)
+    return ArrayRef(FastCCEGPRs);
+
+  return ArrayRef(FastCCIGPRs);
+}
+
+// Pass a 2*XLEN argument that has been split into two XLEN values through
+// registers or the stack as necessary.
+static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
+                                ISD::ArgFlagsTy ArgFlags1, unsigned ValNo2,
+                                MVT ValVT2, MVT LocVT2,
+                                ISD::ArgFlagsTy ArgFlags2, bool EABI) {
+  unsigned XLenInBytes = XLen / 8;
+  const RISCVSubtarget &STI =
+      State.getMachineFunction().getSubtarget<RISCVSubtarget>();
+  ArrayRef<MCPhysReg> ArgGPRs = RISCV::getArgGPRs(STI.getTargetABI());
+
+  if (MCRegister Reg = State.AllocateReg(ArgGPRs)) {
+    // At least one half can be passed via register.
+    State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg,
+                                     VA1.getLocVT(), CCValAssign::Full));
+  } else {
+    // Both halves must be passed on the stack, with proper alignment.
+    // TODO: To be compatible with GCC's behaviors, we force them to have 4-byte
+    // alignment. This behavior may be changed when RV32E/ILP32E is ratified.
+    Align StackAlign(XLenInBytes);
+    if (!EABI || XLen != 32)
+      StackAlign = std::max(StackAlign, ArgFlags1.getNonZeroOrigAlign());
+    State.addLoc(
+        CCValAssign::getMem(VA1.getValNo(), VA1.getValVT(),
+                            State.AllocateStack(XLenInBytes, StackAlign),
+                            VA1.getLocVT(), CCValAssign::Full));
+    State.addLoc(CCValAssign::getMem(
+        ValNo2, ValVT2, State.AllocateStack(XLenInBytes, Align(XLenInBytes)),
+        LocVT2, CCValAssign::Full));
+    return false;
+  }
+
+  if (MCRegister Reg = State.AllocateReg(ArgGPRs)) {
+    // The second half can also be passed via register.
+    State.addLoc(
+        CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full));
+  } else {
+    // The second half is passed via the stack, without additional alignment.
+    State.addLoc(CCValAssign::getMem(
+        ValNo2, ValVT2, State.AllocateStack(XLenInBytes, Align(XLenInBytes)),
+        LocVT2, CCValAssign::Full));
+  }
+
+  return false;
+}
+
+static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo, CCState &State,
+                                 const RISCVTargetLowering &TLI) {
+  const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT);
+  if (RC == &RISCV::VRRegClass) {
+    // Assign the first mask argument to V0.
+    // This is an interim calling convention and it may be changed in the
+    // future.
+    if (ValVT.getVectorElementType() == MVT::i1)
+      if (MCRegister Reg = State.AllocateReg(RISCV::V0))
+        return Reg;
+    return State.AllocateReg(ArgVRs);
+  }
+  if (RC == &RISCV::VRM2RegClass)
+    return State.AllocateReg(ArgVRM2s);
+  if (RC == &RISCV::VRM4RegClass)
+    return State.AllocateReg(ArgVRM4s);
+  if (RC == &RISCV::VRM8RegClass)
+    return State.AllocateReg(ArgVRM8s);
+  if (RC == &RISCV::VRN2M1RegClass)
+    return State.AllocateReg(ArgVRN2M1s);
+  if (RC == &RISCV::VRN3M1RegClass)
+    return State.AllocateReg(ArgVRN3M1s);
+  if (RC == &RISCV::VRN4M1RegClass)
+    return State.AllocateReg(ArgVRN4M1s);
+  if (RC == &RISCV::VRN5M1RegClass)
+    return State.AllocateReg(ArgVRN5M1s);
+  if (RC == &RISCV::VRN6M1RegClass)
+    return State.AllocateReg(ArgVRN6M1s);
+  if (RC == &RISCV::VRN7M1RegClass)
+    return State.AllocateReg(ArgVRN7M1s);
+  if (RC == &RISCV::VRN8M1RegClass)
+    return State.AllocateReg(ArgVRN8M1s);
+  if (RC == &RISCV::VRN2M2RegClass)
+    return State.AllocateReg(ArgVRN2M2s);
+  if (RC == &RISCV::VRN3M2RegClass)
+    return State.AllocateReg(ArgVRN3M2s);
+  if (RC == &RISCV::VRN4M2RegClass)
+    return State.AllocateReg(ArgVRN4M2s);
+  if (RC == &RISCV::VRN2M4RegClass)
+    return State.AllocateReg(ArgVRN2M4s);
+  llvm_unreachable("Unhandled register class for ValueType");
+}
+
+// Implements the RISC-V calling convention. Returns true upon failure.
+bool llvm::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
+                    MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
+                    ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
+                    bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI) {
+  unsigned XLen = DL.getLargestLegalIntTypeSizeInBits();
+  assert(XLen == 32 || XLen == 64);
+  MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;
+
+  // Static chain parameter must not be passed in normal argument registers,
+  // so we assign t2 for it as done in GCC's __builtin_call_with_static_chain
+  if (ArgFlags.isNest()) {
+    if (MCRegister Reg = State.AllocateReg(RISCV::X7)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  // Any return value split in to more than two values can't be returned
+  // directly. Vectors are returned via the available vector registers.
+  if (!LocVT.isVector() && IsRet && ValNo > 1)
+    return true;
+
+  // UseGPRForF16_F32 if targeting one of the soft-float ABIs, if passing a
+  // variadic argument, or if no F16/F32 argument registers are available.
+  bool UseGPRForF16_F32 = true;
+  // UseGPRForF64 if targeting soft-float ABIs or an FLEN=32 ABI, if passing a
+  // variadic argument, or if no F64 argument registers are available.
+  bool UseGPRForF64 = true;
+
+  switch (ABI) {
+  default:
+    llvm_unreachable("Unexpected ABI");
+  case RISCVABI::ABI_ILP32:
+  case RISCVABI::ABI_ILP32E:
+  case RISCVABI::ABI_LP64:
+  case RISCVABI::ABI_LP64E:
+    break;
+  case RISCVABI::ABI_ILP32F:
+  case RISCVABI::ABI_LP64F:
+    UseGPRForF16_F32 = !IsFixed;
+    break;
+  case RISCVABI::ABI_ILP32D:
+  case RISCVABI::ABI_LP64D:
+    UseGPRForF16_F32 = !IsFixed;
+    UseGPRForF64 = !IsFixed;
+    break;
+  }
+
+  // FPR16, FPR32, and FPR64 alias each other.
+  if (State.getFirstUnallocated(ArgFPR32s) == std::size(ArgFPR32s)) {
+    UseGPRForF16_F32 = true;
+    UseGPRForF64 = true;
+  }
+
+  // From this point on, rely on UseGPRForF16_F32, UseGPRForF64 and
+  // similar local variables rather than directly checking against the target
+  // ABI.
+
+  ArrayRef<MCPhysReg> ArgGPRs = RISCV::getArgGPRs(ABI);
+
+  if (UseGPRForF16_F32 && (ValVT == MVT::f16 || ValVT == MVT::bf16 ||
+                           (ValVT == MVT::f32 && XLen == 64))) {
+    MCRegister Reg = State.AllocateReg(ArgGPRs);
+    if (Reg) {
+      LocVT = XLenVT;
+      State.addLoc(
+          CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  if (UseGPRForF16_F32 &&
+      (ValVT == MVT::f16 || ValVT == MVT::bf16 || ValVT == MVT::f32)) {
+    LocVT = XLenVT;
+    LocInfo = CCValAssign::BCvt;
+  } else if (UseGPRForF64 && XLen == 64 && ValVT == MVT::f64) {
+    LocVT = MVT::i64;
+    LocInfo = CCValAssign::BCvt;
+  }
+
+  // If this is a variadic argument, the RISC-V calling convention requires
+  // that it is assigned an 'even' or 'aligned' register if it has 8-byte
+  // alignment (RV32) or 16-byte alignment (RV64). An aligned register should
+  // be used regardless of whether the original argument was split during
+  // legalisation or not. The argument will not be passed by registers if the
+  // original type is larger than 2*XLEN, so the register alignment rule does
+  // not apply.
+  // TODO: To be compatible with GCC's behaviors, we don't align registers
+  // currently if we are using ILP32E calling convention. This behavior may be
+  // changed when RV32E/ILP32E is ratified.
+  unsigned TwoXLenInBytes = (2 * XLen) / 8;
+  if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes &&
+      DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes &&
+      ABI != RISCVABI::ABI_ILP32E) {
+    unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
+    // Skip 'odd' register if necessary.
+    if (RegIdx != std::size(ArgGPRs) && RegIdx % 2 == 1)
+      State.AllocateReg(ArgGPRs);
+  }
+
+  SmallVectorImpl<CCValAssign> &PendingLocs = State.getPendingLocs();
+  SmallVectorImpl<ISD::ArgFlagsTy> &PendingArgFlags =
+      State.getPendingArgFlags();
+
+  assert(PendingLocs.size() == PendingArgFlags.size() &&
+         "PendingLocs and PendingArgFlags out of sync");
+
+  // Handle passing f64 on RV32D with a soft float ABI or when floating point
+  // registers are exhausted.
+  if (UseGPRForF64 && XLen == 32 && ValVT == MVT::f64) {
+    assert(PendingLocs.empty() && "Can't lower f64 if it is split");
+    // Depending on available argument GPRS, f64 may be passed in a pair of
+    // GPRs, split between a GPR and the stack, or passed completely on the
+    // stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these
+    // cases.
+    MCRegister Reg = State.AllocateReg(ArgGPRs);
+    if (!Reg) {
+      unsigned StackOffset = State.AllocateStack(8, Align(8));
+      State.addLoc(
+          CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
+      return false;
+    }
+    LocVT = MVT::i32;
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    MCRegister HiReg = State.AllocateReg(ArgGPRs);
+    if (HiReg) {
+      State.addLoc(
+          CCValAssign::getCustomReg(ValNo, ValVT, HiReg, LocVT, LocInfo));
+    } else {
+      unsigned StackOffset = State.AllocateStack(4, Align(4));
+      State.addLoc(
+          CCValAssign::getCustomMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
+    }
+    return false;
+  }
+
+  // Fixed-length vectors are located in the corresponding scalable-vector
+  // container types.
+  if (ValVT.isFixedLengthVector())
+    LocVT = TLI.getContainerForFixedLengthVector(LocVT);
+
+  // Split arguments might be passed indirectly, so keep track of the pending
+  // values. Split vectors are passed via a mix of registers and indirectly, so
+  // treat them as we would any other argument.
+  if (ValVT.isScalarInteger() && (ArgFlags.isSplit() || !PendingLocs.empty())) {
+    LocVT = XLenVT;
+    LocInfo = CCValAssign::Indirect;
+    PendingLocs.push_back(
+        CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+    PendingArgFlags.push_back(ArgFlags);
+    if (!ArgFlags.isSplitEnd()) {
+      return false;
+    }
+  }
+
+  // If the split argument only had two elements, it should be passed directly
+  // in registers or on the stack.
+  if (ValVT.isScalarInteger() && ArgFlags.isSplitEnd() &&
+      PendingLocs.size() <= 2) {
+    assert(PendingLocs.size() == 2 && "Unexpected PendingLocs.size()");
+    // Apply the normal calling convention rules to the first half of the
+    // split argument.
+    CCValAssign VA = PendingLocs[0];
+    ISD::ArgFlagsTy AF = PendingArgFlags[0];
+    PendingLocs.clear();
+    PendingArgFlags.clear();
+    return CC_RISCVAssign2XLen(
+        XLen, State, VA, AF, ValNo, ValVT, LocVT, ArgFlags,
+        ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E);
+  }
+
+  // Allocate to a register if possible, or else a stack slot.
+  MCRegister Reg;
+  unsigned StoreSizeBytes = XLen / 8;
+  Align StackAlign = Align(XLen / 8);
+
+  if ((ValVT == MVT::f16 || ValVT == MVT::bf16) && !UseGPRForF16_F32)
+    Reg = State.AllocateReg(ArgFPR16s);
+  else if (ValVT == MVT::f32 && !UseGPRForF16_F32)
+    Reg = State.AllocateReg(ArgFPR32s);
+  else if (ValVT == MVT::f64 && !UseGPRForF64)
+    Reg = State.AllocateReg(ArgFPR64s);
+  else if (ValVT.isVector() || ValVT.isRISCVVectorTuple()) {
+    Reg = allocateRVVReg(ValVT, ValNo, State, TLI);
+    if (!Reg) {
+      // For return values, the vector must be passed fully via registers or
+      // via the stack.
+      // FIXME: The proposed vector ABI only mandates v8-v15 for return values,
+      // but we're using all of them.
+      if (IsRet)
+        return true;
+      // Try using a GPR to pass the address
+      if ((Reg = State.AllocateReg(ArgGPRs))) {
+        LocVT = XLenVT;
+        LocInfo = CCValAssign::Indirect;
+      } else if (ValVT.isScalableVector()) {
+        LocVT = XLenVT;
+        LocInfo = CCValAssign::Indirect;
+      } else {
+        // Pass fixed-length vectors on the stack.
+        LocVT = ValVT;
+        StoreSizeBytes = ValVT.getStoreSize();
+        // Align vectors to their element sizes, being careful for vXi1
+        // vectors.
+        StackAlign = MaybeAlign(ValVT.getScalarSizeInBits() / 8).valueOrOne();
+      }
+    }
+  } else {
+    Reg = State.AllocateReg(ArgGPRs);
+  }
+
+  unsigned StackOffset =
+      Reg ? 0 : State.AllocateStack(StoreSizeBytes, StackAlign);
+
+  // If we reach this point and PendingLocs is non-empty, we must be at the
+  // end of a split argument that must be passed indirectly.
+  if (!PendingLocs.empty()) {
+    assert(ArgFlags.isSplitEnd() && "Expected ArgFlags.isSplitEnd()");
+    assert(PendingLocs.size() > 2 && "Unexpected PendingLocs.size()");
+
+    for (auto &It : PendingLocs) {
+      if (Reg)
+        It.convertToReg(Reg);
+      else
+        It.convertToMem(StackOffset);
+      State.addLoc(It);
+    }
+    PendingLocs.clear();
+    PendingArgFlags.clear();
+    return false;
+  }
+
+  assert((!UseGPRForF16_F32 || !UseGPRForF64 || LocVT == XLenVT ||
+          (TLI.getSubtarget().hasVInstructions() &&
+           (ValVT.isVector() || ValVT.isRISCVVectorTuple()))) &&
+         "Expected an XLenVT or vector types at this stage");
+
+  if (Reg) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return false;
+  }
+
+  // When a scalar floating-point value is passed on the stack, no
+  // bit-conversion is needed.
+  if (ValVT.isFloatingPoint() && LocInfo != CCValAssign::Indirect) {
+    assert(!ValVT.isVector());
+    LocVT = ValVT;
+    LocInfo = CCValAssign::Full;
+  }
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
+  return false;
+}
+
+// FastCC has less than 1% performance improvement for some particular
+// benchmark. But theoretically, it may have benefit for some cases.
+bool llvm::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
+                           unsigned ValNo, MVT ValVT, MVT LocVT,
+                           CCValAssign::LocInfo LocInfo,
+                           ISD::ArgFlagsTy ArgFlags, CCState &State,
+                           bool IsFixed, bool IsRet, Type *OrigTy,
+                           const RISCVTargetLowering &TLI) {
+  if (LocVT == MVT::i32 || LocVT == MVT::i64) {
+    if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  const RISCVSubtarget &Subtarget = TLI.getSubtarget();
+
+  if (LocVT == MVT::f16 && Subtarget.hasStdExtZfhmin()) {
+    static const MCPhysReg FPR16List[] = {
+        RISCV::F10_H, RISCV::F11_H, RISCV::F12_H, RISCV::F13_H, RISCV::F14_H,
+        RISCV::F15_H, RISCV::F16_H, RISCV::F17_H, RISCV::F0_H,  RISCV::F1_H,
+        RISCV::F2_H,  RISCV::F3_H,  RISCV::F4_H,  RISCV::F5_H,  RISCV::F6_H,
+        RISCV::F7_H,  RISCV::F28_H, RISCV::F29_H, RISCV::F30_H, RISCV::F31_H};
+    if (MCRegister Reg = State.AllocateReg(FPR16List)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  if (LocVT == MVT::f32 && Subtarget.hasStdExtF()) {
+    static const MCPhysReg FPR32List[] = {
+        RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F, RISCV::F14_F,
+        RISCV::F15_F, RISCV::F16_F, RISCV::F17_F, RISCV::F0_F,  RISCV::F1_F,
+        RISCV::F2_F,  RISCV::F3_F,  RISCV::F4_F,  RISCV::F5_F,  RISCV::F6_F,
+        RISCV::F7_F,  RISCV::F28_F, RISCV::F29_F, RISCV::F30_F, RISCV::F31_F};
+    if (MCRegister Reg = State.AllocateReg(FPR32List)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  if (LocVT == MVT::f64 && Subtarget.hasStdExtD()) {
+    static const MCPhysReg FPR64List[] = {
+        RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D, RISCV::F14_D,
+        RISCV::F15_D, RISCV::F16_D, RISCV::F17_D, RISCV::F0_D,  RISCV::F1_D,
+        RISCV::F2_D,  RISCV::F3_D,  RISCV::F4_D,  RISCV::F5_D,  RISCV::F6_D,
+        RISCV::F7_D,  RISCV::F28_D, RISCV::F29_D, RISCV::F30_D, RISCV::F31_D};
+    if (MCRegister Reg = State.AllocateReg(FPR64List)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  // Check if there is an available GPR before hitting the stack.
+  if ((LocVT == MVT::f16 && Subtarget.hasStdExtZhinxmin()) ||
+      (LocVT == MVT::f32 && Subtarget.hasStdExtZfinx()) ||
+      (LocVT == MVT::f64 && Subtarget.is64Bit() &&
+       Subtarget.hasStdExtZdinx())) {
+    if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
+      if (LocVT.getSizeInBits() != Subtarget.getXLen()) {
+        LocVT = Subtarget.getXLenVT();
+        State.addLoc(
+            CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+        return false;
+      }
+      LocVT = Subtarget.getXLenVT();
+      LocInfo = CCValAssign::BCvt;
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  if (LocVT == MVT::f16) {
+    unsigned Offset2 = State.AllocateStack(2, Align(2));
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset2, LocVT, LocInfo));
+    return false;
+  }
+
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+    unsigned Offset4 = State.AllocateStack(4, Align(4));
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset4, LocVT, LocInfo));
+    return false;
+  }
+
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+    unsigned Offset5 = State.AllocateStack(8, Align(8));
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset5, LocVT, LocInfo));
+    return false;
+  }
+
+  if (LocVT.isVector()) {
+    if (MCRegister Reg = allocateRVVReg(ValVT, ValNo, State, TLI)) {
+      // Fixed-length vectors are located in the corresponding scalable-vector
+      // container types.
+      if (ValVT.isFixedLengthVector())
+        LocVT = TLI.getContainerForFixedLengthVector(LocVT);
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    } else {
+      // Try and pass the address via a "fast" GPR.
+      if (MCRegister GPRReg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
+        LocInfo = CCValAssign::Indirect;
+        LocVT = TLI.getSubtarget().getXLenVT();
+        State.addLoc(CCValAssign::getReg(ValNo, ValVT, GPRReg, LocVT, LocInfo));
+      } else if (ValVT.isFixedLengthVector()) {
+        auto StackAlign =
+            MaybeAlign(ValVT.getScalarSizeInBits() / 8).valueOrOne();
+        unsigned StackOffset =
+            State.AllocateStack(ValVT.getStoreSize(), StackAlign);
+        State.addLoc(
+            CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
+      } else {
+        // Can't pass scalable vectors on the stack.
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  return true; // CC didn't match.
+}
+
+bool llvm::CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
+                        CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+                        CCState &State) {
+  if (ArgFlags.isNest()) {
+    report_fatal_error(
+        "Attribute 'nest' is not supported in GHC calling convention");
+  }
+
+  static const MCPhysReg GPRList[] = {
+      RISCV::X9,  RISCV::X18, RISCV::X19, RISCV::X20, RISCV::X21, RISCV::X22,
+      RISCV::X23, RISCV::X24, RISCV::X25, RISCV::X26, RISCV::X27};
+
+  if (LocVT == MVT::i32 || LocVT == MVT::i64) {
+    // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, R7, SpLim
+    //                        s1    s2  s3  s4  s5  s6  s7  s8  s9  s10 s11
+    if (MCRegister Reg = State.AllocateReg(GPRList)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  const RISCVSubtarget &Subtarget =
+      State.getMachineFunction().getSubtarget<RISCVSubtarget>();
+
+  if (LocVT == MVT::f32 && Subtarget.hasStdExtF()) {
+    // Pass in STG registers: F1, ..., F6
+    //                        fs0 ... fs5
+    static const MCPhysReg FPR32List[] = {RISCV::F8_F,  RISCV::F9_F,
+                                          RISCV::F18_F, RISCV::F19_F,
+                                          RISCV::F20_F, RISCV::F21_F};
+    if (MCRegister Reg = State.AllocateReg(FPR32List)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  if (LocVT == MVT::f64 && Subtarget.hasStdExtD()) {
+    // Pass in STG registers: D1, ..., D6
+    //                        fs6 ... fs11
+    static const MCPhysReg FPR64List[] = {RISCV::F22_D, RISCV::F23_D,
+                                          RISCV::F24_D, RISCV::F25_D,
+                                          RISCV::F26_D, RISCV::F27_D};
+    if (MCRegister Reg = State.AllocateReg(FPR64List)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  if ((LocVT == MVT::f32 && Subtarget.hasStdExtZfinx()) ||
+      (LocVT == MVT::f64 && Subtarget.hasStdExtZdinx() &&
+       Subtarget.is64Bit())) {
+    if (MCRegister Reg = State.AllocateReg(GPRList)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  report_fatal_error("No registers left in GHC calling convention");
+  return true;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.h b/llvm/lib/Target/RISCV/RISCVCallingConv.h
new file mode 100644
index 0000000000000..9154a31d11608
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.h
@@ -0,0 +1,51 @@
+//===-- RISCVCallingConv.h - RISC-V Custom CC Routines ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the custom routines for the RISC-V Calling Convention.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/RISCVBaseInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+
+namespace llvm {
+
+class DataLayout;
+class RISCVTargetLowering;
+
+/// RISCVCCAssignFn - This target-specific function extends the default
+/// CCValAssign with additional information used to lower RISC-V calling
+/// conventions.
+typedef bool RISCVCCAssignFn(const DataLayout &DL, RISCVABI::ABI,
+                             unsigned ValNo, MVT ValVT, MVT LocVT,
+                             CCValAssign::LocInfo LocInfo,
+                             ISD::ArgFlagsTy ArgFlags, CCState &State,
+                             bool IsFixed, bool IsRet, Type *OrigTy,
+                             const RISCVTargetLowering &TLI);
+
+bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
+              MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
+              ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
+              bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI);
+
+bool CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
+                     MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
+                     ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
+                     bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI);
+
+bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
+                  CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+                  CCState &State);
+
+namespace RISCV {
+
+ArrayRef<MCPhysReg> getArgGPRs(const RISCVABI::ABI ABI);
+
+} // end namespace RISCV
+
+} // end namespace llvm
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6b4219b462384..acee6443bc452 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -18934,492 +18934,6 @@ void RISCVTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
       MachineOperand::CreateReg(RISCV::FRM, /*isDef*/ false, /*isImp*/ true));
 }
 
-// Calling Convention Implementation.
-// The expectations for frontend ABI lowering vary from target to target.
-// Ideally, an LLVM frontend would be able to avoid worrying about many ABI
-// details, but this is a longer term goal. For now, we simply try to keep the
-// role of the frontend as simple and well-defined as possible. The rules can
-// be summarised as:
-// * Never split up large scalar arguments. We handle them here.
-// * If a hardfloat calling convention is being used, and the struct may be
-// passed in a pair of registers (fp+fp, int+fp), and both registers are
-// available, then pass as two separate arguments. If either the GPRs or FPRs
-// are exhausted, then pass according to the rule below.
-// * If a struct could never be passed in registers or directly in a stack
-// slot (as it is larger than 2*XLEN and the floating point rules don't
-// apply), then pass it using a pointer with the byval attribute.
-// * If a struct is less than 2*XLEN, then coerce to either a two-element
-// word-sized array or a 2*XLEN scalar (depending on alignment).
-// * The frontend can determine whether a struct is returned by reference or
-// not based on its size and fields. If it will be returned by reference, the
-// frontend must modify the prototype so a pointer with the sret annotation is
-// passed as the first argument. This is not necessary for large scalar
-// returns.
-// * Struct return values and varargs should be coerced to structs containing
-// register-size fields in the same situations they would be for fixed
-// arguments.
-
-static const MCPhysReg ArgFPR16s[] = {
-  RISCV::F10_H, RISCV::F11_H, RISCV::F12_H, RISCV::F13_H,
-  RISCV::F14_H, RISCV::F15_H, RISCV::F16_H, RISCV::F17_H
-};
-static const MCPhysReg ArgFPR32s[] = {
-  RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F,
-  RISCV::F14_F, RISCV::F15_F, RISCV::F16_F, RISCV::F17_F
-};
-static const MCPhysReg ArgFPR64s[] = {
-  RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D,
-  RISCV::F14_D, RISCV::F15_D, RISCV::F16_D, RISCV::F17_D
-};
-// This is an interim calling convention and it may be changed in the future.
-static const MCPhysReg ArgVRs[] = {
-    RISCV::V8,  RISCV::V9,  RISCV::V10, RISCV::V11, RISCV::V12, RISCV::V13,
-    RISCV::V14, RISCV::V15, RISCV::V16, RISCV::V17, RISCV::V18, RISCV::V19,
-    RISCV::V20, RISCV::V21, RISCV::V22, RISCV::V23};
-static const MCPhysReg ArgVRM2s[] = {RISCV::V8M2,  RISCV::V10M2, RISCV::V12M2,
-                                     RISCV::V14M2, RISCV::V16M2, RISCV::V18M2,
-                                     RISCV::V20M2, RISCV::V22M2};
-static const MCPhysReg ArgVRM4s[] = {RISCV::V8M4, RISCV::V12M4, RISCV::V16M4,
-                                     RISCV::V20M4};
-static const MCPhysReg ArgVRM8s[] = {RISCV::V8M8, RISCV::V16M8};
-static const MCPhysReg ArgVRN2M1s[] = {
-    RISCV::V8_V9,   RISCV::V9_V10,  RISCV::V10_V11, RISCV::V11_V12,
-    RISCV::V12_V13, RISCV::V13_V14, RISCV::V14_V15, RISCV::V15_V16,
-    RISCV::V16_V17, RISCV::V17_V18, RISCV::V18_V19, RISCV::V19_V20,
-    RISCV::V20_V21, RISCV::V21_V22, RISCV::V22_V23};
-static const MCPhysReg ArgVRN3M1s[] = {
-    RISCV::V8_V9_V10,   RISCV::V9_V10_V11,  RISCV::V10_V11_V12,
-    RISCV::V11_V12_V13, RISCV::V12_V13_V14, RISCV::V13_V14_V15,
-    RISCV::V14_V15_V16, RISCV::V15_V16_V17, RISCV::V16_V17_V18,
-    RISCV::V17_V18_V19, RISCV::V18_V19_V20, RISCV::V19_V20_V21,
-    RISCV::V20_V21_V22, RISCV::V21_V22_V23};
-static const MCPhysReg ArgVRN4M1s[] = {
-    RISCV::V8_V9_V10_V11,   RISCV::V9_V10_V11_V12,  RISCV::V10_V11_V12_V13,
-    RISCV::V11_V12_V13_V14, RISCV::V12_V13_V14_V15, RISCV::V13_V14_V15_V16,
-    RISCV::V14_V15_V16_V17, RISCV::V15_V16_V17_V18, RISCV::V16_V17_V18_V19,
-    RISCV::V17_V18_V19_V20, RISCV::V18_V19_V20_V21, RISCV::V19_V20_V21_V22,
-    RISCV::V20_V21_V22_V23};
-static const MCPhysReg ArgVRN5M1s[] = {
-    RISCV::V8_V9_V10_V11_V12,   RISCV::V9_V10_V11_V12_V13,
-    RISCV::V10_V11_V12_V13_V14, RISCV::V11_V12_V13_V14_V15,
-    RISCV::V12_V13_V14_V15_V16, RISCV::V13_V14_V15_V16_V17,
-    RISCV::V14_V15_V16_V17_V18, RISCV::V15_V16_V17_V18_V19,
-    RISCV::V16_V17_V18_V19_V20, RISCV::V17_V18_V19_V20_V21,
-    RISCV::V18_V19_V20_V21_V22, RISCV::V19_V20_V21_V22_V23};
-static const MCPhysReg ArgVRN6M1s[] = {
-    RISCV::V8_V9_V10_V11_V12_V13,   RISCV::V9_V10_V11_V12_V13_V14,
-    RISCV::V10_V11_V12_V13_V14_V15, RISCV::V11_V12_V13_V14_V15_V16,
-    RISCV::V12_V13_V14_V15_V16_V17, RISCV::V13_V14_V15_V16_V17_V18,
-    RISCV::V14_V15_V16_V17_V18_V19, RISCV::V15_V16_V17_V18_V19_V20,
-    RISCV::V16_V17_V18_V19_V20_V21, RISCV::V17_V18_V19_V20_V21_V22,
-    RISCV::V18_V19_V20_V21_V22_V23};
-static const MCPhysReg ArgVRN7M1s[] = {
-    RISCV::V8_V9_V10_V11_V12_V13_V14,   RISCV::V9_V10_V11_V12_V13_V14_V15,
-    RISCV::V10_V11_V12_V13_V14_V15_V16, RISCV::V11_V12_V13_V14_V15_V16_V17,
-    RISCV::V12_V13_V14_V15_V16_V17_V18, RISCV::V13_V14_V15_V16_V17_V18_V19,
-    RISCV::V14_V15_V16_V17_V18_V19_V20, RISCV::V15_V16_V17_V18_V19_V20_V21,
-    RISCV::V16_V17_V18_V19_V20_V21_V22, RISCV::V17_V18_V19_V20_V21_V22_V23};
-static const MCPhysReg ArgVRN8M1s[] = {RISCV::V8_V9_V10_V11_V12_V13_V14_V15,
-                                       RISCV::V9_V10_V11_V12_V13_V14_V15_V16,
-                                       RISCV::V10_V11_V12_V13_V14_V15_V16_V17,
-                                       RISCV::V11_V12_V13_V14_V15_V16_V17_V18,
-                                       RISCV::V12_V13_V14_V15_V16_V17_V18_V19,
-                                       RISCV::V13_V14_V15_V16_V17_V18_V19_V20,
-                                       RISCV::V14_V15_V16_V17_V18_V19_V20_V21,
-                                       RISCV::V15_V16_V17_V18_V19_V20_V21_V22,
-                                       RISCV::V16_V17_V18_V19_V20_V21_V22_V23};
-static const MCPhysReg ArgVRN2M2s[] = {RISCV::V8M2_V10M2,  RISCV::V10M2_V12M2,
-                                       RISCV::V12M2_V14M2, RISCV::V14M2_V16M2,
-                                       RISCV::V16M2_V18M2, RISCV::V18M2_V20M2,
-                                       RISCV::V20M2_V22M2};
-static const MCPhysReg ArgVRN3M2s[] = {
-    RISCV::V8M2_V10M2_V12M2,  RISCV::V10M2_V12M2_V14M2,
-    RISCV::V12M2_V14M2_V16M2, RISCV::V14M2_V16M2_V18M2,
-    RISCV::V16M2_V18M2_V20M2, RISCV::V18M2_V20M2_V22M2};
-static const MCPhysReg ArgVRN4M2s[] = {
-    RISCV::V8M2_V10M2_V12M2_V14M2, RISCV::V10M2_V12M2_V14M2_V16M2,
-    RISCV::V12M2_V14M2_V16M2_V18M2, RISCV::V14M2_V16M2_V18M2_V20M2,
-    RISCV::V16M2_V18M2_V20M2_V22M2};
-static const MCPhysReg ArgVRN2M4s[] = {RISCV::V8M4_V12M4, RISCV::V12M4_V16M4,
-                                       RISCV::V16M4_V20M4};
-
-ArrayRef<MCPhysReg> RISCV::getArgGPRs(const RISCVABI::ABI ABI) {
-  // The GPRs used for passing arguments in the ILP32* and LP64* ABIs, except
-  // the ILP32E ABI.
-  static const MCPhysReg ArgIGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12,
-                                       RISCV::X13, RISCV::X14, RISCV::X15,
-                                       RISCV::X16, RISCV::X17};
-  // The GPRs used for passing arguments in the ILP32E/ILP64E ABI.
-  static const MCPhysReg ArgEGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12,
-                                       RISCV::X13, RISCV::X14, RISCV::X15};
-
-  if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E)
-    return ArrayRef(ArgEGPRs);
-
-  return ArrayRef(ArgIGPRs);
-}
-
-static ArrayRef<MCPhysReg> getFastCCArgGPRs(const RISCVABI::ABI ABI) {
-  // The GPRs used for passing arguments in the FastCC, X5 and X6 might be used
-  // for save-restore libcall, so we don't use them.
-  // Don't use X7 for fastcc, since Zicfilp uses X7 as the label register.
-  static const MCPhysReg FastCCIGPRs[] = {
-      RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14, RISCV::X15,
-      RISCV::X16, RISCV::X17, RISCV::X28, RISCV::X29, RISCV::X30, RISCV::X31};
-
-  // The GPRs used for passing arguments in the FastCC when using ILP32E/ILP64E.
-  static const MCPhysReg FastCCEGPRs[] = {RISCV::X10, RISCV::X11, RISCV::X12,
-                                          RISCV::X13, RISCV::X14, RISCV::X15};
-
-  if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E)
-    return ArrayRef(FastCCEGPRs);
-
-  return ArrayRef(FastCCIGPRs);
-}
-
-// Pass a 2*XLEN argument that has been split into two XLEN values through
-// registers or the stack as necessary.
-static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
-                                ISD::ArgFlagsTy ArgFlags1, unsigned ValNo2,
-                                MVT ValVT2, MVT LocVT2,
-                                ISD::ArgFlagsTy ArgFlags2, bool EABI) {
-  unsigned XLenInBytes = XLen / 8;
-  const RISCVSubtarget &STI =
-      State.getMachineFunction().getSubtarget<RISCVSubtarget>();
-  ArrayRef<MCPhysReg> ArgGPRs = RISCV::getArgGPRs(STI.getTargetABI());
-
-  if (MCRegister Reg = State.AllocateReg(ArgGPRs)) {
-    // At least one half can be passed via register.
-    State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg,
-                                     VA1.getLocVT(), CCValAssign::Full));
-  } else {
-    // Both halves must be passed on the stack, with proper alignment.
-    // TODO: To be compatible with GCC's behaviors, we force them to have 4-byte
-    // alignment. This behavior may be changed when RV32E/ILP32E is ratified.
-    Align StackAlign(XLenInBytes);
-    if (!EABI || XLen != 32)
-      StackAlign = std::max(StackAlign, ArgFlags1.getNonZeroOrigAlign());
-    State.addLoc(
-        CCValAssign::getMem(VA1.getValNo(), VA1.getValVT(),
-                            State.AllocateStack(XLenInBytes, StackAlign),
-                            VA1.getLocVT(), CCValAssign::Full));
-    State.addLoc(CCValAssign::getMem(
-        ValNo2, ValVT2, State.AllocateStack(XLenInBytes, Align(XLenInBytes)),
-        LocVT2, CCValAssign::Full));
-    return false;
-  }
-
-  if (MCRegister Reg = State.AllocateReg(ArgGPRs)) {
-    // The second half can also be passed via register.
-    State.addLoc(
-        CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full));
-  } else {
-    // The second half is passed via the stack, without additional alignment.
-    State.addLoc(CCValAssign::getMem(
-        ValNo2, ValVT2, State.AllocateStack(XLenInBytes, Align(XLenInBytes)),
-        LocVT2, CCValAssign::Full));
-  }
-
-  return false;
-}
-
-static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo, CCState &State,
-                                 const RISCVTargetLowering &TLI) {
-  const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT);
-  if (RC == &RISCV::VRRegClass) {
-    // Assign the first mask argument to V0.
-    // This is an interim calling convention and it may be changed in the
-    // future.
-    if (ValVT.getVectorElementType() == MVT::i1)
-      if (MCRegister Reg = State.AllocateReg(RISCV::V0))
-        return Reg;
-    return State.AllocateReg(ArgVRs);
-  }
-  if (RC == &RISCV::VRM2RegClass)
-    return State.AllocateReg(ArgVRM2s);
-  if (RC == &RISCV::VRM4RegClass)
-    return State.AllocateReg(ArgVRM4s);
-  if (RC == &RISCV::VRM8RegClass)
-    return State.AllocateReg(ArgVRM8s);
-  if (RC == &RISCV::VRN2M1RegClass)
-    return State.AllocateReg(ArgVRN2M1s);
-  if (RC == &RISCV::VRN3M1RegClass)
-    return State.AllocateReg(ArgVRN3M1s);
-  if (RC == &RISCV::VRN4M1RegClass)
-    return State.AllocateReg(ArgVRN4M1s);
-  if (RC == &RISCV::VRN5M1RegClass)
-    return State.AllocateReg(ArgVRN5M1s);
-  if (RC == &RISCV::VRN6M1RegClass)
-    return State.AllocateReg(ArgVRN6M1s);
-  if (RC == &RISCV::VRN7M1RegClass)
-    return State.AllocateReg(ArgVRN7M1s);
-  if (RC == &RISCV::VRN8M1RegClass)
-    return State.AllocateReg(ArgVRN8M1s);
-  if (RC == &RISCV::VRN2M2RegClass)
-    return State.AllocateReg(ArgVRN2M2s);
-  if (RC == &RISCV::VRN3M2RegClass)
-    return State.AllocateReg(ArgVRN3M2s);
-  if (RC == &RISCV::VRN4M2RegClass)
-    return State.AllocateReg(ArgVRN4M2s);
-  if (RC == &RISCV::VRN2M4RegClass)
-    return State.AllocateReg(ArgVRN2M4s);
-  llvm_unreachable("Unhandled register class for ValueType");
-}
-
-// Implements the RISC-V calling convention. Returns true upon failure.
-bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
-                     MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
-                     ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
-                     bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI) {
-  unsigned XLen = DL.getLargestLegalIntTypeSizeInBits();
-  assert(XLen == 32 || XLen == 64);
-  MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;
-
-  // Static chain parameter must not be passed in normal argument registers,
-  // so we assign t2 for it as done in GCC's __builtin_call_with_static_chain
-  if (ArgFlags.isNest()) {
-    if (MCRegister Reg = State.AllocateReg(RISCV::X7)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  // Any return value split in to more than two values can't be returned
-  // directly. Vectors are returned via the available vector registers.
-  if (!LocVT.isVector() && IsRet && ValNo > 1)
-    return true;
-
-  // UseGPRForF16_F32 if targeting one of the soft-float ABIs, if passing a
-  // variadic argument, or if no F16/F32 argument registers are available.
-  bool UseGPRForF16_F32 = true;
-  // UseGPRForF64 if targeting soft-float ABIs or an FLEN=32 ABI, if passing a
-  // variadic argument, or if no F64 argument registers are available.
-  bool UseGPRForF64 = true;
-
-  switch (ABI) {
-  default:
-    llvm_unreachable("Unexpected ABI");
-  case RISCVABI::ABI_ILP32:
-  case RISCVABI::ABI_ILP32E:
-  case RISCVABI::ABI_LP64:
-  case RISCVABI::ABI_LP64E:
-    break;
-  case RISCVABI::ABI_ILP32F:
-  case RISCVABI::ABI_LP64F:
-    UseGPRForF16_F32 = !IsFixed;
-    break;
-  case RISCVABI::ABI_ILP32D:
-  case RISCVABI::ABI_LP64D:
-    UseGPRForF16_F32 = !IsFixed;
-    UseGPRForF64 = !IsFixed;
-    break;
-  }
-
-  // FPR16, FPR32, and FPR64 alias each other.
-  if (State.getFirstUnallocated(ArgFPR32s) == std::size(ArgFPR32s)) {
-    UseGPRForF16_F32 = true;
-    UseGPRForF64 = true;
-  }
-
-  // From this point on, rely on UseGPRForF16_F32, UseGPRForF64 and
-  // similar local variables rather than directly checking against the target
-  // ABI.
-
-  ArrayRef<MCPhysReg> ArgGPRs = RISCV::getArgGPRs(ABI);
-
-  if (UseGPRForF16_F32 && (ValVT == MVT::f16 || ValVT == MVT::bf16 ||
-                           (ValVT == MVT::f32 && XLen == 64))) {
-    MCRegister Reg = State.AllocateReg(ArgGPRs);
-    if (Reg) {
-      LocVT = XLenVT;
-      State.addLoc(
-          CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  if (UseGPRForF16_F32 &&
-      (ValVT == MVT::f16 || ValVT == MVT::bf16 || ValVT == MVT::f32)) {
-    LocVT = XLenVT;
-    LocInfo = CCValAssign::BCvt;
-  } else if (UseGPRForF64 && XLen == 64 && ValVT == MVT::f64) {
-    LocVT = MVT::i64;
-    LocInfo = CCValAssign::BCvt;
-  }
-
-  // If this is a variadic argument, the RISC-V calling convention requires
-  // that it is assigned an 'even' or 'aligned' register if it has 8-byte
-  // alignment (RV32) or 16-byte alignment (RV64). An aligned register should
-  // be used regardless of whether the original argument was split during
-  // legalisation or not. The argument will not be passed by registers if the
-  // original type is larger than 2*XLEN, so the register alignment rule does
-  // not apply.
-  // TODO: To be compatible with GCC's behaviors, we don't align registers
-  // currently if we are using ILP32E calling convention. This behavior may be
-  // changed when RV32E/ILP32E is ratified.
-  unsigned TwoXLenInBytes = (2 * XLen) / 8;
-  if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes &&
-      DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes &&
-      ABI != RISCVABI::ABI_ILP32E) {
-    unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
-    // Skip 'odd' register if necessary.
-    if (RegIdx != std::size(ArgGPRs) && RegIdx % 2 == 1)
-      State.AllocateReg(ArgGPRs);
-  }
-
-  SmallVectorImpl<CCValAssign> &PendingLocs = State.getPendingLocs();
-  SmallVectorImpl<ISD::ArgFlagsTy> &PendingArgFlags =
-      State.getPendingArgFlags();
-
-  assert(PendingLocs.size() == PendingArgFlags.size() &&
-         "PendingLocs and PendingArgFlags out of sync");
-
-  // Handle passing f64 on RV32D with a soft float ABI or when floating point
-  // registers are exhausted.
-  if (UseGPRForF64 && XLen == 32 && ValVT == MVT::f64) {
-    assert(PendingLocs.empty() && "Can't lower f64 if it is split");
-    // Depending on available argument GPRS, f64 may be passed in a pair of
-    // GPRs, split between a GPR and the stack, or passed completely on the
-    // stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these
-    // cases.
-    MCRegister Reg = State.AllocateReg(ArgGPRs);
-    if (!Reg) {
-      unsigned StackOffset = State.AllocateStack(8, Align(8));
-      State.addLoc(
-          CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
-      return false;
-    }
-    LocVT = MVT::i32;
-    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    MCRegister HiReg = State.AllocateReg(ArgGPRs);
-    if (HiReg) {
-      State.addLoc(
-          CCValAssign::getCustomReg(ValNo, ValVT, HiReg, LocVT, LocInfo));
-    } else {
-      unsigned StackOffset = State.AllocateStack(4, Align(4));
-      State.addLoc(
-          CCValAssign::getCustomMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
-    }
-    return false;
-  }
-
-  // Fixed-length vectors are located in the corresponding scalable-vector
-  // container types.
-  if (ValVT.isFixedLengthVector())
-    LocVT = TLI.getContainerForFixedLengthVector(LocVT);
-
-  // Split arguments might be passed indirectly, so keep track of the pending
-  // values. Split vectors are passed via a mix of registers and indirectly, so
-  // treat them as we would any other argument.
-  if (ValVT.isScalarInteger() && (ArgFlags.isSplit() || !PendingLocs.empty())) {
-    LocVT = XLenVT;
-    LocInfo = CCValAssign::Indirect;
-    PendingLocs.push_back(
-        CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
-    PendingArgFlags.push_back(ArgFlags);
-    if (!ArgFlags.isSplitEnd()) {
-      return false;
-    }
-  }
-
-  // If the split argument only had two elements, it should be passed directly
-  // in registers or on the stack.
-  if (ValVT.isScalarInteger() && ArgFlags.isSplitEnd() &&
-      PendingLocs.size() <= 2) {
-    assert(PendingLocs.size() == 2 && "Unexpected PendingLocs.size()");
-    // Apply the normal calling convention rules to the first half of the
-    // split argument.
-    CCValAssign VA = PendingLocs[0];
-    ISD::ArgFlagsTy AF = PendingArgFlags[0];
-    PendingLocs.clear();
-    PendingArgFlags.clear();
-    return CC_RISCVAssign2XLen(
-        XLen, State, VA, AF, ValNo, ValVT, LocVT, ArgFlags,
-        ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E);
-  }
-
-  // Allocate to a register if possible, or else a stack slot.
-  MCRegister Reg;
-  unsigned StoreSizeBytes = XLen / 8;
-  Align StackAlign = Align(XLen / 8);
-
-  if ((ValVT == MVT::f16 || ValVT == MVT::bf16) && !UseGPRForF16_F32)
-    Reg = State.AllocateReg(ArgFPR16s);
-  else if (ValVT == MVT::f32 && !UseGPRForF16_F32)
-    Reg = State.AllocateReg(ArgFPR32s);
-  else if (ValVT == MVT::f64 && !UseGPRForF64)
-    Reg = State.AllocateReg(ArgFPR64s);
-  else if (ValVT.isVector() || ValVT.isRISCVVectorTuple()) {
-    Reg = allocateRVVReg(ValVT, ValNo, State, TLI);
-    if (!Reg) {
-      // For return values, the vector must be passed fully via registers or
-      // via the stack.
-      // FIXME: The proposed vector ABI only mandates v8-v15 for return values,
-      // but we're using all of them.
-      if (IsRet)
-        return true;
-      // Try using a GPR to pass the address
-      if ((Reg = State.AllocateReg(ArgGPRs))) {
-        LocVT = XLenVT;
-        LocInfo = CCValAssign::Indirect;
-      } else if (ValVT.isScalableVector()) {
-        LocVT = XLenVT;
-        LocInfo = CCValAssign::Indirect;
-      } else {
-        // Pass fixed-length vectors on the stack.
-        LocVT = ValVT;
-        StoreSizeBytes = ValVT.getStoreSize();
-        // Align vectors to their element sizes, being careful for vXi1
-        // vectors.
-        StackAlign = MaybeAlign(ValVT.getScalarSizeInBits() / 8).valueOrOne();
-      }
-    }
-  } else {
-    Reg = State.AllocateReg(ArgGPRs);
-  }
-
-  unsigned StackOffset =
-      Reg ? 0 : State.AllocateStack(StoreSizeBytes, StackAlign);
-
-  // If we reach this point and PendingLocs is non-empty, we must be at the
-  // end of a split argument that must be passed indirectly.
-  if (!PendingLocs.empty()) {
-    assert(ArgFlags.isSplitEnd() && "Expected ArgFlags.isSplitEnd()");
-    assert(PendingLocs.size() > 2 && "Unexpected PendingLocs.size()");
-
-    for (auto &It : PendingLocs) {
-      if (Reg)
-        It.convertToReg(Reg);
-      else
-        It.convertToMem(StackOffset);
-      State.addLoc(It);
-    }
-    PendingLocs.clear();
-    PendingArgFlags.clear();
-    return false;
-  }
-
-  assert((!UseGPRForF16_F32 || !UseGPRForF64 || LocVT == XLenVT ||
-          (TLI.getSubtarget().hasVInstructions() &&
-           (ValVT.isVector() || ValVT.isRISCVVectorTuple()))) &&
-         "Expected an XLenVT or vector types at this stage");
-
-  if (Reg) {
-    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    return false;
-  }
-
-  // When a scalar floating-point value is passed on the stack, no
-  // bit-conversion is needed.
-  if (ValVT.isFloatingPoint() && LocInfo != CCValAssign::Indirect) {
-    assert(!ValVT.isVector());
-    LocVT = ValVT;
-    LocInfo = CCValAssign::Full;
-  }
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
-  return false;
-}
-
 void RISCVTargetLowering::analyzeInputArgs(
     MachineFunction &MF, CCState &CCInfo,
     const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet,
@@ -19631,189 +19145,6 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
   return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
 }
 
-// FastCC has less than 1% performance improvement for some particular
-// benchmark. But theoretically, it may have benefit for some cases.
-bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
-                            unsigned ValNo, MVT ValVT, MVT LocVT,
-                            CCValAssign::LocInfo LocInfo,
-                            ISD::ArgFlagsTy ArgFlags, CCState &State,
-                            bool IsFixed, bool IsRet, Type *OrigTy,
-                            const RISCVTargetLowering &TLI) {
-  if (LocVT == MVT::i32 || LocVT == MVT::i64) {
-    if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  const RISCVSubtarget &Subtarget = TLI.getSubtarget();
-
-  if (LocVT == MVT::f16 && Subtarget.hasStdExtZfhmin()) {
-    static const MCPhysReg FPR16List[] = {
-        RISCV::F10_H, RISCV::F11_H, RISCV::F12_H, RISCV::F13_H, RISCV::F14_H,
-        RISCV::F15_H, RISCV::F16_H, RISCV::F17_H, RISCV::F0_H,  RISCV::F1_H,
-        RISCV::F2_H,  RISCV::F3_H,  RISCV::F4_H,  RISCV::F5_H,  RISCV::F6_H,
-        RISCV::F7_H,  RISCV::F28_H, RISCV::F29_H, RISCV::F30_H, RISCV::F31_H};
-    if (MCRegister Reg = State.AllocateReg(FPR16List)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  if (LocVT == MVT::f32 && Subtarget.hasStdExtF()) {
-    static const MCPhysReg FPR32List[] = {
-        RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F, RISCV::F14_F,
-        RISCV::F15_F, RISCV::F16_F, RISCV::F17_F, RISCV::F0_F,  RISCV::F1_F,
-        RISCV::F2_F,  RISCV::F3_F,  RISCV::F4_F,  RISCV::F5_F,  RISCV::F6_F,
-        RISCV::F7_F,  RISCV::F28_F, RISCV::F29_F, RISCV::F30_F, RISCV::F31_F};
-    if (MCRegister Reg = State.AllocateReg(FPR32List)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  if (LocVT == MVT::f64 && Subtarget.hasStdExtD()) {
-    static const MCPhysReg FPR64List[] = {
-        RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D, RISCV::F14_D,
-        RISCV::F15_D, RISCV::F16_D, RISCV::F17_D, RISCV::F0_D,  RISCV::F1_D,
-        RISCV::F2_D,  RISCV::F3_D,  RISCV::F4_D,  RISCV::F5_D,  RISCV::F6_D,
-        RISCV::F7_D,  RISCV::F28_D, RISCV::F29_D, RISCV::F30_D, RISCV::F31_D};
-    if (MCRegister Reg = State.AllocateReg(FPR64List)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  // Check if there is an available GPR before hitting the stack.
-  if ((LocVT == MVT::f16 && Subtarget.hasStdExtZhinxmin()) ||
-      (LocVT == MVT::f32 && Subtarget.hasStdExtZfinx()) ||
-      (LocVT == MVT::f64 && Subtarget.is64Bit() &&
-       Subtarget.hasStdExtZdinx())) {
-    if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
-      if (LocVT.getSizeInBits() != Subtarget.getXLen()) {
-        LocVT = Subtarget.getXLenVT();
-        State.addLoc(
-            CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-        return false;
-      }
-      LocVT = Subtarget.getXLenVT();
-      LocInfo = CCValAssign::BCvt;
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  if (LocVT == MVT::f16) {
-    unsigned Offset2 = State.AllocateStack(2, Align(2));
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset2, LocVT, LocInfo));
-    return false;
-  }
-
-  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
-    unsigned Offset4 = State.AllocateStack(4, Align(4));
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset4, LocVT, LocInfo));
-    return false;
-  }
-
-  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
-    unsigned Offset5 = State.AllocateStack(8, Align(8));
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset5, LocVT, LocInfo));
-    return false;
-  }
-
-  if (LocVT.isVector()) {
-    if (MCRegister Reg = allocateRVVReg(ValVT, ValNo, State, TLI)) {
-      // Fixed-length vectors are located in the corresponding scalable-vector
-      // container types.
-      if (ValVT.isFixedLengthVector())
-        LocVT = TLI.getContainerForFixedLengthVector(LocVT);
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    } else {
-      // Try and pass the address via a "fast" GPR.
-      if (MCRegister GPRReg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
-        LocInfo = CCValAssign::Indirect;
-        LocVT = TLI.getSubtarget().getXLenVT();
-        State.addLoc(CCValAssign::getReg(ValNo, ValVT, GPRReg, LocVT, LocInfo));
-      } else if (ValVT.isFixedLengthVector()) {
-        auto StackAlign =
-            MaybeAlign(ValVT.getScalarSizeInBits() / 8).valueOrOne();
-        unsigned StackOffset =
-            State.AllocateStack(ValVT.getStoreSize(), StackAlign);
-        State.addLoc(
-            CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
-      } else {
-        // Can't pass scalable vectors on the stack.
-        return true;
-      }
-    }
-
-    return false;
-  }
-
-  return true; // CC didn't match.
-}
-
-bool RISCV::CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
-                         CCValAssign::LocInfo LocInfo,
-                         ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  if (ArgFlags.isNest()) {
-    report_fatal_error(
-        "Attribute 'nest' is not supported in GHC calling convention");
-  }
-
-  static const MCPhysReg GPRList[] = {
-      RISCV::X9,  RISCV::X18, RISCV::X19, RISCV::X20, RISCV::X21, RISCV::X22,
-      RISCV::X23, RISCV::X24, RISCV::X25, RISCV::X26, RISCV::X27};
-
-  if (LocVT == MVT::i32 || LocVT == MVT::i64) {
-    // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, R7, SpLim
-    //                        s1    s2  s3  s4  s5  s6  s7  s8  s9  s10 s11
-    if (MCRegister Reg = State.AllocateReg(GPRList)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  const RISCVSubtarget &Subtarget =
-      State.getMachineFunction().getSubtarget<RISCVSubtarget>();
-
-  if (LocVT == MVT::f32 && Subtarget.hasStdExtF()) {
-    // Pass in STG registers: F1, ..., F6
-    //                        fs0 ... fs5
-    static const MCPhysReg FPR32List[] = {RISCV::F8_F, RISCV::F9_F,
-                                          RISCV::F18_F, RISCV::F19_F,
-                                          RISCV::F20_F, RISCV::F21_F};
-    if (MCRegister Reg = State.AllocateReg(FPR32List)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  if (LocVT == MVT::f64 && Subtarget.hasStdExtD()) {
-    // Pass in STG registers: D1, ..., D6
-    //                        fs6 ... fs11
-    static const MCPhysReg FPR64List[] = {RISCV::F22_D, RISCV::F23_D,
-                                          RISCV::F24_D, RISCV::F25_D,
-                                          RISCV::F26_D, RISCV::F27_D};
-    if (MCRegister Reg = State.AllocateReg(FPR64List)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  if ((LocVT == MVT::f32 && Subtarget.hasStdExtZfinx()) ||
-      (LocVT == MVT::f64 && Subtarget.hasStdExtZdinx() &&
-       Subtarget.is64Bit())) {
-    if (MCRegister Reg = State.AllocateReg(GPRList)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return false;
-    }
-  }
-
-  report_fatal_error("No registers left in GHC calling convention");
-  return true;
-}
-
 // Transform physical registers into virtual registers.
 SDValue RISCVTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
@@ -19864,11 +19195,11 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
 
   if (CallConv == CallingConv::GHC)
-    CCInfo.AnalyzeFormalArguments(Ins, RISCV::CC_RISCV_GHC);
+    CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_GHC);
   else
     analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false,
-                     CallConv == CallingConv::Fast ? RISCV::CC_RISCV_FastCC
-                                                   : RISCV::CC_RISCV);
+                     CallConv == CallingConv::Fast ? CC_RISCV_FastCC
+                                                   : CC_RISCV);
 
   for (unsigned i = 0, e = ArgLocs.size(), InsIdx = 0; i != e; ++i, ++InsIdx) {
     CCValAssign &VA = ArgLocs[i];
@@ -20072,11 +19403,11 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
   if (CallConv == CallingConv::GHC) {
     if (Subtarget.hasStdExtE())
       report_fatal_error("GHC calling convention is not supported on RVE!");
-    ArgCCInfo.AnalyzeCallOperands(Outs, RISCV::CC_RISCV_GHC);
+    ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_GHC);
   } else
     analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI,
-                      CallConv == CallingConv::Fast ? RISCV::CC_RISCV_FastCC
-                                                    : RISCV::CC_RISCV);
+                      CallConv == CallingConv::Fast ? CC_RISCV_FastCC
+                                                    : CC_RISCV);
 
   // Check if it's really possible to do a tail call.
   if (IsTailCall)
@@ -20319,7 +19650,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
-  analyzeInputArgs(MF, RetCCInfo, Ins, /*IsRet=*/true, RISCV::CC_RISCV);
+  analyzeInputArgs(MF, RetCCInfo, Ins, /*IsRet=*/true, CC_RISCV);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
@@ -20358,9 +19689,9 @@ bool RISCVTargetLowering::CanLowerReturn(
     MVT VT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
     RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
-    if (RISCV::CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full,
-                        ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true,
-                        nullptr, *this))
+    if (CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full,
+                 ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr,
+                 *this))
       return false;
   }
   return true;
@@ -20383,7 +19714,7 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                  *DAG.getContext());
 
   analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /*IsRet=*/true,
-                    nullptr, RISCV::CC_RISCV);
+                    nullptr, CC_RISCV);
 
   if (CallConv == CallingConv::GHC && !RVLocs.empty())
     report_fatal_error("GHC functions return void only");
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 3beee4686956e..220219f98d3b2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_RISCV_RISCVISELLOWERING_H
 
 #include "RISCV.h"
+#include "RISCVCallingConv.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -896,16 +897,6 @@ class RISCVTargetLowering : public TargetLowering {
                               MachineBasicBlock::instr_iterator &MBBI,
                               const TargetInstrInfo *TII) const override;
 
-  /// RISCVCCAssignFn - This target-specific function extends the default
-  /// CCValAssign with additional information used to lower RISC-V calling
-  /// conventions.
-  typedef bool RISCVCCAssignFn(const DataLayout &DL, RISCVABI::ABI,
-                               unsigned ValNo, MVT ValVT, MVT LocVT,
-                               CCValAssign::LocInfo LocInfo,
-                               ISD::ArgFlagsTy ArgFlags, CCState &State,
-                               bool IsFixed, bool IsRet, Type *OrigTy,
-                               const RISCVTargetLowering &TLI);
-
 private:
   void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
                         const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet,
@@ -1048,26 +1039,6 @@ class RISCVTargetLowering : public TargetLowering {
                           SDValue End, SDValue Flags, SDLoc DL) const;
 };
 
-namespace RISCV {
-
-bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
-              MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
-              ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
-              bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI);
-
-bool CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
-                     MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
-                     ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
-                     bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI);
-
-bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
-                  CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                  CCState &State);
-
-ArrayRef<MCPhysReg> getArgGPRs(const RISCVABI::ABI ABI);
-
-} // end namespace RISCV
-
 namespace RISCVVIntrinsicsTable {
 
 struct RISCVVIntrinsicInfo {

From dcfa147c9d9569ea44cb0f0b6981f69a62b87f71 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 6 Sep 2024 05:29:55 +0000
Subject: [PATCH 330/425] [gn build] Port 093b8bfe6b64

---
 llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
index c08c7c9828af7..6316e18b2dd97 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
@@ -112,6 +112,7 @@ static_library("LLVMRISCVCodeGen") {
     "GISel/RISCVPreLegalizerCombiner.cpp",
     "GISel/RISCVRegisterBankInfo.cpp",
     "RISCVAsmPrinter.cpp",
+    "RISCVCallingConv.cpp",
     "RISCVCodeGenPrepare.cpp",
     "RISCVDeadRegisterDefinitions.cpp",
     "RISCVExpandAtomicPseudoInsts.cpp",

From 24267a7e14b35f41ab55e15ba12bb80c82881941 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang@amd.com>
Date: Thu, 5 Sep 2024 22:57:27 -0700
Subject: [PATCH 331/425] AMDGPU: Add f64 to f32 support for llvm.fptrunc.round
 (#107481)

---
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  6 +++
 llvm/lib/Target/AMDGPU/SIModeRegister.cpp     | 12 ++++--
 .../CodeGen/AMDGPU/llvm.fptrunc.round.err.ll  | 28 ++++++-------
 .../test/CodeGen/AMDGPU/llvm.fptrunc.round.ll | 39 +++++++++++++++++++
 4 files changed, 68 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 69e1b9a38324f..c0154645b391d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -230,11 +230,17 @@ def S_INVERSE_BALLOT_U64 : SPseudoInstSI<
 let Uses = [MODE, EXEC] in {
 def FPTRUNC_ROUND_F16_F32_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
   (ins VGPR_32:$src0, i32imm:$round)>;
+
+def FPTRUNC_ROUND_F32_F64_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
+  (ins VReg_64:$src0, i32imm:$round)>;
 } // End Uses = [MODE, EXEC]
 
 def : GCNPat <(f16 (fptrunc_round f32:$src0, (i32 SupportedRoundMode:$round))),
      (FPTRUNC_ROUND_F16_F32_PSEUDO $src0, (as_hw_round_mode $round))>;
 
+def : GCNPat <(f32 (fptrunc_round f64:$src0, (i32 SupportedRoundMode:$round))),
+     (FPTRUNC_ROUND_F32_F64_PSEUDO $src0, (as_hw_round_mode $round))>;
+
 // Invert the exec mask and overwrite the inactive lanes of dst with inactive,
 // restoring it after we're done.
 let Defs = [SCC], isConvergent = 1 in {
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index a590c6560942c..6bcf9757d2945 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -165,7 +165,8 @@ Status SIModeRegister::getInstructionMode(MachineInstr &MI,
                                           const SIInstrInfo *TII) {
   unsigned Opcode = MI.getOpcode();
   if (TII->usesFPDPRounding(MI) ||
-      Opcode == AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO) {
+      Opcode == AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO ||
+      Opcode == AMDGPU::FPTRUNC_ROUND_F32_F64_PSEUDO) {
     switch (Opcode) {
     case AMDGPU::V_INTERP_P1LL_F16:
     case AMDGPU::V_INTERP_P1LV_F16:
@@ -189,8 +190,13 @@ Status SIModeRegister::getInstructionMode(MachineInstr &MI,
         B.addImm(0); // omod
       } else
         MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e32));
-      return Status(FP_ROUND_MODE_DP(3),
-                    FP_ROUND_MODE_DP(Mode));
+      return Status(FP_ROUND_MODE_DP(3), FP_ROUND_MODE_DP(Mode));
+    }
+    case AMDGPU::FPTRUNC_ROUND_F32_F64_PSEUDO: {
+      unsigned Mode = MI.getOperand(2).getImm();
+      MI.removeOperand(2);
+      MI.setDesc(TII->get(AMDGPU::V_CVT_F32_F64_e32));
+      return Status(FP_ROUND_MODE_DP(3), FP_ROUND_MODE_DP(Mode));
     }
     default:
       return DefaultStatus;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll
index 291fe00a6177b..21fe1ce4dc1d6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll
@@ -3,15 +3,15 @@
 ; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F16-F64-FAIL %s
 ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F16-F64-FAIL %s
 
-; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f32-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F32-F64-FAIL %s
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f32-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F32-F64-FAIL %s
-
 ; TODO: check for GISEL when bfloat is supported.
 ; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/bf16-f32-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=BF16-F32-FAIL %s
 ; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/bf16-f64-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=BF16-F64-FAIL %s
 
-; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f32-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=TONEARESTAWAY-FAIL %s
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f32-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=TONEARESTAWAY-FAIL %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f32-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F16-F32-TONEARESTAWAY-FAIL %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f16-f32-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F16-F32-TONEARESTAWAY-FAIL %s
+
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f32-f64-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F32-F64-TONEARESTAWAY-FAIL %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -filetype=null %t/f32-f64-tonearestaway-err.ll 2>&1 | FileCheck --ignore-case --check-prefix=F32-F64-TONEARESTAWAY-FAIL %s
 
 ;--- f16-f64-err.ll
 define amdgpu_gs void @test_fptrunc_round_f16_f64(double %a, ptr addrspace(1) %out) {
@@ -21,14 +21,6 @@ define amdgpu_gs void @test_fptrunc_round_f16_f64(double %a, ptr addrspace(1) %o
   ret void
 }
 
-;--- f32-f64-err.ll
-define amdgpu_gs void @test_fptrunc_round_f32_f64(double %a, ptr addrspace(1) %out) {
-; F32-F64-FAIL: LLVM ERROR: Cannot select
-  %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.downward")
-  store float %res, ptr addrspace(1) %out, align 4
-  ret void
-}
-
 ;--- bf16-f32-err.ll
 define amdgpu_gs void @test_fptrunc_round_bf16_f32(float %a, ptr addrspace(1) %out) {
 ; BF16-F32-FAIL: LLVM ERROR: Cannot select
@@ -47,8 +39,16 @@ define amdgpu_gs void @test_fptrunc_round_bf16_f64(double %a, ptr addrspace(1) %
 
 ;--- f16-f32-tonearestaway-err.ll
 define amdgpu_gs void @test_fptrunc_round_f16_f32_tonearestaway(float %a, ptr addrspace(1) %out) {
-; TONEARESTAWAY-FAIL: LLVM ERROR: Cannot select
+; F16-F32-TONEARESTAWAY-FAIL: LLVM ERROR: Cannot select
   %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.tonearestaway")
   store half %res, ptr addrspace(1) %out, align 2
   ret void
 }
+
+;--- f32-f64-tonearestaway-err.ll
+define amdgpu_gs void @test_fptrunc_round_f32_f64_tonearestaway(double %a, ptr addrspace(1) %out) {
+; F32-F64-TONEARESTAWAY-FAIL: LLVM ERROR: Cannot select
+  %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.tonearestaway")
+  store float %res, ptr addrspace(1) %out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
index 54ed6f1eb4282..3d9ce6e79d9d2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
@@ -516,3 +516,42 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_downward(<8 x float>
   %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.downward")
   ret <8 x half> %res
 }
+
+define amdgpu_gs float @v_fptrunc_round_f64_to_f32_tonearest(double %a) {
+; CHECK-LABEL: v_fptrunc_round_f64_to_f32_tonearest:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; CHECK-NEXT:    ; return to shader part epilog
+  %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.tonearest")
+  ret float %res
+}
+
+define amdgpu_gs float @v_fptrunc_round_f64_to_f32_upward(double %a) {
+; CHECK-LABEL: v_fptrunc_round_f64_to_f32_upward:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; CHECK-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; CHECK-NEXT:    ; return to shader part epilog
+  %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.upward")
+  ret float %res
+}
+
+define amdgpu_gs float @v_fptrunc_round_f64_to_f32_downward(double %a) {
+; CHECK-LABEL: v_fptrunc_round_f64_to_f32_downward:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; CHECK-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; CHECK-NEXT:    ; return to shader part epilog
+  %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.downward")
+  ret float %res
+}
+
+define amdgpu_gs float @v_fptrunc_round_f64_to_f32_towardzero(double %a) {
+; CHECK-LABEL: v_fptrunc_round_f64_to_f32_towardzero:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
+; CHECK-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; CHECK-NEXT:    ; return to shader part epilog
+  %res = call float @llvm.fptrunc.round.f32.f64(double %a, metadata !"round.towardzero")
+  ret float %res
+}

From ddf40e0132cdfb9443e8dce9ca18d4f5595fb73c Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Fri, 6 Sep 2024 09:00:00 +0200
Subject: [PATCH 332/425] [lldb] Correctly reconstruct simplified names for
 type units (#107240)

We need to resolve the type signature to get a hold of the template
argument dies.

The associated test case passes even without this patch, but it only
does it by accident (because the subsequent code considers the types to
be in an anonymous namespace and this not subject to uniqueing). This
will change once my other patch starts resolving names correctly.
---
 .../Plugins/SymbolFile/DWARF/DWARFASTParser.h |  2 +-
 .../SymbolFile/DWARF/DWARFASTParserClang.cpp  |  6 +++--
 .../SymbolFile/DWARF/DWARFASTParserClang.h    |  4 ++--
 .../DWARF/x86/type-definition-search.cpp      | 23 +++++++++++--------
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h
index 636ff4b5cf11d..971cbe47fb702 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h
@@ -58,7 +58,7 @@ class DWARFASTParser {
   virtual void EnsureAllDIEsInDeclContextHaveBeenParsed(
       CompilerDeclContext decl_context) = 0;
 
-  virtual std::string GetDIEClassTemplateParams(const DWARFDIE &die) = 0;
+  virtual std::string GetDIEClassTemplateParams(DWARFDIE die) = 0;
 
   static std::optional<SymbolFile::ArrayInfo>
   ParseChildArrayInfo(const DWARFDIE &parent_die,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 3c58be26c266b..4d688f9a1358b 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -818,8 +818,10 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
                          &attrs.decl, clang_type, resolve_state, payload);
 }
 
-std::string
-DWARFASTParserClang::GetDIEClassTemplateParams(const DWARFDIE &die) {
+std::string DWARFASTParserClang::GetDIEClassTemplateParams(DWARFDIE die) {
+  if (DWARFDIE signature_die = die.GetReferencedDIE(DW_AT_signature))
+    die = signature_die;
+
   if (llvm::StringRef(die.GetName()).contains("<"))
     return {};
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
index ae6720608121f..2806fbbeb99d2 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
@@ -106,8 +106,8 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
   /// \return A string, including surrounding '<>', of the template parameters.
   /// If the DIE's name already has '<>', returns an empty string because
   /// it's assumed that the caller is using the DIE name anyway.
-  std::string GetDIEClassTemplateParams(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  std::string
+  GetDIEClassTemplateParams(lldb_private::plugin::dwarf::DWARFDIE die) override;
 
   void MapDeclDIEToDefDIE(const lldb_private::plugin::dwarf::DWARFDIE &decl_die,
                           const lldb_private::plugin::dwarf::DWARFDIE &def_die);
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp
index bce6ed36b0968..5a40a6e0fbc27 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp
@@ -4,15 +4,20 @@
 
 // REQUIRES: lld
 
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-a.o -g -gsimple-template-names -DFILE_A
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-b.o -g -gsimple-template-names -DFILE_B
-// RUN: ld.lld %t-a.o %t-b.o -o %t
-// RUN: %lldb %t -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s
-
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-a.o -g -fdebug-types-section -DFILE_A
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-b.o -g -fdebug-types-section -DFILE_B
-// RUN: ld.lld %t-a.o %t-b.o -o %t
-// RUN: %lldb %t -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s
+// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-n-a.o -g -gsimple-template-names -DFILE_A
+// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-n-b.o -g -gsimple-template-names -DFILE_B
+// RUN: ld.lld %t-n-a.o %t-n-b.o -o %t-n
+// RUN: %lldb %t-n -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s
+
+// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-t-a.o -g -fdebug-types-section -DFILE_A
+// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-t-b.o -g -fdebug-types-section -DFILE_B
+// RUN: ld.lld %t-t-a.o %t-t-b.o -o %t-t
+// RUN: %lldb %t-t -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s
+
+// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-tn-a.o -g -fdebug-types-section -gsimple-template-names -DFILE_A
+// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-tn-b.o -g -fdebug-types-section -gsimple-template-names -DFILE_B
+// RUN: ld.lld %t-tn-a.o %t-tn-b.o -o %t-tn
+// RUN: %lldb %t-tn -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s
 
 // CHECK: (lldb) target variable
 // CHECK-NEXT: (ReferencesBoth<'A'>) both_a = {

From 9c72a308d839a27ffcbb0c67104baceb1871c50e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <balazs.keri@ericsson.com>
Date: Fri, 6 Sep 2024 09:08:26 +0200
Subject: [PATCH 333/425] [clang][ASTImporter] New fix for default template
 parameter values. (#101836)

Commit e4440b8 added a change that introduced new crash in an
incorrectly handled case. This is fixed here. Default argument
definition or inheritance is preserved in the "To" AST compared to
the "From". If the default argument is defined already in the "To"
AST it can be duplicated at the import.
---
 clang/lib/AST/ASTImporter.cpp           | 118 +++------
 clang/unittests/AST/ASTImporterTest.cpp | 334 +++++++++++++++++++-----
 2 files changed, 309 insertions(+), 143 deletions(-)

diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index fa850409ba121..d335e34907b59 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -360,51 +360,42 @@ namespace clang {
     }
 
     template <typename TemplateParmDeclT>
-    void tryUpdateTemplateParmDeclInheritedFrom(NamedDecl *RecentParm,
-                                                NamedDecl *NewParm) {
-      if (auto *ParmT = dyn_cast<TemplateParmDeclT>(RecentParm)) {
-        if (ParmT->hasDefaultArgument()) {
-          auto *P = cast<TemplateParmDeclT>(NewParm);
-          P->removeDefaultArgument();
-          P->setInheritedDefaultArgument(Importer.ToContext, ParmT);
+    Error importTemplateParameterDefaultArgument(const TemplateParmDeclT *D,
+                                                 TemplateParmDeclT *ToD) {
+      Error Err = Error::success();
+      if (D->hasDefaultArgument()) {
+        if (D->defaultArgumentWasInherited()) {
+          auto *ToInheritedFrom = const_cast<TemplateParmDeclT *>(
+              importChecked(Err, D->getDefaultArgStorage().getInheritedFrom()));
+          if (Err)
+            return std::move(Err);
+          if (!ToInheritedFrom->hasDefaultArgument()) {
+            // Resolve possible circular dependency between default value of the
+            // template argument and the template declaration.
+            const auto ToInheritedDefaultArg =
+                importChecked(Err, D->getDefaultArgStorage()
+                                       .getInheritedFrom()
+                                       ->getDefaultArgument());
+            if (Err)
+              return std::move(Err);
+            ToInheritedFrom->setDefaultArgument(Importer.getToContext(),
+                                                ToInheritedDefaultArg);
+          }
+          ToD->setInheritedDefaultArgument(ToD->getASTContext(),
+                                           ToInheritedFrom);
+        } else {
+          Expected<TemplateArgumentLoc> ToDefaultArgOrErr =
+              import(D->getDefaultArgument());
+          if (!ToDefaultArgOrErr)
+            return ToDefaultArgOrErr.takeError();
+          // Default argument could have been set in the
+          // '!ToInheritedFrom->hasDefaultArgument()' branch above.
+          if (!ToD->hasDefaultArgument())
+            ToD->setDefaultArgument(Importer.getToContext(),
+                                    *ToDefaultArgOrErr);
         }
       }
-    }
-
-    // Update the parameter list `NewParams` of a template declaration
-    // by "inheriting" default argument values from `RecentParams`,
-    // which is the parameter list of an earlier declaration of the
-    // same template. (Note that "inheriting" default argument values
-    // is not related to object-oriented inheritance.)
-    //
-    // In the clang AST template parameters (NonTypeTemplateParmDec,
-    // TemplateTypeParmDecl, TemplateTemplateParmDecl) have a reference to the
-    // default value, if one is specified at the first declaration. The default
-    // value can be specified only once. The template parameters of the
-    // following declarations have a reference to the original default value
-    // through the "inherited" value. This value should be set for all imported
-    // template parameters that have a previous declaration (also a previous
-    // template declaration).
-    //
-    // In the `Visit*ParmDecl` functions the default value of these template
-    // arguments is always imported. At that location the previous declaration
-    // is not easily accessible, it is not possible to call
-    // `setInheritedDefaultArgument` at that place.
-    // `updateTemplateParametersInheritedFrom` is called later when the already
-    // imported default value is erased and changed to "inherited".
-    // It is important to change the mode to "inherited" otherwise false
-    // structural in-equivalences could be detected.
-    void updateTemplateParametersInheritedFrom(
-        const TemplateParameterList &RecentParams,
-        TemplateParameterList &NewParams) {
-      for (auto [Idx, Param] : enumerate(RecentParams)) {
-        tryUpdateTemplateParmDeclInheritedFrom<NonTypeTemplateParmDecl>(
-            Param, NewParams.getParam(Idx));
-        tryUpdateTemplateParmDeclInheritedFrom<TemplateTypeParmDecl>(
-            Param, NewParams.getParam(Idx));
-        tryUpdateTemplateParmDeclInheritedFrom<TemplateTemplateParmDecl>(
-            Param, NewParams.getParam(Idx));
-      }
+      return Err;
     }
 
   public:
@@ -5955,8 +5946,8 @@ ASTNodeImporter::VisitObjCPropertyImplDecl(ObjCPropertyImplDecl *D) {
 ExpectedDecl
 ASTNodeImporter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   // For template arguments, we adopt the translation unit as our declaration
-  // context. This context will be fixed when the actual template declaration
-  // is created.
+  // context. This context will be fixed when (during) the actual template
+  // declaration is created.
 
   ExpectedSLoc BeginLocOrErr = import(D->getBeginLoc());
   if (!BeginLocOrErr)
@@ -5988,13 +5979,8 @@ ASTNodeImporter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
     ToD->setTypeConstraint(ToConceptRef, ToIDC);
   }
 
-  if (D->hasDefaultArgument()) {
-    Expected<TemplateArgumentLoc> ToDefaultArgOrErr =
-        import(D->getDefaultArgument());
-    if (!ToDefaultArgOrErr)
-      return ToDefaultArgOrErr.takeError();
-    ToD->setDefaultArgument(ToD->getASTContext(), *ToDefaultArgOrErr);
-  }
+  if (Error Err = importTemplateParameterDefaultArgument(D, ToD))
+    return Err;
 
   return ToD;
 }
@@ -6020,13 +6006,9 @@ ASTNodeImporter::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) {
                               D->isParameterPack(), ToTypeSourceInfo))
     return ToD;
 
-  if (D->hasDefaultArgument()) {
-    Expected<TemplateArgumentLoc> ToDefaultArgOrErr =
-        import(D->getDefaultArgument());
-    if (!ToDefaultArgOrErr)
-      return ToDefaultArgOrErr.takeError();
-    ToD->setDefaultArgument(Importer.getToContext(), *ToDefaultArgOrErr);
-  }
+  Err = importTemplateParameterDefaultArgument(D, ToD);
+  if (Err)
+    return Err;
 
   return ToD;
 }
@@ -6057,13 +6039,8 @@ ASTNodeImporter::VisitTemplateTemplateParmDecl(TemplateTemplateParmDecl *D) {
           *TemplateParamsOrErr))
     return ToD;
 
-  if (D->hasDefaultArgument()) {
-    Expected<TemplateArgumentLoc> ToDefaultArgOrErr =
-        import(D->getDefaultArgument());
-    if (!ToDefaultArgOrErr)
-      return ToDefaultArgOrErr.takeError();
-    ToD->setDefaultArgument(Importer.getToContext(), *ToDefaultArgOrErr);
-  }
+  if (Error Err = importTemplateParameterDefaultArgument(D, ToD))
+    return Err;
 
   return ToD;
 }
@@ -6201,9 +6178,6 @@ ExpectedDecl ASTNodeImporter::VisitClassTemplateDecl(ClassTemplateDecl *D) {
     }
 
     D2->setPreviousDecl(Recent);
-
-    updateTemplateParametersInheritedFrom(*(Recent->getTemplateParameters()),
-                                          **TemplateParamsOrErr);
   }
 
   return D2;
@@ -6518,9 +6492,6 @@ ExpectedDecl ASTNodeImporter::VisitVarTemplateDecl(VarTemplateDecl *D) {
         ToTemplated->setPreviousDecl(PrevTemplated);
     }
     ToVarTD->setPreviousDecl(Recent);
-
-    updateTemplateParametersInheritedFrom(*(Recent->getTemplateParameters()),
-                                          **TemplateParamsOrErr);
   }
 
   return ToVarTD;
@@ -6793,9 +6764,6 @@ ASTNodeImporter::VisitFunctionTemplateDecl(FunctionTemplateDecl *D) {
         TemplatedFD->setPreviousDecl(PrevTemplated);
     }
     ToFunc->setPreviousDecl(Recent);
-
-    updateTemplateParametersInheritedFrom(*(Recent->getTemplateParameters()),
-                                          *Params);
   }
 
   return ToFunc;
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index cc87e83e86055..aacecd3fbcd90 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -9836,41 +9836,75 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportMultipleAnonymousEnumDecls) {
 struct ImportTemplateParmDeclDefaultValue
     : public ASTImporterOptionSpecificTestBase {
 protected:
-  void checkTemplateParams(RedeclarableTemplateDecl *D) {
-    auto *CanD = cast<RedeclarableTemplateDecl>(D->getCanonicalDecl());
-    auto *CanNonTypeP = cast<NonTypeTemplateParmDecl>(
-        CanD->getTemplateParameters()->getParam(0));
-    auto *CanTypeP =
-        cast<TemplateTypeParmDecl>(CanD->getTemplateParameters()->getParam(1));
-    auto *CanTemplateP = cast<TemplateTemplateParmDecl>(
-        CanD->getTemplateParameters()->getParam(2));
-    EXPECT_FALSE(CanNonTypeP->getDefaultArgStorage().isInherited());
-    EXPECT_FALSE(CanTypeP->getDefaultArgStorage().isInherited());
-    EXPECT_FALSE(CanTemplateP->getDefaultArgStorage().isInherited());
-    for (Decl *Redecl : D->redecls()) {
-      auto *ReD = cast<RedeclarableTemplateDecl>(Redecl);
-      if (ReD != CanD) {
-        auto *NonTypeP = cast<NonTypeTemplateParmDecl>(
-            ReD->getTemplateParameters()->getParam(0));
-        auto *TypeP = cast<TemplateTypeParmDecl>(
-            ReD->getTemplateParameters()->getParam(1));
-        auto *TemplateP = cast<TemplateTemplateParmDecl>(
-            ReD->getTemplateParameters()->getParam(2));
-        EXPECT_TRUE(NonTypeP->getDefaultArgStorage().isInherited());
-        EXPECT_TRUE(TypeP->getDefaultArgStorage().isInherited());
-        EXPECT_TRUE(TemplateP->getDefaultArgStorage().isInherited());
-        EXPECT_EQ(NonTypeP->getDefaultArgStorage().getInheritedFrom(),
-                  CanNonTypeP);
-        EXPECT_EQ(TypeP->getDefaultArgStorage().getInheritedFrom(), CanTypeP);
-        EXPECT_EQ(TemplateP->getDefaultArgStorage().getInheritedFrom(),
-                  CanTemplateP);
-      }
+  void checkTemplateParams(RedeclarableTemplateDecl *D,
+                           RedeclarableTemplateDecl *InheritedFromD) {
+    auto *NonTypeP =
+        cast<NonTypeTemplateParmDecl>(D->getTemplateParameters()->getParam(0));
+    auto *TypeP =
+        cast<TemplateTypeParmDecl>(D->getTemplateParameters()->getParam(1));
+    auto *TemplateP =
+        cast<TemplateTemplateParmDecl>(D->getTemplateParameters()->getParam(2));
+    if (InheritedFromD) {
+      EXPECT_TRUE(NonTypeP->getDefaultArgStorage().isInherited());
+      EXPECT_TRUE(TypeP->getDefaultArgStorage().isInherited());
+      EXPECT_TRUE(TemplateP->getDefaultArgStorage().isInherited());
+      EXPECT_EQ(NonTypeP->getDefaultArgStorage().getInheritedFrom(),
+                InheritedFromD->getTemplateParameters()->getParam(0));
+      EXPECT_EQ(TypeP->getDefaultArgStorage().getInheritedFrom(),
+                InheritedFromD->getTemplateParameters()->getParam(1));
+      EXPECT_EQ(TemplateP->getDefaultArgStorage().getInheritedFrom(),
+                InheritedFromD->getTemplateParameters()->getParam(2));
+    } else {
+      EXPECT_FALSE(NonTypeP->getDefaultArgStorage().isInherited());
+      EXPECT_FALSE(TypeP->getDefaultArgStorage().isInherited());
+      EXPECT_FALSE(TemplateP->getDefaultArgStorage().isInherited());
     }
   }
 
-  void testImport(RedeclarableTemplateDecl *FromD) {
-    RedeclarableTemplateDecl *ToD = Import(FromD, Lang_CXX14);
-    checkTemplateParams(ToD);
+  void testImport(RedeclarableTemplateDecl *FromD1,
+                  RedeclarableTemplateDecl *FromD2,
+                  RedeclarableTemplateDecl *FromD3,
+                  RedeclarableTemplateDecl *ToExistingD1) {
+    auto *ToD1 = Import(FromD1, Lang_CXX14);
+    auto *ToD2 = Import(FromD2, Lang_CXX14);
+    auto *ToD3 = Import(FromD3, Lang_CXX14);
+    checkTemplateParams(ToD1, nullptr);
+    checkTemplateParams(ToD2, ToD1);
+    checkTemplateParams(ToD3, ToExistingD1 ? ToExistingD1 : ToD1);
+  }
+
+  // In these tests a circular dependency is created between the template
+  // parameter default value and the template declaration (with the same
+  // template parameter).
+  template <class TemplateParmDeclT>
+  void
+  testTemplateParmDeclCircularDependency(ClassTemplateDecl *FromD,
+                                         ClassTemplateDecl *FromDInherited) {
+    auto GetTemplateParm =
+        [](ClassTemplateDecl *D) -> const TemplateParmDeclT * {
+      return dyn_cast<TemplateParmDeclT>(
+          D->getTemplateParameters()->getParam(0));
+    };
+
+    ASSERT_FALSE(GetTemplateParm(FromD)->getDefaultArgStorage().isInherited());
+    ASSERT_TRUE(
+        GetTemplateParm(FromDInherited)->getDefaultArgStorage().isInherited());
+
+    auto *ToD = Import(FromD, Lang_CXX14);
+    EXPECT_TRUE(ToD);
+
+    auto *ToDInherited = Import(FromDInherited, Lang_CXX14);
+    EXPECT_TRUE(ToDInherited);
+
+    EXPECT_FALSE(GetTemplateParm(ToD)->getDefaultArgStorage().isInherited());
+    EXPECT_TRUE(
+        GetTemplateParm(ToDInherited)->getDefaultArgStorage().isInherited());
+    EXPECT_EQ(GetTemplateParm(ToDInherited)
+                  ->getDefaultArgStorage()
+                  .getInheritedFrom(),
+              GetTemplateParm(ToD));
+
+    EXPECT_EQ(ToD->getPreviousDecl(), ToDInherited);
   }
 
   const char *CodeFunction =
@@ -9878,81 +9912,245 @@ struct ImportTemplateParmDeclDefaultValue
       template <class> struct X;
 
       template <int A = 2, typename B = int, template<class> class C = X>
-      void f();
+      void test();
       template <int A, typename B, template<class> class C>
-      void f();
+      void test();
       template <int A, typename B, template<class> class C>
-      void f() {}
+      void test() {}
       )";
 
   const char *CodeClass =
       R"(
+      namespace N {
       template <class> struct X;
 
       template <int A = 2, typename B = int, template<class> class C = X>
-      struct S;
+      struct test;
       template <int A, typename B, template<class> class C>
-      struct S;
+      struct test;
       template <int A, typename B, template<class> class C>
-      struct S {};
+      struct test {};
+      }
       )";
 
   const char *CodeVar =
       R"(
+      namespace N {
       template <class> struct X;
 
       template <int A = 2, typename B = int, template<class> class C = X>
-      extern int V;
+      extern int test;
       template <int A, typename B, template<class> class C>
-      extern int V;
+      extern int test;
       template <int A, typename B, template<class> class C>
-      int V = A;
+      int test = A;
+      }
       )";
 };
 
-TEST_P(ImportTemplateParmDeclDefaultValue, ImportFunctionTemplate) {
-  Decl *FromTU = getTuDecl(CodeFunction, Lang_CXX14);
-  auto *FromLastD = LastDeclMatcher<FunctionTemplateDecl>().match(
+TEST_P(ImportTemplateParmDeclDefaultValue, InvisibleInheritedFrom) {
+  const char *ToCode =
+      R"(
+      template <int P = 1>
+      void f() {}
+      )";
+  TranslationUnitDecl *ToTU = getToTuDecl(ToCode, Lang_CXX14);
+  auto *ToFDef = FirstDeclMatcher<FunctionTemplateDecl>().match(
+      ToTU, functionTemplateDecl(hasName("f")));
+
+  const char *FromCode =
+      R"(
+      template <int P = 1>
+      void f() {}
+      template <int P>
+      void f();
+      )";
+  TranslationUnitDecl *FromTU = getTuDecl(FromCode, Lang_CXX14);
+  auto *FromFDef = FirstDeclMatcher<FunctionTemplateDecl>().match(
+      FromTU, functionTemplateDecl(hasName("f")));
+  auto *FromF = LastDeclMatcher<FunctionTemplateDecl>().match(
       FromTU, functionTemplateDecl(hasName("f")));
-  testImport(FromLastD);
+
+  auto *ToFDefImported = Import(FromFDef, Lang_CXX14);
+  EXPECT_EQ(ToFDefImported, ToFDef);
+  auto *ToF = Import(FromF, Lang_CXX14);
+  EXPECT_NE(ToF, ToFDef);
+  const auto *Parm = dyn_cast<NonTypeTemplateParmDecl>(
+      ToF->getTemplateParameters()->getParam(0));
+  EXPECT_TRUE(Parm->defaultArgumentWasInherited());
+  // FIXME: This behavior may be confusing:
+  // Default value is not inherited from the existing declaration, instead a new
+  // is created at import that is similar to the existing but not reachable from
+  // the AST.
+  EXPECT_NE(Parm->getDefaultArgStorage().getInheritedFrom(),
+            ToFDef->getTemplateParameters()->getParam(0));
+}
+
+TEST_P(ImportTemplateParmDeclDefaultValue, ImportFunctionTemplate) {
+  TranslationUnitDecl *FromTU = getTuDecl(CodeFunction, Lang_CXX14);
+  auto *D3 = LastDeclMatcher<FunctionTemplateDecl>().match(
+      FromTU, functionTemplateDecl(hasName("test") /*, hasBody(stmt())*/));
+  auto *D2 = dyn_cast<FunctionTemplateDecl>(D3->getPreviousDecl());
+  auto *D1 = dyn_cast<FunctionTemplateDecl>(D2->getPreviousDecl());
+  testImport(D1, D2, D3, nullptr);
 }
 
 TEST_P(ImportTemplateParmDeclDefaultValue, ImportExistingFunctionTemplate) {
-  getToTuDecl(CodeFunction, Lang_CXX14);
-  Decl *FromTU = getTuDecl(CodeFunction, Lang_CXX14);
-  auto *FromLastD = LastDeclMatcher<FunctionTemplateDecl>().match(
-      FromTU, functionTemplateDecl(hasName("f")));
-  testImport(FromLastD);
+  TranslationUnitDecl *ToTU = getToTuDecl(CodeFunction, Lang_CXX14);
+  auto *ToD1 = FirstDeclMatcher<FunctionTemplateDecl>().match(
+      ToTU, functionTemplateDecl(hasName("test")));
+  TranslationUnitDecl *FromTU = getTuDecl(CodeFunction, Lang_CXX14);
+  auto *D3 = LastDeclMatcher<FunctionTemplateDecl>().match(
+      FromTU, functionTemplateDecl(hasName("test")));
+  auto *D2 = dyn_cast<FunctionTemplateDecl>(D3->getPreviousDecl());
+  auto *D1 = dyn_cast<FunctionTemplateDecl>(D2->getPreviousDecl());
+  testImport(D1, D2, D3, ToD1);
 }
 
 TEST_P(ImportTemplateParmDeclDefaultValue, ImportClassTemplate) {
-  Decl *FromTU = getTuDecl(CodeClass, Lang_CXX14);
-  auto *FromLastD = LastDeclMatcher<ClassTemplateDecl>().match(
-      FromTU, classTemplateDecl(hasName("S")));
-  testImport(FromLastD);
+  TranslationUnitDecl *FromTU = getTuDecl(CodeClass, Lang_CXX14);
+  auto *D3 = LastDeclMatcher<ClassTemplateDecl>().match(
+      FromTU, classTemplateDecl(hasName("test")));
+  auto *D2 = dyn_cast<ClassTemplateDecl>(D3->getPreviousDecl());
+  auto *D1 = dyn_cast<ClassTemplateDecl>(D2->getPreviousDecl());
+  testImport(D1, D2, D3, nullptr);
 }
 
 TEST_P(ImportTemplateParmDeclDefaultValue, ImportExistingClassTemplate) {
-  getToTuDecl(CodeClass, Lang_CXX14);
-  Decl *FromTU = getTuDecl(CodeClass, Lang_CXX14);
-  auto *FromLastD = LastDeclMatcher<ClassTemplateDecl>().match(
-      FromTU, classTemplateDecl(hasName("S")));
-  testImport(FromLastD);
+  TranslationUnitDecl *ToTU = getToTuDecl(CodeClass, Lang_CXX14);
+  auto *ToD1 = FirstDeclMatcher<ClassTemplateDecl>().match(
+      ToTU, classTemplateDecl(hasName("test")));
+  TranslationUnitDecl *FromTU = getTuDecl(CodeClass, Lang_CXX14);
+  auto *D3 = LastDeclMatcher<ClassTemplateDecl>().match(
+      FromTU, classTemplateDecl(hasName("test")));
+  auto *D2 = dyn_cast<ClassTemplateDecl>(D3->getPreviousDecl());
+  auto *D1 = dyn_cast<ClassTemplateDecl>(D2->getPreviousDecl());
+  testImport(D1, D2, D3, ToD1);
 }
 
 TEST_P(ImportTemplateParmDeclDefaultValue, ImportVarTemplate) {
-  Decl *FromTU = getTuDecl(CodeVar, Lang_CXX14);
-  auto *FromLastD = LastDeclMatcher<VarTemplateDecl>().match(
-      FromTU, varTemplateDecl(hasName("V")));
-  testImport(FromLastD);
+  TranslationUnitDecl *FromTU = getTuDecl(CodeVar, Lang_CXX14);
+  auto *D3 = LastDeclMatcher<VarTemplateDecl>().match(
+      FromTU, varTemplateDecl(hasName("test")));
+  auto *D2 = dyn_cast<VarTemplateDecl>(D3->getPreviousDecl());
+  auto *D1 = dyn_cast<VarTemplateDecl>(D2->getPreviousDecl());
+  testImport(D1, D2, D3, nullptr);
 }
 
 TEST_P(ImportTemplateParmDeclDefaultValue, ImportExistingVarTemplate) {
-  getToTuDecl(CodeVar, Lang_CXX14);
-  Decl *FromTU = getTuDecl(CodeVar, Lang_CXX14);
-  auto *FromLastD = LastDeclMatcher<VarTemplateDecl>().match(
-      FromTU, varTemplateDecl(hasName("V")));
-  testImport(FromLastD);
+  TranslationUnitDecl *ToTU = getToTuDecl(CodeVar, Lang_CXX14);
+  auto *ToD1 = FirstDeclMatcher<VarTemplateDecl>().match(
+      ToTU, varTemplateDecl(hasName("test")));
+  TranslationUnitDecl *FromTU = getTuDecl(CodeVar, Lang_CXX14);
+  auto *D3 = LastDeclMatcher<VarTemplateDecl>().match(
+      FromTU, varTemplateDecl(hasName("test")));
+  auto *D2 = dyn_cast<VarTemplateDecl>(D3->getPreviousDecl());
+  auto *D1 = dyn_cast<VarTemplateDecl>(D2->getPreviousDecl());
+  testImport(D1, D2, D3, ToD1);
+}
+
+TEST_P(ImportTemplateParmDeclDefaultValue,
+       NonTypeTemplateParmDeclCircularDependency) {
+  const char *Code =
+      R"(
+      struct Z;
+
+      struct Y {
+        Z *z;
+        static const int x = 1;
+      };
+
+      template <int P1 = Y::x>
+      struct X;
+
+      template <int P2>
+      struct X {
+        static const int A = 1;
+      };
+
+      struct Z {
+        template<int P>
+        void f(int A = X<P>::A);
+      };
+      )";
+
+  Decl *FromTU = getTuDecl(Code, Lang_CXX14);
+  auto *FromD = FirstDeclMatcher<ClassTemplateDecl>().match(
+      FromTU, classTemplateDecl(hasName("X")));
+  auto *FromDInherited = LastDeclMatcher<ClassTemplateDecl>().match(
+      FromTU, classTemplateDecl(hasName("X")));
+
+  testTemplateParmDeclCircularDependency<NonTypeTemplateParmDecl>(
+      FromD, FromDInherited);
+}
+
+TEST_P(ImportTemplateParmDeclDefaultValue,
+       TemplateTypeParmDeclCircularDependency) {
+  const char *Code =
+      R"(
+      struct Z;
+
+      struct Y {
+        Z *z;
+      };
+
+      template <typename T1 = Y>
+      struct X;
+
+      template <typename T2>
+      struct X {
+        static const int A = 1;
+      };
+
+      struct Z {
+        template<typename T>
+        void f(int A = X<T>::A);
+      };
+      )";
+
+  Decl *FromTU = getTuDecl(Code, Lang_CXX14);
+  auto *FromD = FirstDeclMatcher<ClassTemplateDecl>().match(
+      FromTU, classTemplateDecl(hasName("X")));
+  auto *FromDInherited = LastDeclMatcher<ClassTemplateDecl>().match(
+      FromTU, classTemplateDecl(hasName("X")));
+
+  testTemplateParmDeclCircularDependency<TemplateTypeParmDecl>(FromD,
+                                                               FromDInherited);
+}
+
+TEST_P(ImportTemplateParmDeclDefaultValue,
+       TemplateTemplateParmDeclCircularDependency) {
+  const char *Code =
+      R"(
+      struct Z;
+
+      template <int>
+      struct Y {
+        Z *z;
+      };
+
+      template <template <int> class T1 = Y>
+      struct X;
+
+      template <template <int> class T2>
+      struct X {
+        static const int A = 1;
+      };
+
+      struct Z {
+        template <template <int> class T>
+        void f(int A = X<T>::A);
+      };
+      )";
+
+  Decl *FromTU = getTuDecl(Code, Lang_CXX14);
+  auto *FromD = FirstDeclMatcher<ClassTemplateDecl>().match(
+      FromTU, classTemplateDecl(hasName("X")));
+  auto *FromDInherited = LastDeclMatcher<ClassTemplateDecl>().match(
+      FromTU, classTemplateDecl(hasName("X")));
+
+  testTemplateParmDeclCircularDependency<TemplateTemplateParmDecl>(
+      FromD, FromDInherited);
 }
 
 INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ASTImporterLookupTableTest,

From 6fb39e496411a64af8fa5484385ef3ff42d2a5f4 Mon Sep 17 00:00:00 2001
From: Patryk Wychowaniec <pwychowaniec@pm.me>
Date: Fri, 6 Sep 2024 08:13:36 +0100
Subject: [PATCH 334/425] [AVR][NFC] Reformat MC tests (#107068)

---
 llvm/test/MC/AVR/hex-immediates.s           |  2 +-
 llvm/test/MC/AVR/inst-adc.s                 |  4 +---
 llvm/test/MC/AVR/inst-add.s                 |  4 +---
 llvm/test/MC/AVR/inst-adiw.s                |  2 --
 llvm/test/MC/AVR/inst-and.s                 |  4 +---
 llvm/test/MC/AVR/inst-andi.s                |  4 +---
 llvm/test/MC/AVR/inst-asr.s                 |  4 +---
 llvm/test/MC/AVR/inst-bld.s                 |  4 +---
 llvm/test/MC/AVR/inst-brbc.s                |  2 +-
 llvm/test/MC/AVR/inst-brbs.s                |  2 +-
 llvm/test/MC/AVR/inst-brcc.s                |  2 +-
 llvm/test/MC/AVR/inst-brcs.s                |  2 +-
 llvm/test/MC/AVR/inst-break.s               |  4 +---
 llvm/test/MC/AVR/inst-breq.s                |  2 +-
 llvm/test/MC/AVR/inst-brge.s                |  2 +-
 llvm/test/MC/AVR/inst-brhc.s                |  2 +-
 llvm/test/MC/AVR/inst-brhs.s                |  2 +-
 llvm/test/MC/AVR/inst-brid.s                |  2 +-
 llvm/test/MC/AVR/inst-brie.s                |  2 +-
 llvm/test/MC/AVR/inst-brlo.s                |  2 +-
 llvm/test/MC/AVR/inst-brlt.s                |  2 +-
 llvm/test/MC/AVR/inst-brmi.s                |  2 +-
 llvm/test/MC/AVR/inst-brne.s                |  2 +-
 llvm/test/MC/AVR/inst-brpl.s                |  2 +-
 llvm/test/MC/AVR/inst-brsh.s                |  2 +-
 llvm/test/MC/AVR/inst-brtc.s                |  2 +-
 llvm/test/MC/AVR/inst-brts.s                |  2 +-
 llvm/test/MC/AVR/inst-brvc.s                |  2 +-
 llvm/test/MC/AVR/inst-brvs.s                |  2 +-
 llvm/test/MC/AVR/inst-bst.s                 |  4 +---
 llvm/test/MC/AVR/inst-call.s                |  2 --
 llvm/test/MC/AVR/inst-cbi.s                 |  4 +---
 llvm/test/MC/AVR/inst-cbr.s                 |  4 +---
 llvm/test/MC/AVR/inst-clr.s                 |  4 +---
 llvm/test/MC/AVR/inst-com.s                 |  4 +---
 llvm/test/MC/AVR/inst-cp.s                  |  4 +---
 llvm/test/MC/AVR/inst-cpc.s                 |  4 +---
 llvm/test/MC/AVR/inst-cpi.s                 |  4 +---
 llvm/test/MC/AVR/inst-cpse.s                |  4 +---
 llvm/test/MC/AVR/inst-dec.s                 |  4 +---
 llvm/test/MC/AVR/inst-des.s                 |  4 +---
 llvm/test/MC/AVR/inst-eicall.s              |  4 +---
 llvm/test/MC/AVR/inst-eijmp.s               |  4 +---
 llvm/test/MC/AVR/inst-elpm.s                |  4 +---
 llvm/test/MC/AVR/inst-eor.s                 |  4 +---
 llvm/test/MC/AVR/inst-family-set-clr-flag.s |  2 +-
 llvm/test/MC/AVR/inst-fmul.s                |  4 +---
 llvm/test/MC/AVR/inst-fmuls.s               |  4 +---
 llvm/test/MC/AVR/inst-fmulsu.s              |  4 +---
 llvm/test/MC/AVR/inst-icall.s               |  4 +---
 llvm/test/MC/AVR/inst-ijmp.s                |  4 +---
 llvm/test/MC/AVR/inst-in.s                  |  4 +---
 llvm/test/MC/AVR/inst-inc.s                 |  4 +---
 llvm/test/MC/AVR/inst-jmp.s                 |  2 --
 llvm/test/MC/AVR/inst-lac.s                 |  4 +---
 llvm/test/MC/AVR/inst-las.s                 |  4 +---
 llvm/test/MC/AVR/inst-lat.s                 |  4 +---
 llvm/test/MC/AVR/inst-ld.s                  |  4 +---
 llvm/test/MC/AVR/inst-ldd.s                 |  2 +-
 llvm/test/MC/AVR/inst-ldi.s                 |  3 +--
 llvm/test/MC/AVR/inst-lds.s                 |  1 -
 llvm/test/MC/AVR/inst-lpm.s                 |  4 +---
 llvm/test/MC/AVR/inst-lsl.s                 |  4 +---
 llvm/test/MC/AVR/inst-lsr.s                 |  4 +---
 llvm/test/MC/AVR/inst-mov.s                 |  4 +---
 llvm/test/MC/AVR/inst-movw.s                |  4 +---
 llvm/test/MC/AVR/inst-mul.s                 |  3 +--
 llvm/test/MC/AVR/inst-muls.s                |  4 +---
 llvm/test/MC/AVR/inst-mulsu.s               |  4 +---
 llvm/test/MC/AVR/inst-neg.s                 |  2 +-
 llvm/test/MC/AVR/inst-nop.s                 |  4 +---
 llvm/test/MC/AVR/inst-or.s                  |  3 +--
 llvm/test/MC/AVR/inst-ori.s                 |  4 +---
 llvm/test/MC/AVR/inst-out.s                 |  4 +---
 llvm/test/MC/AVR/inst-pop.s                 |  4 +---
 llvm/test/MC/AVR/inst-push.s                |  4 +---
 llvm/test/MC/AVR/inst-rcall.s               |  2 +-
 llvm/test/MC/AVR/inst-ret.s                 |  4 +---
 llvm/test/MC/AVR/inst-reti.s                |  4 +---
 llvm/test/MC/AVR/inst-rjmp.s                |  2 +-
 llvm/test/MC/AVR/inst-rol.s                 |  4 +---
 llvm/test/MC/AVR/inst-ror.s                 |  4 +---
 llvm/test/MC/AVR/inst-sbc.s                 |  4 +---
 llvm/test/MC/AVR/inst-sbci.s                |  4 +---
 llvm/test/MC/AVR/inst-sbi.s                 |  4 +---
 llvm/test/MC/AVR/inst-sbic.s                |  4 +---
 llvm/test/MC/AVR/inst-sbis.s                |  3 +--
 llvm/test/MC/AVR/inst-sbiw.s                |  2 --
 llvm/test/MC/AVR/inst-sbr.s                 |  3 +--
 llvm/test/MC/AVR/inst-sbrc.s                |  4 +---
 llvm/test/MC/AVR/inst-sbrs.s                |  4 +---
 llvm/test/MC/AVR/inst-ser.s                 |  3 +--
 llvm/test/MC/AVR/inst-sleep.s               |  4 +---
 llvm/test/MC/AVR/inst-spm.s                 |  4 +---
 llvm/test/MC/AVR/inst-st.s                  |  3 +--
 llvm/test/MC/AVR/inst-std.s                 |  3 +--
 llvm/test/MC/AVR/inst-sts.s                 |  2 --
 llvm/test/MC/AVR/inst-sub.s                 |  3 +--
 llvm/test/MC/AVR/inst-subi.s                |  3 +--
 llvm/test/MC/AVR/inst-swap.s                |  4 +---
 llvm/test/MC/AVR/inst-tst.s                 |  4 +---
 llvm/test/MC/AVR/inst-wdr.s                 |  4 +---
 llvm/test/MC/AVR/inst-xch.s                 |  4 +---
 llvm/test/MC/AVR/modifiers.s                | 14 --------------
 llvm/test/MC/AVR/registers.s                |  2 +-
 llvm/test/MC/AVR/relocations-abs.s          |  2 +-
 llvm/test/MC/AVR/relocations.s              |  1 +
 llvm/test/MC/AVR/separator.s                |  3 +--
 llvm/test/MC/AVR/syntax-reg-int-literal.s   |  1 -
 llvm/test/MC/AVR/syntax-reg-pair.s          |  1 -
 110 files changed, 101 insertions(+), 260 deletions(-)

diff --git a/llvm/test/MC/AVR/hex-immediates.s b/llvm/test/MC/AVR/hex-immediates.s
index a6ff541a9ac94..7819b7f5f6afd 100644
--- a/llvm/test/MC/AVR/hex-immediates.s
+++ b/llvm/test/MC/AVR/hex-immediates.s
@@ -1,6 +1,6 @@
 ; RUN: llvm-mc -filetype=obj -triple=avr %s -o %t
 ; RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck %s --check-prefix=DEC
-; RUN: llvm-objdump -d --print-imm-hex %t | FileCheck %s --check-prefix=HEX
+; RUN: llvm-objdump -dr --print-imm-hex %t | FileCheck %s --check-prefix=HEX
 
 ; DEC: ldi r24, 66
 ; HEX: ldi r24, 0x42
diff --git a/llvm/test/MC/AVR/inst-adc.s b/llvm/test/MC/AVR/inst-adc.s
index d1157bc7a9b3b..6f29577377b27 100644
--- a/llvm/test/MC/AVR/inst-adc.s
+++ b/llvm/test/MC/AVR/inst-adc.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   adc r0,  r15
   adc r15, r0
   adc r16, r31
diff --git a/llvm/test/MC/AVR/inst-add.s b/llvm/test/MC/AVR/inst-add.s
index 49ad5de80c06b..828f4cf0cc3bd 100644
--- a/llvm/test/MC/AVR/inst-add.s
+++ b/llvm/test/MC/AVR/inst-add.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   add r0,  r15
   add r15, r0
   add r16, r31
diff --git a/llvm/test/MC/AVR/inst-adiw.s b/llvm/test/MC/AVR/inst-adiw.s
index 7904965a51d68..6b83027234c11 100644
--- a/llvm/test/MC/AVR/inst-adiw.s
+++ b/llvm/test/MC/AVR/inst-adiw.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=addsubiw -show-encoding < %s | FileCheck %s
 ; RUN: llvm-mc -filetype=obj -triple avr -mattr=addsubiw < %s | llvm-objdump --no-print-imm-hex -dr --mattr=addsubiw - | FileCheck --check-prefix=CHECK-INST %s
 
-
 foo:
-
   adiw r26,  12
   adiw r26,  63
 
diff --git a/llvm/test/MC/AVR/inst-and.s b/llvm/test/MC/AVR/inst-and.s
index c4d90bfba3747..aa62a3114a204 100644
--- a/llvm/test/MC/AVR/inst-and.s
+++ b/llvm/test/MC/AVR/inst-and.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   and r0,  r15
   and r15, r0
   and r16, r31
diff --git a/llvm/test/MC/AVR/inst-andi.s b/llvm/test/MC/AVR/inst-andi.s
index 96a090173bd78..14f85bbd66010 100644
--- a/llvm/test/MC/AVR/inst-andi.s
+++ b/llvm/test/MC/AVR/inst-andi.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   andi r16, 255
   andi r29, 190
   andi r22, 172
diff --git a/llvm/test/MC/AVR/inst-asr.s b/llvm/test/MC/AVR/inst-asr.s
index 1b59d027dc2bc..0fedf9b5ac121 100644
--- a/llvm/test/MC/AVR/inst-asr.s
+++ b/llvm/test/MC/AVR/inst-asr.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   asr r31
   asr r25
   asr r5
diff --git a/llvm/test/MC/AVR/inst-bld.s b/llvm/test/MC/AVR/inst-bld.s
index 71352c5c0abd8..1dbd6815448c4 100644
--- a/llvm/test/MC/AVR/inst-bld.s
+++ b/llvm/test/MC/AVR/inst-bld.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   bld r3, 5
   bld r1, 1
   bld r0, 0
diff --git a/llvm/test/MC/AVR/inst-brbc.s b/llvm/test/MC/AVR/inst-brbc.s
index 3ef3664cf07bf..4edbbfd858024 100644
--- a/llvm/test/MC/AVR/inst-brbc.s
+++ b/llvm/test/MC/AVR/inst-brbc.s
@@ -1,6 +1,6 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - | FileCheck --check-prefix=INST %s
+; RUN:     | llvm-objdump -dr - | FileCheck --check-prefix=INST %s
 
 foo:
   brbc 3, .+8
diff --git a/llvm/test/MC/AVR/inst-brbs.s b/llvm/test/MC/AVR/inst-brbs.s
index f15a779a53654..3f4b134aef682 100644
--- a/llvm/test/MC/AVR/inst-brbs.s
+++ b/llvm/test/MC/AVR/inst-brbs.s
@@ -1,6 +1,6 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - | FileCheck --check-prefix=INST %s
+; RUN:     | llvm-objdump -dr - | FileCheck --check-prefix=INST %s
 
 foo:
   brbs 3, .+8
diff --git a/llvm/test/MC/AVR/inst-brcc.s b/llvm/test/MC/AVR/inst-brcc.s
index d9218bc61e787..dd1b2b11a6d30 100644
--- a/llvm/test/MC/AVR/inst-brcc.s
+++ b/llvm/test/MC/AVR/inst-brcc.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brcs.s b/llvm/test/MC/AVR/inst-brcs.s
index 0012cb31f6126..3fafccdb49257 100644
--- a/llvm/test/MC/AVR/inst-brcs.s
+++ b/llvm/test/MC/AVR/inst-brcs.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-break.s b/llvm/test/MC/AVR/inst-break.s
index a1bfde93c5a0d..10e7ea584a340 100644
--- a/llvm/test/MC/AVR/inst-break.s
+++ b/llvm/test/MC/AVR/inst-break.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=break -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=break < %s | llvm-objdump -d --mattr=break - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=break < %s | llvm-objdump -dr --mattr=break - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   break
 
 ; CHECK: break                  ; encoding: [0x98,0x95]
diff --git a/llvm/test/MC/AVR/inst-breq.s b/llvm/test/MC/AVR/inst-breq.s
index f82010f02ba61..7a6eac6f01ad0 100644
--- a/llvm/test/MC/AVR/inst-breq.s
+++ b/llvm/test/MC/AVR/inst-breq.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brge.s b/llvm/test/MC/AVR/inst-brge.s
index 1121284a11468..6cf79db4dbd65 100644
--- a/llvm/test/MC/AVR/inst-brge.s
+++ b/llvm/test/MC/AVR/inst-brge.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brhc.s b/llvm/test/MC/AVR/inst-brhc.s
index eb16ac2ef7a64..924895e4bf5df 100644
--- a/llvm/test/MC/AVR/inst-brhc.s
+++ b/llvm/test/MC/AVR/inst-brhc.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brhs.s b/llvm/test/MC/AVR/inst-brhs.s
index 77c49596b3b0b..9704ce5e7e5ac 100644
--- a/llvm/test/MC/AVR/inst-brhs.s
+++ b/llvm/test/MC/AVR/inst-brhs.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brid.s b/llvm/test/MC/AVR/inst-brid.s
index 70d0ea83c49b2..e03c293677887 100644
--- a/llvm/test/MC/AVR/inst-brid.s
+++ b/llvm/test/MC/AVR/inst-brid.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brie.s b/llvm/test/MC/AVR/inst-brie.s
index 717c686e2ed44..74b724b20bd9e 100644
--- a/llvm/test/MC/AVR/inst-brie.s
+++ b/llvm/test/MC/AVR/inst-brie.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brlo.s b/llvm/test/MC/AVR/inst-brlo.s
index 4b56d66ffdfe0..2726d943e0e78 100644
--- a/llvm/test/MC/AVR/inst-brlo.s
+++ b/llvm/test/MC/AVR/inst-brlo.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brlt.s b/llvm/test/MC/AVR/inst-brlt.s
index 8a7c543f9444b..299a873963e5b 100644
--- a/llvm/test/MC/AVR/inst-brlt.s
+++ b/llvm/test/MC/AVR/inst-brlt.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brmi.s b/llvm/test/MC/AVR/inst-brmi.s
index 878612d294dd9..96f7e484f465f 100644
--- a/llvm/test/MC/AVR/inst-brmi.s
+++ b/llvm/test/MC/AVR/inst-brmi.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brne.s b/llvm/test/MC/AVR/inst-brne.s
index 9d6bee4b754d9..ab89d516681d3 100644
--- a/llvm/test/MC/AVR/inst-brne.s
+++ b/llvm/test/MC/AVR/inst-brne.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brpl.s b/llvm/test/MC/AVR/inst-brpl.s
index 393365ee35339..cd2f697ae8f20 100644
--- a/llvm/test/MC/AVR/inst-brpl.s
+++ b/llvm/test/MC/AVR/inst-brpl.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brsh.s b/llvm/test/MC/AVR/inst-brsh.s
index 0bacd64d3d8d0..b066c917f72ae 100644
--- a/llvm/test/MC/AVR/inst-brsh.s
+++ b/llvm/test/MC/AVR/inst-brsh.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brtc.s b/llvm/test/MC/AVR/inst-brtc.s
index eb4ee21162872..64421df10baf5 100644
--- a/llvm/test/MC/AVR/inst-brtc.s
+++ b/llvm/test/MC/AVR/inst-brtc.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brts.s b/llvm/test/MC/AVR/inst-brts.s
index ccd794a922589..bb02b6f3d475d 100644
--- a/llvm/test/MC/AVR/inst-brts.s
+++ b/llvm/test/MC/AVR/inst-brts.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brvc.s b/llvm/test/MC/AVR/inst-brvc.s
index 573f779c0dcd6..52b9f3b9b403c 100644
--- a/llvm/test/MC/AVR/inst-brvc.s
+++ b/llvm/test/MC/AVR/inst-brvc.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-brvs.s b/llvm/test/MC/AVR/inst-brvs.s
index d50a1a9ec5b62..10382a8e6fd67 100644
--- a/llvm/test/MC/AVR/inst-brvs.s
+++ b/llvm/test/MC/AVR/inst-brvs.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-bst.s b/llvm/test/MC/AVR/inst-bst.s
index e97454c497681..1f5fccf2db782 100644
--- a/llvm/test/MC/AVR/inst-bst.s
+++ b/llvm/test/MC/AVR/inst-bst.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   bst r3, 5
   bst r1, 1
   bst r0, 0
diff --git a/llvm/test/MC/AVR/inst-call.s b/llvm/test/MC/AVR/inst-call.s
index 5ded5e1de6a1b..491a38346bbe0 100644
--- a/llvm/test/MC/AVR/inst-call.s
+++ b/llvm/test/MC/AVR/inst-call.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=jmpcall -show-encoding < %s | FileCheck %s
 ; RUN: llvm-mc -filetype=obj -triple avr -mattr=jmpcall < %s | llvm-objdump --no-print-imm-hex -dr --mattr=jmpcall - | FileCheck -check-prefix=CHECK-INST %s
 
-
 foo:
-
   call  4096
   call  -124
   call   -12
diff --git a/llvm/test/MC/AVR/inst-cbi.s b/llvm/test/MC/AVR/inst-cbi.s
index f92435347f310..6fa9c46960fcd 100644
--- a/llvm/test/MC/AVR/inst-cbi.s
+++ b/llvm/test/MC/AVR/inst-cbi.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   cbi 3, 5
   cbi 1, 1
   cbi 7, 2
diff --git a/llvm/test/MC/AVR/inst-cbr.s b/llvm/test/MC/AVR/inst-cbr.s
index 12e9a9d10d287..dc07bc0b3e942 100644
--- a/llvm/test/MC/AVR/inst-cbr.s
+++ b/llvm/test/MC/AVR/inst-cbr.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   cbr r17, 208
   cbr r24, 190
   cbr r20, 173
diff --git a/llvm/test/MC/AVR/inst-clr.s b/llvm/test/MC/AVR/inst-clr.s
index d5966ae8e0953..65b1a0604bf80 100644
--- a/llvm/test/MC/AVR/inst-clr.s
+++ b/llvm/test/MC/AVR/inst-clr.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   clr r2
   clr r12
   clr r5
diff --git a/llvm/test/MC/AVR/inst-com.s b/llvm/test/MC/AVR/inst-com.s
index 43e3b0a96ccd7..a2b6f211d1993 100644
--- a/llvm/test/MC/AVR/inst-com.s
+++ b/llvm/test/MC/AVR/inst-com.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   com r30
   com r17
   com r4
diff --git a/llvm/test/MC/AVR/inst-cp.s b/llvm/test/MC/AVR/inst-cp.s
index 68c363046bb9d..97c2b94fe21e9 100644
--- a/llvm/test/MC/AVR/inst-cp.s
+++ b/llvm/test/MC/AVR/inst-cp.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   cp r12, r2
   cp r19, r0
   cp r15, r31
diff --git a/llvm/test/MC/AVR/inst-cpc.s b/llvm/test/MC/AVR/inst-cpc.s
index 04c96b1d81417..b7e696c720cc8 100644
--- a/llvm/test/MC/AVR/inst-cpc.s
+++ b/llvm/test/MC/AVR/inst-cpc.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   cp r13, r12
   cp r20, r0
   cp r10, r31
diff --git a/llvm/test/MC/AVR/inst-cpi.s b/llvm/test/MC/AVR/inst-cpi.s
index f0ef2c4aa43d7..e17b505e791c5 100644
--- a/llvm/test/MC/AVR/inst-cpi.s
+++ b/llvm/test/MC/AVR/inst-cpi.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   cpi r16, 241
   cpi r29, 190
   cpi r22, 172
diff --git a/llvm/test/MC/AVR/inst-cpse.s b/llvm/test/MC/AVR/inst-cpse.s
index 209491ec70c67..4a9f2a7ece1a1 100644
--- a/llvm/test/MC/AVR/inst-cpse.s
+++ b/llvm/test/MC/AVR/inst-cpse.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   cpse r2, r13
   cpse r9, r0
   cpse r5, r31
diff --git a/llvm/test/MC/AVR/inst-dec.s b/llvm/test/MC/AVR/inst-dec.s
index c419f7f97ee46..57140ede17481 100644
--- a/llvm/test/MC/AVR/inst-dec.s
+++ b/llvm/test/MC/AVR/inst-dec.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   dec r26
   dec r3
   dec r24
diff --git a/llvm/test/MC/AVR/inst-des.s b/llvm/test/MC/AVR/inst-des.s
index 746e6f2e07f33..14576f994e485 100644
--- a/llvm/test/MC/AVR/inst-des.s
+++ b/llvm/test/MC/AVR/inst-des.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=des -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=des < %s | llvm-objdump --no-print-imm-hex -d --mattr=des - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=des < %s | llvm-objdump --no-print-imm-hex -dr --mattr=des - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   des 0
   des 6
   des 1
diff --git a/llvm/test/MC/AVR/inst-eicall.s b/llvm/test/MC/AVR/inst-eicall.s
index 49ce225518617..c589542ee602f 100644
--- a/llvm/test/MC/AVR/inst-eicall.s
+++ b/llvm/test/MC/AVR/inst-eicall.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=eijmpcall -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=eijmpcall < %s | llvm-objdump -d --mattr=eijmpcall - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=eijmpcall < %s | llvm-objdump -dr --mattr=eijmpcall - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   eicall
 
 ; CHECK: eicall                  ; encoding: [0x19,0x95]
diff --git a/llvm/test/MC/AVR/inst-eijmp.s b/llvm/test/MC/AVR/inst-eijmp.s
index 78e2f539a8b00..b3f4460668e00 100644
--- a/llvm/test/MC/AVR/inst-eijmp.s
+++ b/llvm/test/MC/AVR/inst-eijmp.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=eijmpcall -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=eijmpcall < %s | llvm-objdump -d --mattr=eijmpcall - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=eijmpcall < %s | llvm-objdump -dr --mattr=eijmpcall - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   eijmp
 
 ; CHECK: eijmp                  ; encoding: [0x19,0x94]
diff --git a/llvm/test/MC/AVR/inst-elpm.s b/llvm/test/MC/AVR/inst-elpm.s
index ac67fdb22403e..d718dae0277ec 100644
--- a/llvm/test/MC/AVR/inst-elpm.s
+++ b/llvm/test/MC/AVR/inst-elpm.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=elpm,elpmx -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=elpm,elpmx < %s | llvm-objdump -d --mattr=elpm,elpmx - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=elpm,elpmx < %s | llvm-objdump -dr --mattr=elpm,elpmx - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   elpm
 
   elpm r3,  Z
diff --git a/llvm/test/MC/AVR/inst-eor.s b/llvm/test/MC/AVR/inst-eor.s
index 1664eeed40e50..43fed04266b8f 100644
--- a/llvm/test/MC/AVR/inst-eor.s
+++ b/llvm/test/MC/AVR/inst-eor.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   eor r0,  r15
   eor r15, r0
   eor r16, r31
diff --git a/llvm/test/MC/AVR/inst-family-set-clr-flag.s b/llvm/test/MC/AVR/inst-family-set-clr-flag.s
index 364e727866280..3e33c369b3ec7 100644
--- a/llvm/test/MC/AVR/inst-family-set-clr-flag.s
+++ b/llvm/test/MC/AVR/inst-family-set-clr-flag.s
@@ -1,5 +1,5 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
 
diff --git a/llvm/test/MC/AVR/inst-fmul.s b/llvm/test/MC/AVR/inst-fmul.s
index 45443c99434b7..466830f1f94ab 100644
--- a/llvm/test/MC/AVR/inst-fmul.s
+++ b/llvm/test/MC/AVR/inst-fmul.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=mul -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=mul < %s | llvm-objdump -d --mattr=mul - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=mul < %s | llvm-objdump -dr --mattr=mul - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   fmul r22, r16
   fmul r19, r17
   fmul r21, r23
diff --git a/llvm/test/MC/AVR/inst-fmuls.s b/llvm/test/MC/AVR/inst-fmuls.s
index 58a3d06722d06..372a4954b0150 100644
--- a/llvm/test/MC/AVR/inst-fmuls.s
+++ b/llvm/test/MC/AVR/inst-fmuls.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=mul -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=mul < %s | llvm-objdump -d --mattr=mul - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=mul < %s | llvm-objdump -dr --mattr=mul - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   fmuls r22, r16
   fmuls r19, r17
   fmuls r21, r23
diff --git a/llvm/test/MC/AVR/inst-fmulsu.s b/llvm/test/MC/AVR/inst-fmulsu.s
index bc9548421625e..c5b165bc76d60 100644
--- a/llvm/test/MC/AVR/inst-fmulsu.s
+++ b/llvm/test/MC/AVR/inst-fmulsu.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=mul -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=mul < %s | llvm-objdump -d --mattr=mul - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=mul < %s | llvm-objdump -dr --mattr=mul - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   fmulsu r22, r16
   fmulsu r19, r17
   fmulsu r21, r23
diff --git a/llvm/test/MC/AVR/inst-icall.s b/llvm/test/MC/AVR/inst-icall.s
index 48e1e7a9546b0..9323c39306be2 100644
--- a/llvm/test/MC/AVR/inst-icall.s
+++ b/llvm/test/MC/AVR/inst-icall.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=ijmpcall -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=ijmpcall < %s | llvm-objdump -d --mattr=ijmpcall - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=ijmpcall < %s | llvm-objdump -dr --mattr=ijmpcall - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   icall
 
 ; CHECK: icall                  ; encoding: [0x09,0x95]
diff --git a/llvm/test/MC/AVR/inst-ijmp.s b/llvm/test/MC/AVR/inst-ijmp.s
index b2781b43c17ff..14cdfb2d777fb 100644
--- a/llvm/test/MC/AVR/inst-ijmp.s
+++ b/llvm/test/MC/AVR/inst-ijmp.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=ijmpcall -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=ijmpcall < %s | llvm-objdump -d --mattr=ijmpcall - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=ijmpcall < %s | llvm-objdump -dr --mattr=ijmpcall - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   ijmp
 
 ; CHECK: ijmp                  ; encoding: [0x09,0x94]
diff --git a/llvm/test/MC/AVR/inst-in.s b/llvm/test/MC/AVR/inst-in.s
index 38a427478f4fa..e17f25b67d03d 100644
--- a/llvm/test/MC/AVR/inst-in.s
+++ b/llvm/test/MC/AVR/inst-in.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   in r2, 4
   in r9, 6
   in r5, 32
diff --git a/llvm/test/MC/AVR/inst-inc.s b/llvm/test/MC/AVR/inst-inc.s
index d5d1b1aec07c3..00d9c17a1f28c 100644
--- a/llvm/test/MC/AVR/inst-inc.s
+++ b/llvm/test/MC/AVR/inst-inc.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   inc r12
   inc r29
   inc r6
diff --git a/llvm/test/MC/AVR/inst-jmp.s b/llvm/test/MC/AVR/inst-jmp.s
index c0b7a375ed450..0d5b16e02206d 100644
--- a/llvm/test/MC/AVR/inst-jmp.s
+++ b/llvm/test/MC/AVR/inst-jmp.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=jmpcall -show-encoding < %s | FileCheck %s
 ; RUN: llvm-mc -filetype=obj -triple avr -mattr=jmpcall < %s | llvm-objdump --no-print-imm-hex -dr --mattr=jmpcall - | FileCheck -check-prefix=CHECK-INST %s
 
-
 foo:
-
   jmp   200
   jmp  -12
   jmp   80
diff --git a/llvm/test/MC/AVR/inst-lac.s b/llvm/test/MC/AVR/inst-lac.s
index e20998c74c134..52c24792008c3 100644
--- a/llvm/test/MC/AVR/inst-lac.s
+++ b/llvm/test/MC/AVR/inst-lac.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=rmw -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=rmw < %s | llvm-objdump -d --mattr=rmw - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=rmw < %s | llvm-objdump -dr --mattr=rmw - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   lac Z, r13
   lac Z, r0
   lac Z, r31
diff --git a/llvm/test/MC/AVR/inst-las.s b/llvm/test/MC/AVR/inst-las.s
index d8ff8d510df51..4ca65d2c57cc6 100644
--- a/llvm/test/MC/AVR/inst-las.s
+++ b/llvm/test/MC/AVR/inst-las.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=rmw -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=rmw < %s | llvm-objdump -d --mattr=rmw - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=rmw < %s | llvm-objdump -dr --mattr=rmw - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   las Z, r13
   las Z, r0
   las Z, r31
diff --git a/llvm/test/MC/AVR/inst-lat.s b/llvm/test/MC/AVR/inst-lat.s
index ae001c0691403..190d54f435828 100644
--- a/llvm/test/MC/AVR/inst-lat.s
+++ b/llvm/test/MC/AVR/inst-lat.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=rmw -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=rmw < %s | llvm-objdump -d --mattr=rmw - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=rmw < %s | llvm-objdump -dr --mattr=rmw - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   lat Z, r13
   lat Z, r0
   lat Z, r31
diff --git a/llvm/test/MC/AVR/inst-ld.s b/llvm/test/MC/AVR/inst-ld.s
index ad3f8fd62616b..3987e6d7da801 100644
--- a/llvm/test/MC/AVR/inst-ld.s
+++ b/llvm/test/MC/AVR/inst-ld.s
@@ -1,10 +1,8 @@
 ; RUN: llvm-mc -triple avr -mattr=sram -show-encoding < %s | FileCheck %s
 ; RUN: llvm-mc -filetype=obj -triple avr -mattr=sram < %s \
-; RUN:     | llvm-objdump -d --mattr=sram - | FileCheck --check-prefix=INST %s
-
+; RUN:     | llvm-objdump -dr --mattr=sram - | FileCheck --check-prefix=INST %s
 
 foo:
-
   ; Normal
 
   ld r10, X
diff --git a/llvm/test/MC/AVR/inst-ldd.s b/llvm/test/MC/AVR/inst-ldd.s
index 66b2c04a68638..70ba44617690e 100644
--- a/llvm/test/MC/AVR/inst-ldd.s
+++ b/llvm/test/MC/AVR/inst-ldd.s
@@ -1,6 +1,6 @@
 ; RUN: llvm-mc -triple avr -mattr=sram -show-encoding < %s | FileCheck %s
 ; RUN: llvm-mc -filetype=obj -triple avr -mattr=sram < %s \
-; RUN:     | llvm-objdump -d --mattr=sram - | FileCheck --check-prefix=INST %s
+; RUN:     | llvm-objdump -dr --mattr=sram - | FileCheck --check-prefix=INST %s
 
 foo:
   ldd r2, Y+2
diff --git a/llvm/test/MC/AVR/inst-ldi.s b/llvm/test/MC/AVR/inst-ldi.s
index 4f2ad5ac8d714..a6e2bc68b59e0 100644
--- a/llvm/test/MC/AVR/inst-ldi.s
+++ b/llvm/test/MC/AVR/inst-ldi.s
@@ -1,6 +1,5 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=sram < %s | llvm-objdump --no-print-imm-hex -d --mattr=sram - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=sram < %s | llvm-objdump --no-print-imm-hex -dr --mattr=sram - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
   ldi r16, 241
diff --git a/llvm/test/MC/AVR/inst-lds.s b/llvm/test/MC/AVR/inst-lds.s
index 0e4bfb345d827..38f641a07ca79 100644
--- a/llvm/test/MC/AVR/inst-lds.s
+++ b/llvm/test/MC/AVR/inst-lds.s
@@ -1,7 +1,6 @@
 ; RUN: llvm-mc -triple avr -mattr=sram -show-encoding < %s | FileCheck %s
 ; RUN: llvm-mc -filetype=obj -triple avr -mattr=sram < %s | llvm-objdump --no-print-imm-hex -dr --mattr=sram - | FileCheck -check-prefix=CHECK-INST %s
 
-
 foo:
   lds r16, 241
   lds r29, 190
diff --git a/llvm/test/MC/AVR/inst-lpm.s b/llvm/test/MC/AVR/inst-lpm.s
index 7ddacd4b0647a..e61da12e038c9 100644
--- a/llvm/test/MC/AVR/inst-lpm.s
+++ b/llvm/test/MC/AVR/inst-lpm.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=lpm,lpmx -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=lpm,lpmx < %s | llvm-objdump -d --mattr=lpm,lpmx - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=lpm,lpmx < %s | llvm-objdump -dr --mattr=lpm,lpmx - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   lpm
 
   lpm r3,  Z
diff --git a/llvm/test/MC/AVR/inst-lsl.s b/llvm/test/MC/AVR/inst-lsl.s
index fbb4d2ce5666c..a520a79bdf764 100644
--- a/llvm/test/MC/AVR/inst-lsl.s
+++ b/llvm/test/MC/AVR/inst-lsl.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   lsl r31
   lsl r25
   lsl r5
diff --git a/llvm/test/MC/AVR/inst-lsr.s b/llvm/test/MC/AVR/inst-lsr.s
index 59b0174beede6..dac2d768f8d76 100644
--- a/llvm/test/MC/AVR/inst-lsr.s
+++ b/llvm/test/MC/AVR/inst-lsr.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   lsr r31
   lsr r25
   lsr r5
diff --git a/llvm/test/MC/AVR/inst-mov.s b/llvm/test/MC/AVR/inst-mov.s
index 5294b1bc52bed..e0c89e8073c4d 100644
--- a/llvm/test/MC/AVR/inst-mov.s
+++ b/llvm/test/MC/AVR/inst-mov.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   mov r2, r13
   mov r9, r0
   mov r5, r31
diff --git a/llvm/test/MC/AVR/inst-movw.s b/llvm/test/MC/AVR/inst-movw.s
index b93b530e9a701..12dc649f24497 100644
--- a/llvm/test/MC/AVR/inst-movw.s
+++ b/llvm/test/MC/AVR/inst-movw.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=movw -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=movw < %s | llvm-objdump -d --mattr=movw - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=movw < %s | llvm-objdump -dr --mattr=movw - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   movw r10, r8
   movw r12, r16
   movw r20, r22
diff --git a/llvm/test/MC/AVR/inst-mul.s b/llvm/test/MC/AVR/inst-mul.s
index 91295128beb1d..e68118a9016aa 100644
--- a/llvm/test/MC/AVR/inst-mul.s
+++ b/llvm/test/MC/AVR/inst-mul.s
@@ -1,7 +1,6 @@
 ; RUN: llvm-mc -triple avr -mattr=mul -show-encoding < %s | FileCheck %s
 ; RUN: llvm-mc -filetype=obj -triple avr -mattr=mul < %s \
-; RUN:     | llvm-objdump -d --mattr=mul - | FileCheck --check-prefix=INST %s
-
+; RUN:     | llvm-objdump -dr --mattr=mul - | FileCheck --check-prefix=INST %s
 
 foo:
   mul r0,  r15
diff --git a/llvm/test/MC/AVR/inst-muls.s b/llvm/test/MC/AVR/inst-muls.s
index 9ae3d15d8fa1e..0bb20956636fd 100644
--- a/llvm/test/MC/AVR/inst-muls.s
+++ b/llvm/test/MC/AVR/inst-muls.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=mul -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=mul < %s | llvm-objdump -d --mattr=mul - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=mul < %s | llvm-objdump -dr --mattr=mul - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   muls r22, r16
   muls r19, r17
   muls r28, r31
diff --git a/llvm/test/MC/AVR/inst-mulsu.s b/llvm/test/MC/AVR/inst-mulsu.s
index a74322dbe137e..257d0c7ce68dd 100644
--- a/llvm/test/MC/AVR/inst-mulsu.s
+++ b/llvm/test/MC/AVR/inst-mulsu.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=mul -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=mul < %s | llvm-objdump -d --mattr=mul - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=mul < %s | llvm-objdump -dr --mattr=mul - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   mulsu r22, r16
   mulsu r19, r17
   mulsu r21, r23
diff --git a/llvm/test/MC/AVR/inst-neg.s b/llvm/test/MC/AVR/inst-neg.s
index 7f36a1aa39798..326bf62d94b94 100644
--- a/llvm/test/MC/AVR/inst-neg.s
+++ b/llvm/test/MC/AVR/inst-neg.s
@@ -1,5 +1,5 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck -check-prefix=CHECK-INST %s
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
   neg r15
diff --git a/llvm/test/MC/AVR/inst-nop.s b/llvm/test/MC/AVR/inst-nop.s
index a7b6fff3f8001..4b4300dc6978b 100644
--- a/llvm/test/MC/AVR/inst-nop.s
+++ b/llvm/test/MC/AVR/inst-nop.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   nop
 
 ; CHECK: nop                  ; encoding: [0x00,0x00]
diff --git a/llvm/test/MC/AVR/inst-or.s b/llvm/test/MC/AVR/inst-or.s
index c30e6b136bc04..c2120b2106f77 100644
--- a/llvm/test/MC/AVR/inst-or.s
+++ b/llvm/test/MC/AVR/inst-or.s
@@ -1,6 +1,5 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
   or r0,  r15
diff --git a/llvm/test/MC/AVR/inst-ori.s b/llvm/test/MC/AVR/inst-ori.s
index 990d5627bfcc9..bdbe510b1504a 100644
--- a/llvm/test/MC/AVR/inst-ori.s
+++ b/llvm/test/MC/AVR/inst-ori.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   ori r17, 208
   ori r24, 190
   ori r20, 173
diff --git a/llvm/test/MC/AVR/inst-out.s b/llvm/test/MC/AVR/inst-out.s
index e43c7f7a24687..76dac1d578be5 100644
--- a/llvm/test/MC/AVR/inst-out.s
+++ b/llvm/test/MC/AVR/inst-out.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   out 4,  r2
   out 6,  r9
   out 32, r5
diff --git a/llvm/test/MC/AVR/inst-pop.s b/llvm/test/MC/AVR/inst-pop.s
index 2e02fedc806da..01b14a6b890b0 100644
--- a/llvm/test/MC/AVR/inst-pop.s
+++ b/llvm/test/MC/AVR/inst-pop.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=sram -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=sram < %s | llvm-objdump -d --mattr=sram - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=sram < %s | llvm-objdump -dr --mattr=sram - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   pop r31
   pop r25
   pop r5
diff --git a/llvm/test/MC/AVR/inst-push.s b/llvm/test/MC/AVR/inst-push.s
index d5601b84e3cd1..5694e857df403 100644
--- a/llvm/test/MC/AVR/inst-push.s
+++ b/llvm/test/MC/AVR/inst-push.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=sram -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=sram < %s | llvm-objdump -d --mattr=sram - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=sram < %s | llvm-objdump -dr --mattr=sram - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   push r31
   push r25
   push r5
diff --git a/llvm/test/MC/AVR/inst-rcall.s b/llvm/test/MC/AVR/inst-rcall.s
index a4ec32d05b1a4..34c2ef86366c5 100644
--- a/llvm/test/MC/AVR/inst-rcall.s
+++ b/llvm/test/MC/AVR/inst-rcall.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-ret.s b/llvm/test/MC/AVR/inst-ret.s
index 92b1e30118618..8a71ea3ea4774 100644
--- a/llvm/test/MC/AVR/inst-ret.s
+++ b/llvm/test/MC/AVR/inst-ret.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   ret
 
 ; CHECK: ret                  ; encoding: [0x08,0x95]
diff --git a/llvm/test/MC/AVR/inst-reti.s b/llvm/test/MC/AVR/inst-reti.s
index aa338d3024e13..3922f0b049d2b 100644
--- a/llvm/test/MC/AVR/inst-reti.s
+++ b/llvm/test/MC/AVR/inst-reti.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   reti
 
 ; CHECK: reti                  ; encoding: [0x18,0x95]
diff --git a/llvm/test/MC/AVR/inst-rjmp.s b/llvm/test/MC/AVR/inst-rjmp.s
index cc843a58b55d2..74d703593f666 100644
--- a/llvm/test/MC/AVR/inst-rjmp.s
+++ b/llvm/test/MC/AVR/inst-rjmp.s
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - \
+; RUN:     | llvm-objdump -dr - \
 ; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
diff --git a/llvm/test/MC/AVR/inst-rol.s b/llvm/test/MC/AVR/inst-rol.s
index 434505aa88c06..c649cf045f00c 100644
--- a/llvm/test/MC/AVR/inst-rol.s
+++ b/llvm/test/MC/AVR/inst-rol.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   rol r31
   rol r25
   rol r5
diff --git a/llvm/test/MC/AVR/inst-ror.s b/llvm/test/MC/AVR/inst-ror.s
index ca7a90f81f971..525e9df1851ab 100644
--- a/llvm/test/MC/AVR/inst-ror.s
+++ b/llvm/test/MC/AVR/inst-ror.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   ror r31
   ror r25
   ror r5
diff --git a/llvm/test/MC/AVR/inst-sbc.s b/llvm/test/MC/AVR/inst-sbc.s
index d92bd63ba2077..22a9bb52548b4 100644
--- a/llvm/test/MC/AVR/inst-sbc.s
+++ b/llvm/test/MC/AVR/inst-sbc.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   sbc r0,  r15
   sbc r15, r0
   sbc r16, r31
diff --git a/llvm/test/MC/AVR/inst-sbci.s b/llvm/test/MC/AVR/inst-sbci.s
index 4994cf15d95c6..3330294f768d8 100644
--- a/llvm/test/MC/AVR/inst-sbci.s
+++ b/llvm/test/MC/AVR/inst-sbci.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   sbci r17, 21
   sbci r23, 196
   sbci r30, 244
diff --git a/llvm/test/MC/AVR/inst-sbi.s b/llvm/test/MC/AVR/inst-sbi.s
index e08ea67deb533..ef026efdebc6c 100644
--- a/llvm/test/MC/AVR/inst-sbi.s
+++ b/llvm/test/MC/AVR/inst-sbi.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   sbi 3, 5
   sbi 1, 1
   sbi 7, 2
diff --git a/llvm/test/MC/AVR/inst-sbic.s b/llvm/test/MC/AVR/inst-sbic.s
index 68762188c3725..1e8abdb29228d 100644
--- a/llvm/test/MC/AVR/inst-sbic.s
+++ b/llvm/test/MC/AVR/inst-sbic.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   sbic 4,  3
   sbic 6,  2
   sbic 16, 5
diff --git a/llvm/test/MC/AVR/inst-sbis.s b/llvm/test/MC/AVR/inst-sbis.s
index 506a1088c7f31..96996259ed2e5 100644
--- a/llvm/test/MC/AVR/inst-sbis.s
+++ b/llvm/test/MC/AVR/inst-sbis.s
@@ -1,6 +1,5 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
   sbis 4,  3
diff --git a/llvm/test/MC/AVR/inst-sbiw.s b/llvm/test/MC/AVR/inst-sbiw.s
index 5e20789e0292b..640f503c62aea 100644
--- a/llvm/test/MC/AVR/inst-sbiw.s
+++ b/llvm/test/MC/AVR/inst-sbiw.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=addsubiw -show-encoding < %s | FileCheck %s
 ; RUN: llvm-mc -filetype=obj -triple avr -mattr=addsubiw < %s | llvm-objdump --no-print-imm-hex -dr --mattr=addsubiw - | FileCheck --check-prefix=CHECK-INST %s
 
-
 foo:
-
   sbiw r26, 54
   sbiw X,   63
 
diff --git a/llvm/test/MC/AVR/inst-sbr.s b/llvm/test/MC/AVR/inst-sbr.s
index 10440449ddf73..9f946e2cc0c86 100644
--- a/llvm/test/MC/AVR/inst-sbr.s
+++ b/llvm/test/MC/AVR/inst-sbr.s
@@ -1,6 +1,5 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
   sbr r17, 208
diff --git a/llvm/test/MC/AVR/inst-sbrc.s b/llvm/test/MC/AVR/inst-sbrc.s
index a11561c3587a3..05dd57d7b84d7 100644
--- a/llvm/test/MC/AVR/inst-sbrc.s
+++ b/llvm/test/MC/AVR/inst-sbrc.s
@@ -1,10 +1,8 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump --no-print-imm-hex -d - | FileCheck --check-prefix=INST %s
-
+; RUN:     | llvm-objdump --no-print-imm-hex -dr - | FileCheck --check-prefix=INST %s
 
 foo:
-
   sbrc r2, 3
   sbrc r0, 7
 
diff --git a/llvm/test/MC/AVR/inst-sbrs.s b/llvm/test/MC/AVR/inst-sbrs.s
index f0afe5f41a977..9eccf34e7a2e7 100644
--- a/llvm/test/MC/AVR/inst-sbrs.s
+++ b/llvm/test/MC/AVR/inst-sbrs.s
@@ -1,10 +1,8 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump --no-print-imm-hex -d - | FileCheck --check-prefix=INST %s
-
+; RUN:     | llvm-objdump --no-print-imm-hex -dr - | FileCheck --check-prefix=INST %s
 
 foo:
-
   sbrs r2, 3
   sbrs r0, 7
 
diff --git a/llvm/test/MC/AVR/inst-ser.s b/llvm/test/MC/AVR/inst-ser.s
index ba9386ac00ef3..02b597285736c 100644
--- a/llvm/test/MC/AVR/inst-ser.s
+++ b/llvm/test/MC/AVR/inst-ser.s
@@ -1,6 +1,5 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
   ser r16
diff --git a/llvm/test/MC/AVR/inst-sleep.s b/llvm/test/MC/AVR/inst-sleep.s
index 5aeaeb4623961..75c64d72e1c8f 100644
--- a/llvm/test/MC/AVR/inst-sleep.s
+++ b/llvm/test/MC/AVR/inst-sleep.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   sleep
 
 ; CHECK: sleep                  ; encoding: [0x88,0x95]
diff --git a/llvm/test/MC/AVR/inst-spm.s b/llvm/test/MC/AVR/inst-spm.s
index 3e1b8dd5774a4..2728491d0ea76 100644
--- a/llvm/test/MC/AVR/inst-spm.s
+++ b/llvm/test/MC/AVR/inst-spm.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=spm,spmx -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=spm,spmx < %s | llvm-objdump -d --mattr=spm,spmx - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=spm,spmx < %s | llvm-objdump -dr --mattr=spm,spmx - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   spm
   spm Z+
 
diff --git a/llvm/test/MC/AVR/inst-st.s b/llvm/test/MC/AVR/inst-st.s
index ffa9a3c437021..4cada60868cbc 100644
--- a/llvm/test/MC/AVR/inst-st.s
+++ b/llvm/test/MC/AVR/inst-st.s
@@ -1,7 +1,6 @@
 ; RUN: llvm-mc -triple avr -mattr=sram -show-encoding < %s | FileCheck %s
 ; RUN: llvm-mc -filetype=obj -triple avr -mattr=sram < %s \
-; RUN:     | llvm-objdump -d --mattr=sram - | FileCheck --check-prefix=INST %s
-
+; RUN:     | llvm-objdump -dr --mattr=sram - | FileCheck --check-prefix=INST %s
 
 foo:
   ; Normal
diff --git a/llvm/test/MC/AVR/inst-std.s b/llvm/test/MC/AVR/inst-std.s
index 571d5f475f9d7..ca817fa377224 100644
--- a/llvm/test/MC/AVR/inst-std.s
+++ b/llvm/test/MC/AVR/inst-std.s
@@ -1,9 +1,8 @@
 ; RUN: llvm-mc -triple avr -mattr=sram -show-encoding < %s | FileCheck %s
 ; RUN: llvm-mc -filetype=obj -triple avr -mattr=sram < %s \
-; RUN:     | llvm-objdump -d --mattr=sram - | FileCheck --check-prefix=INST %s
+; RUN:     | llvm-objdump -dr --mattr=sram - | FileCheck --check-prefix=INST %s
 
 foo:
-
   std Y+2, r2
   std Y+0, r0
 
diff --git a/llvm/test/MC/AVR/inst-sts.s b/llvm/test/MC/AVR/inst-sts.s
index 19400fdfee600..f7bc3e9cd37b0 100644
--- a/llvm/test/MC/AVR/inst-sts.s
+++ b/llvm/test/MC/AVR/inst-sts.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=sram -show-encoding < %s | FileCheck %s
 ; RUN: llvm-mc -filetype=obj -triple avr -mattr=sram < %s | llvm-objdump --no-print-imm-hex -dr --mattr=sram - | FileCheck -check-prefix=CHECK-INST %s
 
-
 foo:
-
   sts 3,        r5
   sts 255,      r7
   sts SYMBOL+1, r25
diff --git a/llvm/test/MC/AVR/inst-sub.s b/llvm/test/MC/AVR/inst-sub.s
index 9dc5f2006aa13..fa389e9a2d25a 100644
--- a/llvm/test/MC/AVR/inst-sub.s
+++ b/llvm/test/MC/AVR/inst-sub.s
@@ -1,6 +1,5 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
   sub r0,  r15
diff --git a/llvm/test/MC/AVR/inst-subi.s b/llvm/test/MC/AVR/inst-subi.s
index 148b07bde922c..478d229a3e0c9 100644
--- a/llvm/test/MC/AVR/inst-subi.s
+++ b/llvm/test/MC/AVR/inst-subi.s
@@ -1,6 +1,5 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump --no-print-imm-hex -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
   subi r22, 82
diff --git a/llvm/test/MC/AVR/inst-swap.s b/llvm/test/MC/AVR/inst-swap.s
index 4a265639f70ac..3e78378100317 100644
--- a/llvm/test/MC/AVR/inst-swap.s
+++ b/llvm/test/MC/AVR/inst-swap.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   swap r31
   swap r25
   swap r5
diff --git a/llvm/test/MC/AVR/inst-tst.s b/llvm/test/MC/AVR/inst-tst.s
index 5097a81916ad0..4be5f9bfa1d2b 100644
--- a/llvm/test/MC/AVR/inst-tst.s
+++ b/llvm/test/MC/AVR/inst-tst.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   tst r3
   tst r14
   tst r24
diff --git a/llvm/test/MC/AVR/inst-wdr.s b/llvm/test/MC/AVR/inst-wdr.s
index fdded7cca1047..913d77ca8585a 100644
--- a/llvm/test/MC/AVR/inst-wdr.s
+++ b/llvm/test/MC/AVR/inst-wdr.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck --check-prefix=CHECK-INST %s
 
 foo:
-
   wdr
 
 ; CHECK: wdr                  ; encoding: [0xa8,0x95]
diff --git a/llvm/test/MC/AVR/inst-xch.s b/llvm/test/MC/AVR/inst-xch.s
index 36d30e2bc52bf..4a2e8d6092061 100644
--- a/llvm/test/MC/AVR/inst-xch.s
+++ b/llvm/test/MC/AVR/inst-xch.s
@@ -1,9 +1,7 @@
 ; RUN: llvm-mc -triple avr -mattr=rmw -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr -mattr=rmw < %s | llvm-objdump -d --mattr=rmw - | FileCheck -check-prefix=CHECK-INST %s
-
+; RUN: llvm-mc -filetype=obj -triple avr -mattr=rmw < %s | llvm-objdump -dr --mattr=rmw - | FileCheck -check-prefix=CHECK-INST %s
 
 foo:
-
   xch Z, r13
   xch Z, r0
   xch Z, r31
diff --git a/llvm/test/MC/AVR/modifiers.s b/llvm/test/MC/AVR/modifiers.s
index 592c6ac125c33..51b3ad129f732 100644
--- a/llvm/test/MC/AVR/modifiers.s
+++ b/llvm/test/MC/AVR/modifiers.s
@@ -4,7 +4,6 @@
 ; FIXME: most of these tests use values (i.e. 0x0815) that are out of bounds.
 
 foo:
-
     ldi r24, lo8(0x42)
     ldi r24, lo8(0x2342)
 
@@ -17,9 +16,7 @@ foo:
 ; CHECK: ldi  r24, lo8(35)          ; encoding: [0x83,0xe2]
 ; CHECK: ldi  r24, hi8(9026)        ; encoding: [0x83,0xe2]
 
-
 bar:
-
     ldi r24, lo8(bar)
     ldi r24, hi8(bar)
 
@@ -29,7 +26,6 @@ bar:
 ; CHECK:                            ; fixup A - offset: 0, value: hi8(bar), kind: fixup_hi8_ldi
 
 lo8:
-
     ldi r24, lo8(0x0815)
     ldi r24, lo8(foo)
     ldi r24, lo8(bar + 5)
@@ -41,7 +37,6 @@ lo8:
 ; CHECK:                            ; fixup A - offset: 0, value: lo8(bar+5), kind: fixup_lo8_ldi
 
 lo8_neg:
-
     ldi r24, lo8(-(123456))
     ldi r24, lo8(-(foo))
 
@@ -50,7 +45,6 @@ lo8_neg:
 ; CHECK:                            ; fixup A - offset: 0, value: lo8(-(foo)), kind: fixup_lo8_ldi_neg
 
 hi8:
-
     ldi r24, hi8(0x0815)
     ldi r24, hi8(foo)
     ldi r24, hi8(bar + 5)
@@ -62,7 +56,6 @@ hi8:
 ; CHECK:                            ; fixup A - offset: 0, value: hi8(bar+5), kind: fixup_hi8_ldi
 
 hi8_neg:
-
     ldi r24, hi8(-(123456))
     ldi r24, hi8(-(foo))
 
@@ -71,7 +64,6 @@ hi8_neg:
 ; CHECK:                            ; fixup A - offset: 0, value: hi8(-(foo)), kind: fixup_hi8_ldi_neg
 
 hh8:
-
     ldi r24, hh8(0x0815)
     ldi r24, hh8(foo)
     ldi r24, hh8(bar + 5)
@@ -83,7 +75,6 @@ hh8:
 ; CHECK:                            ; fixup A - offset: 0, value: hh8(bar+5), kind: fixup_hh8_ldi
 
 hh8_neg:
-
     ldi r24, hh8(-(123456))
     ldi r24, hh8(-(foo))
 
@@ -92,7 +83,6 @@ hh8_neg:
 ; CHECK:                            ; fixup A - offset: 0, value: hh8(-(foo)), kind: fixup_hh8_ldi_neg
 
 hlo8: ; synonym with hh8() above, hence the... odd results
-
     ldi r24, hlo8(0x0815)
     ldi r24, hlo8(foo)
     ldi r24, hlo8(bar + 5)
@@ -104,17 +94,14 @@ hlo8: ; synonym with hh8() above, hence the... odd results
 ; CHECK:                            ; fixup A - offset: 0, value: hh8(bar+5), kind: fixup_hh8_ldi
 
 hlo8_neg:
-
     ldi r24, hlo8(-(123456))
     ldi r24, hlo8(-(foo))
 
-
 ; CHECK: ldi  r24, hh8(-(123456))  ; encoding: [0x8e,0xef]
 ; CHECK: ldi  r24, hh8(-(foo))     ; encoding: [0x80'A',0xe0]
 ; CHECK:                           ; fixup A - offset: 0, value: hh8(-(foo)), kind: fixup_hh8_ldi_neg
 
 hhi8:
-
     ldi r24, hhi8(0x0815)
     ldi r24, hhi8(foo)
     ldi r24, hhi8(bar + 5)
@@ -167,7 +154,6 @@ pm_hh8:
 ; CHECK: ldi  r24, pm_hh8(bar+5)    ; encoding: [0x80'A',0xe0]
 ; CHECK:                            ; fixup A - offset: 0, value: pm_hh8(bar+5), kind: fixup_hh8_ldi_pm
 
-
 pm_lo8_neg:
     ldi r24, pm_lo8(-(0x0815))
     ldi r24, pm_lo8(-(foo))
diff --git a/llvm/test/MC/AVR/registers.s b/llvm/test/MC/AVR/registers.s
index 62872a83a87c7..62620ccab0e2a 100644
--- a/llvm/test/MC/AVR/registers.s
+++ b/llvm/test/MC/AVR/registers.s
@@ -1,5 +1,5 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck -check-prefix=CHECK-INST %s
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck -check-prefix=CHECK-INST %s
 
 ; Test register aliases: the upper 6 registers have aliases that can be used in
 ; assembly.
diff --git a/llvm/test/MC/AVR/relocations-abs.s b/llvm/test/MC/AVR/relocations-abs.s
index b384cadcec202..937ce6faf7a3d 100644
--- a/llvm/test/MC/AVR/relocations-abs.s
+++ b/llvm/test/MC/AVR/relocations-abs.s
@@ -1,4 +1,4 @@
-; RUN: llvm-mc -filetype=obj -triple=avr %s | llvm-objdump -d -r - | FileCheck %s
+; RUN: llvm-mc -filetype=obj -triple=avr %s | llvm-objdump -dr -r - | FileCheck %s
 
 ; CHECK: <bar>:
 ; CHECK-NEXT: 00 00 nop
diff --git a/llvm/test/MC/AVR/relocations.s b/llvm/test/MC/AVR/relocations.s
index ffc7ba8b8ff7e..02d4595a510e5 100644
--- a/llvm/test/MC/AVR/relocations.s
+++ b/llvm/test/MC/AVR/relocations.s
@@ -3,6 +3,7 @@
 ; CHECK: RELOCATION RECORDS FOR
 
 .global bar
+
 bar:
   jmp bar
 
diff --git a/llvm/test/MC/AVR/separator.s b/llvm/test/MC/AVR/separator.s
index c62e01bda93d6..4916fcc6c5ccc 100644
--- a/llvm/test/MC/AVR/separator.s
+++ b/llvm/test/MC/AVR/separator.s
@@ -1,7 +1,6 @@
-; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -d - | FileCheck %s
+; RUN: llvm-mc -filetype=obj -triple avr < %s | llvm-objdump -dr - | FileCheck %s
 
 foo:
-
   ; The $ symbol is a separator (like a newline).
   mov r0, r1 $ mov r1, r2 $ mov r2, r3 $ mov r3, r4
 
diff --git a/llvm/test/MC/AVR/syntax-reg-int-literal.s b/llvm/test/MC/AVR/syntax-reg-int-literal.s
index 3040bd39b803f..0f6bc5cb9b040 100644
--- a/llvm/test/MC/AVR/syntax-reg-int-literal.s
+++ b/llvm/test/MC/AVR/syntax-reg-int-literal.s
@@ -7,7 +7,6 @@ foo:
   add r16, r31
   add 31,  r16
 
-
 ; CHECK: add r0,  r15               ; encoding: [0x0f,0x0c]
 ; CHECK: add r15, r0                ; encoding: [0xf0,0x0c]
 ; CHECK: add r16, r31               ; encoding: [0x0f,0x0f]
diff --git a/llvm/test/MC/AVR/syntax-reg-pair.s b/llvm/test/MC/AVR/syntax-reg-pair.s
index b4676cca089d7..eaf3146bf2ed7 100644
--- a/llvm/test/MC/AVR/syntax-reg-pair.s
+++ b/llvm/test/MC/AVR/syntax-reg-pair.s
@@ -1,7 +1,6 @@
 ; RUN: llvm-mc -triple avr -mattr=addsubiw -show-encoding < %s | FileCheck %s
 
 foo:
-
   sbiw r24,     1
   sbiw r25:r24, 2
   sbiw r24,     2

From 30a9cace807d4b5c98f2d0e2bd5bdea49061c8b8 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Fri, 6 Sep 2024 15:13:45 +0800
Subject: [PATCH 335/425] [clang][ASTImporter] Fix -Wpessimizing-move in
 ASTImporter.cpp (NFC)

/llvm-project/clang/lib/AST/ASTImporter.cpp:371:20: error: moving a local object in a return statement prevents copy elision [-Werror,-Wpessimizing-move]
       return std::move(Err);
              ^
/llvm-project/clang/lib/AST/ASTImporter.cpp:371:20: note: remove std::move call here
       return std::move(Err);
              ^~~~~~~~~~   ~
/llvm-project/clang/lib/AST/ASTImporter.cpp:380:22: error: moving a local object in a return statement prevents copy elision [-Werror,-Wpessimizing-move]
         return std::move(Err);
                ^
/llvm-project/clang/lib/AST/ASTImporter.cpp:380:22: note: remove std::move call here
              return std::move(Err);
                     ^~~~~~~~~~   ~
2 errors generated.
---
 clang/lib/AST/ASTImporter.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index d335e34907b59..e854dbfb7bf2e 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -368,7 +368,7 @@ namespace clang {
           auto *ToInheritedFrom = const_cast<TemplateParmDeclT *>(
               importChecked(Err, D->getDefaultArgStorage().getInheritedFrom()));
           if (Err)
-            return std::move(Err);
+            return Err;
           if (!ToInheritedFrom->hasDefaultArgument()) {
             // Resolve possible circular dependency between default value of the
             // template argument and the template declaration.
@@ -377,7 +377,7 @@ namespace clang {
                                        .getInheritedFrom()
                                        ->getDefaultArgument());
             if (Err)
-              return std::move(Err);
+              return Err;
             ToInheritedFrom->setDefaultArgument(Importer.getToContext(),
                                                 ToInheritedDefaultArg);
           }

From 62fec3d23d2325869e6eba0263b0b9f834c2067f Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Fri, 6 Sep 2024 15:20:08 +0800
Subject: [PATCH 336/425] [NFCI] [ClangScanDeps] [P1689] Use PreprocessorOnly
 Action for P1689

It is fine enough to use PreprocessorOnly action for P1689 format. We
don't need to read any PCH or module files.
---
 .../Tooling/DependencyScanning/DependencyScanningWorker.cpp   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
index 1a21a4f5e30ff..09ad5ebc7954c 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
@@ -430,7 +430,9 @@ class DependencyScanningAction : public tooling::ToolAction {
 
     std::unique_ptr<FrontendAction> Action;
 
-    if (ModuleName)
+    if (Format == ScanningOutputFormat::P1689)
+      Action = std::make_unique<PreprocessOnlyAction>();
+    else if (ModuleName)
       Action = std::make_unique<GetDependenciesByModuleNameAction>(*ModuleName);
     else
       Action = std::make_unique<ReadPCHAndPreprocessAction>();

From 4b2c950de5611a94defb00cbd66226eb02350938 Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Fri, 6 Sep 2024 15:45:23 +0800
Subject: [PATCH 337/425] [test][LoongArch] Pre-commit test for optimize CTPOP.
 NFC

Reviewed By: SixWeining

Pull Request: https://github.com/llvm/llvm-project/pull/106940
---
 llvm/test/CodeGen/LoongArch/ctpop-with-lsx.ll | 206 ++++++++++++++++++
 1 file changed, 206 insertions(+)
 create mode 100644 llvm/test/CodeGen/LoongArch/ctpop-with-lsx.ll

diff --git a/llvm/test/CodeGen/LoongArch/ctpop-with-lsx.ll b/llvm/test/CodeGen/LoongArch/ctpop-with-lsx.ll
new file mode 100644
index 0000000000000..a5cffb29eec61
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/ctpop-with-lsx.ll
@@ -0,0 +1,206 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 -mattr=+lsx < %s | FileCheck %s --check-prefix=LA32
+; RUN: llc --mtriple=loongarch64 -mattr=+lsx < %s | FileCheck %s --check-prefix=LA64
+
+declare i8 @llvm.ctpop.i8(i8)
+declare i16 @llvm.ctpop.i16(i16)
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)
+
+define i8 @test_ctpop_i8(i8 %a) nounwind {
+; LA32-LABEL: test_ctpop_i8:
+; LA32:       # %bb.0:
+; LA32-NEXT:    srli.w $a1, $a0, 1
+; LA32-NEXT:    andi $a1, $a1, 85
+; LA32-NEXT:    sub.w $a0, $a0, $a1
+; LA32-NEXT:    andi $a1, $a0, 51
+; LA32-NEXT:    srli.w $a0, $a0, 2
+; LA32-NEXT:    andi $a0, $a0, 51
+; LA32-NEXT:    add.w $a0, $a1, $a0
+; LA32-NEXT:    srli.w $a1, $a0, 4
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    andi $a0, $a0, 15
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: test_ctpop_i8:
+; LA64:       # %bb.0:
+; LA64-NEXT:    srli.d $a1, $a0, 1
+; LA64-NEXT:    andi $a1, $a1, 85
+; LA64-NEXT:    sub.d $a0, $a0, $a1
+; LA64-NEXT:    andi $a1, $a0, 51
+; LA64-NEXT:    srli.d $a0, $a0, 2
+; LA64-NEXT:    andi $a0, $a0, 51
+; LA64-NEXT:    add.d $a0, $a1, $a0
+; LA64-NEXT:    srli.d $a1, $a0, 4
+; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    andi $a0, $a0, 15
+; LA64-NEXT:    ret
+  %1 = call i8 @llvm.ctpop.i8(i8 %a)
+  ret i8 %1
+}
+
+define i16 @test_ctpop_i16(i16 %a) nounwind {
+; LA32-LABEL: test_ctpop_i16:
+; LA32:       # %bb.0:
+; LA32-NEXT:    srli.w $a1, $a0, 1
+; LA32-NEXT:    lu12i.w $a2, 5
+; LA32-NEXT:    ori $a2, $a2, 1365
+; LA32-NEXT:    and $a1, $a1, $a2
+; LA32-NEXT:    sub.w $a0, $a0, $a1
+; LA32-NEXT:    lu12i.w $a1, 3
+; LA32-NEXT:    ori $a1, $a1, 819
+; LA32-NEXT:    and $a2, $a0, $a1
+; LA32-NEXT:    srli.w $a0, $a0, 2
+; LA32-NEXT:    and $a0, $a0, $a1
+; LA32-NEXT:    add.w $a0, $a2, $a0
+; LA32-NEXT:    srli.w $a1, $a0, 4
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    bstrpick.w $a1, $a0, 11, 8
+; LA32-NEXT:    andi $a0, $a0, 15
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: test_ctpop_i16:
+; LA64:       # %bb.0:
+; LA64-NEXT:    srli.d $a1, $a0, 1
+; LA64-NEXT:    lu12i.w $a2, 5
+; LA64-NEXT:    ori $a2, $a2, 1365
+; LA64-NEXT:    and $a1, $a1, $a2
+; LA64-NEXT:    sub.d $a0, $a0, $a1
+; LA64-NEXT:    lu12i.w $a1, 3
+; LA64-NEXT:    ori $a1, $a1, 819
+; LA64-NEXT:    and $a2, $a0, $a1
+; LA64-NEXT:    srli.d $a0, $a0, 2
+; LA64-NEXT:    and $a0, $a0, $a1
+; LA64-NEXT:    add.d $a0, $a2, $a0
+; LA64-NEXT:    srli.d $a1, $a0, 4
+; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    bstrpick.d $a1, $a0, 11, 8
+; LA64-NEXT:    andi $a0, $a0, 15
+; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    ret
+  %1 = call i16 @llvm.ctpop.i16(i16 %a)
+  ret i16 %1
+}
+
+define i32 @test_ctpop_i32(i32 %a) nounwind {
+; LA32-LABEL: test_ctpop_i32:
+; LA32:       # %bb.0:
+; LA32-NEXT:    srli.w $a1, $a0, 1
+; LA32-NEXT:    lu12i.w $a2, 349525
+; LA32-NEXT:    ori $a2, $a2, 1365
+; LA32-NEXT:    and $a1, $a1, $a2
+; LA32-NEXT:    sub.w $a0, $a0, $a1
+; LA32-NEXT:    lu12i.w $a1, 209715
+; LA32-NEXT:    ori $a1, $a1, 819
+; LA32-NEXT:    and $a2, $a0, $a1
+; LA32-NEXT:    srli.w $a0, $a0, 2
+; LA32-NEXT:    and $a0, $a0, $a1
+; LA32-NEXT:    add.w $a0, $a2, $a0
+; LA32-NEXT:    srli.w $a1, $a0, 4
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    lu12i.w $a1, 61680
+; LA32-NEXT:    ori $a1, $a1, 3855
+; LA32-NEXT:    and $a0, $a0, $a1
+; LA32-NEXT:    lu12i.w $a1, 4112
+; LA32-NEXT:    ori $a1, $a1, 257
+; LA32-NEXT:    mul.w $a0, $a0, $a1
+; LA32-NEXT:    srli.w $a0, $a0, 24
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: test_ctpop_i32:
+; LA64:       # %bb.0:
+; LA64-NEXT:    srli.d $a1, $a0, 1
+; LA64-NEXT:    lu12i.w $a2, 349525
+; LA64-NEXT:    ori $a2, $a2, 1365
+; LA64-NEXT:    and $a1, $a1, $a2
+; LA64-NEXT:    sub.d $a0, $a0, $a1
+; LA64-NEXT:    lu12i.w $a1, 209715
+; LA64-NEXT:    ori $a1, $a1, 819
+; LA64-NEXT:    and $a2, $a0, $a1
+; LA64-NEXT:    srli.d $a0, $a0, 2
+; LA64-NEXT:    and $a0, $a0, $a1
+; LA64-NEXT:    add.d $a0, $a2, $a0
+; LA64-NEXT:    srli.d $a1, $a0, 4
+; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    lu12i.w $a1, 61680
+; LA64-NEXT:    ori $a1, $a1, 3855
+; LA64-NEXT:    and $a0, $a0, $a1
+; LA64-NEXT:    lu12i.w $a1, 4112
+; LA64-NEXT:    ori $a1, $a1, 257
+; LA64-NEXT:    mul.d $a0, $a0, $a1
+; LA64-NEXT:    bstrpick.d $a0, $a0, 31, 24
+; LA64-NEXT:    ret
+  %1 = call i32 @llvm.ctpop.i32(i32 %a)
+  ret i32 %1
+}
+
+define i64 @test_ctpop_i64(i64 %a) nounwind {
+; LA32-LABEL: test_ctpop_i64:
+; LA32:       # %bb.0:
+; LA32-NEXT:    srli.w $a2, $a1, 1
+; LA32-NEXT:    lu12i.w $a3, 349525
+; LA32-NEXT:    ori $a3, $a3, 1365
+; LA32-NEXT:    and $a2, $a2, $a3
+; LA32-NEXT:    sub.w $a1, $a1, $a2
+; LA32-NEXT:    lu12i.w $a2, 209715
+; LA32-NEXT:    ori $a2, $a2, 819
+; LA32-NEXT:    and $a4, $a1, $a2
+; LA32-NEXT:    srli.w $a1, $a1, 2
+; LA32-NEXT:    and $a1, $a1, $a2
+; LA32-NEXT:    add.w $a1, $a4, $a1
+; LA32-NEXT:    srli.w $a4, $a1, 4
+; LA32-NEXT:    add.w $a1, $a1, $a4
+; LA32-NEXT:    lu12i.w $a4, 61680
+; LA32-NEXT:    ori $a4, $a4, 3855
+; LA32-NEXT:    and $a1, $a1, $a4
+; LA32-NEXT:    lu12i.w $a5, 4112
+; LA32-NEXT:    ori $a5, $a5, 257
+; LA32-NEXT:    mul.w $a1, $a1, $a5
+; LA32-NEXT:    srli.w $a1, $a1, 24
+; LA32-NEXT:    srli.w $a6, $a0, 1
+; LA32-NEXT:    and $a3, $a6, $a3
+; LA32-NEXT:    sub.w $a0, $a0, $a3
+; LA32-NEXT:    and $a3, $a0, $a2
+; LA32-NEXT:    srli.w $a0, $a0, 2
+; LA32-NEXT:    and $a0, $a0, $a2
+; LA32-NEXT:    add.w $a0, $a3, $a0
+; LA32-NEXT:    srli.w $a2, $a0, 4
+; LA32-NEXT:    add.w $a0, $a0, $a2
+; LA32-NEXT:    and $a0, $a0, $a4
+; LA32-NEXT:    mul.w $a0, $a0, $a5
+; LA32-NEXT:    srli.w $a0, $a0, 24
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    move $a1, $zero
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: test_ctpop_i64:
+; LA64:       # %bb.0:
+; LA64-NEXT:    srli.d $a1, $a0, 1
+; LA64-NEXT:    lu12i.w $a2, 349525
+; LA64-NEXT:    ori $a2, $a2, 1365
+; LA64-NEXT:    bstrins.d $a2, $a2, 62, 32
+; LA64-NEXT:    and $a1, $a1, $a2
+; LA64-NEXT:    sub.d $a0, $a0, $a1
+; LA64-NEXT:    lu12i.w $a1, 209715
+; LA64-NEXT:    ori $a1, $a1, 819
+; LA64-NEXT:    bstrins.d $a1, $a1, 61, 32
+; LA64-NEXT:    and $a2, $a0, $a1
+; LA64-NEXT:    srli.d $a0, $a0, 2
+; LA64-NEXT:    and $a0, $a0, $a1
+; LA64-NEXT:    add.d $a0, $a2, $a0
+; LA64-NEXT:    srli.d $a1, $a0, 4
+; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    lu12i.w $a1, 61680
+; LA64-NEXT:    ori $a1, $a1, 3855
+; LA64-NEXT:    bstrins.d $a1, $a1, 59, 32
+; LA64-NEXT:    and $a0, $a0, $a1
+; LA64-NEXT:    lu12i.w $a1, 4112
+; LA64-NEXT:    ori $a1, $a1, 257
+; LA64-NEXT:    bstrins.d $a1, $a1, 56, 32
+; LA64-NEXT:    mul.d $a0, $a0, $a1
+; LA64-NEXT:    srli.d $a0, $a0, 56
+; LA64-NEXT:    ret
+  %1 = call i64 @llvm.ctpop.i64(i64 %a)
+  ret i64 %1
+}

From df93327c1ad9db7ab6c71a97bc093ce7133659d8 Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Fri, 6 Sep 2024 15:46:43 +0800
Subject: [PATCH 338/425] [LoongArch] Legalize ISD::CTPOP for GRLenVT type with
 LSX

Reviewed By: SixWeining

Pull Request: https://github.com/llvm/llvm-project/pull/106941
---
 .../LoongArch/LoongArchISelLowering.cpp       |  41 ++++
 .../Target/LoongArch/LoongArchLSXInstrInfo.td |   4 +
 llvm/test/CodeGen/LoongArch/ctpop-with-lsx.ll | 189 ++++--------------
 3 files changed, 86 insertions(+), 148 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 95c1b150722f6..0e17ce7ea02bb 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -283,6 +283,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
                          ISD::SETUGE, ISD::SETUGT},
                         VT, Expand);
     }
+    setOperationAction(ISD::CTPOP, GRLenVT, Legal);
   }
 
   // Set operations for 'LASX' feature.
@@ -4488,6 +4489,44 @@ emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB,
   return BB;
 }
 
+static MachineBasicBlock *emitPseudoCTPOP(MachineInstr &MI,
+                                          MachineBasicBlock *BB,
+                                          const LoongArchSubtarget &Subtarget) {
+  assert(Subtarget.hasExtLSX());
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const TargetRegisterClass *RC = &LoongArch::LSX128RegClass;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  Register ScratchReg1 = MRI.createVirtualRegister(RC);
+  Register ScratchReg2 = MRI.createVirtualRegister(RC);
+  Register ScratchReg3 = MRI.createVirtualRegister(RC);
+
+  BuildMI(*BB, MI, DL, TII->get(LoongArch::VLDI), ScratchReg1).addImm(0);
+  BuildMI(*BB, MI, DL,
+          TII->get(Subtarget.is64Bit() ? LoongArch::VINSGR2VR_D
+                                       : LoongArch::VINSGR2VR_W),
+          ScratchReg2)
+      .addReg(ScratchReg1)
+      .addReg(Src)
+      .addImm(0);
+  BuildMI(
+      *BB, MI, DL,
+      TII->get(Subtarget.is64Bit() ? LoongArch::VPCNT_D : LoongArch::VPCNT_W),
+      ScratchReg3)
+      .addReg(ScratchReg2);
+  BuildMI(*BB, MI, DL,
+          TII->get(Subtarget.is64Bit() ? LoongArch::VPICKVE2GR_D
+                                       : LoongArch::VPICKVE2GR_W),
+          Dst)
+      .addReg(ScratchReg3)
+      .addImm(0);
+
+  MI.eraseFromParent();
+  return BB;
+}
+
 MachineBasicBlock *LoongArchTargetLowering::EmitInstrWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *BB) const {
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
@@ -4546,6 +4585,8 @@ MachineBasicBlock *LoongArchTargetLowering::EmitInstrWithCustomInserter(
   case LoongArch::PseudoXVINSGR2VR_B:
   case LoongArch::PseudoXVINSGR2VR_H:
     return emitPseudoXVINSGR2VR(MI, BB, Subtarget);
+  case LoongArch::PseudoCTPOP:
+    return emitPseudoCTPOP(MI, BB, Subtarget);
   }
 }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 659ba38c695d3..e7ac9f3bd04cb 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1238,6 +1238,10 @@ def PseudoVBZ_W : VecCond<loongarch_vall_zero, v4i32>;
 def PseudoVBZ_D : VecCond<loongarch_vall_zero, v2i64>;
 def PseudoVBZ : VecCond<loongarch_vany_zero, v16i8>;
 
+let usesCustomInserter = 1 in
+def PseudoCTPOP : Pseudo<(outs GPR:$rd), (ins GPR:$rj),
+                         [(set GPR:$rd, (ctpop GPR:$rj))]>;
+
 } // Predicates = [HasExtLSX]
 
 multiclass PatVr<SDPatternOperator OpNode, string Inst> {
diff --git a/llvm/test/CodeGen/LoongArch/ctpop-with-lsx.ll b/llvm/test/CodeGen/LoongArch/ctpop-with-lsx.ll
index a5cffb29eec61..c01f3cdb40568 100644
--- a/llvm/test/CodeGen/LoongArch/ctpop-with-lsx.ll
+++ b/llvm/test/CodeGen/LoongArch/ctpop-with-lsx.ll
@@ -10,30 +10,20 @@ declare i64 @llvm.ctpop.i64(i64)
 define i8 @test_ctpop_i8(i8 %a) nounwind {
 ; LA32-LABEL: test_ctpop_i8:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    srli.w $a1, $a0, 1
-; LA32-NEXT:    andi $a1, $a1, 85
-; LA32-NEXT:    sub.w $a0, $a0, $a1
-; LA32-NEXT:    andi $a1, $a0, 51
-; LA32-NEXT:    srli.w $a0, $a0, 2
-; LA32-NEXT:    andi $a0, $a0, 51
-; LA32-NEXT:    add.w $a0, $a1, $a0
-; LA32-NEXT:    srli.w $a1, $a0, 4
-; LA32-NEXT:    add.w $a0, $a0, $a1
-; LA32-NEXT:    andi $a0, $a0, 15
+; LA32-NEXT:    andi $a0, $a0, 255
+; LA32-NEXT:    vldi $vr0, 0
+; LA32-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; LA32-NEXT:    vpcnt.w $vr0, $vr0
+; LA32-NEXT:    vpickve2gr.w $a0, $vr0, 0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test_ctpop_i8:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    srli.d $a1, $a0, 1
-; LA64-NEXT:    andi $a1, $a1, 85
-; LA64-NEXT:    sub.d $a0, $a0, $a1
-; LA64-NEXT:    andi $a1, $a0, 51
-; LA64-NEXT:    srli.d $a0, $a0, 2
-; LA64-NEXT:    andi $a0, $a0, 51
-; LA64-NEXT:    add.d $a0, $a1, $a0
-; LA64-NEXT:    srli.d $a1, $a0, 4
-; LA64-NEXT:    add.d $a0, $a0, $a1
-; LA64-NEXT:    andi $a0, $a0, 15
+; LA64-NEXT:    andi $a0, $a0, 255
+; LA64-NEXT:    vldi $vr0, 0
+; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; LA64-NEXT:    vpcnt.d $vr0, $vr0
+; LA64-NEXT:    vpickve2gr.d $a0, $vr0, 0
 ; LA64-NEXT:    ret
   %1 = call i8 @llvm.ctpop.i8(i8 %a)
   ret i8 %1
@@ -42,42 +32,20 @@ define i8 @test_ctpop_i8(i8 %a) nounwind {
 define i16 @test_ctpop_i16(i16 %a) nounwind {
 ; LA32-LABEL: test_ctpop_i16:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    srli.w $a1, $a0, 1
-; LA32-NEXT:    lu12i.w $a2, 5
-; LA32-NEXT:    ori $a2, $a2, 1365
-; LA32-NEXT:    and $a1, $a1, $a2
-; LA32-NEXT:    sub.w $a0, $a0, $a1
-; LA32-NEXT:    lu12i.w $a1, 3
-; LA32-NEXT:    ori $a1, $a1, 819
-; LA32-NEXT:    and $a2, $a0, $a1
-; LA32-NEXT:    srli.w $a0, $a0, 2
-; LA32-NEXT:    and $a0, $a0, $a1
-; LA32-NEXT:    add.w $a0, $a2, $a0
-; LA32-NEXT:    srli.w $a1, $a0, 4
-; LA32-NEXT:    add.w $a0, $a0, $a1
-; LA32-NEXT:    bstrpick.w $a1, $a0, 11, 8
-; LA32-NEXT:    andi $a0, $a0, 15
-; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    bstrpick.w $a0, $a0, 15, 0
+; LA32-NEXT:    vldi $vr0, 0
+; LA32-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; LA32-NEXT:    vpcnt.w $vr0, $vr0
+; LA32-NEXT:    vpickve2gr.w $a0, $vr0, 0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test_ctpop_i16:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    srli.d $a1, $a0, 1
-; LA64-NEXT:    lu12i.w $a2, 5
-; LA64-NEXT:    ori $a2, $a2, 1365
-; LA64-NEXT:    and $a1, $a1, $a2
-; LA64-NEXT:    sub.d $a0, $a0, $a1
-; LA64-NEXT:    lu12i.w $a1, 3
-; LA64-NEXT:    ori $a1, $a1, 819
-; LA64-NEXT:    and $a2, $a0, $a1
-; LA64-NEXT:    srli.d $a0, $a0, 2
-; LA64-NEXT:    and $a0, $a0, $a1
-; LA64-NEXT:    add.d $a0, $a2, $a0
-; LA64-NEXT:    srli.d $a1, $a0, 4
-; LA64-NEXT:    add.d $a0, $a0, $a1
-; LA64-NEXT:    bstrpick.d $a1, $a0, 11, 8
-; LA64-NEXT:    andi $a0, $a0, 15
-; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    bstrpick.d $a0, $a0, 15, 0
+; LA64-NEXT:    vldi $vr0, 0
+; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; LA64-NEXT:    vpcnt.d $vr0, $vr0
+; LA64-NEXT:    vpickve2gr.d $a0, $vr0, 0
 ; LA64-NEXT:    ret
   %1 = call i16 @llvm.ctpop.i16(i16 %a)
   ret i16 %1
@@ -86,50 +54,19 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
 define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; LA32-LABEL: test_ctpop_i32:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    srli.w $a1, $a0, 1
-; LA32-NEXT:    lu12i.w $a2, 349525
-; LA32-NEXT:    ori $a2, $a2, 1365
-; LA32-NEXT:    and $a1, $a1, $a2
-; LA32-NEXT:    sub.w $a0, $a0, $a1
-; LA32-NEXT:    lu12i.w $a1, 209715
-; LA32-NEXT:    ori $a1, $a1, 819
-; LA32-NEXT:    and $a2, $a0, $a1
-; LA32-NEXT:    srli.w $a0, $a0, 2
-; LA32-NEXT:    and $a0, $a0, $a1
-; LA32-NEXT:    add.w $a0, $a2, $a0
-; LA32-NEXT:    srli.w $a1, $a0, 4
-; LA32-NEXT:    add.w $a0, $a0, $a1
-; LA32-NEXT:    lu12i.w $a1, 61680
-; LA32-NEXT:    ori $a1, $a1, 3855
-; LA32-NEXT:    and $a0, $a0, $a1
-; LA32-NEXT:    lu12i.w $a1, 4112
-; LA32-NEXT:    ori $a1, $a1, 257
-; LA32-NEXT:    mul.w $a0, $a0, $a1
-; LA32-NEXT:    srli.w $a0, $a0, 24
+; LA32-NEXT:    vldi $vr0, 0
+; LA32-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; LA32-NEXT:    vpcnt.w $vr0, $vr0
+; LA32-NEXT:    vpickve2gr.w $a0, $vr0, 0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test_ctpop_i32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    srli.d $a1, $a0, 1
-; LA64-NEXT:    lu12i.w $a2, 349525
-; LA64-NEXT:    ori $a2, $a2, 1365
-; LA64-NEXT:    and $a1, $a1, $a2
-; LA64-NEXT:    sub.d $a0, $a0, $a1
-; LA64-NEXT:    lu12i.w $a1, 209715
-; LA64-NEXT:    ori $a1, $a1, 819
-; LA64-NEXT:    and $a2, $a0, $a1
-; LA64-NEXT:    srli.d $a0, $a0, 2
-; LA64-NEXT:    and $a0, $a0, $a1
-; LA64-NEXT:    add.d $a0, $a2, $a0
-; LA64-NEXT:    srli.d $a1, $a0, 4
-; LA64-NEXT:    add.d $a0, $a0, $a1
-; LA64-NEXT:    lu12i.w $a1, 61680
-; LA64-NEXT:    ori $a1, $a1, 3855
-; LA64-NEXT:    and $a0, $a0, $a1
-; LA64-NEXT:    lu12i.w $a1, 4112
-; LA64-NEXT:    ori $a1, $a1, 257
-; LA64-NEXT:    mul.d $a0, $a0, $a1
-; LA64-NEXT:    bstrpick.d $a0, $a0, 31, 24
+; LA64-NEXT:    bstrpick.d $a0, $a0, 31, 0
+; LA64-NEXT:    vldi $vr0, 0
+; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; LA64-NEXT:    vpcnt.d $vr0, $vr0
+; LA64-NEXT:    vpickve2gr.d $a0, $vr0, 0
 ; LA64-NEXT:    ret
   %1 = call i32 @llvm.ctpop.i32(i32 %a)
   ret i32 %1
@@ -138,68 +75,24 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 define i64 @test_ctpop_i64(i64 %a) nounwind {
 ; LA32-LABEL: test_ctpop_i64:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    srli.w $a2, $a1, 1
-; LA32-NEXT:    lu12i.w $a3, 349525
-; LA32-NEXT:    ori $a3, $a3, 1365
-; LA32-NEXT:    and $a2, $a2, $a3
-; LA32-NEXT:    sub.w $a1, $a1, $a2
-; LA32-NEXT:    lu12i.w $a2, 209715
-; LA32-NEXT:    ori $a2, $a2, 819
-; LA32-NEXT:    and $a4, $a1, $a2
-; LA32-NEXT:    srli.w $a1, $a1, 2
-; LA32-NEXT:    and $a1, $a1, $a2
-; LA32-NEXT:    add.w $a1, $a4, $a1
-; LA32-NEXT:    srli.w $a4, $a1, 4
-; LA32-NEXT:    add.w $a1, $a1, $a4
-; LA32-NEXT:    lu12i.w $a4, 61680
-; LA32-NEXT:    ori $a4, $a4, 3855
-; LA32-NEXT:    and $a1, $a1, $a4
-; LA32-NEXT:    lu12i.w $a5, 4112
-; LA32-NEXT:    ori $a5, $a5, 257
-; LA32-NEXT:    mul.w $a1, $a1, $a5
-; LA32-NEXT:    srli.w $a1, $a1, 24
-; LA32-NEXT:    srli.w $a6, $a0, 1
-; LA32-NEXT:    and $a3, $a6, $a3
-; LA32-NEXT:    sub.w $a0, $a0, $a3
-; LA32-NEXT:    and $a3, $a0, $a2
-; LA32-NEXT:    srli.w $a0, $a0, 2
-; LA32-NEXT:    and $a0, $a0, $a2
-; LA32-NEXT:    add.w $a0, $a3, $a0
-; LA32-NEXT:    srli.w $a2, $a0, 4
-; LA32-NEXT:    add.w $a0, $a0, $a2
-; LA32-NEXT:    and $a0, $a0, $a4
-; LA32-NEXT:    mul.w $a0, $a0, $a5
-; LA32-NEXT:    srli.w $a0, $a0, 24
+; LA32-NEXT:    vldi $vr0, 0
+; LA32-NEXT:    vldi $vr1, 0
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 0
+; LA32-NEXT:    vpcnt.w $vr1, $vr1
+; LA32-NEXT:    vpickve2gr.w $a1, $vr1, 0
+; LA32-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; LA32-NEXT:    vpcnt.w $vr0, $vr0
+; LA32-NEXT:    vpickve2gr.w $a0, $vr0, 0
 ; LA32-NEXT:    add.w $a0, $a0, $a1
 ; LA32-NEXT:    move $a1, $zero
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test_ctpop_i64:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    srli.d $a1, $a0, 1
-; LA64-NEXT:    lu12i.w $a2, 349525
-; LA64-NEXT:    ori $a2, $a2, 1365
-; LA64-NEXT:    bstrins.d $a2, $a2, 62, 32
-; LA64-NEXT:    and $a1, $a1, $a2
-; LA64-NEXT:    sub.d $a0, $a0, $a1
-; LA64-NEXT:    lu12i.w $a1, 209715
-; LA64-NEXT:    ori $a1, $a1, 819
-; LA64-NEXT:    bstrins.d $a1, $a1, 61, 32
-; LA64-NEXT:    and $a2, $a0, $a1
-; LA64-NEXT:    srli.d $a0, $a0, 2
-; LA64-NEXT:    and $a0, $a0, $a1
-; LA64-NEXT:    add.d $a0, $a2, $a0
-; LA64-NEXT:    srli.d $a1, $a0, 4
-; LA64-NEXT:    add.d $a0, $a0, $a1
-; LA64-NEXT:    lu12i.w $a1, 61680
-; LA64-NEXT:    ori $a1, $a1, 3855
-; LA64-NEXT:    bstrins.d $a1, $a1, 59, 32
-; LA64-NEXT:    and $a0, $a0, $a1
-; LA64-NEXT:    lu12i.w $a1, 4112
-; LA64-NEXT:    ori $a1, $a1, 257
-; LA64-NEXT:    bstrins.d $a1, $a1, 56, 32
-; LA64-NEXT:    mul.d $a0, $a0, $a1
-; LA64-NEXT:    srli.d $a0, $a0, 56
+; LA64-NEXT:    vldi $vr0, 0
+; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; LA64-NEXT:    vpcnt.d $vr0, $vr0
+; LA64-NEXT:    vpickve2gr.d $a0, $vr0, 0
 ; LA64-NEXT:    ret
   %1 = call i64 @llvm.ctpop.i64(i64 %a)
   ret i64 %1

From 2ee7183e38b91525ebd803144a588a9548ca9a46 Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Fri, 6 Sep 2024 15:48:14 +0800
Subject: [PATCH 339/425] [LoongArch] Add TTI support for cpop with LSX

Reviewed By: SixWeining

Pull Request: https://github.com/llvm/llvm-project/pull/106961
---
 .../LoongArchTargetTransformInfo.cpp          |   6 +
 .../LoongArch/LoongArchTargetTransformInfo.h  |   1 +
 .../LoopIdiom/LoongArch/lit.local.cfg         |   2 +
 .../Transforms/LoopIdiom/LoongArch/popcnt.ll  | 320 ++++++++++++++++++
 4 files changed, 329 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopIdiom/LoongArch/lit.local.cfg
 create mode 100644 llvm/test/Transforms/LoopIdiom/LoongArch/popcnt.ll

diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
index 2c7b0bfeaaad5..5fbc7c734168d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
@@ -83,4 +83,10 @@ const char *LoongArchTTIImpl::getRegisterClassName(unsigned ClassID) const {
   llvm_unreachable("unknown register class");
 }
 
+TargetTransformInfo::PopcntSupportKind
+LoongArchTTIImpl::getPopcntSupport(unsigned TyWidth) {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  return ST->hasExtLSX() ? TTI::PSK_FastHardware : TTI::PSK_Software;
+}
+
 // TODO: Implement more hooks to provide TTI machinery for LoongArch.
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
index b2eef80dd9d3d..f7ce75173be20 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
@@ -45,6 +45,7 @@ class LoongArchTTIImpl : public BasicTTIImplBase<LoongArchTTIImpl> {
   unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const;
   unsigned getMaxInterleaveFactor(ElementCount VF);
   const char *getRegisterClassName(unsigned ClassID) const;
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
 
   // TODO: Implement more hooks to provide TTI machinery for LoongArch.
 };
diff --git a/llvm/test/Transforms/LoopIdiom/LoongArch/lit.local.cfg b/llvm/test/Transforms/LoopIdiom/LoongArch/lit.local.cfg
new file mode 100644
index 0000000000000..cc24278acbb41
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/LoongArch/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "LoongArch" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/LoopIdiom/LoongArch/popcnt.ll b/llvm/test/Transforms/LoopIdiom/LoongArch/popcnt.ll
new file mode 100644
index 0000000000000..915a100a54f48
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/LoongArch/popcnt.ll
@@ -0,0 +1,320 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-idiom -mtriple=loongarch32 -mattr=+lsx -S < %s | FileCheck %s --check-prefix=CPOP
+; RUN: opt -passes=loop-idiom -mtriple=loongarch64 -mattr=+lsx -S < %s | FileCheck %s --check-prefix=CPOP
+; RUN: opt -passes=loop-idiom -mtriple=loongarch32 -S < %s | FileCheck %s --check-prefix=NOCPOP
+; RUN: opt -passes=loop-idiom -mtriple=loongarch64 -S < %s | FileCheck %s --check-prefix=NOCPOP
+
+; Mostly copied from RISCV version.
+
+;To recognize this pattern:
+;int popcount(unsigned long long a) {
+;    int c = 0;
+;    while (a) {
+;        c++;
+;        a &= a - 1;
+;    }
+;    return c;
+;}
+;
+
+define i32 @popcount_i64(i64 %a) nounwind uwtable readnone ssp {
+; CPOP-LABEL: @popcount_i64(
+; CPOP-NEXT:  entry:
+; CPOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[A:%.*]])
+; CPOP-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+; CPOP-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CPOP-NEXT:    br i1 [[TMP2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CPOP:       while.body.preheader:
+; CPOP-NEXT:    br label [[WHILE_BODY:%.*]]
+; CPOP:       while.body:
+; CPOP-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ]
+; CPOP-NEXT:    [[C_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CPOP-NEXT:    [[A_ADDR_04:%.*]] = phi i64 [ [[AND:%.*]], [[WHILE_BODY]] ], [ [[A]], [[WHILE_BODY_PREHEADER]] ]
+; CPOP-NEXT:    [[INC]] = add nsw i32 [[C_05]], 1
+; CPOP-NEXT:    [[SUB:%.*]] = add i64 [[A_ADDR_04]], -1
+; CPOP-NEXT:    [[AND]] = and i64 [[SUB]], [[A_ADDR_04]]
+; CPOP-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CPOP-NEXT:    [[TOBOOL:%.*]] = icmp sle i32 [[TCDEC]], 0
+; CPOP-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CPOP:       while.end.loopexit:
+; CPOP-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY]] ]
+; CPOP-NEXT:    br label [[WHILE_END]]
+; CPOP:       while.end:
+; CPOP-NEXT:    [[C_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CPOP-NEXT:    ret i32 [[C_0_LCSSA]]
+;
+; NOCPOP-LABEL: @popcount_i64(
+; NOCPOP-NEXT:  entry:
+; NOCPOP-NEXT:    [[TOBOOL3:%.*]] = icmp eq i64 [[A:%.*]], 0
+; NOCPOP-NEXT:    br i1 [[TOBOOL3]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; NOCPOP:       while.body.preheader:
+; NOCPOP-NEXT:    br label [[WHILE_BODY:%.*]]
+; NOCPOP:       while.body:
+; NOCPOP-NEXT:    [[C_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; NOCPOP-NEXT:    [[A_ADDR_04:%.*]] = phi i64 [ [[AND:%.*]], [[WHILE_BODY]] ], [ [[A]], [[WHILE_BODY_PREHEADER]] ]
+; NOCPOP-NEXT:    [[INC]] = add nsw i32 [[C_05]], 1
+; NOCPOP-NEXT:    [[SUB:%.*]] = add i64 [[A_ADDR_04]], -1
+; NOCPOP-NEXT:    [[AND]] = and i64 [[SUB]], [[A_ADDR_04]]
+; NOCPOP-NEXT:    [[TOBOOL:%.*]] = icmp eq i64 [[AND]], 0
+; NOCPOP-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; NOCPOP:       while.end.loopexit:
+; NOCPOP-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; NOCPOP-NEXT:    br label [[WHILE_END]]
+; NOCPOP:       while.end:
+; NOCPOP-NEXT:    [[C_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; NOCPOP-NEXT:    ret i32 [[C_0_LCSSA]]
+;
+entry:
+  %tobool3 = icmp eq i64 %a, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i64 %a.addr.04, -1
+  %and = and i64 %sub, %a.addr.04
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}
+
+define i32 @popcount_i32(i32 %a) nounwind uwtable readnone ssp {
+; CPOP-LABEL: @popcount_i32(
+; CPOP-NEXT:  entry:
+; CPOP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[A:%.*]])
+; CPOP-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; CPOP-NEXT:    br i1 [[TMP1]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CPOP:       while.body.preheader:
+; CPOP-NEXT:    br label [[WHILE_BODY:%.*]]
+; CPOP:       while.body:
+; CPOP-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ]
+; CPOP-NEXT:    [[C_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CPOP-NEXT:    [[A_ADDR_04:%.*]] = phi i32 [ [[AND:%.*]], [[WHILE_BODY]] ], [ [[A]], [[WHILE_BODY_PREHEADER]] ]
+; CPOP-NEXT:    [[INC]] = add nsw i32 [[C_05]], 1
+; CPOP-NEXT:    [[SUB:%.*]] = add i32 [[A_ADDR_04]], -1
+; CPOP-NEXT:    [[AND]] = and i32 [[SUB]], [[A_ADDR_04]]
+; CPOP-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CPOP-NEXT:    [[TOBOOL:%.*]] = icmp sle i32 [[TCDEC]], 0
+; CPOP-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CPOP:       while.end.loopexit:
+; CPOP-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY]] ]
+; CPOP-NEXT:    br label [[WHILE_END]]
+; CPOP:       while.end:
+; CPOP-NEXT:    [[C_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CPOP-NEXT:    ret i32 [[C_0_LCSSA]]
+;
+; NOCPOP-LABEL: @popcount_i32(
+; NOCPOP-NEXT:  entry:
+; NOCPOP-NEXT:    [[TOBOOL3:%.*]] = icmp eq i32 [[A:%.*]], 0
+; NOCPOP-NEXT:    br i1 [[TOBOOL3]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; NOCPOP:       while.body.preheader:
+; NOCPOP-NEXT:    br label [[WHILE_BODY:%.*]]
+; NOCPOP:       while.body:
+; NOCPOP-NEXT:    [[C_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; NOCPOP-NEXT:    [[A_ADDR_04:%.*]] = phi i32 [ [[AND:%.*]], [[WHILE_BODY]] ], [ [[A]], [[WHILE_BODY_PREHEADER]] ]
+; NOCPOP-NEXT:    [[INC]] = add nsw i32 [[C_05]], 1
+; NOCPOP-NEXT:    [[SUB:%.*]] = add i32 [[A_ADDR_04]], -1
+; NOCPOP-NEXT:    [[AND]] = and i32 [[SUB]], [[A_ADDR_04]]
+; NOCPOP-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[AND]], 0
+; NOCPOP-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; NOCPOP:       while.end.loopexit:
+; NOCPOP-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; NOCPOP-NEXT:    br label [[WHILE_END]]
+; NOCPOP:       while.end:
+; NOCPOP-NEXT:    [[C_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; NOCPOP-NEXT:    ret i32 [[C_0_LCSSA]]
+;
+entry:
+  %tobool3 = icmp eq i32 %a, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i32 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i32 %a.addr.04, -1
+  %and = and i32 %sub, %a.addr.04
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}
+
+define i32 @popcount_i128(i128 %a) nounwind uwtable readnone ssp {
+; CPOP-LABEL: @popcount_i128(
+; CPOP-NEXT:  entry:
+; CPOP-NEXT:    [[TMP0:%.*]] = call i128 @llvm.ctpop.i128(i128 [[A:%.*]])
+; CPOP-NEXT:    [[TMP1:%.*]] = trunc i128 [[TMP0]] to i32
+; CPOP-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CPOP-NEXT:    br i1 [[TMP2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CPOP:       while.body.preheader:
+; CPOP-NEXT:    br label [[WHILE_BODY:%.*]]
+; CPOP:       while.body:
+; CPOP-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ]
+; CPOP-NEXT:    [[C_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CPOP-NEXT:    [[A_ADDR_04:%.*]] = phi i128 [ [[AND:%.*]], [[WHILE_BODY]] ], [ [[A]], [[WHILE_BODY_PREHEADER]] ]
+; CPOP-NEXT:    [[INC]] = add nsw i32 [[C_05]], 1
+; CPOP-NEXT:    [[SUB:%.*]] = add i128 [[A_ADDR_04]], -1
+; CPOP-NEXT:    [[AND]] = and i128 [[SUB]], [[A_ADDR_04]]
+; CPOP-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CPOP-NEXT:    [[TOBOOL:%.*]] = icmp sle i32 [[TCDEC]], 0
+; CPOP-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CPOP:       while.end.loopexit:
+; CPOP-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY]] ]
+; CPOP-NEXT:    br label [[WHILE_END]]
+; CPOP:       while.end:
+; CPOP-NEXT:    [[C_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CPOP-NEXT:    ret i32 [[C_0_LCSSA]]
+;
+; NOCPOP-LABEL: @popcount_i128(
+; NOCPOP-NEXT:  entry:
+; NOCPOP-NEXT:    [[TOBOOL3:%.*]] = icmp eq i128 [[A:%.*]], 0
+; NOCPOP-NEXT:    br i1 [[TOBOOL3]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; NOCPOP:       while.body.preheader:
+; NOCPOP-NEXT:    br label [[WHILE_BODY:%.*]]
+; NOCPOP:       while.body:
+; NOCPOP-NEXT:    [[C_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; NOCPOP-NEXT:    [[A_ADDR_04:%.*]] = phi i128 [ [[AND:%.*]], [[WHILE_BODY]] ], [ [[A]], [[WHILE_BODY_PREHEADER]] ]
+; NOCPOP-NEXT:    [[INC]] = add nsw i32 [[C_05]], 1
+; NOCPOP-NEXT:    [[SUB:%.*]] = add i128 [[A_ADDR_04]], -1
+; NOCPOP-NEXT:    [[AND]] = and i128 [[SUB]], [[A_ADDR_04]]
+; NOCPOP-NEXT:    [[TOBOOL:%.*]] = icmp eq i128 [[AND]], 0
+; NOCPOP-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; NOCPOP:       while.end.loopexit:
+; NOCPOP-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; NOCPOP-NEXT:    br label [[WHILE_END]]
+; NOCPOP:       while.end:
+; NOCPOP-NEXT:    [[C_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; NOCPOP-NEXT:    ret i32 [[C_0_LCSSA]]
+;
+entry:
+  %tobool3 = icmp eq i128 %a, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i128 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i128 %a.addr.04, -1
+  %and = and i128 %sub, %a.addr.04
+  %tobool = icmp eq i128 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}
+
+; To recognize this pattern:
+;int popcount(unsigned long long a, int mydata1, int mydata2) {
+;    int c = 0;
+;    while (a) {
+;        c++;
+;        a &= a - 1;
+;        mydata1 *= c;
+;        mydata2 *= (int)a;
+;    }
+;    return c + mydata1 + mydata2;
+;}
+
+define i32 @popcount2(i64 %a, i32 %mydata1, i32 %mydata2) nounwind uwtable readnone ssp {
+; CPOP-LABEL: @popcount2(
+; CPOP-NEXT:  entry:
+; CPOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[A:%.*]])
+; CPOP-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+; CPOP-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CPOP-NEXT:    br i1 [[TMP2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CPOP:       while.body.preheader:
+; CPOP-NEXT:    br label [[WHILE_BODY:%.*]]
+; CPOP:       while.body:
+; CPOP-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ]
+; CPOP-NEXT:    [[C_013:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CPOP-NEXT:    [[MYDATA2_ADDR_012:%.*]] = phi i32 [ [[MUL1:%.*]], [[WHILE_BODY]] ], [ [[MYDATA2:%.*]], [[WHILE_BODY_PREHEADER]] ]
+; CPOP-NEXT:    [[MYDATA1_ADDR_011:%.*]] = phi i32 [ [[MUL:%.*]], [[WHILE_BODY]] ], [ [[MYDATA1:%.*]], [[WHILE_BODY_PREHEADER]] ]
+; CPOP-NEXT:    [[A_ADDR_010:%.*]] = phi i64 [ [[AND:%.*]], [[WHILE_BODY]] ], [ [[A]], [[WHILE_BODY_PREHEADER]] ]
+; CPOP-NEXT:    [[INC]] = add nsw i32 [[C_013]], 1
+; CPOP-NEXT:    [[SUB:%.*]] = add i64 [[A_ADDR_010]], -1
+; CPOP-NEXT:    [[AND]] = and i64 [[SUB]], [[A_ADDR_010]]
+; CPOP-NEXT:    [[MUL]] = mul nsw i32 [[INC]], [[MYDATA1_ADDR_011]]
+; CPOP-NEXT:    [[CONV:%.*]] = trunc i64 [[AND]] to i32
+; CPOP-NEXT:    [[MUL1]] = mul nsw i32 [[CONV]], [[MYDATA2_ADDR_012]]
+; CPOP-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CPOP-NEXT:    [[TOBOOL:%.*]] = icmp sle i32 [[TCDEC]], 0
+; CPOP-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CPOP:       while.end.loopexit:
+; CPOP-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY]] ]
+; CPOP-NEXT:    [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[WHILE_BODY]] ]
+; CPOP-NEXT:    [[MUL1_LCSSA:%.*]] = phi i32 [ [[MUL1]], [[WHILE_BODY]] ]
+; CPOP-NEXT:    br label [[WHILE_END]]
+; CPOP:       while.end:
+; CPOP-NEXT:    [[C_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CPOP-NEXT:    [[MYDATA2_ADDR_0_LCSSA:%.*]] = phi i32 [ [[MYDATA2]], [[ENTRY]] ], [ [[MUL1_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CPOP-NEXT:    [[MYDATA1_ADDR_0_LCSSA:%.*]] = phi i32 [ [[MYDATA1]], [[ENTRY]] ], [ [[MUL_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CPOP-NEXT:    [[ADD:%.*]] = add i32 [[MYDATA2_ADDR_0_LCSSA]], [[MYDATA1_ADDR_0_LCSSA]]
+; CPOP-NEXT:    [[ADD2:%.*]] = add i32 [[ADD]], [[C_0_LCSSA]]
+; CPOP-NEXT:    ret i32 [[ADD2]]
+;
+; NOCPOP-LABEL: @popcount2(
+; NOCPOP-NEXT:  entry:
+; NOCPOP-NEXT:    [[TOBOOL9:%.*]] = icmp eq i64 [[A:%.*]], 0
+; NOCPOP-NEXT:    br i1 [[TOBOOL9]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; NOCPOP:       while.body.preheader:
+; NOCPOP-NEXT:    br label [[WHILE_BODY:%.*]]
+; NOCPOP:       while.body:
+; NOCPOP-NEXT:    [[C_013:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; NOCPOP-NEXT:    [[MYDATA2_ADDR_012:%.*]] = phi i32 [ [[MUL1:%.*]], [[WHILE_BODY]] ], [ [[MYDATA2:%.*]], [[WHILE_BODY_PREHEADER]] ]
+; NOCPOP-NEXT:    [[MYDATA1_ADDR_011:%.*]] = phi i32 [ [[MUL:%.*]], [[WHILE_BODY]] ], [ [[MYDATA1:%.*]], [[WHILE_BODY_PREHEADER]] ]
+; NOCPOP-NEXT:    [[A_ADDR_010:%.*]] = phi i64 [ [[AND:%.*]], [[WHILE_BODY]] ], [ [[A]], [[WHILE_BODY_PREHEADER]] ]
+; NOCPOP-NEXT:    [[INC]] = add nsw i32 [[C_013]], 1
+; NOCPOP-NEXT:    [[SUB:%.*]] = add i64 [[A_ADDR_010]], -1
+; NOCPOP-NEXT:    [[AND]] = and i64 [[SUB]], [[A_ADDR_010]]
+; NOCPOP-NEXT:    [[MUL]] = mul nsw i32 [[INC]], [[MYDATA1_ADDR_011]]
+; NOCPOP-NEXT:    [[CONV:%.*]] = trunc i64 [[AND]] to i32
+; NOCPOP-NEXT:    [[MUL1]] = mul nsw i32 [[CONV]], [[MYDATA2_ADDR_012]]
+; NOCPOP-NEXT:    [[TOBOOL:%.*]] = icmp eq i64 [[AND]], 0
+; NOCPOP-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; NOCPOP:       while.end.loopexit:
+; NOCPOP-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; NOCPOP-NEXT:    [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[WHILE_BODY]] ]
+; NOCPOP-NEXT:    [[MUL1_LCSSA:%.*]] = phi i32 [ [[MUL1]], [[WHILE_BODY]] ]
+; NOCPOP-NEXT:    br label [[WHILE_END]]
+; NOCPOP:       while.end:
+; NOCPOP-NEXT:    [[C_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; NOCPOP-NEXT:    [[MYDATA2_ADDR_0_LCSSA:%.*]] = phi i32 [ [[MYDATA2]], [[ENTRY]] ], [ [[MUL1_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; NOCPOP-NEXT:    [[MYDATA1_ADDR_0_LCSSA:%.*]] = phi i32 [ [[MYDATA1]], [[ENTRY]] ], [ [[MUL_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; NOCPOP-NEXT:    [[ADD:%.*]] = add i32 [[MYDATA2_ADDR_0_LCSSA]], [[MYDATA1_ADDR_0_LCSSA]]
+; NOCPOP-NEXT:    [[ADD2:%.*]] = add i32 [[ADD]], [[C_0_LCSSA]]
+; NOCPOP-NEXT:    ret i32 [[ADD2]]
+;
+entry:
+  %tobool9 = icmp eq i64 %a, 0
+  br i1 %tobool9, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.013 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %mydata2.addr.012 = phi i32 [ %mul1, %while.body ], [ %mydata2, %entry ]
+  %mydata1.addr.011 = phi i32 [ %mul, %while.body ], [ %mydata1, %entry ]
+  %a.addr.010 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.013, 1
+  %sub = add i64 %a.addr.010, -1
+  %and = and i64 %sub, %a.addr.010
+  %mul = mul nsw i32 %inc, %mydata1.addr.011
+  %conv = trunc i64 %and to i32
+  %mul1 = mul nsw i32 %conv, %mydata2.addr.012
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  %mydata2.addr.0.lcssa = phi i32 [ %mydata2, %entry ], [ %mul1, %while.body ]
+  %mydata1.addr.0.lcssa = phi i32 [ %mydata1, %entry ], [ %mul, %while.body ]
+  %add = add i32 %mydata2.addr.0.lcssa, %mydata1.addr.0.lcssa
+  %add2 = add i32 %add, %c.0.lcssa
+  ret i32 %add2
+}

From 5602bd5a42897e9323dc0275655186397778c67b Mon Sep 17 00:00:00 2001
From: Daniel Bertalan <dani@danielbertalan.dev>
Date: Fri, 6 Sep 2024 10:10:25 +0200
Subject: [PATCH 340/425] [gn build] Bump macOS target to 12 and make it
 configurable (#107492)

Bumping the deployment target from macOS 10.10 to 12 enables chained
fixups and DWARF 4 debug information for the LLVM binaries and
libraries.

Like in Chromium, a GN arg (`mac_deployment_target`) is added to allow
overriding the default.
---
 llvm/utils/gn/build/BUILD.gn                   | 6 +++---
 llvm/utils/gn/build/mac_sdk.gni                | 4 ++++
 llvm/utils/gn/build/toolchain/target_flags.gni | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/llvm/utils/gn/build/BUILD.gn b/llvm/utils/gn/build/BUILD.gn
index 68c0cbfd384a6..1ccaefdc21c82 100644
--- a/llvm/utils/gn/build/BUILD.gn
+++ b/llvm/utils/gn/build/BUILD.gn
@@ -72,9 +72,9 @@ config("compiler_defaults") {
     ldflags += [ "-miphoneos-version-min=8.0" ]
   }
   if (current_os == "mac") {
-    asmflags += [ "-mmacos-version-min=10.10" ]
-    cflags += [ "-mmacos-version-min=10.10" ]
-    ldflags += [ "-mmacos-version-min=10.10" ]
+    asmflags += [ "-mmacos-version-min=$mac_deployment_target" ]
+    cflags += [ "-mmacos-version-min=$mac_deployment_target" ]
+    ldflags += [ "-mmacos-version-min=$mac_deployment_target" ]
   }
 
   assert(symbol_level == 0 || symbol_level == 1 || symbol_level == 2,
diff --git a/llvm/utils/gn/build/mac_sdk.gni b/llvm/utils/gn/build/mac_sdk.gni
index af7044ba5c961..9a9d35e71cccd 100644
--- a/llvm/utils/gn/build/mac_sdk.gni
+++ b/llvm/utils/gn/build/mac_sdk.gni
@@ -2,6 +2,10 @@ import("//llvm/utils/gn/build/sysroot.gni")
 
 have_ios_sdks = true
 
+declare_args() {
+  mac_deployment_target = "12"
+}
+
 if (sysroot == "") {
   declare_args() {
     # Set to true if you don't have Xcode installed, but do have the commandline
diff --git a/llvm/utils/gn/build/toolchain/target_flags.gni b/llvm/utils/gn/build/toolchain/target_flags.gni
index 7a2a842afc1fb..af8adcd273866 100644
--- a/llvm/utils/gn/build/toolchain/target_flags.gni
+++ b/llvm/utils/gn/build/toolchain/target_flags.gni
@@ -47,7 +47,7 @@ if (current_os == "android") {
       rebase_path(mac_sdk_path, root_build_dir),
       # TODO(lgrey): We should be getting this from `compiler_defaults`. Why
       # aren't we?
-    "-mmacos-version-min=10.10",
+    "-mmacos-version-min=$mac_deployment_target",
     ]
   }
 } else if (current_os == "baremetal") {

From 861caf9b319ea44d48f1e077f07be3b5f102c6d5 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111@gmail.com>
Date: Fri, 6 Sep 2024 17:16:30 +0900
Subject: [PATCH 341/425] [SCCP] Remove LoadInst if it loaded from Constant
 GlobalVariable  (#107245)

This patch removes the `LoadInst` when it loaded from Constant
GlobalVariable. This allows `canRemoveInstruction` function to be
removed.
---
 llvm/lib/Transforms/IPO/SCCP.cpp         | 10 +++++++---
 llvm/lib/Transforms/Utils/SCCPSolver.cpp | 17 ++---------------
 2 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp
index f0d75a2016363..ef95ec0b72e64 100644
--- a/llvm/lib/Transforms/IPO/SCCP.cpp
+++ b/llvm/lib/Transforms/IPO/SCCP.cpp
@@ -337,9 +337,13 @@ static bool runIPSCCP(
       continue;
     LLVM_DEBUG(dbgs() << "Found that GV '" << GV->getName()
                       << "' is constant!\n");
-    while (!GV->use_empty()) {
-      StoreInst *SI = cast<StoreInst>(GV->user_back());
-      SI->eraseFromParent();
+    for (User *U : make_early_inc_range(GV->users())) {
+      // We can remove LoadInst here, because we already replaced its users
+      // with a constant.
+      assert((isa<StoreInst>(U) || isa<LoadInst>(U)) &&
+             "Only Store|Load Instruction can be user of GlobalVariable at "
+             "reaching here.");
+      cast<Instruction>(U)->eraseFromParent();
     }
 
     // Try to create a debug constant expression for the global variable
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 1dc82c6e9aa71..ce58e8e6c7031 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -54,18 +54,6 @@ bool SCCPSolver::isOverdefined(const ValueLatticeElement &LV) {
   return !LV.isUnknownOrUndef() && !SCCPSolver::isConstant(LV);
 }
 
-static bool canRemoveInstruction(Instruction *I) {
-  if (wouldInstructionBeTriviallyDead(I))
-    return true;
-
-  // Some instructions can be handled but are rejected above. Catch
-  // those cases by falling through to here.
-  // TODO: Mark globals as being constant earlier, so
-  // TODO: wouldInstructionBeTriviallyDead() knows that atomic loads
-  // TODO: are safe to remove.
-  return isa<LoadInst>(I);
-}
-
 bool SCCPSolver::tryToReplaceWithConstant(Value *V) {
   Constant *Const = getConstantOrNull(V);
   if (!Const)
@@ -75,8 +63,7 @@ bool SCCPSolver::tryToReplaceWithConstant(Value *V) {
   // Calls with "clang.arc.attachedcall" implicitly use the return value and
   // those uses cannot be updated with a constant.
   CallBase *CB = dyn_cast<CallBase>(V);
-  if (CB && ((CB->isMustTailCall() &&
-              !canRemoveInstruction(CB)) ||
+  if (CB && ((CB->isMustTailCall() && !wouldInstructionBeTriviallyDead(CB)) ||
              CB->getOperandBundle(LLVMContext::OB_clang_arc_attachedcall))) {
     Function *F = CB->getCalledFunction();
 
@@ -244,7 +231,7 @@ bool SCCPSolver::simplifyInstsInBlock(BasicBlock &BB,
     if (Inst.getType()->isVoidTy())
       continue;
     if (tryToReplaceWithConstant(&Inst)) {
-      if (canRemoveInstruction(&Inst))
+      if (wouldInstructionBeTriviallyDead(&Inst))
         Inst.eraseFromParent();
 
       MadeChanges = true;

From 310b0a1cf39aaf25a8fb5c88afc3a47a5776cec2 Mon Sep 17 00:00:00 2001
From: Matt Bolitho <matt.bolitho.software@gmail.com>
Date: Fri, 6 Sep 2024 09:28:22 +0100
Subject: [PATCH 342/425] [llvm-ml] Adds /quiet flag to llvm-ml (#107308)

In PR #106794, it was noted that `llvm-ml` does not support the `/quiet`
flag. The original reason it was added by Microsoft to `ml`/`ml64` was
to remove extraneous CMake build output (see [CMake GitLab
issue](https://gitlab.kitware.com/cmake/cmake/-/issues/23537)) much like
in the linked PR .

If the goal is for `llvm-ml` to be a drop-in replacement for
`ml`/`ml64`, then I think it makes sense to support the `/quiet` flag,
much like how `/nologo` is supported.
---
 llvm/test/tools/llvm-ml/run.asm | 1 +
 llvm/tools/llvm-ml/Opts.td      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llvm/test/tools/llvm-ml/run.asm b/llvm/test/tools/llvm-ml/run.asm
index 0e4f2bf137f18..11e2d72ae06c2 100644
--- a/llvm/test/tools/llvm-ml/run.asm
+++ b/llvm/test/tools/llvm-ml/run.asm
@@ -1,3 +1,4 @@
 ; RUN: llvm-ml --help | FileCheck %s
+; RUN: llvm-ml /nologo /quiet
 
 ; CHECK: USAGE: llvm-ml
diff --git a/llvm/tools/llvm-ml/Opts.td b/llvm/tools/llvm-ml/Opts.td
index 686edded23b85..8b748bb64b278 100644
--- a/llvm/tools/llvm-ml/Opts.td
+++ b/llvm/tools/llvm-ml/Opts.td
@@ -72,6 +72,7 @@ def define : MLJoinedOrSeparate<"D">, MetaVarName<"<macro>=<value>">,
              HelpText<"Define <macro> to <value> (or blank if <value> "
                       "omitted)">;
 def no_logo : MLFlag<"nologo">, HelpText<"">;
+def quiet : MLFlag<"quiet">, HelpText<"">;
 def output_file : MLJoinedOrSeparate<"Fo">, HelpText<"Names the output file">;
 def include_path : MLJoinedOrSeparate<"I">,
                    HelpText<"Sets path for include files">;

From 77b388cfc60706ee1600bf0d50ecfc0ed035db3f Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 6 Sep 2024 09:30:54 +0100
Subject: [PATCH 343/425] [lldb][Docs] Fix broken link to qemu testing page

---
 lldb/docs/resources/test.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/docs/resources/test.rst b/lldb/docs/resources/test.rst
index 382e42bf22b10..906e687cb13b1 100644
--- a/lldb/docs/resources/test.rst
+++ b/lldb/docs/resources/test.rst
@@ -606,7 +606,7 @@ Running tests in QEMU System Emulation Environment
 ``````````````````````````````````````````````````
 
 QEMU can be used to test LLDB in an emulation environment in the absence of
-actual hardware. :doc:`/use/qemu-testing` describes how to setup an
+actual hardware. :doc:`/resources/qemu-testing` describes how to setup an
 emulation environment using QEMU helper scripts found in
 ``llvm-project/lldb/scripts/lldb-test-qemu``. These scripts currently
 work with Arm or AArch64, but support for other architectures can be added easily.

From c2e53b2d50e63d9802483334d6fd39ea7ef889ef Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 6 Sep 2024 10:35:07 +0200
Subject: [PATCH 344/425] [mlir][Transforms][NFC] Dialect conversion: Fix typo
 and improve docs (#107539)

---
 mlir/docs/DialectConversion.md                   | 5 +++++
 mlir/include/mlir/Transforms/DialectConversion.h | 9 ++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/mlir/docs/DialectConversion.md b/mlir/docs/DialectConversion.md
index 23e74470a835f..4f6cb1dec66a6 100644
--- a/mlir/docs/DialectConversion.md
+++ b/mlir/docs/DialectConversion.md
@@ -383,6 +383,11 @@ class TypeConverter {
 };
 ```
 
+Materializations through the type converter are optional. If the
+`ConversionConfig::buildMaterializations` flag is set to "false", the dialect
+conversion driver builds an `unrealized_conversion_cast` op instead of calling
+the respective type converter callback whenever a materialization is required.
+
 ### Region Signature Conversion
 
 From the perspective of type conversion, the types of block arguments are a bit
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 5f680e8eca755..65e279e046e88 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -1127,13 +1127,12 @@ struct ConversionConfig {
 
   /// If set to "true", the dialect conversion attempts to build source/target/
   /// argument materializations through the type converter API in lieu of
-  /// builtin.unrealized_conversion_cast ops. The conversion process fails if
+  /// "builtin.unrealized_conversion_cast ops". The conversion process fails if
   /// at least one materialization could not be built.
   ///
-  /// If set to "false", the dialect conversion does not does not build any
-  /// custom materializations and instead inserts
-  /// builtin.unrealized_conversion_cast ops to ensure that the resulting IR
-  /// is valid.
+  /// If set to "false", the dialect conversion does not build any custom
+  /// materializations and instead inserts "builtin.unrealized_conversion_cast"
+  /// ops to ensure that the resulting IR is valid.
   bool buildMaterializations = true;
 };
 

From 458c91d810a133850ebbf9313defd38fe5e61b50 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Fri, 6 Sep 2024 09:38:03 +0100
Subject: [PATCH 345/425] [AArch64][NEON] Lower fixed-width add partial
 reductions to dot product (#107078)

This PR adds lowering for fixed-width <4 x i32> and <2 x i32> partial
reductions to a dot product when Neon and the dot product feature are
available.

The work is by Max Beck-Jones (@DevM-uk).
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  18 ++-
 .../neon-partial-reduce-dot-product.ll        | 135 ++++++++++++++++++
 ...t.ll => sve-partial-reduce-dot-product.ll} |   0
 3 files changed, 146 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
 rename llvm/test/CodeGen/AArch64/{partial-reduce-dot-product.ll => sve-partial-reduce-dot-product.ll} (100%)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c8e8a1e612e0a..9d80087336d23 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1995,7 +1995,8 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
     return true;
 
   EVT VT = EVT::getEVT(I->getType());
-  return VT != MVT::nxv4i32 && VT != MVT::nxv2i64;
+  return VT != MVT::nxv4i32 && VT != MVT::nxv2i64 && VT != MVT::v4i32 &&
+         VT != MVT::v2i32;
 }
 
 bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
@@ -21807,7 +21808,10 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
              Intrinsic::experimental_vector_partial_reduce_add &&
          "Expected a partial reduction node");
 
-  if (!Subtarget->isSVEorStreamingSVEAvailable())
+  bool Scalable = N->getValueType(0).isScalableVector();
+  if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
+    return SDValue();
+  if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
     return SDValue();
 
   SDLoc DL(N);
@@ -21844,11 +21848,11 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
 
   // Dot products operate on chunks of four elements so there must be four times
   // as many elements in the wide type
-  if (ReducedType == MVT::nxv4i32 && MulSrcType == MVT::nxv16i8)
-    return DAG.getNode(Opcode, DL, MVT::nxv4i32, NarrowOp, A, B);
-
-  if (ReducedType == MVT::nxv2i64 && MulSrcType == MVT::nxv8i16)
-    return DAG.getNode(Opcode, DL, MVT::nxv2i64, NarrowOp, A, B);
+  if ((ReducedType == MVT::nxv4i32 && MulSrcType == MVT::nxv16i8) ||
+      (ReducedType == MVT::nxv2i64 && MulSrcType == MVT::nxv8i16) ||
+      (ReducedType == MVT::v4i32 && MulSrcType == MVT::v16i8) ||
+      (ReducedType == MVT::v2i32 && MulSrcType == MVT::v8i8))
+    return DAG.getNode(Opcode, DL, ReducedType, NarrowOp, A, B);
 
   return SDValue();
 }
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
new file mode 100644
index 0000000000000..8035504d5558b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT
+; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NODOT
+
+define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
+; CHECK-DOT-LABEL: udot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: udot:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    umull v3.8h, v2.8b, v1.8b
+; CHECK-NODOT-NEXT:    umull2 v1.8h, v2.16b, v1.16b
+; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v3.4h
+; CHECK-NODOT-NEXT:    uaddw2 v2.4s, v2.4s, v3.8h
+; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
+; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NODOT-NEXT:    ret
+  %u.wide = zext <16 x i8> %u to <16 x i32>
+  %s.wide = zext <16 x i8> %s to <16 x i32>
+  %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
+  ret <4 x i32> %partial.reduce
+}
+
+define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
+; CHECK-DOT-LABEL: udot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    udot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: udot_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    umull v1.8h, v2.8b, v1.8b
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    uaddw v1.4s, v2.4s, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
+  %u.wide = zext <8 x i8> %u to <8 x i32>
+  %s.wide = zext <8 x i8> %s to <8 x i32>
+  %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
+  ret <2 x i32> %partial.reduce
+}
+
+define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
+; CHECK-DOT-LABEL: sdot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: sdot:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    smull v3.8h, v2.8b, v1.8b
+; CHECK-NODOT-NEXT:    smull2 v1.8h, v2.16b, v1.16b
+; CHECK-NODOT-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v3.4h
+; CHECK-NODOT-NEXT:    saddw2 v2.4s, v2.4s, v3.8h
+; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
+; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NODOT-NEXT:    ret
+  %u.wide = sext <16 x i8> %u to <16 x i32>
+  %s.wide = sext <16 x i8> %s to <16 x i32>
+  %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
+  ret <4 x i32> %partial.reduce
+}
+
+define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
+; CHECK-DOT-LABEL: sdot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: sdot_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    smull v1.8h, v2.8b, v1.8b
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    saddw v1.4s, v2.4s, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
+  %u.wide = sext <8 x i8> %u to <8 x i32>
+  %s.wide = sext <8 x i8> %s to <8 x i32>
+  %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
+  ret <2 x i32> %partial.reduce
+}
+
+define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
+; CHECK-LABEL: not_udot:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umull v1.8h, v2.8b, v1.8b
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
+; CHECK-NEXT:    ret
+  %u.wide = zext <8 x i8> %u to <8 x i32>
+  %s.wide = zext <8 x i8> %s to <8 x i32>
+  %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <8 x i32> %mult)
+  ret <4 x i32> %partial.reduce
+}
+
+define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
+; CHECK-LABEL: not_udot_narrow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-NEXT:    bic v2.4h, #255, lsl #8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umull v3.4s, v2.4h, v1.4h
+; CHECK-NEXT:    umlal v0.4s, v2.4h, v1.4h
+; CHECK-NEXT:    ext v1.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %u.wide = zext <4 x i8> %u to <4 x i32>
+  %s.wide = zext <4 x i8> %s to <4 x i32>
+  %mult = mul nuw nsw <4 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <4 x i32> %mult)
+  ret <2 x i32> %partial.reduce
+}
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
similarity index 100%
rename from llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll
rename to llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll

From 725fab987d7b2133293eb4b82e8a0e9c6ba76053 Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Fri, 6 Sep 2024 12:38:55 +0400
Subject: [PATCH 346/425] [lldb][NFC] Separated
 GDBRemoteCommunication::GetDebugserverPath() (#107388)

This is the prerequisite for #104238.
---
 .../gdb-remote/GDBRemoteCommunication.cpp     | 29 +++++++++++--------
 .../gdb-remote/GDBRemoteCommunication.h       |  3 ++
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index 1f1e511346879..04f0f525e3ebb 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -879,19 +879,11 @@ lldb::thread_result_t GDBRemoteCommunication::ListenThread() {
   return {};
 }
 
-Status GDBRemoteCommunication::StartDebugserverProcess(
-    const char *url, Platform *platform, ProcessLaunchInfo &launch_info,
-    uint16_t *port, const Args *inferior_args, int pass_comm_fd) {
+FileSpec GDBRemoteCommunication::GetDebugserverPath(Platform *platform) {
   Log *log = GetLog(GDBRLog::Process);
-  LLDB_LOGF(log, "GDBRemoteCommunication::%s(url=%s, port=%" PRIu16 ")",
-            __FUNCTION__, url ? url : "<empty>", port ? *port : uint16_t(0));
-
-  Status error;
   // If we locate debugserver, keep that located version around
   static FileSpec g_debugserver_file_spec;
-
-  char debugserver_path[PATH_MAX];
-  FileSpec &debugserver_file_spec = launch_info.GetExecutableFile();
+  FileSpec debugserver_file_spec;
 
   Environment host_env = Host::GetEnvironment();
 
@@ -943,9 +935,20 @@ Status GDBRemoteCommunication::StartDebugserverProcess(
       }
     }
   }
+  return debugserver_file_spec;
+}
 
-  if (debugserver_exists) {
-    debugserver_file_spec.GetPath(debugserver_path, sizeof(debugserver_path));
+Status GDBRemoteCommunication::StartDebugserverProcess(
+    const char *url, Platform *platform, ProcessLaunchInfo &launch_info,
+    uint16_t *port, const Args *inferior_args, shared_fd_t pass_comm_fd) {
+  Log *log = GetLog(GDBRLog::Process);
+  LLDB_LOGF(log, "GDBRemoteCommunication::%s(url=%s, port=%" PRIu16 ")",
+            __FUNCTION__, url ? url : "<empty>", port ? *port : uint16_t(0));
+
+  Status error;
+  FileSpec &debugserver_file_spec = launch_info.GetExecutableFile();
+  if (debugserver_file_spec = GetDebugserverPath(platform)) {
+    std::string debugserver_path = debugserver_file_spec.GetPath();
 
     Args &debugserver_args = launch_info.GetArguments();
     debugserver_args.Clear();
@@ -1059,6 +1062,8 @@ Status GDBRemoteCommunication::StartDebugserverProcess(
         }
       }
     }
+
+    Environment host_env = Host::GetEnvironment();
     std::string env_debugserver_log_file =
         host_env.lookup("LLDB_DEBUGSERVER_LOG_FILE");
     if (!env_debugserver_log_file.empty()) {
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
index 4e59bd5ec8be6..a182291f6c859 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
@@ -146,6 +146,9 @@ class GDBRemoteCommunication : public Communication {
 
   std::chrono::seconds GetPacketTimeout() const { return m_packet_timeout; }
 
+  // Get the debugserver path and check that it exist.
+  FileSpec GetDebugserverPath(Platform *platform);
+
   // Start a debugserver instance on the current host using the
   // supplied connection URL.
   Status StartDebugserverProcess(

From 691e3c64d08c32955c8f5f740d4ce0db00ee2307 Mon Sep 17 00:00:00 2001
From: Daniel Bertalan <dani@danielbertalan.dev>
Date: Fri, 6 Sep 2024 10:58:19 +0200
Subject: [PATCH 347/425] [lld-macho] Fix `Defined` size increase with
 `-mms-bitfields` (#107545)

Under the Microsoft ABI, only those bit fields can be merged whose
underlying types have the same size.

d175616 (`[lld-macho][arm64] Enhance safe ICF with thunk-based
deduplication`) added an enum field (`identicalCodeFoldingKind`) next to
booleans in the `Defined` class, which increased the size under the MS
ABI. On MinGW targets, this triggered the `static_assert` which checks
the size of `Defined` (for MSVC targets, the check is disabled due to
another problem). Let's store it as a `uint8_t` to allow merging to take
place.

Fixes #107511
---
 lld/MachO/Symbols.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lld/MachO/Symbols.h b/lld/MachO/Symbols.h
index 70fd195f25c92..beb97b35bf881 100644
--- a/lld/MachO/Symbols.h
+++ b/lld/MachO/Symbols.h
@@ -14,6 +14,7 @@
 #include "Target.h"
 
 #include "llvm/Object/Archive.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 
 namespace lld {
@@ -152,7 +153,8 @@ class Defined : public Symbol {
   // Whether this symbol should appear in the output symbol table.
   bool includeInSymtab : 1;
   // The ICF folding kind of this symbol: None / Body / Thunk.
-  ICFFoldKind identicalCodeFoldingKind : 2;
+  LLVM_PREFERRED_TYPE(ICFFoldKind)
+  uint8_t identicalCodeFoldingKind : 2;
   // Symbols marked referencedDynamically won't be removed from the output's
   // symbol table by tools like strip. In theory, this could be set on arbitrary
   // symbols in input object files. In practice, it's used solely for the

From 66a03295de26c61a2178bb3a697d355592cb0eb5 Mon Sep 17 00:00:00 2001
From: Vitaly Goldshteyn <VitalyGoldstein@gmail.com>
Date: Fri, 6 Sep 2024 11:19:01 +0200
Subject: [PATCH 348/425] [libc] Implement branchless head-tail comparison for
 bcmp (#107540)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Binary size changes:

| Bytes (cache lines) | before   | after   |
|---------------------|----------|---------|
| sse4                | 419 (7)  | 288 (5) |
| avx                 | 430 (7)  | 308 (5) |
| avx512f             | 589 (10) | 390 (7) |

Benchmarks for different CPUs using
https://github.com/google/fleetbench.

 - indus-cascadelake

```
name                                                       old speed            new speed            delta
BM_LIBC_Bcmp_Fleet_L1                                      1.96GB/s ± 1%        2.19GB/s ± 0%  +11.49%  (p=0.000 n=29+24)
BM_LIBC_Bcmp_Fleet_L2                                      1.90GB/s ± 1%        2.14GB/s ± 1%  +12.68%  (p=0.000 n=29+24)
BM_LIBC_Bcmp_Fleet_LLC                                      513MB/s ± 4%         531MB/s ± 4%   +3.53%  (p=0.000 n=24+24)
BM_LIBC_Bcmp_Fleet_Cold                                     452MB/s ± 3%         456MB/s ± 4%     ~     (p=0.103 n=30+30)
BM_LIBC_Bcmp_0_L1                                [Bcmp_0]  2.98GB/s ± 1%        3.15GB/s ± 1%   +5.59%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_0_L2                                [Bcmp_0]  2.86GB/s ± 1%        3.07GB/s ± 1%   +7.21%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_0_LLC                               [Bcmp_0]   738MB/s ± 7%         751MB/s ± 3%   +1.68%  (p=0.000 n=24+25)
BM_LIBC_Bcmp_0_Cold                              [Bcmp_0]   643MB/s ± 3%         642MB/s ± 4%     ~     (p=0.522 n=29+30)
BM_LIBC_Bcmp_1_L1                                [Bcmp_1]  3.08GB/s ± 0%        3.25GB/s ± 0%   +5.35%  (p=0.000 n=28+30)
BM_LIBC_Bcmp_1_L2                                [Bcmp_1]  2.97GB/s ± 1%        3.17GB/s ± 1%   +6.65%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_1_LLC                               [Bcmp_1]   901MB/s ±59%         871MB/s ±36%     ~     (p=0.676 n=29+27)
BM_LIBC_Bcmp_1_Cold                              [Bcmp_1]   686MB/s ± 4%         686MB/s ± 3%     ~     (p=0.934 n=29+30)
BM_LIBC_Bcmp_2_L1                                [Bcmp_2]  1.63GB/s ± 0%        1.80GB/s ± 1%  +10.19%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_2_L2                                [Bcmp_2]  1.57GB/s ± 1%        1.75GB/s ± 1%  +11.46%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_2_LLC                               [Bcmp_2]   451MB/s ±61%         427MB/s ±28%     ~     (p=0.469 n=29+25)
BM_LIBC_Bcmp_2_Cold                              [Bcmp_2]   353MB/s ± 4%         354MB/s ± 5%     ~     (p=0.467 n=30+30)
BM_LIBC_Bcmp_3_L1                                [Bcmp_3]  1.91GB/s ± 1%        2.10GB/s ± 1%   +9.90%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_3_L2                                [Bcmp_3]  1.84GB/s ± 1%        2.03GB/s ± 1%  +10.63%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_3_LLC                               [Bcmp_3]   491MB/s ±24%         538MB/s ±24%   +9.66%  (p=0.000 n=24+27)
BM_LIBC_Bcmp_3_Cold                              [Bcmp_3]   417MB/s ± 4%         421MB/s ± 3%     ~     (p=0.063 n=30+29)
BM_LIBC_Bcmp_4_L1                                [Bcmp_4]   761MB/s ± 1%         867MB/s ± 1%  +14.02%  (p=0.000 n=28+30)
BM_LIBC_Bcmp_4_L2                                [Bcmp_4]   748MB/s ± 1%         860MB/s ± 1%  +15.04%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_4_LLC                               [Bcmp_4]   227MB/s ±29%         260MB/s ±64%  +14.70%  (p=0.000 n=26+27)
BM_LIBC_Bcmp_4_Cold                              [Bcmp_4]   187MB/s ± 3%         191MB/s ± 5%   +2.26%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_5_L1                                [Bcmp_5]  1.48GB/s ± 1%        1.71GB/s ± 1%  +15.26%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_5_L2                                [Bcmp_5]  1.42GB/s ± 1%        1.67GB/s ± 1%  +17.68%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_5_LLC                               [Bcmp_5]   412MB/s ±34%         519MB/s ±80%  +25.87%  (p=0.000 n=27+30)
BM_LIBC_Bcmp_5_Cold                              [Bcmp_5]   336MB/s ± 4%         343MB/s ± 6%   +2.05%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_6_L1                                [Bcmp_6]  2.87GB/s ± 0%        3.24GB/s ± 1%  +12.88%  (p=0.000 n=26+30)
BM_LIBC_Bcmp_6_L2                                [Bcmp_6]  2.78GB/s ± 1%        3.20GB/s ± 1%  +15.15%  (p=0.000 n=26+30)
BM_LIBC_Bcmp_6_LLC                               [Bcmp_6]   926MB/s ±43%        1227MB/s ±76%  +32.53%  (p=0.000 n=27+30)
BM_LIBC_Bcmp_6_Cold                              [Bcmp_6]   716MB/s ± 4%         737MB/s ± 6%   +3.02%  (p=0.000 n=28+29)
BM_LIBC_Bcmp_7_L1                                [Bcmp_7]  1.54GB/s ± 1%        1.56GB/s ± 0%   +1.40%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_7_L2                                [Bcmp_7]  1.47GB/s ± 1%        1.52GB/s ± 1%   +2.97%  (p=0.000 n=27+30)
BM_LIBC_Bcmp_7_LLC                               [Bcmp_7]   351MB/s ±23%         436MB/s ±83%  +24.04%  (p=0.005 n=24+29)
BM_LIBC_Bcmp_7_Cold                              [Bcmp_7]   283MB/s ± 4%         282MB/s ± 4%     ~     (p=0.644 n=30+30)
BM_LIBC_Bcmp_8_L1                                [Bcmp_8]   824MB/s ± 1%        1048MB/s ± 1%  +27.18%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_8_L2                                [Bcmp_8]   808MB/s ± 1%        1027MB/s ± 1%  +27.12%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_8_LLC                               [Bcmp_8]   317MB/s ±79%         332MB/s ±74%     ~     (p=0.338 n=30+29)
BM_LIBC_Bcmp_8_Cold                              [Bcmp_8]   207MB/s ± 5%         212MB/s ± 5%   +2.27%  (p=0.000 n=30+30)
```

 - indus-skylake

```
name                                                       old speed            new speed            delta
BM_LIBC_Bcmp_Fleet_L1                                      2.06GB/s ± 2%        2.25GB/s ± 3%   +9.66%  (p=0.000 n=27+24)
BM_LIBC_Bcmp_Fleet_L2                                      1.96GB/s ± 2%        2.17GB/s ± 2%  +10.61%  (p=0.000 n=30+24)
BM_LIBC_Bcmp_Fleet_LLC                                     1.18GB/s ± 6%        1.32GB/s ± 5%  +12.27%  (p=0.000 n=28+28)
BM_LIBC_Bcmp_Fleet_Cold                                     456MB/s ± 2%         466MB/s ± 2%   +2.22%  (p=0.000 n=28+28)
BM_LIBC_Bcmp_0_L1                                [Bcmp_0]  3.08GB/s ± 2%        3.20GB/s ± 1%   +3.72%  (p=0.000 n=28+22)
BM_LIBC_Bcmp_0_L2                                [Bcmp_0]  2.92GB/s ± 1%        3.05GB/s ± 2%   +4.49%  (p=0.000 n=23+23)
BM_LIBC_Bcmp_0_LLC                               [Bcmp_0]  1.83GB/s ± 8%        1.94GB/s ± 4%   +6.24%  (p=0.000 n=25+27)
BM_LIBC_Bcmp_0_Cold                              [Bcmp_0]   654MB/s ± 2%         659MB/s ± 2%   +0.76%  (p=0.012 n=30+29)
BM_LIBC_Bcmp_1_L1                                [Bcmp_1]  3.19GB/s ± 2%        3.34GB/s ± 2%   +4.41%  (p=0.000 n=26+23)
BM_LIBC_Bcmp_1_L2                                [Bcmp_1]  3.05GB/s ± 2%        3.21GB/s ± 2%   +5.32%  (p=0.000 n=28+25)
BM_LIBC_Bcmp_1_LLC                               [Bcmp_1]  1.95GB/s ± 4%        2.03GB/s ±10%   +3.61%  (p=0.000 n=27+30)
BM_LIBC_Bcmp_1_Cold                              [Bcmp_1]   700MB/s ± 2%         702MB/s ± 2%     ~     (p=0.150 n=30+30)
BM_LIBC_Bcmp_2_L1                                [Bcmp_2]  1.69GB/s ± 2%        1.85GB/s ± 1%   +9.31%  (p=0.000 n=30+26)
BM_LIBC_Bcmp_2_L2                                [Bcmp_2]  1.60GB/s ± 2%        1.78GB/s ± 2%  +10.90%  (p=0.000 n=26+27)
BM_LIBC_Bcmp_2_LLC                               [Bcmp_2]  1.01GB/s ± 5%        1.12GB/s ± 5%  +11.40%  (p=0.000 n=27+28)
BM_LIBC_Bcmp_2_Cold                              [Bcmp_2]   355MB/s ± 3%         360MB/s ± 3%   +1.46%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_3_L1                                [Bcmp_3]  1.98GB/s ± 2%        2.15GB/s ± 2%   +8.89%  (p=0.000 n=29+27)
BM_LIBC_Bcmp_3_L2                                [Bcmp_3]  1.87GB/s ± 3%        2.05GB/s ± 2%  +10.06%  (p=0.000 n=30+26)
BM_LIBC_Bcmp_3_LLC                               [Bcmp_3]  1.19GB/s ± 4%        1.31GB/s ± 6%   +9.82%  (p=0.000 n=27+29)
BM_LIBC_Bcmp_3_Cold                              [Bcmp_3]   424MB/s ± 3%         431MB/s ± 3%   +1.58%  (p=0.000 n=28+30)
BM_LIBC_Bcmp_4_L1                                [Bcmp_4]   849MB/s ± 2%         949MB/s ± 2%  +11.84%  (p=0.000 n=27+28)
BM_LIBC_Bcmp_4_L2                                [Bcmp_4]   815MB/s ± 3%         913MB/s ± 3%  +12.06%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_4_LLC                               [Bcmp_4]   512MB/s ± 9%         571MB/s ± 7%  +11.40%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_4_Cold                              [Bcmp_4]   187MB/s ± 3%         192MB/s ± 2%   +2.56%  (p=0.000 n=30+28)
BM_LIBC_Bcmp_5_L1                                [Bcmp_5]  1.55GB/s ± 2%        1.77GB/s ± 3%  +13.93%  (p=0.000 n=30+28)
BM_LIBC_Bcmp_5_L2                                [Bcmp_5]  1.47GB/s ± 2%        1.70GB/s ± 2%  +15.96%  (p=0.000 n=27+26)
BM_LIBC_Bcmp_5_LLC                               [Bcmp_5]   939MB/s ± 5%        1084MB/s ± 4%  +15.36%  (p=0.000 n=28+27)
BM_LIBC_Bcmp_5_Cold                              [Bcmp_5]   340MB/s ± 2%         347MB/s ± 3%   +1.93%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_6_L1                                [Bcmp_6]  3.06GB/s ± 3%        3.40GB/s ± 2%  +11.13%  (p=0.000 n=30+28)
BM_LIBC_Bcmp_6_L2                                [Bcmp_6]  2.89GB/s ± 3%        3.24GB/s ± 2%  +12.20%  (p=0.000 n=29+26)
BM_LIBC_Bcmp_6_LLC                               [Bcmp_6]  1.93GB/s ± 4%        2.09GB/s ±11%   +8.16%  (p=0.000 n=26+30)
BM_LIBC_Bcmp_6_Cold                              [Bcmp_6]   746MB/s ± 2%         762MB/s ± 2%   +2.11%  (p=0.000 n=30+28)
BM_LIBC_Bcmp_7_L1                                [Bcmp_7]  1.59GB/s ± 2%        1.62GB/s ± 2%   +1.72%  (p=0.000 n=25+27)
BM_LIBC_Bcmp_7_L2                                [Bcmp_7]  1.49GB/s ± 2%        1.53GB/s ± 2%   +2.62%  (p=0.000 n=27+29)
BM_LIBC_Bcmp_7_LLC                               [Bcmp_7]   852MB/s ±10%         909MB/s ± 6%   +6.71%  (p=0.000 n=30+29)
BM_LIBC_Bcmp_7_Cold                              [Bcmp_7]   283MB/s ± 3%         283MB/s ± 2%     ~     (p=0.617 n=30+27)
BM_LIBC_Bcmp_8_L1                                [Bcmp_8]   891MB/s ± 2%        1083MB/s ± 2%  +21.64%  (p=0.000 n=27+24)
BM_LIBC_Bcmp_8_L2                                [Bcmp_8]   855MB/s ± 2%        1045MB/s ± 1%  +22.31%  (p=0.000 n=25+23)
BM_LIBC_Bcmp_8_LLC                               [Bcmp_8]   568MB/s ± 7%         659MB/s ± 8%  +16.04%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_8_Cold                              [Bcmp_8]   207MB/s ± 2%         212MB/s ± 2%   +2.31%  (p=0.000 n=30+27)
```

 - arcadia-rome

```
name                                                       old speed            new speed            delta
BM_LIBC_Bcmp_Fleet_L1                                      2.16GB/s ± 2%        2.27GB/s ± 2%   +5.13%  (p=0.000 n=26+30)
BM_LIBC_Bcmp_Fleet_L2                                      2.15GB/s ± 2%        2.25GB/s ± 2%   +4.64%  (p=0.000 n=27+30)
BM_LIBC_Bcmp_Fleet_LLC                                     1.73GB/s ± 3%        1.81GB/s ± 3%   +4.66%  (p=0.000 n=25+28)
BM_LIBC_Bcmp_Fleet_Cold                                     494MB/s ± 1%         496MB/s ± 2%   +0.45%  (p=0.023 n=22+24)
BM_LIBC_Bcmp_0_L1                                [Bcmp_0]  3.30GB/s ± 1%        3.24GB/s ± 2%   -1.70%  (p=0.000 n=27+30)
BM_LIBC_Bcmp_0_L2                                [Bcmp_0]  3.23GB/s ± 2%        3.19GB/s ± 2%   -1.28%  (p=0.000 n=28+28)
BM_LIBC_Bcmp_0_LLC                               [Bcmp_0]  2.59GB/s ± 3%        2.58GB/s ± 2%   -0.65%  (p=0.010 n=26+26)
BM_LIBC_Bcmp_0_Cold                              [Bcmp_0]   720MB/s ± 1%         707MB/s ± 3%   -1.75%  (p=0.000 n=22+25)
BM_LIBC_Bcmp_1_L1                                [Bcmp_1]  3.37GB/s ± 1%        3.36GB/s ± 2%     ~     (p=0.102 n=28+29)
BM_LIBC_Bcmp_1_L2                                [Bcmp_1]  3.32GB/s ± 2%        3.30GB/s ± 2%   -0.51%  (p=0.038 n=28+29)
BM_LIBC_Bcmp_1_LLC                               [Bcmp_1]  2.67GB/s ± 4%        2.70GB/s ± 4%   +0.96%  (p=0.009 n=28+27)
BM_LIBC_Bcmp_1_Cold                              [Bcmp_1]   755MB/s ± 1%         751MB/s ± 2%   -0.57%  (p=0.000 n=22+25)
BM_LIBC_Bcmp_2_L1                                [Bcmp_2]  1.79GB/s ± 1%        1.86GB/s ± 2%   +3.92%  (p=0.000 n=27+29)
BM_LIBC_Bcmp_2_L2                                [Bcmp_2]  1.77GB/s ± 2%        1.82GB/s ± 2%   +2.99%  (p=0.000 n=28+29)
BM_LIBC_Bcmp_2_LLC                               [Bcmp_2]  1.41GB/s ± 4%        1.47GB/s ± 3%   +3.97%  (p=0.000 n=28+28)
BM_LIBC_Bcmp_2_Cold                              [Bcmp_2]   386MB/s ± 1%         389MB/s ± 1%   +0.60%  (p=0.000 n=21+23)
BM_LIBC_Bcmp_3_L1                                [Bcmp_3]  2.07GB/s ± 2%        2.17GB/s ± 2%   +4.87%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_3_L2                                [Bcmp_3]  2.07GB/s ± 2%        2.13GB/s ± 2%   +3.02%  (p=0.000 n=28+30)
BM_LIBC_Bcmp_3_LLC                               [Bcmp_3]  1.66GB/s ± 2%        1.73GB/s ± 2%   +4.08%  (p=0.000 n=29+26)
BM_LIBC_Bcmp_3_Cold                              [Bcmp_3]   466MB/s ± 2%         469MB/s ± 3%   +0.66%  (p=0.001 n=22+25)
BM_LIBC_Bcmp_4_L1                                [Bcmp_4]   861MB/s ± 1%         964MB/s ± 2%  +11.98%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_4_L2                                [Bcmp_4]   853MB/s ± 2%         935MB/s ± 2%   +9.54%  (p=0.000 n=28+29)
BM_LIBC_Bcmp_4_LLC                               [Bcmp_4]   707MB/s ± 3%         743MB/s ± 4%   +5.08%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_4_Cold                              [Bcmp_4]   199MB/s ± 3%         199MB/s ± 2%     ~     (p=0.107 n=29+25)
BM_LIBC_Bcmp_5_L1                                [Bcmp_5]  1.65GB/s ± 1%        1.75GB/s ± 2%   +6.15%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_5_L2                                [Bcmp_5]  1.64GB/s ± 3%        1.73GB/s ± 2%   +5.37%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_5_LLC                               [Bcmp_5]  1.32GB/s ± 2%        1.40GB/s ± 2%   +6.21%  (p=0.000 n=28+27)
BM_LIBC_Bcmp_5_Cold                              [Bcmp_5]   370MB/s ± 3%         371MB/s ± 2%   +0.16%  (p=0.008 n=29+25)
BM_LIBC_Bcmp_6_L1                                [Bcmp_6]  3.25GB/s ± 2%        3.47GB/s ± 2%   +6.74%  (p=0.000 n=28+29)
BM_LIBC_Bcmp_6_L2                                [Bcmp_6]  3.26GB/s ± 1%        3.44GB/s ± 1%   +5.43%  (p=0.000 n=28+29)
BM_LIBC_Bcmp_6_LLC                               [Bcmp_6]  2.66GB/s ± 2%        2.79GB/s ± 3%   +4.90%  (p=0.000 n=27+29)
BM_LIBC_Bcmp_6_Cold                              [Bcmp_6]   812MB/s ± 3%         799MB/s ± 2%   -1.57%  (p=0.000 n=29+25)
BM_LIBC_Bcmp_7_L1                                [Bcmp_7]  1.71GB/s ± 2%        1.66GB/s ± 2%   -3.14%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_7_L2                                [Bcmp_7]  1.63GB/s ± 2%        1.59GB/s ± 2%   -2.50%  (p=0.000 n=29+28)
BM_LIBC_Bcmp_7_LLC                               [Bcmp_7]  1.25GB/s ± 4%        1.25GB/s ± 2%     ~     (p=0.530 n=28+26)
BM_LIBC_Bcmp_7_Cold                              [Bcmp_7]   311MB/s ± 3%         308MB/s ± 1%     ~     (p=0.127 n=29+24)
BM_LIBC_Bcmp_8_L1                                [Bcmp_8]   869MB/s ± 2%        1098MB/s ± 2%  +26.28%  (p=0.000 n=27+29)
BM_LIBC_Bcmp_8_L2                                [Bcmp_8]   873MB/s ± 2%        1075MB/s ± 1%  +23.06%  (p=0.000 n=27+29)
BM_LIBC_Bcmp_8_LLC                               [Bcmp_8]   743MB/s ± 4%         859MB/s ± 4%  +15.58%  (p=0.000 n=27+27)
BM_LIBC_Bcmp_8_Cold                              [Bcmp_8]   221MB/s ± 4%         221MB/s ± 3%   +0.14%  (p=0.034 n=29+25)
```

 - ixion-haswell

```
name                                                       old speed            new speed            delta
BM_LIBC_Bcmp_Fleet_L1                                      2.27GB/s ± 5%        2.41GB/s ± 6%   +6.10%  (p=0.000 n=29+28)
BM_LIBC_Bcmp_Fleet_L2                                      2.14GB/s ± 6%        2.33GB/s ± 5%   +9.21%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_Fleet_LLC                                     1.30GB/s ± 9%        1.43GB/s ± 8%   +9.85%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_Fleet_Cold                                     475MB/s ± 6%         475MB/s ± 5%     ~     (p=0.839 n=30+29)
BM_LIBC_Bcmp_0_L1                                [Bcmp_0]  3.38GB/s ± 7%        3.46GB/s ± 6%   +2.35%  (p=0.009 n=30+29)
BM_LIBC_Bcmp_0_L2                                [Bcmp_0]  3.20GB/s ± 5%        3.32GB/s ± 6%   +3.52%  (p=0.000 n=28+30)
BM_LIBC_Bcmp_0_LLC                               [Bcmp_0]  1.88GB/s ± 9%        2.00GB/s ± 6%   +6.63%  (p=0.000 n=30+28)
BM_LIBC_Bcmp_0_Cold                              [Bcmp_0]   664MB/s ± 6%         655MB/s ± 6%   -1.32%  (p=0.025 n=30+30)
BM_LIBC_Bcmp_1_L1                                [Bcmp_1]  3.50GB/s ± 8%        3.61GB/s ±10%   +3.09%  (p=0.001 n=29+30)
BM_LIBC_Bcmp_1_L2                                [Bcmp_1]  3.32GB/s ± 7%        3.48GB/s ± 8%   +4.89%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_1_LLC                               [Bcmp_1]  2.02GB/s ± 7%        2.14GB/s ± 9%   +5.82%  (p=0.000 n=28+29)
BM_LIBC_Bcmp_1_Cold                              [Bcmp_1]   716MB/s ± 6%         709MB/s ± 5%   -0.97%  (p=0.040 n=30+28)
BM_LIBC_Bcmp_2_L1                                [Bcmp_2]  1.83GB/s ± 7%        1.97GB/s ± 8%   +7.90%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_2_L2                                [Bcmp_2]  1.74GB/s ± 6%        1.92GB/s ± 6%  +10.29%  (p=0.000 n=30+29)
BM_LIBC_Bcmp_2_LLC                               [Bcmp_2]  1.05GB/s ± 9%        1.15GB/s ± 9%   +9.73%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_2_Cold                              [Bcmp_2]   379MB/s ± 6%         372MB/s ± 6%   -1.74%  (p=0.012 n=30+30)
BM_LIBC_Bcmp_3_L1                                [Bcmp_3]  2.17GB/s ± 5%        2.29GB/s ± 6%   +5.61%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_3_L2                                [Bcmp_3]  2.02GB/s ± 6%        2.20GB/s ± 6%   +8.75%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_3_LLC                               [Bcmp_3]  1.22GB/s ± 8%        1.34GB/s ± 9%   +9.19%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_3_Cold                              [Bcmp_3]   447MB/s ± 3%         441MB/s ± 7%   -1.40%  (p=0.033 n=30+30)
BM_LIBC_Bcmp_4_L1                                [Bcmp_4]   902MB/s ± 6%         995MB/s ±10%  +10.37%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_4_L2                                [Bcmp_4]   863MB/s ± 5%         945MB/s ±11%   +9.50%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_4_LLC                               [Bcmp_4]   528MB/s ±11%         559MB/s ±12%   +5.75%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_4_Cold                              [Bcmp_4]   183MB/s ± 4%         181MB/s ± 7%     ~     (p=0.088 n=28+30)
BM_LIBC_Bcmp_5_L1                                [Bcmp_5]  1.70GB/s ± 6%        1.87GB/s ± 8%  +10.14%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_5_L2                                [Bcmp_5]  1.60GB/s ± 5%        1.80GB/s ± 9%  +12.61%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_5_LLC                               [Bcmp_5]   994MB/s ±13%        1094MB/s ± 8%  +10.10%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_5_Cold                              [Bcmp_5]   362MB/s ± 6%         358MB/s ± 7%     ~     (p=0.123 n=30+30)
BM_LIBC_Bcmp_6_L1                                [Bcmp_6]  3.31GB/s ± 5%        3.67GB/s ± 6%  +10.90%  (p=0.000 n=28+30)
BM_LIBC_Bcmp_6_L2                                [Bcmp_6]  3.11GB/s ± 5%        3.53GB/s ± 5%  +13.59%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_6_LLC                               [Bcmp_6]  1.98GB/s ± 9%        2.18GB/s ± 8%  +10.34%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_6_Cold                              [Bcmp_6]   754MB/s ± 5%         752MB/s ± 5%     ~     (p=0.592 n=30+30)
BM_LIBC_Bcmp_7_L1                                [Bcmp_7]  1.72GB/s ± 5%        1.72GB/s ± 6%     ~     (p=0.549 n=29+29)
BM_LIBC_Bcmp_7_L2                                [Bcmp_7]  1.61GB/s ± 7%        1.63GB/s ± 8%     ~     (p=0.191 n=30+29)
BM_LIBC_Bcmp_7_LLC                               [Bcmp_7]   913MB/s ± 8%         905MB/s ± 9%     ~     (p=0.423 n=30+30)
BM_LIBC_Bcmp_7_Cold                              [Bcmp_7]   304MB/s ± 6%         287MB/s ± 4%   -5.57%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_8_L1                                [Bcmp_8]   961MB/s ± 5%        1124MB/s ± 6%  +16.94%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_8_L2                                [Bcmp_8]   915MB/s ± 8%        1100MB/s ± 7%  +20.16%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_8_LLC                               [Bcmp_8]   593MB/s ± 8%         669MB/s ± 8%  +12.92%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_8_Cold                              [Bcmp_8]   220MB/s ± 4%         220MB/s ± 6%     ~     (p=0.572 n=30+30)
```

Co-authored-by: goldvitaly@google.com <%username%@google.com>
---
 libc/src/string/memory_utils/op_x86.h         | 86 ++++++++++++++-----
 .../string/memory_utils/x86_64/inline_bcmp.h  | 32 +++----
 2 files changed, 77 insertions(+), 41 deletions(-)

diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
index cf9667283818d..ab694e25fe0fe 100644
--- a/libc/src/string/memory_utils/op_x86.h
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -63,6 +63,15 @@ struct Memcpy {
 namespace LIBC_NAMESPACE_DECL {
 namespace generic {
 
+// Not equals: returns non-zero iff values at head or tail differ.
+// This function typically loads more data than necessary when the two buffer
+// differs.
+template <typename T>
+LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) {
+  static_assert(cpp::is_integral_v<T>);
+  return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T));
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Specializations for uint16_t
 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
@@ -133,6 +142,11 @@ LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
 #if defined(__SSE4_1__)
 template <> struct is_vector<__m128i> : cpp::true_type {};
 template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
+LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) {
+  const auto a = load<__m128i>(p1, offset);
+  const auto b = load<__m128i>(p2, offset);
+  return _mm_xor_si128(a, b);
+}
 LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
   return _mm_max_epu8(a, b);
 }
@@ -144,17 +158,21 @@ LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
   return static_cast<uint16_t>(
       _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
 }
+LIBC_INLINE bool is_zero(__m128i value) {
+  return _mm_testz_si128(value, value) == 1;
+}
 template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
-  const auto a = load<__m128i>(p1, offset);
-  const auto b = load<__m128i>(p2, offset);
-  const auto xored = _mm_xor_si128(a, b);
-  return _mm_testz_si128(xored, xored) == 1; // 1 iff xored == 0
+  return is_zero(load_and_xor_m128i(p1, p2, offset));
 }
 template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
-  const auto a = load<__m128i>(p1, offset);
-  const auto b = load<__m128i>(p2, offset);
-  const auto xored = _mm_xor_si128(a, b);
-  return _mm_testz_si128(xored, xored) == 0; // 0 iff xored != 0
+  return !is_zero(load_and_xor_m128i(p1, p2, offset));
+}
+template <>
+LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
+                                                       size_t count) {
+  const __m128i head = load_and_xor_m128i(p1, p2, 0);
+  const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i));
+  return !is_zero(_mm_or_si128(head, tail));
 }
 template <>
 LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
@@ -173,19 +191,34 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
 #if defined(__AVX__)
 template <> struct is_vector<__m256i> : cpp::true_type {};
 template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
-template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
-  const auto a = load<__m256i>(p1, offset);
-  const auto b = load<__m256i>(p2, offset);
-  const auto xored = _mm256_castps_si256(
+LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) {
+  return _mm256_castps_si256(
       _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-  return _mm256_testz_si256(xored, xored) == 1; // 1 iff xored == 0
 }
-template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
+LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) {
+  return _mm256_castps_si256(
+      _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load<__m256i>(p1, offset);
   const auto b = load<__m256i>(p2, offset);
-  const auto xored = _mm256_castps_si256(
-      _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-  return _mm256_testz_si256(xored, xored) == 0; // 0 iff xored != 0
+  return xor_m256i(a, b);
+}
+LIBC_INLINE bool is_zero(__m256i value) {
+  return _mm256_testz_si256(value, value) == 1;
+}
+template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
+  return is_zero(load_and_xor_m256i(p1, p2, offset));
+}
+template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
+  return !is_zero(load_and_xor_m256i(p1, p2, offset));
+}
+template <>
+LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
+                                                       size_t count) {
+  const __m256i head = load_and_xor_m256i(p1, p2, 0);
+  const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i));
+  return !is_zero(or_m256i(head, tail));
 }
 #endif // __AVX__
 
@@ -300,9 +333,22 @@ template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
 template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load<__m512i>(p1, offset);
   const auto b = load<__m512i>(p2, offset);
-  const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b);
-  return static_cast<uint32_t>(xored >> 32) |
-         static_cast<uint32_t>(xored & 0xFFFFFFFF);
+  return _mm512_cmpneq_epi8_mask(a, b) != 0;
+}
+LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) {
+  const auto a = load<__m512i>(p1, offset);
+  const auto b = load<__m512i>(p2, offset);
+  return _mm512_xor_epi64(a, b);
+}
+LIBC_INLINE bool is_zero(__m512i value) {
+  return _mm512_test_epi32_mask(value, value) == 0;
+}
+template <>
+LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
+                                                       size_t count) {
+  const __m512i head = load_and_xor_m512i(p1, p2, 0);
+  const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i));
+  return !is_zero(_mm512_or_epi64(head, tail));
 }
 template <>
 LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
diff --git a/libc/src/string/memory_utils/x86_64/inline_bcmp.h b/libc/src/string/memory_utils/x86_64/inline_bcmp.h
index 49fe08fb0501b..cc54c4140ee6e 100644
--- a/libc/src/string/memory_utils/x86_64/inline_bcmp.h
+++ b/libc/src/string/memory_utils/x86_64/inline_bcmp.h
@@ -27,7 +27,7 @@ inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
 [[maybe_unused]] LIBC_INLINE BcmpReturnType
 inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
   if (count <= 32)
-    return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
+    return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
   return generic::Bcmp<__m128i>::loop_and_tail_align_above(256, p1, p2, count);
 }
 #endif // __SSE4_1__
@@ -36,9 +36,9 @@ inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
 [[maybe_unused]] LIBC_INLINE BcmpReturnType
 inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
   if (count <= 32)
-    return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
+    return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
   if (count <= 64)
-    return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
+    return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
   return generic::Bcmp<__m256i>::loop_and_tail_align_above(256, p1, p2, count);
 }
 #endif // __AVX__
@@ -47,11 +47,11 @@ inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
 [[maybe_unused]] LIBC_INLINE BcmpReturnType
 inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
   if (count <= 32)
-    return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
+    return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
   if (count <= 64)
-    return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
+    return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
   if (count <= 128)
-    return generic::Bcmp<__m512i>::head_tail(p1, p2, count);
+    return generic::branchless_head_tail_neq<__m512i>(p1, p2, count);
   return generic::Bcmp<__m512i>::loop_and_tail_align_above(256, p1, p2, count);
 }
 #endif // __AVX512BW__
@@ -62,22 +62,12 @@ inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
     return BcmpReturnType::zero();
   if (count == 1)
     return generic::Bcmp<uint8_t>::block(p1, p2);
-  if (count == 2)
-    return generic::Bcmp<uint16_t>::block(p1, p2);
-  if (count == 3)
-    return generic::BcmpSequence<uint16_t, uint8_t>::block(p1, p2);
-  if (count == 4)
-    return generic::Bcmp<uint32_t>::block(p1, p2);
-  if (count == 5)
-    return generic::BcmpSequence<uint32_t, uint8_t>::block(p1, p2);
-  if (count == 6)
-    return generic::BcmpSequence<uint32_t, uint16_t>::block(p1, p2);
-  if (count == 7)
-    return generic::BcmpSequence<uint32_t, uint16_t, uint8_t>::block(p1, p2);
-  if (count == 8)
-    return generic::Bcmp<uint64_t>::block(p1, p2);
+  if (count <= 4)
+    return generic::branchless_head_tail_neq<uint16_t>(p1, p2, count);
+  if (count <= 8)
+    return generic::branchless_head_tail_neq<uint32_t>(p1, p2, count);
   if (count <= 16)
-    return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
+    return generic::branchless_head_tail_neq<uint64_t>(p1, p2, count);
 #if defined(__AVX512BW__)
   return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
 #elif defined(__AVX__)

From 78e1e6ace6c99ac3c96216a40836a1ac98d4f000 Mon Sep 17 00:00:00 2001
From: ErikHogeman <erik.hogeman@arm.com>
Date: Fri, 6 Sep 2024 11:20:14 +0200
Subject: [PATCH 349/425] [LV] Check for vector-to-scalar casts in legalizer
 (#106244)

The code makes assumptions later on the operations and their inputs
being scalar in the loops that are processed, so we should make sure
this is the case in the legalizer.
---
 .../Vectorize/LoopVectorizationLegality.cpp   |  3 ++
 .../LoopVectorize/vector-to-scalar-cast.ll    | 40 +++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/vector-to-scalar-cast.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 7042af6dd8eae..7062e21383a5f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -943,9 +943,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         VecCallVariantsFound = true;
 
       // Check that the instruction return type is vectorizable.
+      // We can't vectorize casts from vector type to scalar type.
       // Also, we can't vectorize extractelement instructions.
       if ((!VectorType::isValidElementType(I.getType()) &&
            !I.getType()->isVoidTy()) ||
+          (isa<CastInst>(I) &&
+           !VectorType::isValidElementType(I.getOperand(0)->getType())) ||
           isa<ExtractElementInst>(I)) {
         reportVectorizationFailure("Found unvectorizable type",
             "instruction return type cannot be vectorized",
diff --git a/llvm/test/Transforms/LoopVectorize/vector-to-scalar-cast.ll b/llvm/test/Transforms/LoopVectorize/vector-to-scalar-cast.ll
new file mode 100644
index 0000000000000..a6dcd4d1cfc11
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vector-to-scalar-cast.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; The test was crashing earlier due to a vectorization attempt; vector-to-scalar cast
+; is now forbidden in the legalizer, and the test isn't a vectorization candidate now.
+; RUN: opt -S -force-widen-divrem-via-safe-divisor=false -force-vector-width=4 --passes=loop-vectorize < %s | FileCheck %s
+
+define void @vector_to_scalar_cast(ptr %out) {
+; CHECK-LABEL: define void @vector_to_scalar_cast(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <2 x i16> undef, i16 0, i64 0
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <2 x i16> [[VEC0]], i16 0, i64 1
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 1, %[[ENTRY]] ]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[IV]], 11
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[VEC_SCALAR_CAST:%.*]] = bitcast <2 x i16> [[VEC1]] to i32
+; CHECK-NEXT:    [[SREM:%.*]] = srem i32 0, [[VEC_SCALAR_CAST]]
+; CHECK-NEXT:    store i32 [[SREM]], ptr [[OUT]], align 4
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %vec0 = insertelement <2 x i16> undef, i16 0, i64 0
+  %vec1 = insertelement <2 x i16> %vec0, i16 0, i64 1
+  br label %loop
+
+loop:
+  %iv = phi i32 [ %iv.next, %loop ], [ 1, %entry ]
+  %exitcond = icmp ne i32 %iv, 11
+  %iv.next = add nuw nsw i32 %iv, 1
+  %vec.scalar.cast = bitcast <2 x i16> %vec1 to i32
+  %srem = srem i32 0, %vec.scalar.cast
+  store i32 %srem, ptr %out
+  br i1 %exitcond, label %loop, label %exit
+
+exit:
+  ret void
+}

From 12c4d26c1d6f03e7628c31c8d12fdf426575bb2c Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Fri, 6 Sep 2024 10:36:49 +0100
Subject: [PATCH 350/425] [Flang][OpenMP] NFC: DataSharingProcessor cleanup
 (#107391)

This patch removes unused and undefined method declarations from
`DataSharingProcessor`, as well as the unused `hasLastPrivateOp` class
member. The `insPt` class member is replaced by a local `InsertionGuard`
in the only place it is set and used.
---
 flang/lib/Lower/OpenMP/DataSharingProcessor.cpp | 6 ++----
 flang/lib/Lower/OpenMP/DataSharingProcessor.h   | 8 --------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 78c3aa5a1c16c..592c77289f9f2 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -39,7 +39,7 @@ DataSharingProcessor::DataSharingProcessor(
     const List<Clause> &clauses, lower::pft::Evaluation &eval,
     bool shouldCollectPreDeterminedSymbols, bool useDelayedPrivatization,
     lower::SymMap *symTable)
-    : hasLastPrivateOp(false), converter(converter), semaCtx(semaCtx),
+    : converter(converter), semaCtx(semaCtx),
       firOpBuilder(converter.getFirOpBuilder()), clauses(clauses), eval(eval),
       shouldCollectPreDeterminedSymbols(shouldCollectPreDeterminedSymbols),
       useDelayedPrivatization(useDelayedPrivatization), symTable(symTable),
@@ -64,9 +64,8 @@ void DataSharingProcessor::processStep1(
 void DataSharingProcessor::processStep2(mlir::Operation *op, bool isLoop) {
   // 'sections' lastprivate is handled by genOMP()
   if (!mlir::isa<mlir::omp::SectionsOp>(op)) {
-    insPt = firOpBuilder.saveInsertionPoint();
+    mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
     copyLastPrivatize(op);
-    firOpBuilder.restoreInsertionPoint(insPt);
   }
 
   if (isLoop) {
@@ -157,7 +156,6 @@ void DataSharingProcessor::collectSymbolsForPrivatization() {
                    std::get_if<omp::clause::Lastprivate>(&clause.u)) {
       const ObjectList &objects = std::get<ObjectList>(lastPrivateClause->t);
       collectOmpObjectListSymbol(objects, explicitlyPrivatizedSymbols);
-      hasLastPrivateOp = true;
     }
   }
 
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index 166cdfa37fc7f..ff186483e04a8 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -68,9 +68,7 @@ class DataSharingProcessor {
                           lower::pft::Evaluation &eval) const;
   };
 
-  bool hasLastPrivateOp;
   mlir::OpBuilder::InsertPoint lastPrivIP;
-  mlir::OpBuilder::InsertPoint insPt;
   llvm::SmallVector<mlir::Value> loopIVs;
   // Symbols in private, firstprivate, and/or lastprivate clauses.
   llvm::SetVector<const semantics::Symbol *> explicitlyPrivatizedSymbols;
@@ -106,12 +104,6 @@ class DataSharingProcessor {
   void collectImplicitSymbols();
   void collectPreDeterminedSymbols();
   void privatize(mlir::omp::PrivateClauseOps *clauseOps);
-  void defaultPrivatize(
-      mlir::omp::PrivateClauseOps *clauseOps,
-      llvm::SmallVectorImpl<const semantics::Symbol *> *privateSyms);
-  void implicitPrivatize(
-      mlir::omp::PrivateClauseOps *clauseOps,
-      llvm::SmallVectorImpl<const semantics::Symbol *> *privateSyms);
   void doPrivatize(const semantics::Symbol *sym,
                    mlir::omp::PrivateClauseOps *clauseOps);
   void copyLastPrivatize(mlir::Operation *op);

From c5de6611ce10b8ecf573f601b5f12de60424897d Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Fri, 6 Sep 2024 10:37:56 +0100
Subject: [PATCH 351/425] [Flang][OpenMP] NFC: Flang OpenMP passes cleanup
 (#107386)

For consistency, this patch renames the `FunctionFiltering` pass to
`FunctionFilteringPass`. Also, OpenMP pass-related `#define`s are
removed from flang/Optimizer/Transforms/Passes.h, since `#define`s don't
have an effect there after moving related passes to
flang/Optimizer/OpenMP/Passes.td.
---
 flang/include/flang/Optimizer/OpenMP/Passes.td    | 2 +-
 flang/include/flang/Optimizer/Transforms/Passes.h | 3 ---
 flang/include/flang/Tools/CLOptions.inc           | 2 +-
 flang/lib/Optimizer/OpenMP/FunctionFiltering.cpp  | 4 ++--
 4 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td
index a5f98e1320c94..1c0ce08f5b483 100644
--- a/flang/include/flang/Optimizer/OpenMP/Passes.td
+++ b/flang/include/flang/Optimizer/OpenMP/Passes.td
@@ -28,7 +28,7 @@ def MarkDeclareTargetPass
   let dependentDialects = ["mlir::omp::OpenMPDialect"];
 }
 
-def FunctionFiltering : Pass<"omp-function-filtering"> {
+def FunctionFilteringPass : Pass<"omp-function-filtering"> {
   let summary = "Filters out functions intended for the host when compiling "
                 "for the target device.";
   let dependentDialects = [
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index 6f98e3a25ec12..59266a6adfe46 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -53,9 +53,6 @@ namespace fir {
 #define GEN_PASS_DECL_STACKRECLAIM
 #define GEN_PASS_DECL_LOOPVERSIONING
 #define GEN_PASS_DECL_ADDALIASTAGS
-#define GEN_PASS_DECL_OMPMAPINFOFINALIZATIONPASS
-#define GEN_PASS_DECL_OMPMARKDECLARETARGETPASS
-#define GEN_PASS_DECL_OMPFUNCTIONFILTERING
 #define GEN_PASS_DECL_VSCALEATTR
 #define GEN_PASS_DECL_FUNCTIONATTR
 #define GEN_PASS_DECL_CONSTANTARGUMENTGLOBALISATIONOPT
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index 20351dcf6d6f8..04b7f0ba370b8 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -370,7 +370,7 @@ inline void createOpenMPFIRPassPipeline(
   pm.addPass(flangomp::createMapInfoFinalizationPass());
   pm.addPass(flangomp::createMarkDeclareTargetPass());
   if (isTargetDevice)
-    pm.addPass(flangomp::createFunctionFiltering());
+    pm.addPass(flangomp::createFunctionFilteringPass());
 }
 
 #if !defined(FLANG_EXCLUDE_CODEGEN)
diff --git a/flang/lib/Optimizer/OpenMP/FunctionFiltering.cpp b/flang/lib/Optimizer/OpenMP/FunctionFiltering.cpp
index bd9005d3e2df6..9554808824ac3 100644
--- a/flang/lib/Optimizer/OpenMP/FunctionFiltering.cpp
+++ b/flang/lib/Optimizer/OpenMP/FunctionFiltering.cpp
@@ -22,7 +22,7 @@
 #include "llvm/ADT/SmallVector.h"
 
 namespace flangomp {
-#define GEN_PASS_DEF_FUNCTIONFILTERING
+#define GEN_PASS_DEF_FUNCTIONFILTERINGPASS
 #include "flang/Optimizer/OpenMP/Passes.h.inc"
 } // namespace flangomp
 
@@ -30,7 +30,7 @@ using namespace mlir;
 
 namespace {
 class FunctionFilteringPass
-    : public flangomp::impl::FunctionFilteringBase<FunctionFilteringPass> {
+    : public flangomp::impl::FunctionFilteringPassBase<FunctionFilteringPass> {
 public:
   FunctionFilteringPass() = default;
 

From e40c5b42fe8f489ea4bac4694ef58f09bd95263b Mon Sep 17 00:00:00 2001
From: Lily Brown <lbrown@modular.com>
Date: Fri, 6 Sep 2024 02:42:02 -0700
Subject: [PATCH 352/425] [lsp] Fix format string in Reply (#107480)

PR #105745 requires that `formatv` calls have the correct number of
arguments. This call to `Logger::info` was incorrect.
---
 mlir/lib/Tools/lsp-server-support/Transport.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Tools/lsp-server-support/Transport.cpp b/mlir/lib/Tools/lsp-server-support/Transport.cpp
index ca7ffdf78d3a1..ad8308f69aead 100644
--- a/mlir/lib/Tools/lsp-server-support/Transport.cpp
+++ b/mlir/lib/Tools/lsp-server-support/Transport.cpp
@@ -75,7 +75,7 @@ void Reply::operator()(llvm::Expected<llvm::json::Value> reply) {
     transport->reply(std::move(id), std::move(reply));
   } else {
     llvm::Error error = reply.takeError();
-    Logger::info("--> reply:{0}({1})", method, id, error);
+    Logger::info("--> reply:{0}({1}): {2}", method, id, error);
     transport->reply(std::move(id), std::move(error));
   }
 }

From d4e320e6f4dd78929cec44b342b6035a6bbd6323 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Fri, 6 Sep 2024 11:55:24 +0200
Subject: [PATCH 353/425] [lldb][NFC] Fix -Wparentheses warning.

Fix `using the result of an assignment as a condition without parentheses` warning.
---
 .../Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index 04f0f525e3ebb..9cf37e271bcde 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -947,7 +947,7 @@ Status GDBRemoteCommunication::StartDebugserverProcess(
 
   Status error;
   FileSpec &debugserver_file_spec = launch_info.GetExecutableFile();
-  if (debugserver_file_spec = GetDebugserverPath(platform)) {
+  if ((debugserver_file_spec = GetDebugserverPath(platform))) {
     std::string debugserver_path = debugserver_file_spec.GetPath();
 
     Args &debugserver_args = launch_info.GetArguments();

From 50be4f17a0ff951f8a8e123e66c7024b067211c6 Mon Sep 17 00:00:00 2001
From: Chaitanya <Krishna.Sankisa@amd.com>
Date: Fri, 6 Sep 2024 16:04:17 +0530
Subject: [PATCH 354/425] [AMDGPU] Skip lowerNonKernelLDSAccesses if function
 is declaration. (#106975)

This PR skips lowering non-kernel LDS i.e lowerNonKernelLDSAccesses,
when function is a declaration or there are no lds globals to process.
---
 llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp   |  5 +-
 ...gpu-sw-lower-lds-non-kernel-declaration.ll | 97 +++++++++++++++++++
 2 files changed, 100 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
old mode 100644
new mode 100755
index b2ab7e9c03e52..e75b70b00c3ef
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -272,7 +272,7 @@ void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
     for (auto &I : *CGN) {
       CallGraphNode *CallerCGN = I.second;
       Function *CalledFunc = CallerCGN->getFunction();
-      if (!CalledFunc)
+      if (!CalledFunc || CalledFunc->isDeclaration())
         continue;
       if (AMDGPU::isKernelLDS(CalledFunc))
         continue;
@@ -300,7 +300,8 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
     for (User *V : GV->users()) {
       if (auto *I = dyn_cast<Instruction>(V)) {
         Function *F = I->getFunction();
-        if (!isKernelLDS(F) && F->hasFnAttribute(Attribute::SanitizeAddress))
+        if (!isKernelLDS(F) && F->hasFnAttribute(Attribute::SanitizeAddress) &&
+            !F->isDeclaration())
           FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV);
       }
     }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll
new file mode 100644
index 0000000000000..ae2bcbbb81b5f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-sw-lower-lds < %s | FileCheck %s
+@lds = external addrspace(3) global [5 x i8], align 8
+declare void @non_kernel_declaration() sanitize_address
+
+;.
+; CHECK: @llvm.amdgcn.sw.lds.k1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.sw.lds.k1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k1.md.type { %llvm.amdgcn.sw.lds.k1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k1.md.item { i32 32, i32 5, i32 32 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k1], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [1 x ptr addrspace(1)]] [[1 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 1, i32 0)]], no_sanitize_address
+;.
+define void @non_kernel_function() sanitize_address {
+; CHECK-LABEL: define void @non_kernel_function(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x [1 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]]
+; CHECK-NEXT:    [[Y:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP9:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr
+; CHECK-NEXT:    store i8 5, ptr [[TMP9]], align 8
+; CHECK-NEXT:    ret void
+;
+  %Y = addrspacecast ptr addrspace(3) @lds to ptr
+  store i8 5, ptr addrspacecast( ptr addrspace(3) @lds to ptr), align 8
+  ret void
+}
+
+define amdgpu_kernel void @k1() sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @k1(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
+; CHECK-NEXT:  [[WID:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]]
+; CHECK:       [[MALLOC]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1)
+; CHECK-NEXT:    store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8
+; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64
+; CHECK-NEXT:    call void @__asan_poison_region(i64 [[TMP15]], i64 24)
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 37
+; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64
+; CHECK-NEXT:    call void @__asan_poison_region(i64 [[TMP17]], i64 27)
+; CHECK-NEXT:    br label %[[BB18]]
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ]
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, align 8
+; CHECK-NEXT:    call void @non_kernel_function()
+; CHECK-NEXT:    call void @non_kernel_declaration()
+; CHECK-NEXT:    br label %[[CONDFREE:.*]]
+; CHECK:       [[CONDFREE]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]]
+; CHECK:       [[FREE]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64
+; CHECK-NEXT:    call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]])
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    ret void
+;
+  call void @non_kernel_function()
+  call void @non_kernel_declaration()
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address }
+; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1}
+; CHECK: [[META2]] = !{i32 0}
+;.

From a918fa117acfbb20d29039cb8bddab159a8173dc Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Fri, 6 Sep 2024 12:37:21 +0200
Subject: [PATCH 355/425] [clang] Emit -Wdangling diagnoses for cases where a
 gsl-pointer is construct from a gsl-owner object in a container. (#104556)

The warning is not emitted for the case `string_view c =
std::vector<std::string>({""}).at(0);` because we bail out during the
visit of the LHS at [this
point](https://github.com/llvm/llvm-project/blob/5d2c324fea2d7cf86ec50e4bb6b680acf89b2ed5/clang/lib/Sema/CheckExprLifetime.cpp#L341-L343)
in the code.

This bailout was introduced in [this
commit](https://github.com/llvm/llvm-project/commit/bcd0798c47ca865f95226859893016a17402441e)
to address a false positive with
`vector<vector::iterator>({""}).at(0);`. This PR refines that fix by
ensuring that, for initialization involving a gsl-pointer, we only
consider constructor calls that take the gsl-owner object.

Fixes #100384.
---
 clang/docs/ReleaseNotes.rst                   |  2 ++
 clang/lib/Sema/CheckExprLifetime.cpp          | 26 +++++++++--------
 .../Sema/warn-lifetime-analysis-nocfg.cpp     | 28 +++++++++++++++++++
 3 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a2e91fd648cce..684484ccd298f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -288,6 +288,8 @@ Improvements to Clang's diagnostics
 - The lifetimebound and GSL analysis in clang are coherent, allowing clang to
   detect more use-after-free bugs. (#GH100549).
 
+- Clang now diagnoses dangling cases where a gsl-pointer is constructed from a gsl-owner object inside a container (#GH100384).
+
 - Clang now warns for u8 character literals used in C23 with ``-Wpre-c23-compat`` instead of ``-Wpre-c++17-compat``.
 
 Improvements to Clang's time-trace
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index 8f4d5d50669f1..f1507ebb9a506 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -403,13 +403,17 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
       visitLocalsRetainedByInitializer(Path, Arg, Visit, true);
     Path.pop_back();
   };
-  auto VisitGSLPointerArg = [&](const Decl *D, Expr *Arg, bool Value) {
+  auto VisitGSLPointerArg = [&](const FunctionDecl *Callee, Expr *Arg) {
     // We are not interested in the temporary base objects of gsl Pointers:
     //   Temp().ptr; // Here ptr might not dangle.
     if (isa<MemberExpr>(Arg->IgnoreImpCasts()))
       return;
-    // Once we initialized a value with a reference, it can no longer dangle.
-    if (!Value) {
+    auto ReturnType = Callee->getReturnType();
+
+    // Once we initialized a value with a non gsl-owner reference, it can no
+    // longer dangle.
+    if (ReturnType->isReferenceType() &&
+        !isRecordWithAttr<OwnerAttr>(ReturnType->getPointeeType())) {
       for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) {
         if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit ||
             PE.Kind == IndirectLocalPathEntry::LifetimeBoundCall)
@@ -420,9 +424,10 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
         break;
       }
     }
-    Path.push_back({Value ? IndirectLocalPathEntry::GslPointerInit
-                          : IndirectLocalPathEntry::GslReferenceInit,
-                    Arg, D});
+    Path.push_back({ReturnType->isReferenceType()
+                        ? IndirectLocalPathEntry::GslReferenceInit
+                        : IndirectLocalPathEntry::GslPointerInit,
+                    Arg, Callee});
     if (Arg->isGLValue())
       visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding,
                                             Visit);
@@ -453,8 +458,7 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
     else if (EnableGSLAnalysis) {
       if (auto *CME = dyn_cast<CXXMethodDecl>(Callee);
           CME && shouldTrackImplicitObjectArg(CME))
-        VisitGSLPointerArg(Callee, ObjectArg,
-                           !Callee->getReturnType()->isReferenceType());
+        VisitGSLPointerArg(Callee, ObjectArg);
     }
   }
 
@@ -465,13 +469,11 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
       VisitLifetimeBoundArg(Callee->getParamDecl(I), Args[I]);
     else if (EnableGSLAnalysis && I == 0) {
       if (shouldTrackFirstArgument(Callee)) {
-        VisitGSLPointerArg(Callee, Args[0],
-                           !Callee->getReturnType()->isReferenceType());
+        VisitGSLPointerArg(Callee, Args[0]);
       } else if (auto *CCE = dyn_cast<CXXConstructExpr>(Call);
                  CCE &&
                  CCE->getConstructor()->getParent()->hasAttr<PointerAttr>()) {
-        VisitGSLPointerArg(CCE->getConstructor()->getParamDecl(0), Args[0],
-                           true);
+        VisitGSLPointerArg(CCE->getConstructor(), Args[0]);
       }
     }
   }
diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
index 67d1ceaa02d03..59357d0730a7d 100644
--- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
+++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
@@ -275,6 +275,34 @@ int &danglingRawPtrFromLocal3() {
   return *o; // expected-warning {{reference to stack memory associated with local variable 'o' returned}}
 }
 
+// GH100384
+std::string_view containerWithAnnotatedElements() {
+  std::string_view c1 = std::vector<std::string>().at(0); // expected-warning {{object backing the pointer will be destroyed at the end of the full-expression}}
+  c1 = std::vector<std::string>().at(0); // expected-warning {{object backing the pointer}}
+
+  // no warning on constructing from gsl-pointer
+  std::string_view c2 = std::vector<std::string_view>().at(0);
+
+  std::vector<std::string> local;
+  return local.at(0); // expected-warning {{address of stack memory associated with local variable}}
+}
+
+std::string_view localUniquePtr(int i) {
+  std::unique_ptr<std::string> c1;
+  if (i)
+    return *c1; // expected-warning {{address of stack memory associated with local variable}}
+  std::unique_ptr<std::string_view> c2;
+  return *c2; // expect no-warning.
+}
+
+std::string_view localOptional(int i) {
+  std::optional<std::string> o;
+  if (i)
+    return o.value(); // expected-warning {{address of stack memory associated with local variable}}
+  std::optional<std::string_view> abc;
+  return abc.value(); // expect no warning
+}
+
 const char *danglingRawPtrFromTemp() {
   return std::basic_string<char>().c_str(); // expected-warning {{returning address of local temporary object}}
 }

From b11a70392c6ef3c481421f9f0a6651030333ebdc Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Fri, 6 Sep 2024 14:42:28 +0400
Subject: [PATCH 356/425] [lldb] Fixed a typo in #107388

---
 .../Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index 9cf37e271bcde..d3a782b22349a 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -940,7 +940,7 @@ FileSpec GDBRemoteCommunication::GetDebugserverPath(Platform *platform) {
 
 Status GDBRemoteCommunication::StartDebugserverProcess(
     const char *url, Platform *platform, ProcessLaunchInfo &launch_info,
-    uint16_t *port, const Args *inferior_args, shared_fd_t pass_comm_fd) {
+    uint16_t *port, const Args *inferior_args, int pass_comm_fd) {
   Log *log = GetLog(GDBRLog::Process);
   LLDB_LOGF(log, "GDBRemoteCommunication::%s(url=%s, port=%" PRIu16 ")",
             __FUNCTION__, url ? url : "<empty>", port ? *port : uint16_t(0));

From 8af0860529d75b61b66cb96ac05f503b0e2e845c Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Fri, 6 Sep 2024 12:53:33 +0200
Subject: [PATCH 357/425] AffineExpr: Fix result of d0 + (d0 // -c) * c.
 (#107530)

Currently, this is rewritten to d0 mod -c. However, we do not support
modulo with a negative RHS in our lowering passes, so this triggers
undefined behavior.

It would be better to not have these ad hoc simplifications at all, but
I guess that ship has sailed.
---
 mlir/lib/IR/AffineExpr.cpp           |  5 ++++-
 mlir/unittests/IR/AffineExprTest.cpp | 17 +++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
index fc7ede279643e..0b078966aeb85 100644
--- a/mlir/lib/IR/AffineExpr.cpp
+++ b/mlir/lib/IR/AffineExpr.cpp
@@ -760,8 +760,11 @@ static AffineExpr simplifyAdd(AffineExpr lhs, AffineExpr rhs) {
 
   llrhs = lrBinOpExpr.getLHS();
   rlrhs = lrBinOpExpr.getRHS();
+  auto rlrhsConstOpExpr = dyn_cast<AffineConstantExpr>(rlrhs);
+  // We don't support modulo with a negative RHS.
+  bool isPositiveRhs = rlrhsConstOpExpr && rlrhsConstOpExpr.getValue() > 0;
 
-  if (lhs == llrhs && rlrhs == -rrhs) {
+  if (isPositiveRhs && lhs == llrhs && rlrhs == -rrhs) {
     return lhs % rlrhs;
   }
   return nullptr;
diff --git a/mlir/unittests/IR/AffineExprTest.cpp b/mlir/unittests/IR/AffineExprTest.cpp
index dc78bbac85f3c..9e89a5b79e2e2 100644
--- a/mlir/unittests/IR/AffineExprTest.cpp
+++ b/mlir/unittests/IR/AffineExprTest.cpp
@@ -15,6 +15,13 @@
 
 using namespace mlir;
 
+static std::string toString(AffineExpr expr) {
+  std::string s;
+  llvm::raw_string_ostream ss(s);
+  ss << expr;
+  return s;
+}
+
 // Test creating AffineExprs using the overloaded binary operators.
 TEST(AffineExprTest, constructFromBinaryOperators) {
   MLIRContext ctx;
@@ -112,3 +119,13 @@ TEST(AffineExprTest, divisorOfNegativeFloorDiv) {
   OpBuilder b(&ctx);
   ASSERT_EQ(b.getAffineDimExpr(0).floorDiv(-1).getLargestKnownDivisor(), 1);
 }
+
+TEST(AffineExprTest, d0PlusD0FloorDivNeg2) {
+  // Regression test for a bug where this was rewritten to d0 mod -2. We do not
+  // support a negative RHS for mod in LowerAffinePass.
+  MLIRContext ctx;
+  OpBuilder b(&ctx);
+  auto d0 = b.getAffineDimExpr(0);
+  auto sum = d0 + d0.floorDiv(-2) * 2;
+  ASSERT_EQ(toString(sum), "d0 + (d0 floordiv -2) * 2");
+}

From 704da919bafa5b088223f9d77424f24ae754539e Mon Sep 17 00:00:00 2001
From: Jake Egan <Jake.egan@ibm.com>
Date: Fri, 6 Sep 2024 06:59:15 -0400
Subject: [PATCH 358/425] [DebugInfo][AIX] XFAIL debug-ranges-duplication.ll
 (#107525)

The test fails with `Assertion failed: Section && "Cannot switch to a
null section!"` because of unsupported DWARF 5 section.
---
 llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll b/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll
index 540400be740a5..e9c23100eeda5 100644
--- a/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll
+++ b/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll
@@ -1,3 +1,6 @@
+; AIX doesn't currently support DWARF 5 section .debug_rnglists
+; XFAIL: target={{.*}}-aix{{.*}}
+
 ; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-info - | FileCheck %s
 ;
 ; Generated from the following C++ source with:

From cdd2c0693b6dd75816f64960a479aacbe4e34549 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Fri, 6 Sep 2024 19:11:26 +0800
Subject: [PATCH 359/425] [clang-tidy] fix misc-unconventional-assign-operator
 false positive for deducing this (#107409)

Fixes: #107119
When meeting c++23 deducing this, we should skip the first parameter
---
 .../misc/UnconventionalAssignOperatorCheck.cpp    | 15 ++++++++++++---
 clang-tools-extra/docs/ReleaseNotes.rst           |  4 ++++
 .../misc/unconventional-assign-operator-cxx23.cpp | 10 ++++++++++
 3 files changed, 26 insertions(+), 3 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/misc/unconventional-assign-operator-cxx23.cpp

diff --git a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp
index 42c4b6edb6d20..afc4897eeb2ae 100644
--- a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp
@@ -14,6 +14,16 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::misc {
 
+namespace {
+
+AST_MATCHER_P(CXXMethodDecl, firstParameter,
+              ast_matchers::internal::Matcher<ParmVarDecl>, InnerMatcher) {
+  unsigned N = Node.isExplicitObjectMemberFunction() ? 1 : 0;
+  return (N < Node.parameters().size() &&
+          InnerMatcher.matches(*Node.parameters()[N], Finder, Builder));
+}
+} // namespace
+
 void UnconventionalAssignOperatorCheck::registerMatchers(
     ast_matchers::MatchFinder *Finder) {
   const auto HasGoodReturnType =
@@ -29,7 +39,7 @@ void UnconventionalAssignOperatorCheck::registerMatchers(
                     hasName("operator="), ofClass(recordDecl().bind("class")))
           .bind("method");
   const auto IsSelfAssign =
-      cxxMethodDecl(IsAssign, hasParameter(0, parmVarDecl(hasType(IsSelf))))
+      cxxMethodDecl(IsAssign, firstParameter(parmVarDecl(hasType(IsSelf))))
           .bind("method");
 
   Finder->addMatcher(
@@ -41,8 +51,7 @@ void UnconventionalAssignOperatorCheck::registerMatchers(
             rValueReferenceType(pointee(isConstQualified()))))));
 
   Finder->addMatcher(
-      cxxMethodDecl(IsSelfAssign,
-                    hasParameter(0, parmVarDecl(hasType(BadSelf))))
+      cxxMethodDecl(IsSelfAssign, firstParameter(parmVarDecl(hasType(BadSelf))))
           .bind("ArgumentType"),
       this);
 
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 6999c1ef2ea4b..8d028f8863cb7 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -112,6 +112,10 @@ Changes in existing checks
   <clang-tidy/checks/modernize/use-std-format>` check to support replacing
   member function calls too.
 
+- Improved :doc:`misc-unconventional-assign-operator
+  <clang-tidy/checks/misc/unconventional-assign-operator>` check to avoid
+  false positive for C++23 deducing this.
+
 - Improved :doc:`modernize-use-std-print
   <clang-tidy/checks/modernize/use-std-print>` check to support replacing
   member function calls too.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/unconventional-assign-operator-cxx23.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/unconventional-assign-operator-cxx23.cpp
new file mode 100644
index 0000000000000..d947df164be86
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/unconventional-assign-operator-cxx23.cpp
@@ -0,0 +1,10 @@
+// RUN: %check_clang_tidy -std=c++23 %s misc-unconventional-assign-operator %t
+
+struct BadArgument {
+  BadArgument &operator=(this BadArgument& self, BadArgument &);
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: operator=() should take 'BadArgument const&', 'BadArgument&&' or 'BadArgument'
+};
+
+struct GoodArgument {
+  GoodArgument &operator=(this GoodArgument& self, GoodArgument const &);
+};

From 222d3b031f6bf39873550a34152b9d05b9b6578a Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Fri, 6 Sep 2024 14:13:19 +0300
Subject: [PATCH 360/425] [TBAA] Fix the case where a subobject gets accessed
 at a non-zero offset. (#101485)

---
 llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp  |  9 ++++----
 .../TypeBasedAliasAnalysis/aggregates.ll      | 23 +++++++++++++++++--
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index e908be4ad5127..fd11c3abc379e 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -613,12 +613,13 @@ static bool mayBeAccessToSubobjectOf(TBAAStructTagNode BaseTag,
     }
 
     if (BaseType.getNode() == SubobjectTag.getBaseType()) {
-      bool SameMemberAccess = OffsetInBase == SubobjectTag.getOffset();
+      MayAlias = OffsetInBase == SubobjectTag.getOffset() ||
+                 BaseType.getNode() == BaseTag.getAccessType() ||
+                 SubobjectTag.getBaseType() == SubobjectTag.getAccessType();
       if (GenericTag) {
-        *GenericTag = SameMemberAccess ? SubobjectTag.getNode() :
-                                         createAccessTag(CommonType);
+        *GenericTag =
+            MayAlias ? SubobjectTag.getNode() : createAccessTag(CommonType);
       }
-      MayAlias = SameMemberAccess;
       return true;
     }
 
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/aggregates.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/aggregates.ll
index 4049c78049e03..1c5efa219134e 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/aggregates.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/aggregates.ll
@@ -6,7 +6,7 @@
 ; Check that TBAA handles access tags with aggregate final access types
 ; correctly.
 
-%A = type { i32 }  ; struct A { int i; };
+%A = type { i32, i32 }  ; struct A { int i, j; };
 %B = type { %A }   ; struct B { A a; };
 %C = type { %B }   ; struct C { B b; };
 %D = type { i16 }  ; struct D { short s; };
@@ -105,13 +105,32 @@ entry:
   ret i32 %0
 }
 
+; A vs. A::j  =>  MayAlias.
+; This differs from A vs. A::i case in that the offsets of the final
+; accessed objects in A do not match.
+define i32 @f7(ptr %i, ptr %a) {
+entry:
+; CHECK-LABEL: f7
+; CHECK: MayAlias: store i32 7, {{.*}} <-> store i32 5,
+; OPT-LABEL: f7
+; OPT: store i32 5,
+; OPT: store i32 7,
+; OPT: %[[RET:.*]] = load i32,
+; OPT: ret i32 %[[RET]]
+  store i32 5, ptr %i, align 4, !tbaa !10  ; TAG_A
+  store i32 7, ptr %a, align 4, !tbaa !16  ; TAG_A_j
+  %0 = load i32, ptr %i, align 4, !tbaa !10  ; TAG_A
+  ret i32 %0
+}
+
 !0 = !{!"root"}
 !1 = !{!0, i64 1, !"char"}
 !2 = !{!1, i64 4, !"int"}
 !3 = !{!2, !2, i64 0, i64 4}  ; TAG_int
 
-!4 = !{!1, i64 4, !"A", !2, i64 0, i64 4}
+!4 = !{!1, i64 4, !"A", !2, i64 0, i64 4, !2, i64 4, i64 4}
 !5 = !{!4, !2, i64 0, i64 4}  ; TAG_A_i
+!16 = !{!4, !2, i64 4, i64 4}  ; TAG_A_j
 
 !6 = !{!1, i64 4, !"B", !4, i64 0, i64 4}
 !7 = !{!6, !4, i64 0, i64 4}  ; TAG_B_a

From c9bdc2564dabd1601ac0d714ba3f7057f55f6329 Mon Sep 17 00:00:00 2001
From: Freddy Ye <freddy.ye@intel.com>
Date: Fri, 6 Sep 2024 19:30:08 +0800
Subject: [PATCH 361/425] [X86][MC] Cont. update for AVX10.2-BF16 (#107529)

Not an expert in this code, but looks like it fixed the crash for us. Picking it up.
---
 llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp   | 2 --
 llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp | 5 ++---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index 8fcc1c10d93a0..cb34b56fbb07a 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -216,8 +216,6 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
               printdwordmem(MI, CurOp--, OS);
           } else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD &&
                      (Desc.TSFlags & X86II::OpMapMask) != X86II::TA) {
-            assert((Desc.TSFlags & X86II::OpMapMask) != X86II::TA &&
-                   "Unexpected op map!");
             printqwordmem(MI, CurOp--, OS);
           } else if (Desc.TSFlags & X86II::EVEX_L2) {
             printzmmwordmem(MI, CurOp--, OS);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index 39600ffcadd8e..0e00b4d0d5b16 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -204,9 +204,8 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS
               printwordmem(MI, CurOp++, OS);
             else
               printdwordmem(MI, CurOp++, OS);
-          } else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) {
-            assert((Desc.TSFlags & X86II::OpMapMask) != X86II::TA &&
-                   "Unexpected op map!");
+          } else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD &&
+                     (Desc.TSFlags & X86II::OpMapMask) != X86II::TA) {
             printqwordmem(MI, CurOp++, OS);
           } else if (Desc.TSFlags & X86II::EVEX_L2) {
             printzmmwordmem(MI, CurOp++, OS);

From 775f7f1c99562d0315be26aed7828e713bd3fc22 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Fri, 6 Sep 2024 13:58:45 +0200
Subject: [PATCH 362/425] [flang] lower SPACING f16/bf16 to new runtime APIs
 (#107541)

Use APIs added in https://github.com/llvm/llvm-project/pull/106575

This is needed to fix HDF5 builds that are blocked by SPACING TODOs for
REAL(2) and currently needs manual hacks.
---
 .../lib/Optimizer/Builder/Runtime/Numeric.cpp | 13 +++++++--
 flang/test/Lower/Intrinsics/spacing.f90       | 28 +++++++++++++++----
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
index d98288419d68f..c13064a284d12 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
@@ -625,7 +625,11 @@ mlir::Value fir::runtime::genSpacing(fir::FirOpBuilder &builder,
                                      mlir::Location loc, mlir::Value x) {
   mlir::func::FuncOp func;
   mlir::Type fltTy = x.getType();
-
+  // TODO: for f16/bf16, there are better alternatives that do not require
+  // casting the argument (resp. result) to (resp. from) f32, but this requires
+  // knowing that the target runtime has been compiled with std::float16_t or
+  // std::bfloat16_t support, which is not an information available here for
+  // now.
   if (fltTy.isF32())
     func = fir::runtime::getRuntimeFunc<mkRTKey(Spacing4)>(loc, builder);
   else if (fltTy.isF64())
@@ -634,6 +638,10 @@ mlir::Value fir::runtime::genSpacing(fir::FirOpBuilder &builder,
     func = fir::runtime::getRuntimeFunc<ForcedSpacing10>(loc, builder);
   else if (fltTy.isF128())
     func = fir::runtime::getRuntimeFunc<ForcedSpacing16>(loc, builder);
+  else if (fltTy.isF16())
+    func = fir::runtime::getRuntimeFunc<mkRTKey(Spacing2By4)>(loc, builder);
+  else if (fltTy.isBF16())
+    func = fir::runtime::getRuntimeFunc<mkRTKey(Spacing3By4)>(loc, builder);
   else
     fir::intrinsicTypeTODO(builder, fltTy, loc, "SPACING");
 
@@ -641,5 +649,6 @@ mlir::Value fir::runtime::genSpacing(fir::FirOpBuilder &builder,
   llvm::SmallVector<mlir::Value> args = {
       builder.createConvert(loc, funcTy.getInput(0), x)};
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  mlir::Value res = builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return builder.createConvert(loc, fltTy, res);
 }
diff --git a/flang/test/Lower/Intrinsics/spacing.f90 b/flang/test/Lower/Intrinsics/spacing.f90
index 39b35ebd533ab..151f4e2a6d236 100644
--- a/flang/test/Lower/Intrinsics/spacing.f90
+++ b/flang/test/Lower/Intrinsics/spacing.f90
@@ -1,20 +1,36 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
 
 ! CHECK-LABEL: func @_QPspacing_test(
-! CHECK-SAME: %[[x:[^:]+]]: !fir.ref<f32>{{.*}}) -> f32
 real*4 function spacing_test(x)
   real*4 :: x
   spacing_test = spacing(x)
-! CHECK: %[[a1:.*]] = fir.load %[[x]] : !fir.ref<f32>
+! CHECK: %[[a1:.*]] = fir.load %{{.*}} : !fir.ref<f32>
 ! CHECK: %{{.*}} = fir.call @_FortranASpacing4(%[[a1]]) {{.*}}: (f32) -> f32
 end function
 
 ! CHECK-LABEL: func @_QPspacing_test2(
-! CHECK-SAME: %[[x:[^:]+]]: !fir.ref<f80>{{.*}}) -> f80
 real*10 function spacing_test2(x)
   real*10 :: x
   spacing_test2 = spacing(x)
-! CHECK: %[[a1:.*]] = fir.load %[[x]] : !fir.ref<f80>
+! CHECK: %[[a1:.*]] = fir.load %{{.*}} : !fir.ref<f80>
 ! CHECK: %{{.*}} = fir.call @_FortranASpacing10(%[[a1]]) {{.*}}: (f80) -> f80
 end function
+
+! CHECK-LABEL: test_real2
+subroutine test_real2(x, y)
+  real(2) :: x, y
+  y = spacing(x)
+! CHECK: %[[CAST_ARG:.*]] = fir.convert %{{.*}} : (f16) -> f32
+! CHECK: %[[RT_RES:.*]] = fir.call @_FortranASpacing2By4(%[[CAST_ARG]]){{.*}}: (f32) -> f32
+! CHECK: fir.convert %[[RT_RES]] : (f32) -> f16
+end subroutine
+
+! CHECK-LABEL: test_real3
+subroutine test_real3(x, y)
+  real(3) :: x, y
+  y = spacing(x)
+! CHECK: %[[CAST_ARG:.*]] = fir.convert %{{.*}} : (bf16) -> f32
+! CHECK: %[[RT_RES:.*]] = fir.call @_FortranASpacing3By4(%[[CAST_ARG]]){{.*}}: (f32) -> f32
+! CHECK: fir.convert %[[RT_RES]] : (f32) -> bf16
+end subroutine

From 5d2b3378758b42391e90b1adf936537a66b038b6 Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Fri, 6 Sep 2024 16:03:11 +0400
Subject: [PATCH 363/425] [lldb][NFC] Used shared_fd_t (#107553)

Replaced `int connection_fd = -1` with `shared_fd_t connection_fd =
SharedSocket::kInvalidFD`.

This is prerequisite for #104238.
---
 .../gdb-remote/GDBRemoteCommunication.cpp     | 19 +++++++++++--------
 .../gdb-remote/GDBRemoteCommunication.h       |  5 +++--
 .../GDBRemoteCommunicationServerPlatform.cpp  |  5 +++--
 .../Process/gdb-remote/ProcessGDBRemote.cpp   |  2 +-
 lldb/tools/lldb-server/lldb-gdbserver.cpp     | 14 ++++++++------
 .../tools/lldb-server/tests/LLGSTest.cpp      |  4 ----
 .../tools/lldb-server/tests/TestClient.cpp    |  4 ----
 .../tools/lldb-server/tests/TestClient.h      |  4 ++++
 8 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index d3a782b22349a..d49e754582752 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -940,7 +940,7 @@ FileSpec GDBRemoteCommunication::GetDebugserverPath(Platform *platform) {
 
 Status GDBRemoteCommunication::StartDebugserverProcess(
     const char *url, Platform *platform, ProcessLaunchInfo &launch_info,
-    uint16_t *port, const Args *inferior_args, int pass_comm_fd) {
+    uint16_t *port, const Args *inferior_args, shared_fd_t pass_comm_fd) {
   Log *log = GetLog(GDBRLog::Process);
   LLDB_LOGF(log, "GDBRemoteCommunication::%s(url=%s, port=%" PRIu16 ")",
             __FUNCTION__, url ? url : "<empty>", port ? *port : uint16_t(0));
@@ -962,16 +962,19 @@ Status GDBRemoteCommunication::StartDebugserverProcess(
 #endif
 
     // If a url is supplied then use it
-    if (url)
+    if (url && url[0])
       debugserver_args.AppendArgument(llvm::StringRef(url));
 
-    if (pass_comm_fd >= 0) {
+    if (pass_comm_fd != SharedSocket::kInvalidFD) {
       StreamString fd_arg;
-      fd_arg.Printf("--fd=%i", pass_comm_fd);
+      fd_arg.Printf("--fd=%" PRIi64, (int64_t)pass_comm_fd);
       debugserver_args.AppendArgument(fd_arg.GetString());
       // Send "pass_comm_fd" down to the inferior so it can use it to
-      // communicate back with this process
-      launch_info.AppendDuplicateFileAction(pass_comm_fd, pass_comm_fd);
+      // communicate back with this process. Ignored on Windows.
+#ifndef _WIN32
+      launch_info.AppendDuplicateFileAction((int)pass_comm_fd,
+                                            (int)pass_comm_fd);
+#endif
     }
 
     // use native registers, not the GDB registers
@@ -991,7 +994,7 @@ Status GDBRemoteCommunication::StartDebugserverProcess(
     // port is null when debug server should listen on domain socket - we're
     // not interested in port value but rather waiting for debug server to
     // become available.
-    if (pass_comm_fd == -1) {
+    if (pass_comm_fd == SharedSocket::kInvalidFD) {
       if (url) {
 // Create a temporary file to get the stdout/stderr and redirect the output of
 // the command into this file. We will later read this file if all goes well
@@ -1137,7 +1140,7 @@ Status GDBRemoteCommunication::StartDebugserverProcess(
 
     if (error.Success() &&
         (launch_info.GetProcessID() != LLDB_INVALID_PROCESS_ID) &&
-        pass_comm_fd == -1) {
+        pass_comm_fd == SharedSocket::kInvalidFD) {
       if (named_pipe_path.size() > 0) {
         error = socket_pipe.OpenAsReader(named_pipe_path, false);
         if (error.Fail())
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
index a182291f6c859..754797ba7f504 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
@@ -21,6 +21,7 @@
 #include "lldb/Core/Communication.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Host/HostThread.h"
+#include "lldb/Host/Socket.h"
 #include "lldb/Utility/Args.h"
 #include "lldb/Utility/Listener.h"
 #include "lldb/Utility/Predicate.h"
@@ -156,8 +157,8 @@ class GDBRemoteCommunication : public Communication {
       Platform *platform, // If non nullptr, then check with the platform for
                           // the GDB server binary if it can't be located
       ProcessLaunchInfo &launch_info, uint16_t *port, const Args *inferior_args,
-      int pass_comm_fd); // Communication file descriptor to pass during
-                         // fork/exec to avoid having to connect/accept
+      shared_fd_t pass_comm_fd); // Communication file descriptor to pass during
+                                 // fork/exec to avoid having to connect/accept
 
   void DumpHistory(Stream &strm);
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
index 2f2750ec2b920..c60a83d351ba0 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
@@ -208,8 +208,9 @@ Status GDBRemoteCommunicationServerPlatform::LaunchGDBServer(
     port_ptr = nullptr;
   }
 
-  Status error = StartDebugserverProcess(
-      url.str().c_str(), nullptr, debugserver_launch_info, port_ptr, &args, -1);
+  Status error = StartDebugserverProcess(url.str().c_str(), nullptr,
+                                         debugserver_launch_info, port_ptr,
+                                         &args, SharedSocket::kInvalidFD);
 
   pid = debugserver_launch_info.GetProcessID();
   if (pid != LLDB_INVALID_PROCESS_ID) {
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 65b1b3a0f2686..5eaf9ce2a302a 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -3447,7 +3447,7 @@ Status ProcessGDBRemote::LaunchAndConnectToDebugserver(
     }
 #endif
 
-    int communication_fd = -1;
+    shared_fd_t communication_fd = SharedSocket::kInvalidFD;
 #ifdef USE_SOCKETPAIR_FOR_LOCAL_CONNECTION
     // Use a socketpair on non-Windows systems for security and performance
     // reasons.
diff --git a/lldb/tools/lldb-server/lldb-gdbserver.cpp b/lldb/tools/lldb-server/lldb-gdbserver.cpp
index 563284730bc70..d7503112697ef 100644
--- a/lldb/tools/lldb-server/lldb-gdbserver.cpp
+++ b/lldb/tools/lldb-server/lldb-gdbserver.cpp
@@ -195,13 +195,13 @@ void ConnectToRemote(MainLoop &mainloop,
                      bool reverse_connect, llvm::StringRef host_and_port,
                      const char *const progname, const char *const subcommand,
                      const char *const named_pipe_path, pipe_t unnamed_pipe,
-                     int connection_fd) {
+                     shared_fd_t connection_fd) {
   Status error;
 
   std::unique_ptr<Connection> connection_up;
   std::string url;
 
-  if (connection_fd != -1) {
+  if (connection_fd != SharedSocket::kInvalidFD) {
     url = llvm::formatv("fd://{0}", connection_fd).str();
 
     // Create the connection.
@@ -338,7 +338,7 @@ int main_gdbserver(int argc, char *argv[]) {
       log_channels; // e.g. "lldb process threads:gdb-remote default:linux all"
   lldb::pipe_t unnamed_pipe = LLDB_INVALID_PIPE;
   bool reverse_connect = false;
-  int connection_fd = -1;
+  shared_fd_t connection_fd = SharedSocket::kInvalidFD;
 
   // ProcessLaunchInfo launch_info;
   ProcessAttachInfo attach_info;
@@ -404,10 +404,12 @@ int main_gdbserver(int argc, char *argv[]) {
     unnamed_pipe = (pipe_t)Arg;
   }
   if (Args.hasArg(OPT_fd)) {
-    if (!llvm::to_integer(Args.getLastArgValue(OPT_fd), connection_fd)) {
+    int64_t fd;
+    if (!llvm::to_integer(Args.getLastArgValue(OPT_fd), fd)) {
       WithColor::error() << "invalid '--fd' argument\n" << HelpText;
       return 1;
     }
+    connection_fd = (shared_fd_t)fd;
   }
 
   if (!LLDBServerUtilities::SetupLogging(
@@ -423,7 +425,7 @@ int main_gdbserver(int argc, char *argv[]) {
     for (const char *Val : Arg->getValues())
       Inputs.push_back(Val);
   }
-  if (Inputs.empty() && connection_fd == -1) {
+  if (Inputs.empty() && connection_fd == SharedSocket::kInvalidFD) {
     WithColor::error() << "no connection arguments\n" << HelpText;
     return 1;
   }
@@ -432,7 +434,7 @@ int main_gdbserver(int argc, char *argv[]) {
   GDBRemoteCommunicationServerLLGS gdb_server(mainloop, manager);
 
   llvm::StringRef host_and_port;
-  if (!Inputs.empty()) {
+  if (!Inputs.empty() && connection_fd == SharedSocket::kInvalidFD) {
     host_and_port = Inputs.front();
     Inputs.erase(Inputs.begin());
   }
diff --git a/lldb/unittests/tools/lldb-server/tests/LLGSTest.cpp b/lldb/unittests/tools/lldb-server/tests/LLGSTest.cpp
index 66e92415911af..34f8d23e94e4c 100644
--- a/lldb/unittests/tools/lldb-server/tests/LLGSTest.cpp
+++ b/lldb/unittests/tools/lldb-server/tests/LLGSTest.cpp
@@ -14,10 +14,6 @@ using namespace llgs_tests;
 using namespace lldb_private;
 using namespace llvm;
 
-#ifdef SendMessage
-#undef SendMessage
-#endif
-
 // Disable this test on Windows as it appears to have a race condition
 // that causes lldb-server not to exit after the inferior hangs up.
 #if !defined(_WIN32)
diff --git a/lldb/unittests/tools/lldb-server/tests/TestClient.cpp b/lldb/unittests/tools/lldb-server/tests/TestClient.cpp
index 2d7ce36bacfaa..a6f2dc32c6d0c 100644
--- a/lldb/unittests/tools/lldb-server/tests/TestClient.cpp
+++ b/lldb/unittests/tools/lldb-server/tests/TestClient.cpp
@@ -26,10 +26,6 @@ using namespace lldb_private;
 using namespace llvm;
 using namespace llgs_tests;
 
-#ifdef SendMessage
-#undef SendMessage
-#endif
-
 TestClient::TestClient(std::unique_ptr<Connection> Conn) {
   SetConnection(std::move(Conn));
   SetPacketTimeout(std::chrono::seconds(10));
diff --git a/lldb/unittests/tools/lldb-server/tests/TestClient.h b/lldb/unittests/tools/lldb-server/tests/TestClient.h
index deb6802d0da70..5d1eacaf6198e 100644
--- a/lldb/unittests/tools/lldb-server/tests/TestClient.h
+++ b/lldb/unittests/tools/lldb-server/tests/TestClient.h
@@ -20,6 +20,10 @@
 #include <optional>
 #include <string>
 
+#ifdef SendMessage
+#undef SendMessage
+#endif
+
 #if LLDB_SERVER_IS_DEBUGSERVER
 #define LLGS_TEST(x) DISABLED_ ## x
 #define DS_TEST(x) x

From 1f70fcefa981e6e2b7e60678545766426fbefd96 Mon Sep 17 00:00:00 2001
From: SpencerAbson <Spencer.Abson@arm.com>
Date: Fri, 6 Sep 2024 13:12:37 +0100
Subject: [PATCH 364/425] [Clang][AArch64] Add customisable immediate range
 checking to NEON (#100278)

This patch moves NEON immediate argument specification and checking to
the system currently shared by both SVE and SME.

In its current form, the TableGen definition of a NEON intrinsic cannot
control how its immediate arguments are range-checked, this information
must be inferred from the name of the intrinsic by NeonEmitter, which
also assumes that any NEON instruction will only ever receive a single
immediate argument. For SVE/SME instrinsics, this information is more
conveniently supplied in the TableGen definition.

As a result, for each immediate argument, NEON instructions must define
- The index of the immediate argument to be checked
- The type of immediate range check to be performed,
    (e.g., ImmCheckShiftRight)
- The index of the argument whose type defines the context
    of this immediate check (base type, vector size).
- **Difference from SVE/SME** If this definition generates a polymorphic
NEON builtin, the base type defined by this argument is overwritten by
that of the type code supplied to the overloaded builtin call. This
third argument is omitted in some cases due to this.

Here is an example for
[`vfma_laneq`](https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]&q=vfma_laneq)
- The immediate is supplied in argument 3
- The immediate is used as an index into the lanes of argument 2
- So we must perform an immediate check on argument 3, based on the type
information of argument 2.
- `ImmCheck<3, ImmCheckLaneIndex, 2>`

During this work, we discovered that the existing immediate
range-checking system was largely untested, which made it difficult to
make reliable progress. Missing tests have been added to verify this
implementation against all intrinsics which take constrained immediate
arguments. All test immediate range checking tests for NEON intrinsics
are moved to a dedicated directory
`clang/test/Sema/aarch64-neon-immediate-ranges/`.
---
 clang/include/clang/Basic/CMakeLists.txt      |    3 +
 clang/include/clang/Basic/TargetBuiltins.h    |   36 +-
 clang/include/clang/Basic/arm_fp16.td         |   20 +-
 .../include/clang/Basic/arm_immcheck_incl.td  |   43 +
 clang/include/clang/Basic/arm_neon.td         |  662 +++++-----
 clang/include/clang/Basic/arm_neon_incl.td    |   17 +-
 clang/include/clang/Basic/arm_sve_sme_incl.td |   36 +-
 clang/include/clang/Sema/SemaARM.h            |   16 +-
 clang/lib/Sema/SemaARM.cpp                    |  342 +++---
 clang/test/CodeGen/aarch64-neon-luti.c        |   40 +-
 clang/test/CodeGen/arm-neon-range-checks.c    |  426 -------
 clang/test/Sema/aarch64-neon-bf16-ranges.c    |   49 -
 clang/test/Sema/aarch64-neon-fp16-ranges.c    |   66 -
 .../aarch64-neon-immediate-ranges/bfloat16.c  |  237 ++++
 .../conversions.c                             |  144 +++
 .../copy-vector-lane.c                        |  498 ++++++++
 .../aarch64-neon-immediate-ranges/dotprod.c   |   50 +
 .../extract-elt-from-vector.c                 |  301 +++++
 .../extract-vector-from-vectors.c             |  170 +++
 .../fp16-scalar.c                             |   79 ++
 .../aarch64-neon-immediate-ranges/fp16-v84.c  |   89 ++
 .../fp16-vector.c                             |  181 +++
 .../fused-multiply-accumulate.c               |  126 ++
 .../Sema/aarch64-neon-immediate-ranges/luti.c |  283 +++++
 .../matrix-multiplication.c                   |   50 +
 .../multiply-extended.c                       |   69 ++
 .../saturating-multiply-accumulate.c          |  132 ++
 .../saturating-multiply-by-scalar-and-widen.c |  193 +++
 .../set-lanes-to-value.c                      |  297 +++++
 .../set-vector-lane.c                         |  162 +++
 .../sqrdmlah-ranges.c                         |  130 ++
 .../aarch64-neon-immediate-ranges/vcmla.c     |  203 +++
 .../vector-load.c                             |  692 +++++++++++
 .../vector-multiply-accumulate-by-scalar.c    |  200 +++
 .../vector-multiply-by-scalar-and-widen.c     |   98 ++
 .../vector-multiply-by-scalar.c               |  160 +++
 .../vector-multiply-subtract-by-scalar.c      |  201 +++
 .../vector-shift-left.c                       |  542 +++++++++
 .../vector-shift-right.c                      | 1083 +++++++++++++++++
 .../vector-store.c                            |  620 ++++++++++
 clang/test/Sema/aarch64-neon-ranges.c         |  220 ----
 clang/utils/TableGen/NeonEmitter.cpp          |  182 +--
 clang/utils/TableGen/SveEmitter.cpp           |   54 +-
 clang/utils/TableGen/TableGen.cpp             |    8 +
 clang/utils/TableGen/TableGenBackends.h       |    1 +
 llvm/docs/CommandGuide/tblgen.rst             |    4 +
 llvm/include/llvm/TableGen/AArch64ImmCheck.h  |   37 +
 47 files changed, 7792 insertions(+), 1460 deletions(-)
 create mode 100644 clang/include/clang/Basic/arm_immcheck_incl.td
 delete mode 100644 clang/test/CodeGen/arm-neon-range-checks.c
 delete mode 100644 clang/test/Sema/aarch64-neon-bf16-ranges.c
 delete mode 100644 clang/test/Sema/aarch64-neon-fp16-ranges.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/bfloat16.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/conversions.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/copy-vector-lane.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/extract-elt-from-vector.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/extract-vector-from-vectors.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/fp16-scalar.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/fp16-v84.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/fp16-vector.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/fused-multiply-accumulate.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/luti.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/matrix-multiplication.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/multiply-extended.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/saturating-multiply-accumulate.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/saturating-multiply-by-scalar-and-widen.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/set-lanes-to-value.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/set-vector-lane.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/sqrdmlah-ranges.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/vcmla.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/vector-load.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-accumulate-by-scalar.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-by-scalar-and-widen.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-by-scalar.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-subtract-by-scalar.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/vector-shift-left.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/vector-shift-right.c
 create mode 100644 clang/test/Sema/aarch64-neon-immediate-ranges/vector-store.c
 delete mode 100644 clang/test/Sema/aarch64-neon-ranges.c
 create mode 100644 llvm/include/llvm/TableGen/AArch64ImmCheck.h

diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt
index 2ef6ddc68f4bf..f069f4fc118f2 100644
--- a/clang/include/clang/Basic/CMakeLists.txt
+++ b/clang/include/clang/Basic/CMakeLists.txt
@@ -67,6 +67,9 @@ clang_tablegen(arm_neon.inc -gen-arm-neon-sema
 clang_tablegen(arm_fp16.inc -gen-arm-neon-sema
   SOURCE arm_fp16.td
   TARGET ClangARMFP16)
+clang_tablegen(arm_immcheck_types.inc -gen-arm-immcheck-types
+  SOURCE arm_sve.td
+  TARGET ClangARMImmChecks)
 clang_tablegen(arm_mve_builtins.inc -gen-arm-mve-builtin-def
   SOURCE arm_mve.td
   TARGET ClangARMMveBuiltinsDef)
diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index 4333830bf34f2..02b4a4b39bbf4 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -216,6 +216,36 @@ namespace clang {
     }
     bool isUnsigned() const { return (Flags & UnsignedFlag) != 0; }
     bool isQuad() const { return (Flags & QuadFlag) != 0; }
+    unsigned getEltSizeInBits() const {
+      switch (getEltType()) {
+      case Int8:
+      case Poly8:
+        return 8;
+      case Int16:
+      case Float16:
+      case Poly16:
+      case BFloat16:
+        return 16;
+      case Int32:
+      case Float32:
+        return 32;
+      case Int64:
+      case Float64:
+      case Poly64:
+        return 64;
+      case Poly128:
+        return 128;
+      default:
+        llvm_unreachable("Invalid NeonTypeFlag!");
+      }
+    }
+  };
+
+  // Shared between SVE/SME and NEON
+  enum ImmCheckType {
+#define LLVM_GET_ARM_INTRIN_IMMCHECKTYPES
+#include "clang/Basic/arm_immcheck_types.inc"
+#undef LLVM_GET_ARM_INTRIN_IMMCHECKTYPES
   };
 
   /// Flags to identify the types for overloaded SVE builtins.
@@ -249,12 +279,6 @@ namespace clang {
 #undef LLVM_GET_SVE_MERGETYPES
     };
 
-    enum ImmCheckType {
-#define LLVM_GET_SVE_IMMCHECKTYPES
-#include "clang/Basic/arm_sve_typeflags.inc"
-#undef LLVM_GET_SVE_IMMCHECKTYPES
-    };
-
     SVETypeFlags(uint64_t F) : Flags(F) {
       EltTypeShift = llvm::countr_zero(EltTypeMask);
       MemEltTypeShift = llvm::countr_zero(MemEltTypeMask);
diff --git a/clang/include/clang/Basic/arm_fp16.td b/clang/include/clang/Basic/arm_fp16.td
index d36b4617bef5d..ed26e84af075e 100644
--- a/clang/include/clang/Basic/arm_fp16.td
+++ b/clang/include/clang/Basic/arm_fp16.td
@@ -76,17 +76,23 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
   def SCALAR_FCVTPUH  : SInst<"vcvtp_u16", "(1U)1", "Sh">;
   def SCALAR_FCVTPUH1 : SInst<"vcvtp_u32", "(1U>)1", "Sh">;
   def SCALAR_FCVTPUH2 : SInst<"vcvtp_u64", "(1U>>)1", "Sh">;
-  let isVCVT_N = 1 in {
+  let ImmChecks = [ImmCheck<1, ImmCheck1_16>] in {
     def SCALAR_SCVTFSHO : SInst<"vcvth_n_f16", "(1F)(1!)I", "sUs">;
     def SCALAR_SCVTFSH1O: SInst<"vcvth_n_f16", "(1F<)(1!)I", "iUi">;
     def SCALAR_SCVTFSH2O: SInst<"vcvth_n_f16", "(1F<<)(1!)I", "lUl">;
-    def SCALAR_FCVTZSHO : SInst<"vcvt_n_s16", "(1S)1I", "Sh">;
-    def SCALAR_FCVTZSH1O: SInst<"vcvt_n_s32", "(1S>)1I", "Sh">;
-    def SCALAR_FCVTZSH2O: SInst<"vcvt_n_s64", "(1S>>)1I", "Sh">;
-    def SCALAR_FCVTZUHO : SInst<"vcvt_n_u16", "(1U)1I", "Sh">;
-    def SCALAR_FCVTZUH1O: SInst<"vcvt_n_u32", "(1U>)1I", "Sh">;
-    def SCALAR_FCVTZUH2O: SInst<"vcvt_n_u64", "(1U>>)1I", "Sh">;
   }
+    def SCALAR_FCVTZSHO : SInst<"vcvt_n_s16", "(1S)1I", "Sh",
+                                [ImmCheck<1, ImmCheckCvt, 0>]>;
+    def SCALAR_FCVTZSH1O: SInst<"vcvt_n_s32", "(1S>)1I", "Sh",
+                                [ImmCheck<1, ImmCheckCvt, 0>]>;
+    def SCALAR_FCVTZSH2O: SInst<"vcvt_n_s64", "(1S>>)1I", "Sh",
+                                [ImmCheck<1, ImmCheckCvt, 0>]>;
+    def SCALAR_FCVTZUHO : SInst<"vcvt_n_u16", "(1U)1I", "Sh",
+                                [ImmCheck<1, ImmCheckCvt, 0>]>;
+    def SCALAR_FCVTZUH1O: SInst<"vcvt_n_u32", "(1U>)1I", "Sh",
+                                [ImmCheck<1, ImmCheckCvt, 0>]>;
+    def SCALAR_FCVTZUH2O: SInst<"vcvt_n_u64", "(1U>>)1I", "Sh",
+                                [ImmCheck<1, ImmCheckCvt, 0>]>;
   // Comparison
   def SCALAR_CMEQRH   : SInst<"vceq", "(1U)11", "Sh">;
   def SCALAR_CMEQZH   : SInst<"vceqz", "(1U)1", "Sh">;
diff --git a/clang/include/clang/Basic/arm_immcheck_incl.td b/clang/include/clang/Basic/arm_immcheck_incl.td
new file mode 100644
index 0000000000000..9d7f74a35aaa8
--- /dev/null
+++ b/clang/include/clang/Basic/arm_immcheck_incl.td
@@ -0,0 +1,43 @@
+class ImmCheckType<int val> {
+  int Value = val;
+}
+
+// These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h
+def ImmCheck0_31                : ImmCheckType<0>;  // 0..31 (used for e.g. predicate patterns)
+def ImmCheck1_16                : ImmCheckType<1>;  // 1..16
+def ImmCheckExtract             : ImmCheckType<2>;  // 0..(2048/sizeinbits(elt) - 1)
+def ImmCheckShiftRight          : ImmCheckType<3>;  // 1..sizeinbits(elt)
+def ImmCheckShiftRightNarrow    : ImmCheckType<4>;  // 1..sizeinbits(elt)/2
+def ImmCheckShiftLeft           : ImmCheckType<5>;  // 0..(sizeinbits(elt) - 1)
+def ImmCheck0_7                 : ImmCheckType<6>;  // 0..7
+def ImmCheckLaneIndex           : ImmCheckType<7>;  // 0..(sizeinbits(vec)/(sizeinbits(elt)) - 1)
+def ImmCheckCvt                 : ImmCheckType<8>;  // 1..sizeinbits(elt) (same as ShiftRight)
+def ImmCheckLaneIndexCompRotate : ImmCheckType<9>;  // 0..(sizeinbits(vec)/(2*sizeinbits(elt)) - 1)
+def ImmCheckLaneIndexDot        : ImmCheckType<10>; // 0..(sizeinbits(vec)/(4*sizeinbits(elt)) - 1)
+def ImmCheckComplexRot90_270    : ImmCheckType<11>; // [90,270]
+def ImmCheckComplexRotAll90     : ImmCheckType<12>; // [0, 90, 180,270]
+def ImmCheck0_13                : ImmCheckType<13>; // 0..13
+def ImmCheck0_1                 : ImmCheckType<14>; // 0..1
+def ImmCheck0_2                 : ImmCheckType<15>; // 0..2
+def ImmCheck0_3                 : ImmCheckType<16>; // 0..3
+def ImmCheck0_0                 : ImmCheckType<17>; // 0..0
+def ImmCheck0_15                : ImmCheckType<18>; // 0..15
+def ImmCheck0_255               : ImmCheckType<19>; // 0..255
+def ImmCheck2_4_Mul2            : ImmCheckType<20>; // 2, 4
+def ImmCheck1_1                 : ImmCheckType<21>; // 1..1
+def ImmCheck1_3                 : ImmCheckType<22>; // 1..3
+def ImmCheck1_7                 : ImmCheckType<23>; // 1..7
+def ImmCheck1_32                : ImmCheckType<24>; // 1..32
+def ImmCheck1_64                : ImmCheckType<25>; // 1..64
+def ImmCheck0_63                : ImmCheckType<26>; // 0..63
+
+class ImmCheck<int immArgIdx, ImmCheckType kind, int typeArgIdx = -1> {
+  // Parameter index of immediate argument to be verified
+  int ImmArgIdx = immArgIdx;
+
+  // Parameter index of argument whose type determines the context of this immediate check -
+  // element type for SVE/SME, element type and vector size for NEON (ignoring element type for
+  // ClassB NEON intrinsics).
+  int TypeContextArgIdx = typeArgIdx;
+  ImmCheckType Kind = kind;
+}
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 536c0652280b9..875ec6e90b685 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -284,16 +284,17 @@ def OP_CVT_F32_BF16
 
 // Splat operation - performs a range-checked splat over a vector
 def SPLAT  : WInst<"splat_lane", ".(!q)I",
-                   "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl">;
+                   "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl",
+                    [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
 def SPLATQ : WInst<"splat_laneq", ".(!Q)I",
-                   "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl"> {
-  let isLaneQ = 1;
-}
+                   "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl",
+                   [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
+
 let TargetGuard = "bf16,neon" in {
-  def SPLAT_BF  : WInst<"splat_lane", ".(!q)I", "bQb">;
-  def SPLATQ_BF : WInst<"splat_laneq", ".(!Q)I", "bQb"> {
-    let isLaneQ = 1;
-  }
+  def SPLAT_BF  : WInst<"splat_lane", ".(!q)I", "bQb",
+                      [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
+  def SPLATQ_BF : WInst<"splat_laneq", ".(!Q)I", "bQb",
+                      [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -401,27 +402,57 @@ def VQRSHL : SInst<"vqrshl", "..S", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.12 Shifts by constant
 let isShift = 1 in {
-def VSHR_N     : SInst<"vshr_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VSHL_N     : IInst<"vshl_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VRSHR_N    : SInst<"vrshr_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VSRA_N     : SInst<"vsra_n", "...I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VRSRA_N    : SInst<"vrsra_n", "...I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VQSHL_N    : SInst<"vqshl_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VQSHLU_N   : SInst<"vqshlu_n", "U.I", "csilQcQsQiQl">;
-def VSHRN_N    : IInst<"vshrn_n", "<QI", "silUsUiUl">;
-def VQSHRUN_N  : SInst<"vqshrun_n", "(<U)QI", "sil">;
-def VQRSHRUN_N : SInst<"vqrshrun_n", "(<U)QI", "sil">;
-def VQSHRN_N   : SInst<"vqshrn_n", "<QI", "silUsUiUl">;
-def VRSHRN_N   : IInst<"vrshrn_n", "<QI", "silUsUiUl">;
-def VQRSHRN_N  : SInst<"vqrshrn_n", "<QI", "silUsUiUl">;
-def VSHLL_N    : SInst<"vshll_n", "(>Q).I", "csiUcUsUi">;
+
+
+def VSHR_N     : SInst<"vshr_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl",
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+def VSHL_N     : IInst<"vshl_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl",
+                      [ImmCheck<1, ImmCheckShiftLeft>]>;
+def VRSHR_N    : SInst<"vrshr_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl",
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+def VSRA_N     : SInst<"vsra_n", "...I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl",
+                      [ImmCheck<2, ImmCheckShiftRight>]>;
+def VRSRA_N    : SInst<"vrsra_n", "...I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl",
+                      [ImmCheck<2, ImmCheckShiftRight>]>;
+def VQSHL_N    : SInst<"vqshl_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl",
+                      [ImmCheck<1, ImmCheckShiftLeft>]>;
+def VQSHLU_N   : SInst<"vqshlu_n", "U.I", "csilQcQsQiQl",
+                      [ImmCheck<1, ImmCheckShiftLeft>]>;
+
+// Narrowing right shifts should have an immediate range of 1..(sizeinbits(arg)/2).
+// However, as the overloaded type code that is supplied to a polymorphic builtin
+// is that of the return type (half as wide as the argument in this case), using
+// ImmCheckShiftRightNarrow would return in an upper bound of (sizeinbits(arg)/2)/2.
+// ImmCheckShiftRight produces the correct behavior here.
+def VSHRN_N    : IInst<"vshrn_n", "<QI", "silUsUiUl",
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+def VQSHRUN_N  : SInst<"vqshrun_n", "(<U)QI", "sil",
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+def VQRSHRUN_N : SInst<"vqrshrun_n", "(<U)QI", "sil",
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+def VQSHRN_N   : SInst<"vqshrn_n", "<QI", "silUsUiUl",
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+def VRSHRN_N   : IInst<"vrshrn_n", "<QI", "silUsUiUl",
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+def VQRSHRN_N  : SInst<"vqrshrn_n", "<QI", "silUsUiUl",
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+
+// Widening left-shifts should have a range of 0..(sizeinbits(arg)-1).
+// This polymorphic builtin is supplied the wider return type as it's overloaded
+// base type, so the range here is actually 0..(sizeinbits(arg)*2).
+// This cannot be rectified currently due to a use of vshll_n_s16 with an
+// out-of-bounds immediate in the defintiion of vcvt_f32_bf16.
+def VSHLL_N    : SInst<"vshll_n", "(>Q).I", "csiUcUsUi",
+                      [ImmCheck<1, ImmCheckShiftLeft>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.13 Shifts with insert
 def VSRI_N : WInst<"vsri_n", "...I",
-                   "csilUcUsUiUlPcPsQcQsQiQlQUcQUsQUiQUlQPcQPs">;
+                   "csilUcUsUiUlPcPsQcQsQiQlQUcQUsQUiQUlQPcQPs",
+                    [ImmCheck<2, ImmCheckShiftRight, 0>]>;
 def VSLI_N : WInst<"vsli_n", "...I",
-                   "csilUcUsUiUlPcPsQcQsQiQlQUcQUsQUiQUlQPcQPs">;
+                   "csilUcUsUiUlPcPsQcQsQiQlQUcQUsQUiQUlQPcQPs",
+                   [ImmCheck<2, ImmCheckShiftLeft, 0>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -435,7 +466,8 @@ def VLD1_X3   : WInst<"vld1_x3", "3(c*!)",
 def VLD1_X4   : WInst<"vld1_x4", "4(c*!)",
                       "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
 def VLD1_LANE : WInst<"vld1_lane", ".(c*!).I",
-                      "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs">;
+                      "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs",
+                      [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 def VLD1_DUP  : WInst<"vld1_dup", ".(c*!)",
                       "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs">;
 def VST1      : WInst<"vst1", "v*(.!)",
@@ -447,19 +479,23 @@ def VST1_X3   : WInst<"vst1_x3", "v*(3!)",
 def VST1_X4   : WInst<"vst1_x4", "v*(4!)",
                       "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
 def VST1_LANE : WInst<"vst1_lane", "v*(.!)I",
-                      "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs">;
+                      "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs",
+                      [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+
 let ArchGuard = "(__ARM_FP & 2)" in {
 def VLD1_F16      : WInst<"vld1", ".(c*!)", "hQh">;
 def VLD1_X2_F16   : WInst<"vld1_x2", "2(c*!)", "hQh">;
 def VLD1_X3_F16   : WInst<"vld1_x3", "3(c*!)", "hQh">;
 def VLD1_X4_F16   : WInst<"vld1_x4", "4(c*!)", "hQh">;
-def VLD1_LANE_F16 : WInst<"vld1_lane", ".(c*!).I", "hQh">;
+def VLD1_LANE_F16 : WInst<"vld1_lane", ".(c*!).I", "hQh",
+                          [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 def VLD1_DUP_F16  : WInst<"vld1_dup", ".(c*!)", "hQh">;
 def VST1_F16      : WInst<"vst1", "v*(.!)", "hQh">;
 def VST1_X2_F16   : WInst<"vst1_x2", "v*(2!)", "hQh">;
 def VST1_X3_F16   : WInst<"vst1_x3", "v*(3!)", "hQh">;
 def VST1_X4_F16   : WInst<"vst1_x4", "v*(4!)", "hQh">;
-def VST1_LANE_F16 : WInst<"vst1_lane", "v*(.!)I", "hQh">;
+def VST1_LANE_F16 : WInst<"vst1_lane", "v*(.!)I", "hQh",
+                          [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -473,15 +509,21 @@ def VLD3_DUP  : WInst<"vld3_dup", "3(c*!)",
                       "UcUsUiUlcsilfPcPsQcQfQiQlQsQPcQPsQUcQUiQUlQUs">;
 def VLD4_DUP  : WInst<"vld4_dup", "4(c*!)",
                       "UcUsUiUlcsilfPcPsQcQfQiQlQsQPcQPsQUcQUiQUlQUs">;
-def VLD2_LANE : WInst<"vld2_lane", "2(c*!)2I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
-def VLD3_LANE : WInst<"vld3_lane", "3(c*!)3I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
-def VLD4_LANE : WInst<"vld4_lane", "4(c*!)4I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
+def VLD2_LANE : WInst<"vld2_lane", "2(c*!)2I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs",
+                      [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
+def VLD3_LANE : WInst<"vld3_lane", "3(c*!)3I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs",
+                      [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
+def VLD4_LANE : WInst<"vld4_lane", "4(c*!)4I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs",
+                      [ImmCheck<6, ImmCheckLaneIndex, 1>]>;
 def VST2 : WInst<"vst2", "v*(2!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
 def VST3 : WInst<"vst3", "v*(3!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
 def VST4 : WInst<"vst4", "v*(4!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
-def VST2_LANE : WInst<"vst2_lane", "v*(2!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
-def VST3_LANE : WInst<"vst3_lane", "v*(3!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
-def VST4_LANE : WInst<"vst4_lane", "v*(4!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
+def VST2_LANE : WInst<"vst2_lane", "v*(2!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs",
+                      [ImmCheck<3, ImmCheckLaneIndex, 1>]>;
+def VST3_LANE : WInst<"vst3_lane", "v*(3!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs",
+                      [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
+def VST4_LANE : WInst<"vst4_lane", "v*(4!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs",
+                      [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
 let ArchGuard = "(__ARM_FP & 2)" in {
 def VLD2_F16      : WInst<"vld2", "2(c*!)", "hQh">;
 def VLD3_F16      : WInst<"vld3", "3(c*!)", "hQh">;
@@ -489,28 +531,36 @@ def VLD4_F16      : WInst<"vld4", "4(c*!)", "hQh">;
 def VLD2_DUP_F16  : WInst<"vld2_dup", "2(c*!)", "hQh">;
 def VLD3_DUP_F16  : WInst<"vld3_dup", "3(c*!)", "hQh">;
 def VLD4_DUP_F16  : WInst<"vld4_dup", "4(c*!)", "hQh">;
-def VLD2_LANE_F16 : WInst<"vld2_lane", "2(c*!)2I", "hQh">;
-def VLD3_LANE_F16 : WInst<"vld3_lane", "3(c*!)3I", "hQh">;
-def VLD4_LANE_F16 : WInst<"vld4_lane", "4(c*!)4I", "hQh">;
+def VLD2_LANE_F16 : WInst<"vld2_lane", "2(c*!)2I", "hQh",
+                          [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
+def VLD3_LANE_F16 : WInst<"vld3_lane", "3(c*!)3I", "hQh",
+                          [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
+def VLD4_LANE_F16 : WInst<"vld4_lane", "4(c*!)4I", "hQh",
+                          [ImmCheck<6, ImmCheckLaneIndex, 1>]>;
 def VST2_F16      : WInst<"vst2", "v*(2!)", "hQh">;
 def VST3_F16      : WInst<"vst3", "v*(3!)", "hQh">;
 def VST4_F16      : WInst<"vst4", "v*(4!)", "hQh">;
-def VST2_LANE_F16 : WInst<"vst2_lane", "v*(2!)I", "hQh">;
-def VST3_LANE_F16 : WInst<"vst3_lane", "v*(3!)I", "hQh">;
-def VST4_LANE_F16 : WInst<"vst4_lane", "v*(4!)I", "hQh">;
+def VST2_LANE_F16 : WInst<"vst2_lane", "v*(2!)I", "hQh",
+                          [ImmCheck<3, ImmCheckLaneIndex, 1>]>;
+def VST3_LANE_F16 : WInst<"vst3_lane", "v*(3!)I", "hQh",
+                         [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
+def VST4_LANE_F16 : WInst<"vst4_lane", "v*(4!)I", "hQh",
+                          [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.16 Extract lanes from a vector
 let InstName = "vmov" in
 def VGET_LANE : IInst<"vget_lane", "1.I",
-                      "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl">;
+                      "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl",
+                      [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.17 Set lanes within a vector
 let InstName = "vmov" in
 def VSET_LANE : IInst<"vset_lane", ".1.I",
-                      "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl">;
+                      "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl",
+                      [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.18 Initialize a vector from bit pattern
@@ -559,11 +609,12 @@ let ArchGuard = "(__ARM_FP & 2)" in {
 def VCVT_S32     : SInst<"vcvt_s32", "S.",  "fQf">;
 def VCVT_U32     : SInst<"vcvt_u32", "U.",  "fQf">;
 def VCVT_F32     : SInst<"vcvt_f32", "F(.!)",  "iUiQiQUi">;
-let isVCVT_N = 1 in {
-def VCVT_N_S32   : SInst<"vcvt_n_s32", "S.I", "fQf">;
-def VCVT_N_U32   : SInst<"vcvt_n_u32", "U.I", "fQf">;
-def VCVT_N_F32   : SInst<"vcvt_n_f32", "F(.!)I", "iUiQiQUi">;
-}
+def VCVT_N_S32   : SInst<"vcvt_n_s32", "S.I", "fQf",
+                        [ImmCheck<1, ImmCheck1_32>]>;
+def VCVT_N_U32   : SInst<"vcvt_n_u32", "U.I", "fQf",
+                        [ImmCheck<1, ImmCheck1_32>]>;
+def VCVT_N_F32   : SInst<"vcvt_n_f32", "F(.!)I", "iUiQiQUi",
+                        [ImmCheck<1, ImmCheck1_32>]>;
 
 def VMOVN        : IInst<"vmovn", "<Q",  "silUsUiUl">;
 def VMOVL        : SInst<"vmovl", "(>Q).",  "csiUcUsUi">;
@@ -610,8 +661,10 @@ def VQDMULH_LANE  : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
 def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "..qI", "siQsQi", OP_QRDMULH_LN>;
 }
 let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in {
-def A64_VQDMULH_LANE  : SInst<"vqdmulh_lane", "..(!q)I", "siQsQi">;
-def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..(!q)I", "siQsQi">;
+def A64_VQDMULH_LANE  : SInst<"vqdmulh_lane", "..(!q)I", "siQsQi",
+                              [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..(!q)I", "siQsQi",
+                              [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 }
 
 let TargetGuard = "v8.1a,neon" in {
@@ -629,7 +682,8 @@ def VQDMLSL_N     : SOpInst<"vqdmlsl_n", "(>Q)(>Q).1", "si", OP_QDMLSL_N>;
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.26 Vector Extract
 def VEXT : WInst<"vext", "...I",
-                 "cUcPcsUsPsiUilUlfQcQUcQPcQsQUsQPsQiQUiQlQUlQf">;
+                 "cUcPcsUsPsiUilUlfQcQUcQPcQsQUsQPsQiQUiQlQUlQf",
+                 [ImmCheck<2, ImmCheckLaneIndex, 0>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.27 Reverse vector elements
@@ -738,14 +792,22 @@ def ST1_X2 : WInst<"vst1_x2", "v*(2!)", "dQdPlQPl">;
 def ST1_X3 : WInst<"vst1_x3", "v*(3!)", "dQdPlQPl">;
 def ST1_X4 : WInst<"vst1_x4", "v*(4!)", "dQdPlQPl">;
 
-def LD1_LANE : WInst<"vld1_lane", ".(c*!).I", "dQdPlQPl">;
-def LD2_LANE : WInst<"vld2_lane", "2(c*!)2I", "lUlQcQUcQPcQlQUldQdPlQPl">;
-def LD3_LANE : WInst<"vld3_lane", "3(c*!)3I", "lUlQcQUcQPcQlQUldQdPlQPl">;
-def LD4_LANE : WInst<"vld4_lane", "4(c*!)4I", "lUlQcQUcQPcQlQUldQdPlQPl">;
-def ST1_LANE : WInst<"vst1_lane", "v*(.!)I", "dQdPlQPl">;
-def ST2_LANE : WInst<"vst2_lane", "v*(2!)I", "lUlQcQUcQPcQlQUldQdPlQPl">;
-def ST3_LANE : WInst<"vst3_lane", "v*(3!)I", "lUlQcQUcQPcQlQUldQdPlQPl">;
-def ST4_LANE : WInst<"vst4_lane", "v*(4!)I", "lUlQcQUcQPcQlQUldQdPlQPl">;
+def LD1_LANE : WInst<"vld1_lane", ".(c*!).I", "dQdPlQPl",
+                    [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def LD2_LANE : WInst<"vld2_lane", "2(c*!)2I", "lUlQcQUcQPcQlQUldQdPlQPl",
+                    [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
+def LD3_LANE : WInst<"vld3_lane", "3(c*!)3I", "lUlQcQUcQPcQlQUldQdPlQPl",
+                    [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
+def LD4_LANE : WInst<"vld4_lane", "4(c*!)4I", "lUlQcQUcQPcQlQUldQdPlQPl",
+                    [ImmCheck<6, ImmCheckLaneIndex, 1>]>;
+def ST1_LANE : WInst<"vst1_lane", "v*(.!)I", "dQdPlQPl",
+                    [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def ST2_LANE : WInst<"vst2_lane", "v*(2!)I", "lUlQcQUcQPcQlQUldQdPlQPl",
+                    [ImmCheck<3, ImmCheckLaneIndex, 1>]>;
+def ST3_LANE : WInst<"vst3_lane", "v*(3!)I", "lUlQcQUcQPcQlQUldQdPlQPl",
+                    [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
+def ST4_LANE : WInst<"vst4_lane", "v*(4!)I", "lUlQcQUcQPcQlQUldQdPlQPl",
+                    [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
 
 def LD1_DUP  : WInst<"vld1_dup", ".(c*!)", "dQdPlQPl">;
 def LD2_DUP  : WInst<"vld2_dup", "2(c*!)", "dQdPlQPl">;
@@ -901,8 +963,8 @@ def SHLL_HIGH_N    : SOpInst<"vshll_high_n", ">.I", "HcHsHiHUcHUsHUi",
                              OP_LONG_HI>;
 
 ////////////////////////////////////////////////////////////////////////////////
-def SRI_N : WInst<"vsri_n", "...I", "PlQPl">;
-def SLI_N : WInst<"vsli_n", "...I", "PlQPl">;
+def SRI_N : WInst<"vsri_n", "...I", "PlQPl", [ImmCheck<2, ImmCheckShiftRight, 0>]>;
+def SLI_N : WInst<"vsli_n", "...I", "PlQPl", [ImmCheck<2, ImmCheckShiftLeft, 0>]>;
 
 // Right shift narrow high
 def SHRN_HIGH_N    : IOpInst<"vshrn_high_n", "<(<q).I",
@@ -923,11 +985,12 @@ def QRSHRN_HIGH_N  : SOpInst<"vqrshrn_high_n", "<(<q).I",
 // Converting vectors
 def VMOVL_HIGH   : SOpInst<"vmovl_high", ">.", "HcHsHiHUcHUsHUi", OP_MOVL_HI>;
 
-let isVCVT_N = 1 in {
-def CVTF_N_F64   : SInst<"vcvt_n_f64", "F(.!)I", "lUlQlQUl">;
-def FCVTZS_N_S64 : SInst<"vcvt_n_s64", "S.I", "dQd">;
-def FCVTZS_N_U64 : SInst<"vcvt_n_u64", "U.I", "dQd">;
-}
+def CVTF_N_F64   : SInst<"vcvt_n_f64", "F(.!)I", "lUlQlQUl",
+                        [ImmCheck<1, ImmCheck1_64>]>;
+def FCVTZS_N_S64 : SInst<"vcvt_n_s64", "S.I", "dQd",
+                        [ImmCheck<1, ImmCheck1_64>]>;
+def FCVTZS_N_U64 : SInst<"vcvt_n_u64", "U.I", "dQd",
+                        [ImmCheck<1, ImmCheck1_64>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // 3VDiff class using high 64-bit in operands
@@ -965,29 +1028,25 @@ let TargetGuard = "aes,neon" in {
 
 ////////////////////////////////////////////////////////////////////////////////
 // Extract or insert element from vector
-def GET_LANE : IInst<"vget_lane", "1.I", "dQdPlQPl">;
-def SET_LANE : IInst<"vset_lane", ".1.I", "dQdPlQPl">;
+def GET_LANE : IInst<"vget_lane", "1.I", "dQdPlQPl",
+                      [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
+def SET_LANE : IInst<"vset_lane", ".1.I", "dQdPlQPl",
+                      [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 def COPY_LANE : IOpInst<"vcopy_lane", "..I.I",
                         "csilUcUsUiUlPcPsPlfd", OP_COPY_LN>;
 def COPYQ_LANE : IOpInst<"vcopy_lane", "..IqI",
                         "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>;
 def COPY_LANEQ : IOpInst<"vcopy_laneq", "..IQI",
-                     "csilPcPsPlUcUsUiUlfd", OP_COPY_LN> {
-  let isLaneQ = 1;
-}
+                     "csilPcPsPlUcUsUiUlfd", OP_COPY_LN>;
 def COPYQ_LANEQ : IOpInst<"vcopy_laneq", "..I.I",
-                     "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN> {
-  let isLaneQ = 1;
-}
+                     "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Set all lanes to same value
 def VDUP_LANE1: WOpInst<"vdup_lane", ".qI", "dQdPlQPl", OP_DUP_LN>;
 def VDUP_LANE2: WOpInst<"vdup_laneq", ".QI",
                   "csilUcUsUiUlPcPshfdQcQsQiQlQPcQPsQUcQUsQUiQUlQhQfQdPlQPl",
-                        OP_DUP_LN> {
-  let isLaneQ = 1;
-}
+                        OP_DUP_LN>;
 def DUP_N   : WOpInst<"vdup_n", ".1", "dQdPlQPl", OP_DUP>;
 def MOV_N   : WOpInst<"vmov_n", ".1", "dQdPlQPl", OP_DUP>;
 
@@ -1003,60 +1062,36 @@ def CREATE : NoTestOpInst<"vcreate", ".(IU>)", "dPl", OP_CAST> {
 ////////////////////////////////////////////////////////////////////////////////
 
 def VMLA_LANEQ   : IOpInst<"vmla_laneq", "...QI",
-                           "siUsUifQsQiQUsQUiQf", OP_MLA_LN> {
-  let isLaneQ = 1;
-}
+                           "siUsUifQsQiQUsQUiQf", OP_MLA_LN>;
 def VMLS_LANEQ   : IOpInst<"vmls_laneq", "...QI",
-                           "siUsUifQsQiQUsQUiQf", OP_MLS_LN> {
-  let isLaneQ = 1;
-}
-
-def VFMA_LANE    : IInst<"vfma_lane", "...qI", "fdQfQd">;
-def VFMA_LANEQ   : IInst<"vfma_laneq", "...QI", "fdQfQd"> {
-  let isLaneQ = 1;
-}
+                           "siUsUifQsQiQUsQUiQf", OP_MLS_LN>;
+def VFMA_LANE    : IInst<"vfma_lane", "...qI", "fdQfQd",
+                        [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def VFMA_LANEQ   : IInst<"vfma_laneq", "...QI", "fdQfQd",
+                        [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
 def VFMS_LANE    : IOpInst<"vfms_lane", "...qI", "fdQfQd", OP_FMS_LN>;
-def VFMS_LANEQ   : IOpInst<"vfms_laneq", "...QI", "fdQfQd", OP_FMS_LNQ> {
-  let isLaneQ = 1;
-}
+def VFMS_LANEQ   : IOpInst<"vfms_laneq", "...QI", "fdQfQd", OP_FMS_LNQ>;
 
-def VMLAL_LANEQ  : SOpInst<"vmlal_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLAL_LN> {
-  let isLaneQ = 1;
-}
+def VMLAL_LANEQ  : SOpInst<"vmlal_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLAL_LN>;
 def VMLAL_HIGH_LANE   : SOpInst<"vmlal_high_lane", "(>Q)(>Q)Q.I", "siUsUi",
                                 OP_MLALHi_LN>;
 def VMLAL_HIGH_LANEQ  : SOpInst<"vmlal_high_laneq", "(>Q)(>Q)QQI", "siUsUi",
-                                OP_MLALHi_LN> {
-  let isLaneQ = 1;
-}
-def VMLSL_LANEQ  : SOpInst<"vmlsl_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLSL_LN> {
-  let isLaneQ = 1;
-}
+                                OP_MLALHi_LN>;
+def VMLSL_LANEQ  : SOpInst<"vmlsl_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLSL_LN>;
 def VMLSL_HIGH_LANE   : SOpInst<"vmlsl_high_lane", "(>Q)(>Q)Q.I", "siUsUi",
                                 OP_MLSLHi_LN>;
 def VMLSL_HIGH_LANEQ  : SOpInst<"vmlsl_high_laneq", "(>Q)(>Q)QQI", "siUsUi",
-                                OP_MLSLHi_LN> {
-  let isLaneQ = 1;
-}
-
-def VQDMLAL_LANEQ  : SOpInst<"vqdmlal_laneq", "(>Q)(>Q).QI", "si", OP_QDMLAL_LN> {
-  let isLaneQ = 1;
-}
+                                OP_MLSLHi_LN>;
+def VQDMLAL_LANEQ  : SOpInst<"vqdmlal_laneq", "(>Q)(>Q).QI", "si", OP_QDMLAL_LN>;
 def VQDMLAL_HIGH_LANE   : SOpInst<"vqdmlal_high_lane", "(>Q)(>Q)Q.I", "si",
                                 OP_QDMLALHi_LN>;
 def VQDMLAL_HIGH_LANEQ  : SOpInst<"vqdmlal_high_laneq", "(>Q)(>Q)QQI", "si",
-                                OP_QDMLALHi_LN> {
-  let isLaneQ = 1;
-}
-def VQDMLSL_LANEQ  : SOpInst<"vqdmlsl_laneq", "(>Q)(>Q).QI", "si", OP_QDMLSL_LN> {
-  let isLaneQ = 1;
-}
+                                OP_QDMLALHi_LN>;
+def VQDMLSL_LANEQ  : SOpInst<"vqdmlsl_laneq", "(>Q)(>Q).QI", "si", OP_QDMLSL_LN>;
 def VQDMLSL_HIGH_LANE   : SOpInst<"vqdmlsl_high_lane", "(>Q)(>Q)Q.I", "si",
                                 OP_QDMLSLHi_LN>;
 def VQDMLSL_HIGH_LANEQ  : SOpInst<"vqdmlsl_high_laneq", "(>Q)(>Q)QQI", "si",
-                                OP_QDMLSLHi_LN> {
-  let isLaneQ = 1;
-}
+                                OP_QDMLSLHi_LN>;
 
 // Newly add double parameter for vmul_lane in aarch64
 // Note: d type is handled by SCALAR_VMUL_LANE
@@ -1064,48 +1099,31 @@ def VMUL_LANE_A64 : IOpInst<"vmul_lane", "..qI", "Qd", OP_MUL_LN>;
 
 // Note: d type is handled by SCALAR_VMUL_LANEQ
 def VMUL_LANEQ   : IOpInst<"vmul_laneq", "..QI",
-                           "sifUsUiQsQiQUsQUiQfQd", OP_MUL_LN> {
-  let isLaneQ = 1;
-}
-def VMULL_LANEQ  : SOpInst<"vmull_laneq", "(>Q).QI", "siUsUi", OP_MULL_LN> {
-  let isLaneQ = 1;
-}
+                           "sifUsUiQsQiQUsQUiQfQd", OP_MUL_LN>;
+def VMULL_LANEQ  : SOpInst<"vmull_laneq", "(>Q).QI", "siUsUi", OP_MULL_LN>;
 def VMULL_HIGH_LANE   : SOpInst<"vmull_high_lane", "(>Q)Q.I", "siUsUi",
                                 OP_MULLHi_LN>;
 def VMULL_HIGH_LANEQ  : SOpInst<"vmull_high_laneq", "(>Q)QQI", "siUsUi",
-                                OP_MULLHi_LN> {
-  let isLaneQ = 1;
-}
-
-def VQDMULL_LANEQ  : SOpInst<"vqdmull_laneq", "(>Q).QI", "si", OP_QDMULL_LN> {
-  let isLaneQ = 1;
-}
+                                OP_MULLHi_LN>;
+def VQDMULL_LANEQ  : SOpInst<"vqdmull_laneq", "(>Q).QI", "si", OP_QDMULL_LN>;
 def VQDMULL_HIGH_LANE   : SOpInst<"vqdmull_high_lane", "(>Q)Q.I", "si",
                                   OP_QDMULLHi_LN>;
 def VQDMULL_HIGH_LANEQ  : SOpInst<"vqdmull_high_laneq", "(>Q)QQI", "si",
-                                  OP_QDMULLHi_LN> {
-  let isLaneQ = 1;
-}
+                                  OP_QDMULLHi_LN>;
+def VQDMULH_LANEQ  : SInst<"vqdmulh_laneq", "..QI", "siQsQi",
+                          [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi",
+                          [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 
-let isLaneQ = 1 in {
-def VQDMULH_LANEQ  : SInst<"vqdmulh_laneq", "..QI", "siQsQi">;
-def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">;
-}
 let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a,neon" in {
-def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN> {
-  let isLaneQ = 1;
-}
-def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN> {
-  let isLaneQ = 1;
-}
+def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN>;
+def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN>;
 } // ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a"
 
 // Note: d type implemented by SCALAR_VMULX_LANE
 def VMULX_LANE : IOpInst<"vmulx_lane", "..qI", "fQfQd", OP_MULX_LN>;
 // Note: d type is implemented by SCALAR_VMULX_LANEQ
-def VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "fQfQd", OP_MULX_LN> {
-  let isLaneQ = 1;
-}
+def VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "fQfQd", OP_MULX_LN>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Across vectors class
@@ -1118,7 +1136,8 @@ def FMINNMV : SInst<"vminnmv", "1.", "fQfQd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Newly added Vector Extract for f64
-def VEXT_A64 : WInst<"vext", "...I", "dQdPlQPl">;
+def VEXT_A64 : WInst<"vext", "...I", "dQdPlQPl",
+                    [ImmCheck<2, ImmCheckLaneIndex, 0>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Crypto
@@ -1147,10 +1166,7 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "s
 def BCAX : SInst<"vbcax", "....", "QUcQUsQUiQUlQcQsQiQl">;
 def EOR3 : SInst<"veor3", "....", "QUcQUsQUiQUlQcQsQiQl">;
 def RAX1 : SInst<"vrax1", "...", "QUl">;
-
-let isVXAR = 1 in {
-def XAR :  SInst<"vxar", "...I", "QUl">;
-}
+def XAR :  SInst<"vxar", "...I", "QUl", [ImmCheck<2, ImmCheck0_63>]>;
 }
 
 let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sha3,neon" in {
@@ -1162,10 +1178,10 @@ def SHA512H2 : SInst<"vsha512h2", "....", "QUl">;
 
 let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sm4,neon" in {
 def SM3SS1 : SInst<"vsm3ss1", "....", "QUi">;
-def SM3TT1A : SInst<"vsm3tt1a", "....I", "QUi">;
-def SM3TT1B : SInst<"vsm3tt1b", "....I", "QUi">;
-def SM3TT2A : SInst<"vsm3tt2a", "....I", "QUi">;
-def SM3TT2B : SInst<"vsm3tt2b", "....I", "QUi">;
+def SM3TT1A : SInst<"vsm3tt1a", "....I", "QUi", [ImmCheck<3, ImmCheck0_3>]>;
+def SM3TT1B : SInst<"vsm3tt1b", "....I", "QUi", [ImmCheck<3, ImmCheck0_3>]>;
+def SM3TT2A : SInst<"vsm3tt2a", "....I", "QUi", [ImmCheck<3, ImmCheck0_3>]>;
+def SM3TT2B : SInst<"vsm3tt2b", "....I", "QUi", [ImmCheck<3, ImmCheck0_3>]>;
 def SM3PARTW1 : SInst<"vsm3partw1", "....", "QUi">;
 def SM3PARTW2 : SInst<"vsm3partw2", "....", "QUi">;
 }
@@ -1327,49 +1343,68 @@ def SCALAR_RSHL: SInst<"vrshl", "11(S1)", "SlSUl">;
 // Scalar Shift (Immediate)
 let isScalarShift = 1 in {
 // Signed/Unsigned Shift Right (Immediate)
-def SCALAR_SSHR_N: SInst<"vshr_n", "11I", "SlSUl">;
+def SCALAR_SSHR_N: SInst<"vshr_n", "11I", "SlSUl",
+                        [ImmCheck<1, ImmCheckShiftRight, 0>]>;
 // Signed/Unsigned Rounding Shift Right (Immediate)
-def SCALAR_SRSHR_N: SInst<"vrshr_n", "11I", "SlSUl">;
+def SCALAR_SRSHR_N: SInst<"vrshr_n", "11I", "SlSUl",
+                          [ImmCheck<1, ImmCheckShiftRight, 0>]>;
 
 // Signed/Unsigned Shift Right and Accumulate (Immediate)
-def SCALAR_SSRA_N: SInst<"vsra_n", "111I", "SlSUl">;
+def SCALAR_SSRA_N: SInst<"vsra_n", "111I", "SlSUl",
+                        [ImmCheck<2, ImmCheckShiftRight, 0>]>;
 // Signed/Unsigned Rounding Shift Right and Accumulate (Immediate)
-def SCALAR_SRSRA_N: SInst<"vrsra_n", "111I", "SlSUl">;
+def SCALAR_SRSRA_N: SInst<"vrsra_n", "111I", "SlSUl",
+                        [ImmCheck<2, ImmCheckShiftRight, 0>]>;
 
 // Shift Left (Immediate)
-def SCALAR_SHL_N: SInst<"vshl_n", "11I", "SlSUl">;
+def SCALAR_SHL_N: SInst<"vshl_n", "11I", "SlSUl",
+                      [ImmCheck<1, ImmCheckShiftLeft, 0>]>;
 // Signed/Unsigned Saturating Shift Left (Immediate)
-def SCALAR_SQSHL_N: SInst<"vqshl_n", "11I", "ScSsSiSlSUcSUsSUiSUl">;
+def SCALAR_SQSHL_N: SInst<"vqshl_n", "11I", "ScSsSiSlSUcSUsSUiSUl",
+                      [ImmCheck<1, ImmCheckShiftLeft, 0>]>;
 // Signed Saturating Shift Left Unsigned (Immediate)
-def SCALAR_SQSHLU_N: SInst<"vqshlu_n", "11I", "ScSsSiSl">;
+def SCALAR_SQSHLU_N: SInst<"vqshlu_n", "11I", "ScSsSiSl",
+                      [ImmCheck<1, ImmCheckShiftLeft, 0>]>;
 
 // Shift Right And Insert (Immediate)
-def SCALAR_SRI_N: SInst<"vsri_n", "111I", "SlSUl">;
+def SCALAR_SRI_N: SInst<"vsri_n", "111I", "SlSUl",
+                        [ImmCheck<2, ImmCheckShiftRight, 0>]>;
 // Shift Left And Insert (Immediate)
-def SCALAR_SLI_N: SInst<"vsli_n", "111I", "SlSUl">;
+def SCALAR_SLI_N: SInst<"vsli_n", "111I", "SlSUl",
+                        [ImmCheck<2, ImmCheckShiftLeft, 0>]>;
 
 let isScalarNarrowShift = 1 in {
   // Signed/Unsigned Saturating Shift Right Narrow (Immediate)
-  def SCALAR_SQSHRN_N: SInst<"vqshrn_n", "(1<)1I", "SsSiSlSUsSUiSUl">;
+  def SCALAR_SQSHRN_N: SInst<"vqshrn_n", "(1<)1I", "SsSiSlSUsSUiSUl",
+                            [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
   // Signed/Unsigned Saturating Rounded Shift Right Narrow (Immediate)
-  def SCALAR_SQRSHRN_N: SInst<"vqrshrn_n", "(1<)1I", "SsSiSlSUsSUiSUl">;
+  def SCALAR_SQRSHRN_N: SInst<"vqrshrn_n", "(1<)1I", "SsSiSlSUsSUiSUl",
+                            [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
   // Signed Saturating Shift Right Unsigned Narrow (Immediate)
-  def SCALAR_SQSHRUN_N: SInst<"vqshrun_n", "(1<U)1I", "SsSiSl">;
+  def SCALAR_SQSHRUN_N: SInst<"vqshrun_n", "(1<U)1I", "SsSiSl",
+                            [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
   // Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate)
-  def SCALAR_SQRSHRUN_N: SInst<"vqrshrun_n", "(1<U)1I", "SsSiSl">;
+  def SCALAR_SQRSHRUN_N: SInst<"vqrshrun_n", "(1<U)1I", "SsSiSl",
+                            [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Signed/Unsigned Fixed-point Convert To Floating-Point (Immediate)
-def SCALAR_SCVTF_N_F32: SInst<"vcvt_n_f32", "(1F)(1!)I", "SiSUi">;
-def SCALAR_SCVTF_N_F64: SInst<"vcvt_n_f64", "(1F)(1!)I", "SlSUl">;
+def SCALAR_SCVTF_N_F32: SInst<"vcvt_n_f32", "(1F)(1!)I", "SiSUi",
+                              [ImmCheck<1, ImmCheck1_32>]>;
+def SCALAR_SCVTF_N_F64: SInst<"vcvt_n_f64", "(1F)(1!)I", "SlSUl",
+                              [ImmCheck<1, ImmCheck1_64>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Convert To Signed/Unsigned Fixed-point (Immediate)
-def SCALAR_FCVTZS_N_S32 : SInst<"vcvt_n_s32", "(1S)1I", "Sf">;
-def SCALAR_FCVTZU_N_U32 : SInst<"vcvt_n_u32", "(1U)1I", "Sf">;
-def SCALAR_FCVTZS_N_S64 : SInst<"vcvt_n_s64", "(1S)1I", "Sd">;
-def SCALAR_FCVTZU_N_U64 : SInst<"vcvt_n_u64", "(1U)1I", "Sd">;
+def SCALAR_FCVTZS_N_S32 : SInst<"vcvt_n_s32", "(1S)1I", "Sf",
+                                [ImmCheck<1, ImmCheck1_32>]>;
+def SCALAR_FCVTZU_N_U32 : SInst<"vcvt_n_u32", "(1U)1I", "Sf",
+                                [ImmCheck<1, ImmCheck1_32>]>;
+def SCALAR_FCVTZS_N_S64 : SInst<"vcvt_n_s64", "(1S)1I", "Sd",
+                                [ImmCheck<1, ImmCheck1_64>]>;
+def SCALAR_FCVTZU_N_U64 : SInst<"vcvt_n_u64", "(1U)1I", "Sd",
+                                [ImmCheck<1, ImmCheck1_64>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1562,94 +1597,72 @@ def SCALAR_UQXTN : SInst<"vqmovn", "(1<)1", "SUsSUiSUl">;
 
 // Scalar Floating Point  multiply (scalar, by element)
 def SCALAR_FMUL_LANE : IOpInst<"vmul_lane", "11.I", "SfSd", OP_SCALAR_MUL_LN>;
-def SCALAR_FMUL_LANEQ : IOpInst<"vmul_laneq", "11QI", "SfSd", OP_SCALAR_MUL_LN> {
-  let isLaneQ = 1;
-}
+def SCALAR_FMUL_LANEQ : IOpInst<"vmul_laneq", "11QI", "SfSd", OP_SCALAR_MUL_LN>;
 
 // Scalar Floating Point  multiply extended (scalar, by element)
 def SCALAR_FMULX_LANE : IOpInst<"vmulx_lane", "11.I", "SfSd", OP_SCALAR_MULX_LN>;
-def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "11QI", "SfSd", OP_SCALAR_MULX_LN> {
-  let isLaneQ = 1;
-}
+def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "11QI", "SfSd", OP_SCALAR_MULX_LN>;
 
 def SCALAR_VMUL_N : IInst<"vmul_n", "..1", "d">;
 
 // VMUL_LANE_A64 d type implemented using scalar mul lane
-def SCALAR_VMUL_LANE : IInst<"vmul_lane", "..qI", "d">;
+def SCALAR_VMUL_LANE : IInst<"vmul_lane", "..qI", "d",
+                            [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 
 // VMUL_LANEQ d type implemented using scalar mul lane
-def SCALAR_VMUL_LANEQ   : IInst<"vmul_laneq", "..QI", "d"> {
-  let isLaneQ = 1;
-}
-
+def SCALAR_VMUL_LANEQ : IInst<"vmul_laneq", "..QI", "d",
+                              [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 // VMULX_LANE d type implemented using scalar vmulx_lane
 def SCALAR_VMULX_LANE : IOpInst<"vmulx_lane", "..qI", "d", OP_SCALAR_VMULX_LN>;
 
 // VMULX_LANEQ d type implemented using scalar vmulx_laneq
-def SCALAR_VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "d", OP_SCALAR_VMULX_LNQ> {
-  let isLaneQ = 1;
-}
-
+def SCALAR_VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "d", OP_SCALAR_VMULX_LNQ>;
 // Scalar Floating Point fused multiply-add (scalar, by element)
-def SCALAR_FMLA_LANE : IInst<"vfma_lane", "111.I", "SfSd">;
-def SCALAR_FMLA_LANEQ : IInst<"vfma_laneq", "111QI", "SfSd"> {
-  let isLaneQ = 1;
-}
+def SCALAR_FMLA_LANE : IInst<"vfma_lane", "111.I", "SfSd",
+                            [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SCALAR_FMLA_LANEQ : IInst<"vfma_laneq", "111QI", "SfSd",
+                            [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
 
 // Scalar Floating Point fused multiply-subtract (scalar, by element)
 def SCALAR_FMLS_LANE : IOpInst<"vfms_lane", "111.I", "SfSd", OP_FMS_LN>;
-def SCALAR_FMLS_LANEQ : IOpInst<"vfms_laneq", "111QI", "SfSd", OP_FMS_LNQ> {
-  let isLaneQ = 1;
-}
+def SCALAR_FMLS_LANEQ : IOpInst<"vfms_laneq", "111QI", "SfSd", OP_FMS_LNQ>;
 
 // Signed Saturating Doubling Multiply Long (scalar by element)
 def SCALAR_SQDMULL_LANE : SOpInst<"vqdmull_lane", "(1>)1.I", "SsSi", OP_SCALAR_QDMULL_LN>;
-def SCALAR_SQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(1>)1QI", "SsSi", OP_SCALAR_QDMULL_LN> {
-  let isLaneQ = 1;
-}
+def SCALAR_SQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(1>)1QI", "SsSi", OP_SCALAR_QDMULL_LN>;
 
 // Signed Saturating Doubling Multiply-Add Long (scalar by element)
-def SCALAR_SQDMLAL_LANE : SInst<"vqdmlal_lane", "(1>)(1>)1.I", "SsSi">;
-def SCALAR_SQDMLAL_LANEQ : SInst<"vqdmlal_laneq", "(1>)(1>)1QI", "SsSi"> {
-  let isLaneQ = 1;
-}
+def SCALAR_SQDMLAL_LANE : SInst<"vqdmlal_lane", "(1>)(1>)1.I", "SsSi",
+                                [ImmCheck<3, ImmCheckLaneIndex, 1>]>;
+def SCALAR_SQDMLAL_LANEQ : SInst<"vqdmlal_laneq", "(1>)(1>)1QI", "SsSi",
+                                [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
 
 // Signed Saturating Doubling Multiply-Subtract Long (scalar by element)
-def SCALAR_SQDMLS_LANE : SInst<"vqdmlsl_lane", "(1>)(1>)1.I", "SsSi">;
-def SCALAR_SQDMLS_LANEQ : SInst<"vqdmlsl_laneq", "(1>)(1>)1QI", "SsSi"> {
-  let isLaneQ = 1;
-}
-
+def SCALAR_SQDMLS_LANE : SInst<"vqdmlsl_lane", "(1>)(1>)1.I", "SsSi",
+                              [ImmCheck<3, ImmCheckLaneIndex, 1>]>;
+def SCALAR_SQDMLS_LANEQ : SInst<"vqdmlsl_laneq", "(1>)(1>)1QI", "SsSi",
+                              [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
 // Scalar Integer Saturating Doubling Multiply Half High (scalar by element)
 def SCALAR_SQDMULH_LANE : SOpInst<"vqdmulh_lane", "11.I", "SsSi", OP_SCALAR_QDMULH_LN>;
-def SCALAR_SQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QDMULH_LN> {
-  let isLaneQ = 1;
-}
+def SCALAR_SQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QDMULH_LN>;
 
 // Scalar Integer Saturating Rounding Doubling Multiply Half High
 def SCALAR_SQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "11.I", "SsSi", OP_SCALAR_QRDMULH_LN>;
-def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QRDMULH_LN> {
-  let isLaneQ = 1;
-}
+def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QRDMULH_LN>;
 
 let TargetGuard = "v8.1a,neon" in {
 // Signed Saturating Rounding Doubling Multiply Accumulate Returning High Half
 def SCALAR_SQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "111.I", "SsSi", OP_SCALAR_QRDMLAH_LN>;
-def SCALAR_SQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLAH_LN> {
-  let isLaneQ = 1;
-}
-
+def SCALAR_SQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLAH_LN>;
 // Signed Saturating Rounding Doubling Multiply Subtract Returning High Half
 def SCALAR_SQRDMLSH_LANE : SOpInst<"vqrdmlsh_lane", "111.I", "SsSi", OP_SCALAR_QRDMLSH_LN>;
-def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLSH_LN> {
-  let isLaneQ = 1;
-}
+def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLSH_LN>;
 } // TargetGuard = "v8.1a"
 
-def SCALAR_VDUP_LANE : IInst<"vdup_lane", "1.I", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
-def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs"> {
-  let isLaneQ = 1;
-}
+def SCALAR_VDUP_LANE : IInst<"vdup_lane", "1.I", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs",
+                            [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
+def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs",
+                            [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
 
 } // ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)"
 
@@ -1719,11 +1732,12 @@ let TargetGuard = "fullfp16,neon" in {
     def VCLTH      : SOpInst<"vclt", "U..", "hQh", OP_LT>;
 
   // Vector conversion
-  let isVCVT_N = 1 in {
-    def VCVT_N_F16 : SInst<"vcvt_n_f16", "F(.!)I", "sUsQsQUs">;
-    def VCVT_N_S16 : SInst<"vcvt_n_s16", "S.I", "hQh">;
-    def VCVT_N_U16 : SInst<"vcvt_n_u16", "U.I", "hQh">;
-  }
+    def VCVT_N_F16 : SInst<"vcvt_n_f16", "F(.!)I", "sUsQsQUs",
+                          [ImmCheck<1, ImmCheck1_16>]>;
+    def VCVT_N_S16 : SInst<"vcvt_n_s16", "S.I", "hQh",
+                          [ImmCheck<1, ImmCheck1_16>]>;
+    def VCVT_N_U16 : SInst<"vcvt_n_u16", "U.I", "hQh",
+                          [ImmCheck<1, ImmCheck1_16>]>;
 
   // Max/Min
   def VMAXH         : SInst<"vmax", "...", "hQh">;
@@ -1770,7 +1784,7 @@ def VZIPH    : WInst<"vzip", "2..", "hQh">;
 def VUZPH    : WInst<"vuzp", "2..", "hQh">;
 def VTRNH    : WInst<"vtrn", "2..", "hQh">;
 // Vector Extract
-def VEXTH      : WInst<"vext", "...I", "hQh">;
+def VEXTH      : WInst<"vext", "...I", "hQh", [ImmCheck<2, ImmCheckLaneIndex, 0>]>;
 // Reverse vector elements
 def VREV64H    : WOpInst<"vrev64", "..", "hQh", OP_REV64>;
 
@@ -1801,54 +1815,42 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
   // ARMv8.2-A FP16 lane vector intrinsics.
 
   // FMA lane
-  def VFMA_LANEH   : IInst<"vfma_lane", "...qI", "hQh">;
-  def VFMA_LANEQH  : IInst<"vfma_laneq", "...QI", "hQh"> {
-    let isLaneQ = 1;
-  }
+  def VFMA_LANEH   : IInst<"vfma_lane", "...qI", "hQh",
+                          [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+  def VFMA_LANEQH  : IInst<"vfma_laneq", "...QI", "hQh",
+                          [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
 
   // FMA lane with scalar argument
   def FMLA_NH      : SOpInst<"vfma_n", "...1", "hQh", OP_FMLA_N>;
   // Scalar floating point fused multiply-add (scalar, by element)
-  def SCALAR_FMLA_LANEH  : IInst<"vfma_lane", "111.I", "Sh">;
-  def SCALAR_FMLA_LANEQH : IInst<"vfma_laneq", "111QI", "Sh"> {
-    let isLaneQ = 1;
-  }
+  def SCALAR_FMLA_LANEH  : IInst<"vfma_lane", "111.I", "Sh",
+                                [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+  def SCALAR_FMLA_LANEQH : IInst<"vfma_laneq", "111QI", "Sh",
+                                [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
 
   // FMS lane
   def VFMS_LANEH   : IOpInst<"vfms_lane", "...qI", "hQh", OP_FMS_LN>;
-  def VFMS_LANEQH  : IOpInst<"vfms_laneq", "...QI", "hQh", OP_FMS_LNQ> {
-    let isLaneQ = 1;
-  }
+  def VFMS_LANEQH  : IOpInst<"vfms_laneq", "...QI", "hQh", OP_FMS_LNQ>;
   // FMS lane with scalar argument
   def FMLS_NH      : SOpInst<"vfms_n", "...1", "hQh", OP_FMLS_N>;
   // Scalar floating foint fused multiply-subtract (scalar, by element)
   def SCALAR_FMLS_LANEH  : IOpInst<"vfms_lane", "111.I", "Sh", OP_FMS_LN>;
-  def SCALAR_FMLS_LANEQH : IOpInst<"vfms_laneq", "111QI", "Sh", OP_FMS_LNQ> {
-    let isLaneQ = 1;
-  }
-
+  def SCALAR_FMLS_LANEQH : IOpInst<"vfms_laneq", "111QI", "Sh", OP_FMS_LNQ>;
   // Mul lane
-  def VMUL_LANEQH   : IOpInst<"vmul_laneq", "..QI", "hQh", OP_MUL_LN> {
-    let isLaneQ = 1;
-  }
+  def VMUL_LANEQH   : IOpInst<"vmul_laneq", "..QI", "hQh", OP_MUL_LN>;
   // Scalar floating point  multiply (scalar, by element)
   def SCALAR_FMUL_LANEH  : IOpInst<"vmul_lane", "11.I", "Sh", OP_SCALAR_MUL_LN>;
-  def SCALAR_FMUL_LANEQH : IOpInst<"vmul_laneq", "11QI", "Sh", OP_SCALAR_MUL_LN> {
-    let isLaneQ = 1;
-  }
+  def SCALAR_FMUL_LANEQH : IOpInst<"vmul_laneq", "11QI", "Sh", OP_SCALAR_MUL_LN>;
 
   // Mulx lane
   def VMULX_LANEH   : IOpInst<"vmulx_lane", "..qI", "hQh", OP_MULX_LN>;
-  def VMULX_LANEQH  : IOpInst<"vmulx_laneq", "..QI", "hQh", OP_MULX_LN> {
-    let isLaneQ = 1;
-  }
+  def VMULX_LANEQH  : IOpInst<"vmulx_laneq", "..QI", "hQh", OP_MULX_LN>;
   def VMULX_NH      : IOpInst<"vmulx_n", "..1", "hQh", OP_MULX_N>;
   // Scalar floating point  mulx (scalar, by element)
-  def SCALAR_FMULX_LANEH : IInst<"vmulx_lane", "11.I", "Sh">;
-  def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "11QI", "Sh"> {
-    let isLaneQ = 1;
-  }
-
+  def SCALAR_FMULX_LANEH : IInst<"vmulx_lane", "11.I", "Sh",
+                                [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+  def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "11QI", "Sh",
+                                [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
   // ARMv8.2-A FP16 reduction vector intrinsics.
   def VMAXVH   : SInst<"vmaxv", "1.", "hQh">;
   def VMINVH   : SInst<"vminv", "1.", "hQh">;
@@ -1865,10 +1867,10 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in {
   def VZIP2H     : SOpInst<"vzip2", "...", "hQh", OP_ZIP2>;
   def VUZP2H     : SOpInst<"vuzp2", "...", "hQh", OP_UZP2>;
 
-  def SCALAR_VDUP_LANEH  : IInst<"vdup_lane", "1.I", "Sh">;
-  def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "1QI", "Sh"> {
-    let isLaneQ = 1;
-  }
+  def SCALAR_VDUP_LANEH  : IInst<"vdup_lane", "1.I", "Sh",
+                                [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
+  def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "1QI", "Sh",
+                                [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
 }
 
 // v8.2-A dot product instructions.
@@ -1878,9 +1880,7 @@ let TargetGuard = "dotprod,neon" in {
 }
 let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "dotprod,neon" in {
   // Variants indexing into a 128-bit vector are A64 only.
-  def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(<<Q)I", "iUiQiQUi", OP_DOT_LNQ> {
-    let isLaneQ = 1;
-  }
+  def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(<<Q)I", "iUiQiQUi", OP_DOT_LNQ>;
 }
 
 // v8.2-A FP16 fused multiply-add long instructions.
@@ -1895,18 +1895,10 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
   def VFMLAL_LANE_HIGH : SOpInst<"vfmlal_lane_high", "(F>)(F>)F(Fq)I", "hQh", OP_FMLAL_LN_Hi>;
   def VFMLSL_LANE_HIGH : SOpInst<"vfmlsl_lane_high", "(F>)(F>)F(Fq)I", "hQh", OP_FMLSL_LN_Hi>;
 
-  def VFMLAL_LANEQ_LOW  : SOpInst<"vfmlal_laneq_low",  "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN> {
-    let isLaneQ = 1;
-  }
-  def VFMLSL_LANEQ_LOW  : SOpInst<"vfmlsl_laneq_low",  "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN> {
-    let isLaneQ = 1;
-  }
-  def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN_Hi> {
-    let isLaneQ = 1;
-  }
-  def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi> {
-    let isLaneQ = 1;
-  }
+  def VFMLAL_LANEQ_LOW  : SOpInst<"vfmlal_laneq_low",  "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN>;
+  def VFMLSL_LANEQ_LOW  : SOpInst<"vfmlsl_laneq_low",  "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN>;
+  def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN_Hi>;
+  def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>;
 }
 
 let TargetGuard = "i8mm,neon" in {
@@ -1919,19 +1911,15 @@ let TargetGuard = "i8mm,neon" in {
   def VSUDOT_LANE  : SOpInst<"vsudot_lane", "..(<<)(<<qU)I", "iQi", OP_SUDOT_LN>;
 
   let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in {
-    let isLaneQ = 1 in {
-      def VUSDOT_LANEQ  : SOpInst<"vusdot_laneq", "..(<<U)(<<Q)I", "iQi", OP_USDOT_LNQ>;
-      def VSUDOT_LANEQ  : SOpInst<"vsudot_laneq", "..(<<)(<<QU)I", "iQi", OP_SUDOT_LNQ>;
-    }
+    def VUSDOT_LANEQ  : SOpInst<"vusdot_laneq", "..(<<U)(<<Q)I", "iQi", OP_USDOT_LNQ>;
+    def VSUDOT_LANEQ  : SOpInst<"vsudot_laneq", "..(<<)(<<QU)I", "iQi", OP_SUDOT_LNQ>;
   }
 }
 
 let TargetGuard = "bf16,neon" in {
   def VDOT_BF : SInst<"vbfdot", "..BB", "fQf">;
   def VDOT_LANE_BF : SOpInst<"vbfdot_lane", "..B(Bq)I", "fQf", OP_BFDOT_LN>;
-  def VDOT_LANEQ_BF : SOpInst<"vbfdot_laneq", "..B(BQ)I", "fQf", OP_BFDOT_LNQ> {
-    let isLaneQ = 1;
-  }
+  def VDOT_LANEQ_BF : SOpInst<"vbfdot_laneq", "..B(BQ)I", "fQf", OP_BFDOT_LNQ>;
 
   def VFMMLA_BF : SInst<"vbfmmla", "..BB", "Qf">;
 
@@ -1952,20 +1940,16 @@ multiclass VCMLA_ROTS<string type, string lanety, string laneqty> {
     // vcmla{ROT}_lane
     def : SOpInst<"vcmla" # ROT # "_lane", "...qI", type, Op<(call "vcmla" # ROT, $p0, $p1,
            (bitcast $p0, (dup_typed lanety , (call "vget_lane", (bitcast lanety, $p2), $p3))))>>;
-
     // vcmlaq{ROT}_lane
     def : SOpInst<"vcmla" # ROT # "_lane", "...qI", "Q" # type, Op<(call "vcmla" # ROT, $p0, $p1,
            (bitcast $p0, (dup_typed laneqty , (call "vget_lane", (bitcast lanety, $p2), $p3))))>>;
 
-    let isLaneQ = 1 in  {
-      // vcmla{ROT}_laneq
-      def : SOpInst<"vcmla" # ROT # "_laneq", "...QI", type,  Op<(call "vcmla" # ROT, $p0, $p1,
-              (bitcast $p0, (dup_typed lanety, (call "vget_lane", (bitcast laneqty, $p2), $p3))))>>;
-
-      // vcmlaq{ROT}_laneq
-      def : SOpInst<"vcmla" # ROT # "_laneq", "...QI", "Q" # type, Op<(call "vcmla" # ROT, $p0, $p1,
-             (bitcast $p0, (dup_typed laneqty , (call "vget_lane", (bitcast laneqty, $p2), $p3))))>>;
-    }
+    // vcmla{ROT}_laneq
+    def : SOpInst<"vcmla" # ROT # "_laneq", "...QI", type,  Op<(call "vcmla" # ROT, $p0, $p1,
+            (bitcast $p0, (dup_typed lanety, (call "vget_lane", (bitcast laneqty, $p2), $p3))))>>;
+    // vcmlaq{ROT}_laneq
+    def : SOpInst<"vcmla" # ROT # "_laneq", "...QI", "Q" # type, Op<(call "vcmla" # ROT, $p0, $p1,
+            (bitcast $p0, (dup_typed laneqty , (call "vget_lane", (bitcast laneqty, $p2), $p3))))>>;
   }
 }
 
@@ -2002,21 +1986,21 @@ let TargetGuard = "bf16,neon" in {
   def VDUP_N_BF    : WOpInst<"vdup_n", ".1", "bQb", OP_DUP>;
 
   def VDUP_LANE_BF : WOpInst<"vdup_lane", ".qI", "bQb", OP_DUP_LN>;
-  def VDUP_LANEQ_BF: WOpInst<"vdup_laneq", ".QI", "bQb", OP_DUP_LN> {
-    let isLaneQ = 1;
-  }
+  def VDUP_LANEQ_BF: WOpInst<"vdup_laneq", ".QI", "bQb", OP_DUP_LN>;
 
   def VCOMBINE_BF  : NoTestOpInst<"vcombine", "Q..", "b", OP_CONC>;
 
   def VGET_HIGH_BF : NoTestOpInst<"vget_high", ".Q", "b", OP_HI>;
   def VGET_LOW_BF  : NoTestOpInst<"vget_low", ".Q", "b", OP_LO>;
 
-  def VGET_LANE_BF : IInst<"vget_lane", "1.I", "bQb">;
-  def VSET_LANE_BF : IInst<"vset_lane", ".1.I", "bQb">;
-  def SCALAR_VDUP_LANE_BF : IInst<"vdup_lane", "1.I", "Sb">;
-  def SCALAR_VDUP_LANEQ_BF : IInst<"vdup_laneq", "1QI", "Sb"> {
-    let isLaneQ = 1;
-  }
+  def VGET_LANE_BF : IInst<"vget_lane", "1.I", "bQb",
+                          [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
+  def VSET_LANE_BF : IInst<"vset_lane", ".1.I", "bQb",
+                          [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+  def SCALAR_VDUP_LANE_BF : IInst<"vdup_lane", "1.I", "Sb",
+                          [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
+  def SCALAR_VDUP_LANEQ_BF : IInst<"vdup_laneq", "1QI", "Sb",
+                          [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
 
   def VLD1_BF : WInst<"vld1", ".(c*!)", "bQb">;
   def VLD2_BF : WInst<"vld2", "2(c*!)", "bQb">;
@@ -2036,14 +2020,22 @@ let TargetGuard = "bf16,neon" in {
   def VST1_X3_BF : WInst<"vst1_x3", "v*(3!)", "bQb">;
   def VST1_X4_BF : WInst<"vst1_x4", "v*(4!)", "bQb">;
 
-  def VLD1_LANE_BF : WInst<"vld1_lane", ".(c*!).I", "bQb">;
-  def VLD2_LANE_BF : WInst<"vld2_lane", "2(c*!)2I", "bQb">;
-  def VLD3_LANE_BF : WInst<"vld3_lane", "3(c*!)3I", "bQb">;
-  def VLD4_LANE_BF : WInst<"vld4_lane", "4(c*!)4I", "bQb">;
-  def VST1_LANE_BF : WInst<"vst1_lane", "v*(.!)I", "bQb">;
-  def VST2_LANE_BF : WInst<"vst2_lane", "v*(2!)I", "bQb">;
-  def VST3_LANE_BF : WInst<"vst3_lane", "v*(3!)I", "bQb">;
-  def VST4_LANE_BF : WInst<"vst4_lane", "v*(4!)I", "bQb">;
+  def VLD1_LANE_BF : WInst<"vld1_lane", ".(c*!).I", "bQb",
+                          [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+  def VLD2_LANE_BF : WInst<"vld2_lane", "2(c*!)2I", "bQb",
+                          [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
+  def VLD3_LANE_BF : WInst<"vld3_lane", "3(c*!)3I", "bQb",
+                          [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
+  def VLD4_LANE_BF : WInst<"vld4_lane", "4(c*!)4I", "bQb",
+                          [ImmCheck<6, ImmCheckLaneIndex, 1>]>;
+  def VST1_LANE_BF : WInst<"vst1_lane", "v*(.!)I", "bQb",
+                          [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+  def VST2_LANE_BF : WInst<"vst2_lane", "v*(2!)I", "bQb",
+                          [ImmCheck<3, ImmCheckLaneIndex, 1>]>;
+  def VST3_LANE_BF : WInst<"vst3_lane", "v*(3!)I", "bQb",
+                          [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
+  def VST4_LANE_BF : WInst<"vst4_lane", "v*(4!)I", "bQb",
+                          [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
 
   def VLD1_DUP_BF : WInst<"vld1_dup", ".(c*!)", "bQb">;
   def VLD2_DUP_BF : WInst<"vld2_dup", "2(c*!)", "bQb">;
@@ -2093,25 +2085,39 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "b
 
 // v8.9a/v9.4a LRCPC3 intrinsics
 let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "rcpc3,neon" in {
-  def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">;
-  def VSTL1_LANE  : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">;
+  def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl",
+                        [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+  def VSTL1_LANE  : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl",
+                        [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 }
 
 // Lookup table read with 2-bit/4-bit indices
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in {
-  def VLUTI2_B    : SInst<"vluti2_lane",    "Q.(qU)I",   "cUcPcQcQUcQPc">;
-  def VLUTI2_B_Q  : SInst<"vluti2_laneq",   "Q.(QU)I",   "cUcPcQcQUcQPc">;
-  def VLUTI2_H    : SInst<"vluti2_lane",    "Q.(<qU)I",   "sUsPshQsQUsQPsQh">;
-  def VLUTI2_H_Q  : SInst<"vluti2_laneq",   "Q.(<QU)I",   "sUsPshQsQUsQPsQh">;
-  def VLUTI4_B    : SInst<"vluti4_lane",    "..(qU)I",   "QcQUcQPc">;
-  def VLUTI4_B_Q    : SInst<"vluti4_laneq",    "..UI",   "QcQUcQPc">;
-  def VLUTI4_H_X2 : SInst<"vluti4_lane_x2", ".2(<qU)I", "QsQUsQPsQh">;
-  def VLUTI4_H_X2_Q : SInst<"vluti4_laneq_x2", ".2(<U)I", "QsQUsQPsQh">;
-    
-  let ArchGuard = "defined(__aarch64__)", TargetGuard= "lut,bf16" in {
-    def VLUTI2_BF      : SInst<"vluti2_lane",     "Q.(<qU)I",   "bQb">;
-    def VLUTI2_BF_Q    : SInst<"vluti2_laneq",    "Q.(<QU)I",   "bQb">;
-    def VLUTI4_BF_X2   : SInst<"vluti4_lane_x2", ".2(<qU)I", "Qb">;
-    def VLUTI4_BF_X2_Q   : SInst<"vluti4_laneq_x2", ".2(<U)I", "Qb">;
+  def VLUTI2_B    : SInst<"vluti2_lane", "Q.(qU)I", "cUcPcQcQUcQPc",
+                         [ImmCheck<2, ImmCheck0_1>]>;
+  def VLUTI2_B_Q  : SInst<"vluti2_laneq", "Q.(QU)I", "cUcPcQcQUcQPc",
+                         [ImmCheck<2, ImmCheck0_3>]>;
+  def VLUTI2_H    : SInst<"vluti2_lane", "Q.(<qU)I", "sUsPshQsQUsQPsQh",
+                         [ImmCheck<2, ImmCheck0_3>]>;
+  def VLUTI2_H_Q  : SInst<"vluti2_laneq", "Q.(<QU)I", "sUsPshQsQUsQPsQh",
+                         [ImmCheck<2, ImmCheck0_7>]>;
+  def VLUTI4_B    : SInst<"vluti4_lane", "..(qU)I", "QcQUcQPc",
+                         [ImmCheck<2, ImmCheck0_0>]>;
+  def VLUTI4_B_Q  : SInst<"vluti4_laneq", "..UI", "QcQUcQPc",
+                         [ImmCheck<2, ImmCheck0_1>]>;
+  def VLUTI4_H_X2 : SInst<"vluti4_lane_x2", ".2(<qU)I", "QsQUsQPsQh",
+                          [ImmCheck<3, ImmCheck0_1>]>;
+  def VLUTI4_H_X2_Q : SInst<"vluti4_laneq_x2", ".2(<U)I", "QsQUsQPsQh",
+                          [ImmCheck<3, ImmCheck0_3>]>;
+
+  let TargetGuard = "lut,bf16" in {
+    def VLUTI2_BF      : SInst<"vluti2_lane", "Q.(<qU)I", "bQb",
+                              [ImmCheck<2, ImmCheck0_3>]>;
+    def VLUTI2_BF_Q    : SInst<"vluti2_laneq", "Q.(<QU)I", "bQb",
+                              [ImmCheck<2, ImmCheck0_7>]>;
+    def VLUTI4_BF_X2   : SInst<"vluti4_lane_x2", ".2(<qU)I", "Qb",
+                              [ImmCheck<3, ImmCheck0_1>]>;
+    def VLUTI4_BF_X2_Q   : SInst<"vluti4_laneq_x2", ".2(<U)I", "Qb",
+                              [ImmCheck<3, ImmCheck0_3>]>;
   }
-}
+}
\ No newline at end of file
diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index 3b8015daee6d9..b088e0794cdea 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -21,6 +21,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+include "arm_immcheck_incl.td"
+
 // The base Operation class. All operations must subclass this.
 class Operation<list<dag> ops=[]> {
   list<dag> Ops = ops;
@@ -260,7 +262,7 @@ def OP_UNAVAILABLE : Operation {
 
 
 // Every intrinsic subclasses Inst.
-class Inst <string n, string p, string t, Operation o> {
+class Inst <string n, string p, string t, Operation o, list<ImmCheck> ch = []>{
   string Name = n;
   string Prototype = p;
   string Types = t;
@@ -272,12 +274,7 @@ class Inst <string n, string p, string t, Operation o> {
   bit isShift = 0;
   bit isScalarShift = 0;
   bit isScalarNarrowShift = 0;
-  bit isVCVT_N = 0;
-  bit isVXAR = 0;
-  // For immediate checks: the immediate will be assumed to specify the lane of
-  // a Q register. Only used for intrinsics which end up calling polymorphic
-  // builtins.
-  bit isLaneQ = 0;
+  list<ImmCheck> ImmChecks = ch;
 
   // Certain intrinsics have different names than their representative
   // instructions. This field allows us to handle this correctly when we
@@ -300,9 +297,9 @@ class Inst <string n, string p, string t, Operation o> {
 // SInst: Instruction with signed/unsigned suffix (e.g., "s8", "u8", "p8")
 // IInst: Instruction with generic integer suffix (e.g., "i8")
 // WInst: Instruction with only bit size suffix (e.g., "8")
-class SInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
-class IInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
-class WInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
+class SInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
+class IInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
+class WInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 
 // The following instruction classes are implemented via operators
 // instead of builtins. As such these declarations are only used for
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index 6ec357825a132..fdf4ba55fe938 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -13,6 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+include "arm_immcheck_incl.td"
+
 //===----------------------------------------------------------------------===//
 // Instruction definitions
 //===----------------------------------------------------------------------===//
@@ -233,40 +235,6 @@ def IsInZT0                         : FlagType<0x400000000000>;
 def IsOutZT0                        : FlagType<0x800000000000>;
 def IsInOutZT0                      : FlagType<0x1000000000000>;
 
-// These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h
-class ImmCheckType<int val> {
-  int Value = val;
-}
-def ImmCheck0_31                : ImmCheckType<0>;  // 0..31 (used for e.g. predicate patterns)
-def ImmCheck1_16                : ImmCheckType<1>;  // 1..16
-def ImmCheckExtract             : ImmCheckType<2>;  // 0..(2048/sizeinbits(elt) - 1)
-def ImmCheckShiftRight          : ImmCheckType<3>;  // 1..sizeinbits(elt)
-def ImmCheckShiftRightNarrow    : ImmCheckType<4>;  // 1..sizeinbits(elt)/2
-def ImmCheckShiftLeft           : ImmCheckType<5>;  // 0..(sizeinbits(elt) - 1)
-def ImmCheck0_7                 : ImmCheckType<6>;  // 0..7
-def ImmCheckLaneIndex           : ImmCheckType<7>;  // 0..(128/(1*sizeinbits(elt)) - 1)
-def ImmCheckLaneIndexCompRotate : ImmCheckType<8>;  // 0..(128/(2*sizeinbits(elt)) - 1)
-def ImmCheckLaneIndexDot        : ImmCheckType<9>;  // 0..(128/(4*sizeinbits(elt)) - 1)
-def ImmCheckComplexRot90_270    : ImmCheckType<10>; // [90,270]
-def ImmCheckComplexRotAll90     : ImmCheckType<11>; // [0, 90, 180,270]
-def ImmCheck0_13                : ImmCheckType<12>; // 0..13
-def ImmCheck0_1                 : ImmCheckType<13>; // 0..1
-def ImmCheck0_2                 : ImmCheckType<14>; // 0..2
-def ImmCheck0_3                 : ImmCheckType<15>; // 0..3
-def ImmCheck0_0                 : ImmCheckType<16>; // 0..0
-def ImmCheck0_15                : ImmCheckType<17>; // 0..15
-def ImmCheck0_255               : ImmCheckType<18>; // 0..255
-def ImmCheck2_4_Mul2            : ImmCheckType<19>; // 2, 4
-def ImmCheck1_1                 : ImmCheckType<20>; // 1..1
-def ImmCheck1_3                 : ImmCheckType<21>; // 1..3
-def ImmCheck1_7                 : ImmCheckType<22>; // 1..7
-
-class ImmCheck<int arg, ImmCheckType kind, int eltSizeArg = -1> {
-  int Arg = arg;
-  int EltSizeArg = eltSizeArg;
-  ImmCheckType Kind = kind;
-}
-
 defvar InvalidMode = "";
 
 class Inst<string n, string p, string t, MergeType mt, string i,
diff --git a/clang/include/clang/Sema/SemaARM.h b/clang/include/clang/Sema/SemaARM.h
index b8196a3170d63..8c4c56e222130 100644
--- a/clang/include/clang/Sema/SemaARM.h
+++ b/clang/include/clang/Sema/SemaARM.h
@@ -13,7 +13,9 @@
 #ifndef LLVM_CLANG_SEMA_SEMAARM_H
 #define LLVM_CLANG_SEMA_SEMAARM_H
 
-#include "clang/AST/ASTFwd.h"
+#include "clang/AST/DeclBase.h"
+#include "clang/AST/Expr.h"
+#include "clang/Basic/TargetInfo.h"
 #include "clang/Sema/SemaBase.h"
 #include "llvm/ADT/StringRef.h"
 #include <tuple>
@@ -40,15 +42,21 @@ class SemaARM : public SemaBase {
                             /// flags. Do Sema checks for the runtime mode.
   };
 
+  bool CheckImmediateArg(CallExpr *TheCall, unsigned CheckTy, unsigned ArgIdx,
+                         unsigned EltBitWidth, unsigned VecBitWidth);
   bool CheckARMBuiltinExclusiveCall(unsigned BuiltinID, CallExpr *TheCall,
                                     unsigned MaxWidth);
   bool CheckNeonBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
                                     CallExpr *TheCall);
+  bool PerformNeonImmChecks(
+      CallExpr *TheCall,
+      SmallVectorImpl<std::tuple<int, int, int, int>> &ImmChecks,
+      int OverloadType = -1);
+  bool
+  PerformSVEImmChecks(CallExpr *TheCall,
+                      SmallVectorImpl<std::tuple<int, int, int>> &ImmChecks);
   bool CheckMVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
-  bool
-  ParseSVEImmChecks(CallExpr *TheCall,
-                    llvm::SmallVector<std::tuple<int, int, int>, 3> &ImmChecks);
   bool CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckCDEBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
                                    CallExpr *TheCall);
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 185e0427d5c99..770968cb2d4b4 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -315,40 +315,6 @@ bool SemaARM::BuiltinARMSpecialReg(unsigned BuiltinID, CallExpr *TheCall,
   return false;
 }
 
-// Get the valid immediate range for the specified NEON type code.
-static unsigned RFT(unsigned t, bool shift = false, bool ForceQuad = false) {
-  NeonTypeFlags Type(t);
-  int IsQuad = ForceQuad ? true : Type.isQuad();
-  switch (Type.getEltType()) {
-  case NeonTypeFlags::Int8:
-  case NeonTypeFlags::Poly8:
-    return shift ? 7 : (8 << IsQuad) - 1;
-  case NeonTypeFlags::Int16:
-  case NeonTypeFlags::Poly16:
-    return shift ? 15 : (4 << IsQuad) - 1;
-  case NeonTypeFlags::Int32:
-    return shift ? 31 : (2 << IsQuad) - 1;
-  case NeonTypeFlags::Int64:
-  case NeonTypeFlags::Poly64:
-    return shift ? 63 : (1 << IsQuad) - 1;
-  case NeonTypeFlags::Poly128:
-    return shift ? 127 : (1 << IsQuad) - 1;
-  case NeonTypeFlags::Float16:
-    assert(!shift && "cannot shift float types!");
-    return (4 << IsQuad) - 1;
-  case NeonTypeFlags::Float32:
-    assert(!shift && "cannot shift float types!");
-    return (2 << IsQuad) - 1;
-  case NeonTypeFlags::Float64:
-    assert(!shift && "cannot shift float types!");
-    return (1 << IsQuad) - 1;
-  case NeonTypeFlags::BFloat16:
-    assert(!shift && "cannot shift float types!");
-    return (4 << IsQuad) - 1;
-  }
-  llvm_unreachable("Invalid NeonTypeFlag!");
-}
-
 /// getNeonEltType - Return the QualType corresponding to the elements of
 /// the vector type specified by the NeonTypeFlags.  This is used to check
 /// the pointer arguments for Neon load/store intrinsics.
@@ -404,142 +370,174 @@ enum ArmSMEState : unsigned {
   ArmZT0Mask = 0b11 << 2
 };
 
-bool SemaARM::ParseSVEImmChecks(
-    CallExpr *TheCall, SmallVector<std::tuple<int, int, int>, 3> &ImmChecks) {
-  // Perform all the immediate checks for this builtin call.
+bool SemaARM::CheckImmediateArg(CallExpr *TheCall, unsigned CheckTy,
+                                unsigned ArgIdx, unsigned EltBitWidth,
+                                unsigned VecBitWidth) {
+  // Function that checks whether the operand (ArgIdx) is an immediate
+  // that is one of a given set of values.
+  auto CheckImmediateInSet = [&](std::initializer_list<int64_t> Set,
+                                 int ErrDiag) -> bool {
+    // We can't check the value of a dependent argument.
+    Expr *Arg = TheCall->getArg(ArgIdx);
+    if (Arg->isTypeDependent() || Arg->isValueDependent())
+      return false;
+
+    // Check constant-ness first.
+    llvm::APSInt Imm;
+    if (SemaRef.BuiltinConstantArg(TheCall, ArgIdx, Imm))
+      return true;
+
+    if (std::find(Set.begin(), Set.end(), Imm.getSExtValue()) == Set.end())
+      return Diag(TheCall->getBeginLoc(), ErrDiag) << Arg->getSourceRange();
+    return false;
+  };
+
+  switch ((ImmCheckType)CheckTy) {
+  case ImmCheckType::ImmCheck0_31:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 31))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck0_13:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 13))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck0_63:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 63))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck1_16:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, 16))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck0_7:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 7))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck1_1:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, 1))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck1_3:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, 3))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck1_7:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, 7))
+      return true;
+    break;
+  case ImmCheckType::ImmCheckExtract:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0,
+                                        (2048 / EltBitWidth) - 1))
+      return true;
+    break;
+  case ImmCheckType::ImmCheckCvt:
+  case ImmCheckType::ImmCheckShiftRight:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, EltBitWidth))
+      return true;
+    break;
+  case ImmCheckType::ImmCheckShiftRightNarrow:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, EltBitWidth / 2))
+      return true;
+    break;
+  case ImmCheckType::ImmCheckShiftLeft:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, EltBitWidth - 1))
+      return true;
+    break;
+  case ImmCheckType::ImmCheckLaneIndex:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0,
+                                        (VecBitWidth / EltBitWidth) - 1))
+      return true;
+    break;
+  case ImmCheckType::ImmCheckLaneIndexCompRotate:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0,
+                                        (VecBitWidth / (2 * EltBitWidth)) - 1))
+      return true;
+    break;
+  case ImmCheckType::ImmCheckLaneIndexDot:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0,
+                                        (VecBitWidth / (4 * EltBitWidth)) - 1))
+      return true;
+    break;
+  case ImmCheckType::ImmCheckComplexRot90_270:
+    if (CheckImmediateInSet({90, 270}, diag::err_rotation_argument_to_cadd))
+      return true;
+    break;
+  case ImmCheckType::ImmCheckComplexRotAll90:
+    if (CheckImmediateInSet({0, 90, 180, 270},
+                            diag::err_rotation_argument_to_cmla))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck0_1:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 1))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck0_2:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 2))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck0_3:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 3))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck0_0:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 0))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck0_15:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 15))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck0_255:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 255))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck1_32:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, 32))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck1_64:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, 64))
+      return true;
+    break;
+  case ImmCheckType::ImmCheck2_4_Mul2:
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 2, 4) ||
+        SemaRef.BuiltinConstantArgMultiple(TheCall, ArgIdx, 2))
+      return true;
+    break;
+  default:
+    llvm_unreachable("Invalid immediate range typeflag!");
+    break;
+  }
+  return false;
+}
+
+bool SemaARM::PerformNeonImmChecks(
+    CallExpr *TheCall,
+    SmallVectorImpl<std::tuple<int, int, int, int>> &ImmChecks,
+    int OverloadType) {
   bool HasError = false;
-  for (auto &I : ImmChecks) {
-    int ArgNum, CheckTy, ElementSizeInBits;
-    std::tie(ArgNum, CheckTy, ElementSizeInBits) = I;
-
-    typedef bool (*OptionSetCheckFnTy)(int64_t Value);
-
-    // Function that checks whether the operand (ArgNum) is an immediate
-    // that is one of the predefined values.
-    auto CheckImmediateInSet = [&](OptionSetCheckFnTy CheckImm,
-                                   int ErrDiag) -> bool {
-      // We can't check the value of a dependent argument.
-      Expr *Arg = TheCall->getArg(ArgNum);
-      if (Arg->isTypeDependent() || Arg->isValueDependent())
-        return false;
-
-      // Check constant-ness first.
-      llvm::APSInt Imm;
-      if (SemaRef.BuiltinConstantArg(TheCall, ArgNum, Imm))
-        return true;
 
-      if (!CheckImm(Imm.getSExtValue()))
-        return Diag(TheCall->getBeginLoc(), ErrDiag) << Arg->getSourceRange();
-      return false;
-    };
+  for (const auto &I : ImmChecks) {
+    auto [ArgIdx, CheckTy, ElementSizeInBits, VecSizeInBits] = I;
 
-    switch ((SVETypeFlags::ImmCheckType)CheckTy) {
-    case SVETypeFlags::ImmCheck0_31:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 31))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck0_13:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 13))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck1_16:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 1, 16))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck0_7:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 7))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck1_1:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 1, 1))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck1_3:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 1, 3))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck1_7:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 1, 7))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheckExtract:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0,
-                                          (2048 / ElementSizeInBits) - 1))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheckShiftRight:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 1,
-                                          ElementSizeInBits))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheckShiftRightNarrow:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 1,
-                                          ElementSizeInBits / 2))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheckShiftLeft:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0,
-                                          ElementSizeInBits - 1))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheckLaneIndex:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0,
-                                          (128 / (1 * ElementSizeInBits)) - 1))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheckLaneIndexCompRotate:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0,
-                                          (128 / (2 * ElementSizeInBits)) - 1))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheckLaneIndexDot:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0,
-                                          (128 / (4 * ElementSizeInBits)) - 1))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheckComplexRot90_270:
-      if (CheckImmediateInSet([](int64_t V) { return V == 90 || V == 270; },
-                              diag::err_rotation_argument_to_cadd))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheckComplexRotAll90:
-      if (CheckImmediateInSet(
-              [](int64_t V) {
-                return V == 0 || V == 90 || V == 180 || V == 270;
-              },
-              diag::err_rotation_argument_to_cmla))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck0_1:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 1))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck0_2:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 2))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck0_3:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 3))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck0_0:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 0))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck0_15:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 15))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck0_255:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 255))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck2_4_Mul2:
-      if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 2, 4) ||
-          SemaRef.BuiltinConstantArgMultiple(TheCall, ArgNum, 2))
-        HasError = true;
-      break;
-    }
+    if (OverloadType >= 0)
+      ElementSizeInBits = NeonTypeFlags(OverloadType).getEltSizeInBits();
+
+    HasError |= CheckImmediateArg(TheCall, CheckTy, ArgIdx, ElementSizeInBits,
+                                  VecSizeInBits);
+  }
+
+  return HasError;
+}
+
+bool SemaARM::PerformSVEImmChecks(
+    CallExpr *TheCall, SmallVectorImpl<std::tuple<int, int, int>> &ImmChecks) {
+  bool HasError = false;
+
+  for (const auto &I : ImmChecks) {
+    auto [ArgIdx, CheckTy, ElementSizeInBits] = I;
+    HasError |=
+        CheckImmediateArg(TheCall, CheckTy, ArgIdx, ElementSizeInBits, 128);
   }
 
   return HasError;
@@ -694,7 +692,7 @@ bool SemaARM::CheckSMEBuiltinFunctionCall(unsigned BuiltinID,
 #undef GET_SME_IMMEDIATE_CHECK
   }
 
-  return ParseSVEImmChecks(TheCall, ImmChecks);
+  return PerformSVEImmChecks(TheCall, ImmChecks);
 }
 
 bool SemaARM::CheckSVEBuiltinFunctionCall(unsigned BuiltinID,
@@ -722,7 +720,7 @@ bool SemaARM::CheckSVEBuiltinFunctionCall(unsigned BuiltinID,
 #undef GET_SVE_IMMEDIATE_CHECK
   }
 
-  return ParseSVEImmChecks(TheCall, ImmChecks);
+  return PerformSVEImmChecks(TheCall, ImmChecks);
 }
 
 bool SemaARM::CheckNeonBuiltinFunctionCall(const TargetInfo &TI,
@@ -749,7 +747,7 @@ bool SemaARM::CheckNeonBuiltinFunctionCall(const TargetInfo &TI,
 
   llvm::APSInt Result;
   uint64_t mask = 0;
-  unsigned TV = 0;
+  int TV = -1;
   int PtrArgNum = -1;
   bool HasConstPtr = false;
   switch (BuiltinID) {
@@ -802,7 +800,7 @@ bool SemaARM::CheckNeonBuiltinFunctionCall(const TargetInfo &TI,
 
   // For NEON intrinsics which take an immediate value as part of the
   // instruction, range check them here.
-  unsigned i = 0, l = 0, u = 0;
+  SmallVector<std::tuple<int, int, int, int>, 2> ImmChecks;
   switch (BuiltinID) {
   default:
     return false;
@@ -812,7 +810,7 @@ bool SemaARM::CheckNeonBuiltinFunctionCall(const TargetInfo &TI,
 #undef GET_NEON_IMMEDIATE_CHECK
   }
 
-  return SemaRef.BuiltinConstantArgRange(TheCall, i, l, u + l);
+  return PerformNeonImmChecks(TheCall, ImmChecks, TV);
 }
 
 bool SemaARM::CheckMVEBuiltinFunctionCall(unsigned BuiltinID,
diff --git a/clang/test/CodeGen/aarch64-neon-luti.c b/clang/test/CodeGen/aarch64-neon-luti.c
index 40daf742eb966..4b485636d45b1 100644
--- a/clang/test/CodeGen/aarch64-neon-luti.c
+++ b/clang/test/CodeGen/aarch64-neon-luti.c
@@ -27,21 +27,21 @@ uint8x16_t test_vluti2_laneq_u8(uint8x8_t vn, uint8x16_t vm) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_u8(
 // CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 1)
 // CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANE]]
 //
 uint8x16_t test_vluti2q_lane_u8(uint8x16_t vn, uint8x8_t vm) {
-  return vluti2q_lane_u8(vn, vm, 3);
+  return vluti2q_lane_u8(vn, vm, 1);
 }
 
 // CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_u8(
 // CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
 // CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANEQ]]
 //
 uint8x16_t test_vluti2q_laneq_u8(uint8x16_t vn, uint8x16_t vm) {
-  return vluti2q_laneq_u8(vn, vm, 7);
+  return vluti2q_laneq_u8(vn, vm, 3);
 }
 
 // CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_s8(
@@ -67,21 +67,21 @@ int8x16_t test_vluti2_laneq_s8(int8x8_t vn, uint8x16_t vm) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_s8(
 // CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 1)
 // CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANE]]
 //
 int8x16_t test_vluti2q_lane_s8(int8x16_t vn, uint8x8_t vm) {
-  return vluti2q_lane_s8(vn, vm, 3);
+  return vluti2q_lane_s8(vn, vm, 1);
 }
 
 // CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_s8(
 // CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
 // CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANEQ]]
 //
 int8x16_t test_vluti2q_laneq_s8(int8x16_t vn, uint8x16_t vm) {
-  return vluti2q_laneq_s8(vn, vm, 7);
+  return vluti2q_laneq_s8(vn, vm, 3);
 }
 
 // CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_p8(
@@ -107,21 +107,21 @@ poly8x16_t test_vluti2_laneq_p8(poly8x8_t vn, uint8x16_t vm) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_p8(
 // CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 1)
 // CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANE]]
 //
 poly8x16_t test_vluti2q_lane_p8(poly8x16_t vn, uint8x8_t vm) {
-  return vluti2q_lane_p8(vn, vm, 3);
+  return vluti2q_lane_p8(vn, vm, 1);
 }
 
 // CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_p8(
 // CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
 // CHECK-NEXT:    ret <16 x i8> [[VLUTI2_LANEQ]]
 //
 poly8x16_t test_vluti2q_laneq_p8(poly8x16_t vn, uint8x16_t vm) {
-  return vluti2q_laneq_p8(vn, vm, 7);
+  return vluti2q_laneq_p8(vn, vm, 3);
 }
 
 // CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_lane_u16(
@@ -349,11 +349,11 @@ uint8x16_t test_vluti4q_laneq_u8(uint8x16_t vn, uint8x16_t vm) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_lane_s8(
 // CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VLUTI4Q_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 1)
+// CHECK-NEXT:    [[VLUTI4Q_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 0)
 // CHECK-NEXT:    ret <16 x i8> [[VLUTI4Q_LANE]]
 //
 int8x16_t test_vluti4q_lane_s8(int8x16_t vn, uint8x8_t vm) {
-  return vluti4q_lane_s8(vn, vm, 1);
+  return vluti4q_lane_s8(vn, vm, 0);
 }
 
 // CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_s8(
@@ -369,11 +369,11 @@ int8x16_t test_vluti4q_laneq_s8(int8x16_t vn, uint8x16_t vm) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_lane_p8(
 // CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VLUTI4Q_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 1)
+// CHECK-NEXT:    [[VLUTI4Q_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 0)
 // CHECK-NEXT:    ret <16 x i8> [[VLUTI4Q_LANE]]
 //
 poly8x16_t test_vluti4q_lane_p8(poly8x16_t vn, uint8x8_t vm) {
-  return vluti4q_lane_p8(vn, vm, 1);
+  return vluti4q_lane_p8(vn, vm, 0);
 }
 
 // CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_p8(
@@ -415,11 +415,11 @@ uint16x8_t test_vluti4q_laneq_u16_x2(uint16x8x2_t vn, uint8x16_t vm) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0
 // CHECK-NEXT:    [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1
-// CHECK-NEXT:    [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.lane.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.lane.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 1)
 // CHECK-NEXT:    ret <8 x i16> [[VLUTI4Q_LANE_X24]]
 //
 int16x8_t test_vluti4q_lane_s16_x2(int16x8x2_t vn, uint8x8_t vm) {
-  return vluti4q_lane_s16_x2(vn, vm, 3);
+  return vluti4q_lane_s16_x2(vn, vm, 1);
 }
 
 // CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_laneq_s16_x2(
@@ -463,11 +463,11 @@ float16x8_t test_vluti4q_laneq_f16_x2(float16x8x2_t vn, uint8x16_t vm) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 0
 // CHECK-NEXT:    [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 1
-// CHECK-NEXT:    [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.lane.x2.v8bf16(<8 x bfloat> [[VN_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 2)
+// CHECK-NEXT:    [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.lane.x2.v8bf16(<8 x bfloat> [[VN_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 1)
 // CHECK-NEXT:    ret <8 x bfloat> [[VLUTI4Q_LANE_X24]]
 //
 bfloat16x8_t test_vluti4q_lane_bf16_x2(bfloat16x8x2_t vn, uint8x8_t vm) {
-  return vluti4q_lane_bf16_x2(vn, vm, 2);
+  return vluti4q_lane_bf16_x2(vn, vm, 1);
 }
 
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti4q_laneq_bf16_x2(
diff --git a/clang/test/CodeGen/arm-neon-range-checks.c b/clang/test/CodeGen/arm-neon-range-checks.c
deleted file mode 100644
index 360ff6be16654..0000000000000
--- a/clang/test/CodeGen/arm-neon-range-checks.c
+++ /dev/null
@@ -1,426 +0,0 @@
-// RUN: %clang_cc1 -triple arm64-none-eabi -target-feature +neon -target-feature +dotprod -target-feature +v8.1a -verify %s
-// RUN: %clang_cc1 -triple armv8.1a-none-eabi -target-feature +neon -target-feature +dotprod -target-feature +v8.1a -verify %s
-
-// REQUIRES: aarch64-registered-target || arm-registered-target
-
-#include <arm_neon.h>
-
-void test_vdot_lane(int32x2_t r, int8x8_t a, int8x8_t b) {
-  vdot_lane_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vdot_lane_s32(r, a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vdot_lane_s32(r, a, b, 0);
-  vdot_lane_s32(r, a, b, 1);
-}
-
-void test_vdotq_lane(int32x4_t r, int8x16_t a, int8x8_t b) {
-  vdotq_lane_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vdotq_lane_s32(r, a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vdotq_lane_s32(r, a, b, 0);
-  vdotq_lane_s32(r, a, b, 1);
-}
-
-#if defined(__aarch64__)
-void test_vdot_laneq(int32x2_t r, int8x8_t a, int8x16_t b) {
-  vdot_laneq_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vdot_laneq_s32(r, a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vdot_laneq_s32(r, a, b, 0);
-  vdot_laneq_s32(r, a, b, 3);
-}
-
-void test_vdotq_laneq(int32x4_t r, int8x16_t a, int8x16_t b) {
-  vdotq_laneq_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vdotq_laneq_s32(r, a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vdotq_laneq_s32(r, a, b, 0);
-  vdotq_laneq_s32(r, a, b, 3);
-}
-#endif
-
-void test_vdup_lane(int32x2_t v) {
-  vdup_lane_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vdup_lane_s32(v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vdup_lane_s32(v, 0);
-  vdup_lane_s32(v, 1);
-}
-
-void test_vdupq_lane(int32x2_t v) {
-  vdupq_lane_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vdupq_lane_s32(v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vdupq_lane_s32(v, 0);
-  vdupq_lane_s32(v, 1);
-}
-
-#if defined(__aarch64__)
-void test_vdup_laneq(int32x4_t v) {
-  vdup_laneq_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vdup_laneq_s32(v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vdup_laneq_s32(v, 0);
-  vdup_laneq_s32(v, 3);
-}
-
-void test_vdupq_laneq(int32x4_t v) {
-  vdupq_laneq_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vdupq_laneq_s32(v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vdupq_laneq_s32(v, 0);
-  vdupq_laneq_s32(v, 3);
-}
-#endif
-
-void test_vmla_lane(int32x2_t a, int32x2_t b, int32x2_t v) {
-  vmla_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vmla_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vmla_lane_s32(a, b, v, 0);
-  vmla_lane_s32(a, b, v, 1);
-}
-
-void test_vmlaq_lane(int32x4_t a, int32x4_t b, int32x2_t v) {
-  vmlaq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vmlaq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vmlaq_lane_s32(a, b, v, 0);
-  vmlaq_lane_s32(a, b, v, 1);
-}
-
-#if defined(__aarch64__)
-void test_vmla_laneq(int32x2_t a, int32x2_t b, int32x4_t v) {
-  vmla_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vmla_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vmla_laneq_s32(a, b, v, 0);
-  vmla_laneq_s32(a, b, v, 3);
-}
-
-void test_vmlaq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) {
-  vmlaq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vmlaq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vmlaq_laneq_s32(a, b, v, 0);
-  vmlaq_laneq_s32(a, b, v, 3);
-}
-
-void test_vmlal_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) {
-  vmlal_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vmlal_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vmlal_high_lane_s32(a, b, v, 0);
-  vmlal_high_lane_s32(a, b, v, 1);
-}
-
-void test_vmlal_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) {
-  vmlal_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vmlal_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vmlal_high_laneq_s32(a, b, v, 0);
-  vmlal_high_laneq_s32(a, b, v, 3);
-}
-#endif
-
-void test_vmlal_lane(int64x2_t a, int32x2_t b, int32x2_t v) {
-  vmlal_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vmlal_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vmlal_lane_s32(a, b, v, 0);
-  vmlal_lane_s32(a, b, v, 1);
-}
-
-#if defined(__aarch64__)
-void test_vmlal_laneq(int64x2_t a, int32x2_t b, int32x4_t v) {
-  vmlal_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vmlal_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vmlal_laneq_s32(a, b, v, 0);
-  vmlal_laneq_s32(a, b, v, 3);
-}
-#endif
-
-void test_vmls_lane(int32x2_t a, int32x2_t b, int32x2_t v) {
-  vmls_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vmls_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vmls_lane_s32(a, b, v, 0);
-  vmls_lane_s32(a, b, v, 1);
-}
-
-void test_vmlsq_lane(int32x4_t a, int32x4_t b, int32x2_t v) {
-  vmlsq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vmlsq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vmlsq_lane_s32(a, b, v, 0);
-  vmlsq_lane_s32(a, b, v, 1);
-}
-
-#if defined(__aarch64__)
-void test_vmls_laneq(int32x2_t a, int32x2_t b, int32x4_t v) {
-  vmls_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vmls_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vmls_laneq_s32(a, b, v, 0);
-  vmls_laneq_s32(a, b, v, 3);
-}
-
-void test_vmlsq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) {
-  vmlsq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vmlsq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vmlsq_laneq_s32(a, b, v, 0);
-  vmlsq_laneq_s32(a, b, v, 3);
-}
-
-void test_vmlsl_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) {
-  vmlsl_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vmlsl_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vmlsl_high_lane_s32(a, b, v, 0);
-  vmlsl_high_lane_s32(a, b, v, 1);
-}
-
-void test_vmlsl_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) {
-  vmlsl_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vmlsl_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vmlsl_high_laneq_s32(a, b, v, 0);
-  vmlsl_high_laneq_s32(a, b, v, 3);
-}
-#endif
-
-void test_vmlsl_lane(int64x2_t a, int32x2_t b, int32x2_t v) {
-  vmlsl_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vmlsl_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vmlsl_lane_s32(a, b, v, 0);
-  vmlsl_lane_s32(a, b, v, 1);
-}
-
-#if defined(__aarch64__)
-void test_vmlsl_laneq(int64x2_t a, int32x2_t b, int32x4_t v) {
-  vmlsl_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vmlsl_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vmlsl_laneq_s32(a, b, v, 0);
-  vmlsl_laneq_s32(a, b, v, 3);
-}
-#endif
-
-void test_vmull_lane(int32x2_t a, int32x2_t b) {
-  vmull_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vmull_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vmull_lane_s32(a, b, 0);
-  vmull_lane_s32(a, b, 1);
-}
-
-#if defined(__aarch64__)
-void test_vmull_laneq(int32x2_t a, int32x4_t b) {
-  vmull_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vmull_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vmull_laneq_s32(a, b, 0);
-  vmull_laneq_s32(a, b, 3);
-}
-
-void test_vmull_high_lane(int32x4_t a, int32x2_t b) {
-  vmull_high_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vmull_high_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vmull_high_lane_s32(a, b, 0);
-  vmull_high_lane_s32(a, b, 1);
-}
-
-void test_vmull_high_laneq(int32x4_t a, int32x4_t b) {
-  vmull_high_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vmull_high_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vmull_high_laneq_s32(a, b, 0);
-  vmull_high_laneq_s32(a, b, 3);
-}
-
-void test_vqdmlal_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) {
-  vqdmlal_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vqdmlal_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vqdmlal_high_lane_s32(a, b, v, 0);
-  vqdmlal_high_lane_s32(a, b, v, 1);
-}
-
-void test_vqdmlal_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) {
-  vqdmlal_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vqdmlal_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vqdmlal_high_laneq_s32(a, b, v, 0);
-  vqdmlal_high_laneq_s32(a, b, v, 3);
-}
-#endif
-
-void test_vqdmlal_lane(int64x2_t a, int32x2_t b, int32x2_t v) {
-  vqdmlal_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vqdmlal_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vqdmlal_lane_s32(a, b, v, 0);
-  vqdmlal_lane_s32(a, b, v, 1);
-}
-
-#if defined(__aarch64__)
-void test_vqdmlal_laneq(int64x2_t a, int32x2_t b, int32x4_t v) {
-  vqdmlal_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vqdmlal_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vqdmlal_laneq_s32(a, b, v, 0);
-  vqdmlal_laneq_s32(a, b, v, 3);
-}
-
-void test_vqdmlsl_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) {
-  vqdmlsl_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vqdmlsl_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vqdmlsl_high_lane_s32(a, b, v, 0);
-  vqdmlsl_high_lane_s32(a, b, v, 1);
-}
-
-void test_vqdmlsl_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) {
-  vqdmlsl_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vqdmlsl_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vqdmlsl_high_laneq_s32(a, b, v, 0);
-  vqdmlsl_high_laneq_s32(a, b, v, 3);
-}
-#endif
-
-void test_vqdmlsl_lane(int64x2_t a, int32x2_t b, int32x2_t v) {
-  vqdmlsl_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vqdmlsl_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vqdmlsl_lane_s32(a, b, v, 0);
-  vqdmlsl_lane_s32(a, b, v, 1);
-}
-
-#if defined(__aarch64__)
-void test_vqdmlsl_laneq(int64x2_t a, int32x2_t b, int32x4_t v) {
-  vqdmlsl_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vqdmlsl_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vqdmlsl_laneq_s32(a, b, v, 0);
-  vqdmlsl_laneq_s32(a, b, v, 3);
-}
-#endif
-
-void test_vqdmulh_lane(int32x2_t a, int32x2_t b) {
-  vqdmulh_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vqdmulh_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vqdmulh_lane_s32(a, b, 0);
-  vqdmulh_lane_s32(a, b, 1);
-}
-
-void test_vqdmulhq_lane(int32x4_t a, int32x2_t b) {
-  vqdmulhq_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vqdmulhq_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vqdmulhq_lane_s32(a, b, 0);
-  vqdmulhq_lane_s32(a, b, 1);
-}
-
-#if defined(__aarch64__)
-void test_vqdmulh_laneq(int32x2_t a, int32x4_t b) {
-  vqdmulh_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vqdmulh_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vqdmulh_laneq_s32(a, b, 0);
-  vqdmulh_laneq_s32(a, b, 3);
-}
-
-void test_vqdmulhq_laneq(int32x4_t a, int32x4_t b) {
-  vqdmulhq_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vqdmulhq_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vqdmulhq_laneq_s32(a, b, 0);
-  vqdmulhq_laneq_s32(a, b, 3);
-}
-
-void test_vqdmull_high_lane(int32x4_t a, int32x2_t b) {
-  vqdmull_high_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vqdmull_high_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vqdmull_high_lane_s32(a, b, 0);
-  vqdmull_high_lane_s32(a, b, 1);
-}
-
-void test_vqdmull_high_laneq(int32x4_t a, int32x4_t b) {
-  vqdmull_high_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vqdmull_high_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vqdmull_high_laneq_s32(a, b, 0);
-  vqdmull_high_laneq_s32(a, b, 3);
-}
-#endif
-
-void test_vqdmull_lane(int32x2_t a, int32x2_t v) {
-  vqdmull_lane_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vqdmull_lane_s32(a, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vqdmull_lane_s32(a, v, 0);
-  vqdmull_lane_s32(a, v, 1);
-}
-
-#if defined(__aarch64__)
-void test_vqdmull_laneq(int32x2_t a, int32x4_t v) {
-  vqdmull_laneq_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vqdmull_laneq_s32(a, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vqdmull_laneq_s32(a, v, 0);
-  vqdmull_laneq_s32(a, v, 3);
-}
-#endif
-
-void test_vqrdmlah_lane(int32x2_t a, int32x2_t b, int32x2_t v) {
-  vqrdmlah_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vqrdmlah_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vqrdmlah_lane_s32(a, b, v, 0);
-  vqrdmlah_lane_s32(a, b, v, 1);
-}
-
-void test_vqrdmlahq_lane(int32x4_t a, int32x4_t b, int32x2_t v) {
-  vqrdmlahq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vqrdmlahq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vqrdmlahq_lane_s32(a, b, v, 0);
-  vqrdmlahq_lane_s32(a, b, v, 1);
-}
-
-#if defined(__aarch64__)
-void test_vqrdmlah_laneq(int32x2_t a, int32x2_t b, int32x4_t v) {
-  vqrdmlah_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vqrdmlah_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vqrdmlah_laneq_s32(a, b, v, 0);
-  vqrdmlah_laneq_s32(a, b, v, 3);
-}
-
-void test_vqrdmlahq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) {
-  vqrdmlahq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vqrdmlahq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vqrdmlahq_laneq_s32(a, b, v, 0);
-  vqrdmlahq_laneq_s32(a, b, v, 3);
-}
-#endif
-
-void test_vqrdmlsh_lane(int32x2_t a, int32x2_t b, int32x2_t v) {
-  vqrdmlsh_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vqrdmlsh_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vqrdmlsh_lane_s32(a, b, v, 0);
-  vqrdmlsh_lane_s32(a, b, v, 1);
-}
-
-void test_vqrdmlshq_lane(int32x4_t a, int32x4_t b, int32x2_t v) {
-  vqrdmlshq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vqrdmlshq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vqrdmlshq_lane_s32(a, b, v, 0);
-  vqrdmlshq_lane_s32(a, b, v, 1);
-}
-
-#if defined(__aarch64__)
-void test_vqrdmlsh_laneq(int32x2_t a, int32x2_t b, int32x4_t v) {
-  vqrdmlsh_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vqrdmlsh_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vqrdmlsh_laneq_s32(a, b, v, 0);
-  vqrdmlsh_laneq_s32(a, b, v, 3);
-}
-
-void test_vqrdmlshq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) {
-  vqrdmlshq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vqrdmlshq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vqrdmlshq_laneq_s32(a, b, v, 0);
-  vqrdmlshq_laneq_s32(a, b, v, 3);
-}
-#endif
-
-void test_vqrdmulh_lane(int32x2_t a, int32x2_t v) {
-  vqrdmulh_lane_s32(a, v,  -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vqrdmulh_lane_s32(a, v,  2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vqrdmulh_lane_s32(a, v,  0);
-  vqrdmulh_lane_s32(a, v,  1);
-}
-
-void test_vqrdmulhq_lane(int32x4_t a, int32x2_t v) {
-  vqrdmulhq_lane_s32(a, v,  -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
-  vqrdmulhq_lane_s32(a, v,  2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-  vqrdmulhq_lane_s32(a, v,  0);
-  vqrdmulhq_lane_s32(a, v,  1);
-}
-
-#if defined(__aarch64__)
-void test_vqrdmulh_laneq(int32x2_t a, int32x4_t v) {
-  vqrdmulh_laneq_s32(a, v,  -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vqrdmulh_laneq_s32(a, v,  4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vqrdmulh_laneq_s32(a, v,  0);
-  vqrdmulh_laneq_s32(a, v,  3);
-}
-
-void test_vqrdmulhq_laneq(int32x4_t a, int32x4_t v) {
-  vqrdmulhq_laneq_s32(a, v,  -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  vqrdmulhq_laneq_s32(a, v,  4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  vqrdmulhq_laneq_s32(a, v,  0);
-  vqrdmulhq_laneq_s32(a, v,  3);
-}
-#endif
diff --git a/clang/test/Sema/aarch64-neon-bf16-ranges.c b/clang/test/Sema/aarch64-neon-bf16-ranges.c
deleted file mode 100644
index 88e6c50c59382..0000000000000
--- a/clang/test/Sema/aarch64-neon-bf16-ranges.c
+++ /dev/null
@@ -1,49 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify \
-// RUN: -triple aarch64 -target-feature +neon \
-// RUN: -target-feature +bf16 %s
-
-// REQUIRES: aarch64-registered-target || arm-registered-target
-
-#include <arm_neon.h>
-
-int x;
-
-void test_vcopy_lane_bf16(bfloat16x4_t a, bfloat16x8_t b) {
-  // 0 <= lane1 <= 3; 0 <= lane2 <= 3
-  (void)vcopy_lane_bf16(a, 3, a, 3);
-  (void)vcopy_lane_bf16(a, 0, a, 4);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  (void)vcopy_lane_bf16(a, 1, a, -1);   // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  (void)vcopy_lane_bf16(a, 4, a, 0);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  (void)vcopy_lane_bf16(a, -1, a, 1);   // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  (void)vcopy_lane_bf16(a, 0, a, x);    // expected-error-re {{argument {{.*}} must be a constant integer}}
-  (void)vcopy_lane_bf16(a, x, a, 0);    // expected-error-re {{argument {{.*}} must be a constant integer}}
-
-  // 0 <= lane1 <= 7; 0 <= lane2 <= 3
-  (void)vcopyq_lane_bf16(b, 7, a, 3);
-  (void)vcopyq_lane_bf16(b, 0, a, 4);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  (void)vcopyq_lane_bf16(b, 1, a, -1);  // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  (void)vcopyq_lane_bf16(b, 8, a, 0);   // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  (void)vcopyq_lane_bf16(b, -1, a, 1);  // expected-error {{argument value -1 is outside the valid range [0, 7]}}
-  (void)vcopyq_lane_bf16(b, 0, a, x);   // expected-error-re {{argument {{.*}} must be a constant integer}}
-  (void)vcopyq_lane_bf16(b, x, a, 0);   // expected-error-re {{argument {{.*}} must be a constant integer}}
-
-  // 0 <= lane1 <= 3; 0 <= lane2 <= 7
-  (void)vcopy_laneq_bf16(a, 3, b, 7);
-  (void)vcopy_laneq_bf16(a, 0, b, 8);   // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  (void)vcopy_laneq_bf16(a, 1, b, -1);  // expected-error {{argument value -1 is outside the valid range [0, 7]}}
-  (void)vcopy_laneq_bf16(a, 4, b, 0);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
-  (void)vcopy_laneq_bf16(a, -1, b, 1);  // expected-error {{argument value -1 is outside the valid range [0, 3]}}
-  (void)vcopy_laneq_bf16(a, 0, b, x);   // expected-error-re {{argument {{.*}} must be a constant integer}}
-  (void)vcopy_laneq_bf16(a, x, b, 0);   // expected-error-re {{argument {{.*}} must be a constant integer}}
-
-
-  // 0 <= lane1 <= 7; 0 <= lane2 <= 7
-  (void)vcopyq_laneq_bf16(b, 7, b, 7);
-  (void)vcopyq_laneq_bf16(b, 0, b, 8);  // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  (void)vcopyq_laneq_bf16(b, 1, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 7]}}
-  (void)vcopyq_laneq_bf16(b, 8, b, 0);  // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  (void)vcopyq_laneq_bf16(b, -1, b, 1); // expected-error {{argument value -1 is outside the valid range [0, 7]}}
-  (void)vcopyq_laneq_bf16(b, 0, b, x);  // expected-error-re {{argument {{.*}} must be a constant integer}}
-  (void)vcopyq_laneq_bf16(b, x, b, 0);  // expected-error-re {{argument {{.*}} must be a constant integer}}
-}
-
diff --git a/clang/test/Sema/aarch64-neon-fp16-ranges.c b/clang/test/Sema/aarch64-neon-fp16-ranges.c
deleted file mode 100644
index cb273eb56160b..0000000000000
--- a/clang/test/Sema/aarch64-neon-fp16-ranges.c
+++ /dev/null
@@ -1,66 +0,0 @@
-// RUN: %clang_cc1 -triple arm64-linux-gnu -target-feature +neon -target-feature +fullfp16 -ffreestanding -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +fullfp16 -target-feature +neon -ffreestanding -fsyntax-only -verify %s
-
-// REQUIRES: aarch64-registered-target || arm-registered-target
-
-#include <arm_neon.h>
-#include <arm_fp16.h>
-
-void test_vcvt_f16_16(int16_t a){
-  vcvth_n_f16_s16(a, 1);
-  vcvth_n_f16_s16(a, 16);
-  vcvth_n_f16_s16(a, 0);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vcvth_n_f16_s16(a, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vcvth_n_f16_u16(a, 1);
-  vcvth_n_f16_u16(a, 16);
-  vcvth_n_f16_u16(a, 0);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vcvth_n_f16_u16(a, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-}
-
-void test_vcvt_f16_32(int32_t a){
-  vcvth_n_f16_u32(a, 1);
-  vcvth_n_f16_u32(a, 16);
-  vcvth_n_f16_u32(a, 0);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vcvth_n_f16_u32(a, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vcvth_n_f16_s32(a, 1);
-  vcvth_n_f16_s32(a, 16);
-  vcvth_n_f16_s32(a, 0);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vcvth_n_f16_s32(a, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-}
-
-void test_vcvt_f16_64(int64_t a){
-  vcvth_n_f16_s64(a, 1);
-  vcvth_n_f16_s64(a, 16);
-  vcvth_n_f16_s64(a, 0);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vcvth_n_f16_s64(a, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-}
-
-
-void test_vcvt_su_f(float16_t a){
-  vcvth_n_s16_f16(a, 1);
-  vcvth_n_s16_f16(a, 16);
-  vcvth_n_s16_f16(a, 0);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vcvth_n_s16_f16(a, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vcvth_n_s32_f16(a, 1);
-  vcvth_n_s32_f16(a, 16);
-  vcvth_n_s32_f16(a, 0);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vcvth_n_s32_f16(a, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vcvth_n_s64_f16(a, 1);
-  vcvth_n_s64_f16(a, 16);
-  vcvth_n_s64_f16(a, 0);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vcvth_n_s64_f16(a, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vcvth_n_u16_f16(a, 1);
-  vcvth_n_u16_f16(a, 16);
-  vcvth_n_u16_f16(a, 0);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vcvth_n_u16_f16(a, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vcvth_n_u32_f16(a, 1);
-  vcvth_n_u32_f16(a, 16);
-  vcvth_n_u32_f16(a, 0);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vcvth_n_u32_f16(a, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-}
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/bfloat16.c b/clang/test/Sema/aarch64-neon-immediate-ranges/bfloat16.c
new file mode 100644
index 0000000000000..485219a9f8978
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/bfloat16.c
@@ -0,0 +1,237 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -target-feature +bf16 -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+#include <arm_bf16.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_set_all_lanes_to_the_same_value_bf16(bfloat16x8_t arg_b16x8, bfloat16x4_t arg_b16x4) {
+	vdup_lane_bf16(arg_b16x4, 0);
+	vdup_lane_bf16(arg_b16x4, 3);
+	vdup_lane_bf16(arg_b16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_lane_bf16(arg_b16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_lane_bf16(arg_b16x4, 0);
+	vdupq_lane_bf16(arg_b16x4, 3);
+	vdupq_lane_bf16(arg_b16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_lane_bf16(arg_b16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdup_laneq_bf16(arg_b16x8, 0);
+	vdup_laneq_bf16(arg_b16x8, 7);
+	vdup_laneq_bf16(arg_b16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_laneq_bf16(arg_b16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_laneq_bf16(arg_b16x8, 0);
+	vdupq_laneq_bf16(arg_b16x8, 7);
+	vdupq_laneq_bf16(arg_b16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_laneq_bf16(arg_b16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vduph_lane_bf16(arg_b16x4, 0);
+	vduph_lane_bf16(arg_b16x4, 3);
+	vduph_lane_bf16(arg_b16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vduph_lane_bf16(arg_b16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vduph_laneq_bf16(arg_b16x8, 0);
+	vduph_laneq_bf16(arg_b16x8, 7);
+	vduph_laneq_bf16(arg_b16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vduph_laneq_bf16(arg_b16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_split_vectors_bf16(bfloat16x8_t arg_b16x8, bfloat16x4_t arg_b16x4) {
+	vget_lane_bf16(arg_b16x4, 0);
+	vget_lane_bf16(arg_b16x4, 3);
+	vget_lane_bf16(arg_b16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vget_lane_bf16(arg_b16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vgetq_lane_bf16(arg_b16x8, 0);
+	vgetq_lane_bf16(arg_b16x8, 7);
+	vgetq_lane_bf16(arg_b16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vgetq_lane_bf16(arg_b16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_vector_lane_bf16(bfloat16x8_t arg_b16x8, bfloat16x4_t arg_b16x4, bfloat16_t arg_b16) {
+	vset_lane_bf16(arg_b16, arg_b16x4, 0);
+	vset_lane_bf16(arg_b16, arg_b16x4, 3);
+	vset_lane_bf16(arg_b16, arg_b16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vset_lane_bf16(arg_b16, arg_b16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsetq_lane_bf16(arg_b16, arg_b16x8, 0);
+	vsetq_lane_bf16(arg_b16, arg_b16x8, 7);
+	vsetq_lane_bf16(arg_b16, arg_b16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsetq_lane_bf16(arg_b16, arg_b16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_copy_vector_lane_bf16(bfloat16x8_t arg_b16x8, bfloat16x4_t arg_b16x4) {
+	vcopy_lane_bf16(arg_b16x4, 0, arg_b16x4, 0);
+	vcopy_lane_bf16(arg_b16x4, 3, arg_b16x4, 0);
+	vcopy_lane_bf16(arg_b16x4, -1, arg_b16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_bf16(arg_b16x4, 4, arg_b16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_bf16(arg_b16x4, 0, arg_b16x4, 3);
+	vcopy_lane_bf16(arg_b16x4, 0, arg_b16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_bf16(arg_b16x4, 0, arg_b16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_bf16(arg_b16x8, 0, arg_b16x4, 0);
+	vcopyq_lane_bf16(arg_b16x8, 7, arg_b16x4, 0);
+	vcopyq_lane_bf16(arg_b16x8, -1, arg_b16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_bf16(arg_b16x8, 8, arg_b16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_bf16(arg_b16x8, 0, arg_b16x4, 3);
+	vcopyq_lane_bf16(arg_b16x8, 0, arg_b16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_bf16(arg_b16x8, 0, arg_b16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_bf16(arg_b16x4, 0, arg_b16x8, 0);
+	vcopy_laneq_bf16(arg_b16x4, 3, arg_b16x8, 0);
+	vcopy_laneq_bf16(arg_b16x4, -1, arg_b16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_bf16(arg_b16x4, 4, arg_b16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_bf16(arg_b16x4, 0, arg_b16x8, 7);
+	vcopy_laneq_bf16(arg_b16x4, 0, arg_b16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_bf16(arg_b16x4, 0, arg_b16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_bf16(arg_b16x8, 0, arg_b16x8, 0);
+	vcopyq_laneq_bf16(arg_b16x8, 7, arg_b16x8, 0);
+	vcopyq_laneq_bf16(arg_b16x8, -1, arg_b16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_bf16(arg_b16x8, 8, arg_b16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_bf16(arg_b16x8, 0, arg_b16x8, 7);
+	vcopyq_laneq_bf16(arg_b16x8, 0, arg_b16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_bf16(arg_b16x8, 0, arg_b16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_load_bf16(bfloat16x4_t arg_b16x4, bfloat16x4x4_t arg_b16x4x4, bfloat16x8x4_t arg_b16x8x4,
+					bfloat16x4x3_t arg_b16x4x3, bfloat16x8_t arg_b16x8, bfloat16_t* arg_b16_ptr,
+					bfloat16x8x3_t arg_b16x8x3, bfloat16x4x2_t arg_b16x4x2, bfloat16x8x2_t arg_b16x8x2) {
+	vld1_lane_bf16(arg_b16_ptr, arg_b16x4, 0);
+	vld1_lane_bf16(arg_b16_ptr, arg_b16x4, 3);
+	vld1_lane_bf16(arg_b16_ptr, arg_b16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1_lane_bf16(arg_b16_ptr, arg_b16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld1q_lane_bf16(arg_b16_ptr, arg_b16x8, 0);
+	vld1q_lane_bf16(arg_b16_ptr, arg_b16x8, 7);
+	vld1q_lane_bf16(arg_b16_ptr, arg_b16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1q_lane_bf16(arg_b16_ptr, arg_b16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2_lane_bf16(arg_b16_ptr, arg_b16x4x2, 0);
+	vld2_lane_bf16(arg_b16_ptr, arg_b16x4x2, 3);
+	vld2_lane_bf16(arg_b16_ptr, arg_b16x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2_lane_bf16(arg_b16_ptr, arg_b16x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2q_lane_bf16(arg_b16_ptr, arg_b16x8x2, 0);
+	vld2q_lane_bf16(arg_b16_ptr, arg_b16x8x2, 7);
+	vld2q_lane_bf16(arg_b16_ptr, arg_b16x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2q_lane_bf16(arg_b16_ptr, arg_b16x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3_lane_bf16(arg_b16_ptr, arg_b16x4x3, 0);
+	vld3_lane_bf16(arg_b16_ptr, arg_b16x4x3, 3);
+	vld3_lane_bf16(arg_b16_ptr, arg_b16x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3_lane_bf16(arg_b16_ptr, arg_b16x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3q_lane_bf16(arg_b16_ptr, arg_b16x8x3, 0);
+	vld3q_lane_bf16(arg_b16_ptr, arg_b16x8x3, 7);
+	vld3q_lane_bf16(arg_b16_ptr, arg_b16x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3q_lane_bf16(arg_b16_ptr, arg_b16x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4_lane_bf16(arg_b16_ptr, arg_b16x4x4, 0);
+	vld4_lane_bf16(arg_b16_ptr, arg_b16x4x4, 3);
+	vld4_lane_bf16(arg_b16_ptr, arg_b16x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4_lane_bf16(arg_b16_ptr, arg_b16x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4q_lane_bf16(arg_b16_ptr, arg_b16x8x4, 0);
+	vld4q_lane_bf16(arg_b16_ptr, arg_b16x8x4, 7);
+	vld4q_lane_bf16(arg_b16_ptr, arg_b16x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4q_lane_bf16(arg_b16_ptr, arg_b16x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_store_bf16(bfloat16x4_t arg_b16x4, bfloat16x4x4_t arg_b16x4x4, bfloat16x8x4_t arg_b16x8x4,
+					 bfloat16x4x3_t arg_b16x4x3, bfloat16x8_t arg_b16x8, bfloat16_t* arg_b16_ptr,
+					 bfloat16x8x3_t arg_b16x8x3, bfloat16x4x2_t arg_b16x4x2, bfloat16x8x2_t arg_b16x8x2) {
+	vst1_lane_bf16(arg_b16_ptr, arg_b16x4, 0);
+	vst1_lane_bf16(arg_b16_ptr, arg_b16x4, 3);
+	vst1_lane_bf16(arg_b16_ptr, arg_b16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1_lane_bf16(arg_b16_ptr, arg_b16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst1q_lane_bf16(arg_b16_ptr, arg_b16x8, 0);
+	vst1q_lane_bf16(arg_b16_ptr, arg_b16x8, 7);
+	vst1q_lane_bf16(arg_b16_ptr, arg_b16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1q_lane_bf16(arg_b16_ptr, arg_b16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2_lane_bf16(arg_b16_ptr, arg_b16x4x2, 0);
+	vst2_lane_bf16(arg_b16_ptr, arg_b16x4x2, 3);
+	vst2_lane_bf16(arg_b16_ptr, arg_b16x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2_lane_bf16(arg_b16_ptr, arg_b16x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2q_lane_bf16(arg_b16_ptr, arg_b16x8x2, 0);
+	vst2q_lane_bf16(arg_b16_ptr, arg_b16x8x2, 7);
+	vst2q_lane_bf16(arg_b16_ptr, arg_b16x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2q_lane_bf16(arg_b16_ptr, arg_b16x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3_lane_bf16(arg_b16_ptr, arg_b16x4x3, 0);
+	vst3_lane_bf16(arg_b16_ptr, arg_b16x4x3, 3);
+	vst3_lane_bf16(arg_b16_ptr, arg_b16x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3_lane_bf16(arg_b16_ptr, arg_b16x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3q_lane_bf16(arg_b16_ptr, arg_b16x8x3, 0);
+	vst3q_lane_bf16(arg_b16_ptr, arg_b16x8x3, 7);
+	vst3q_lane_bf16(arg_b16_ptr, arg_b16x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3q_lane_bf16(arg_b16_ptr, arg_b16x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4_lane_bf16(arg_b16_ptr, arg_b16x4x4, 0);
+	vst4_lane_bf16(arg_b16_ptr, arg_b16x4x4, 3);
+	vst4_lane_bf16(arg_b16_ptr, arg_b16x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4_lane_bf16(arg_b16_ptr, arg_b16x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4q_lane_bf16(arg_b16_ptr, arg_b16x8x4, 0);
+	vst4q_lane_bf16(arg_b16_ptr, arg_b16x8x4, 7);
+	vst4q_lane_bf16(arg_b16_ptr, arg_b16x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4q_lane_bf16(arg_b16_ptr, arg_b16x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_dot_product_f32(bfloat16x8_t arg_b16x8, bfloat16x4_t arg_b16x4, float32x2_t arg_f32x2, float32x4_t arg_f32x4) {
+	vbfdot_lane_f32(arg_f32x2, arg_b16x4, arg_b16x4, 0);
+	vbfdot_lane_f32(arg_f32x2, arg_b16x4, arg_b16x4, 1);
+	vbfdot_lane_f32(arg_f32x2, arg_b16x4, arg_b16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vbfdot_lane_f32(arg_f32x2, arg_b16x4, arg_b16x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vbfdotq_laneq_f32(arg_f32x4, arg_b16x8, arg_b16x8, 0);
+	vbfdotq_laneq_f32(arg_f32x4, arg_b16x8, arg_b16x8, 3);
+	vbfdotq_laneq_f32(arg_f32x4, arg_b16x8, arg_b16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vbfdotq_laneq_f32(arg_f32x4, arg_b16x8, arg_b16x8, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vbfdot_laneq_f32(arg_f32x2, arg_b16x4, arg_b16x8, 0);
+	vbfdot_laneq_f32(arg_f32x2, arg_b16x4, arg_b16x8, 3);
+	vbfdot_laneq_f32(arg_f32x2, arg_b16x4, arg_b16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vbfdot_laneq_f32(arg_f32x2, arg_b16x4, arg_b16x8, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vbfdotq_lane_f32(arg_f32x4, arg_b16x8, arg_b16x4, 0);
+	vbfdotq_lane_f32(arg_f32x4, arg_b16x8, arg_b16x4, 1);
+	vbfdotq_lane_f32(arg_f32x4, arg_b16x8, arg_b16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vbfdotq_lane_f32(arg_f32x4, arg_b16x8, arg_b16x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_accumulate_by_scalar_f32(bfloat16x8_t arg_b16x8, bfloat16x4_t arg_b16x4, float32x4_t arg_f32x4) {
+	vbfmlalbq_lane_f32(arg_f32x4, arg_b16x8, arg_b16x4, 0);
+	vbfmlalbq_lane_f32(arg_f32x4, arg_b16x8, arg_b16x4, 3);
+	vbfmlalbq_lane_f32(arg_f32x4, arg_b16x8, arg_b16x4, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vbfmlalbq_lane_f32(arg_f32x4, arg_b16x8, arg_b16x4, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vbfmlalbq_laneq_f32(arg_f32x4, arg_b16x8, arg_b16x8, 0);
+	vbfmlalbq_laneq_f32(arg_f32x4, arg_b16x8, arg_b16x8, 7);
+	vbfmlalbq_laneq_f32(arg_f32x4, arg_b16x8, arg_b16x8, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vbfmlalbq_laneq_f32(arg_f32x4, arg_b16x8, arg_b16x8, 8); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vbfmlaltq_lane_f32(arg_f32x4, arg_b16x8, arg_b16x4, 0);
+	vbfmlaltq_lane_f32(arg_f32x4, arg_b16x8, arg_b16x4, 3);
+	vbfmlaltq_lane_f32(arg_f32x4, arg_b16x8, arg_b16x4, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vbfmlaltq_lane_f32(arg_f32x4, arg_b16x8, arg_b16x4, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vbfmlaltq_laneq_f32(arg_f32x4, arg_b16x8, arg_b16x8, 0);
+	vbfmlaltq_laneq_f32(arg_f32x4, arg_b16x8, arg_b16x8, 7);
+	vbfmlaltq_laneq_f32(arg_f32x4, arg_b16x8, arg_b16x8, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vbfmlaltq_laneq_f32(arg_f32x4, arg_b16x8, arg_b16x8, 8); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/conversions.c b/clang/test/Sema/aarch64-neon-immediate-ranges/conversions.c
new file mode 100644
index 0000000000000..30ae7f7392422
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/conversions.c
@@ -0,0 +1,144 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_conversions_f32(float32_t arg_f32, float32x2_t arg_f32x2, float32x4_t arg_f32x4) {
+	vcvt_n_s32_f32(arg_f32x2, 1);
+	vcvt_n_s32_f32(arg_f32x2, 32);
+	vcvt_n_s32_f32(arg_f32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvt_n_s32_f32(arg_f32x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvtq_n_s32_f32(arg_f32x4, 1);
+	vcvtq_n_s32_f32(arg_f32x4, 32);
+	vcvtq_n_s32_f32(arg_f32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvtq_n_s32_f32(arg_f32x4, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvt_n_u32_f32(arg_f32x2, 1);
+	vcvt_n_u32_f32(arg_f32x2, 32);
+	vcvt_n_u32_f32(arg_f32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvt_n_u32_f32(arg_f32x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvtq_n_u32_f32(arg_f32x4, 1);
+	vcvtq_n_u32_f32(arg_f32x4, 32);
+	vcvtq_n_u32_f32(arg_f32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvtq_n_u32_f32(arg_f32x4, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvts_n_s32_f32(arg_f32, 1);
+	vcvts_n_s32_f32(arg_f32, 32);
+	vcvts_n_s32_f32(arg_f32, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvts_n_s32_f32(arg_f32, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvts_n_u32_f32(arg_f32, 1);
+	vcvts_n_u32_f32(arg_f32, 32);
+	vcvts_n_u32_f32(arg_f32, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvts_n_u32_f32(arg_f32, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_conversions_f64(float64x1_t arg_f64x1, float64x2_t arg_f64x2, float64_t arg_f64) {
+	vcvt_n_s64_f64(arg_f64x1, 1);
+	vcvt_n_s64_f64(arg_f64x1, 64);
+	vcvt_n_s64_f64(arg_f64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvt_n_s64_f64(arg_f64x1, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvtq_n_s64_f64(arg_f64x2, 1);
+	vcvtq_n_s64_f64(arg_f64x2, 64);
+	vcvtq_n_s64_f64(arg_f64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvtq_n_s64_f64(arg_f64x2, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvt_n_u64_f64(arg_f64x1, 1);
+	vcvt_n_u64_f64(arg_f64x1, 64);
+	vcvt_n_u64_f64(arg_f64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvt_n_u64_f64(arg_f64x1, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvtq_n_u64_f64(arg_f64x2, 1);
+	vcvtq_n_u64_f64(arg_f64x2, 64);
+	vcvtq_n_u64_f64(arg_f64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvtq_n_u64_f64(arg_f64x2, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvtd_n_s64_f64(arg_f64, 1);
+	vcvtd_n_s64_f64(arg_f64, 64);
+	vcvtd_n_s64_f64(arg_f64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvtd_n_s64_f64(arg_f64, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvtd_n_u64_f64(arg_f64, 1);
+	vcvtd_n_u64_f64(arg_f64, 64);
+	vcvtd_n_u64_f64(arg_f64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvtd_n_u64_f64(arg_f64, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_conversions_s32(int32_t arg_i32, int32x2_t arg_i32x2, int32x4_t arg_i32x4) {
+	vcvt_n_f32_s32(arg_i32x2, 1);
+	vcvt_n_f32_s32(arg_i32x2, 32);
+	vcvt_n_f32_s32(arg_i32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvt_n_f32_s32(arg_i32x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvtq_n_f32_s32(arg_i32x4, 1);
+	vcvtq_n_f32_s32(arg_i32x4, 32);
+	vcvtq_n_f32_s32(arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvtq_n_f32_s32(arg_i32x4, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvts_n_f32_s32(arg_i32, 1);
+	vcvts_n_f32_s32(arg_i32, 32);
+	vcvts_n_f32_s32(arg_i32, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvts_n_f32_s32(arg_i32, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_conversions_u32(uint32x4_t arg_u32x4, uint32x2_t arg_u32x2, uint32_t arg_u32) {
+	vcvt_n_f32_u32(arg_u32x2, 1);
+	vcvt_n_f32_u32(arg_u32x2, 32);
+	vcvt_n_f32_u32(arg_u32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvt_n_f32_u32(arg_u32x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvtq_n_f32_u32(arg_u32x4, 1);
+	vcvtq_n_f32_u32(arg_u32x4, 32);
+	vcvtq_n_f32_u32(arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvtq_n_f32_u32(arg_u32x4, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvts_n_f32_u32(arg_u32, 1);
+	vcvts_n_f32_u32(arg_u32, 32);
+	vcvts_n_f32_u32(arg_u32, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvts_n_f32_u32(arg_u32, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_conversions_s64(int64x2_t arg_i64x2, int64x1_t arg_i64x1, int64_t arg_i64) {
+	vcvt_n_f64_s64(arg_i64x1, 1);
+	vcvt_n_f64_s64(arg_i64x1, 64);
+	vcvt_n_f64_s64(arg_i64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvt_n_f64_s64(arg_i64x1, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvtq_n_f64_s64(arg_i64x2, 1);
+	vcvtq_n_f64_s64(arg_i64x2, 64);
+	vcvtq_n_f64_s64(arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvtq_n_f64_s64(arg_i64x2, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvtd_n_f64_s64(arg_i64, 1);
+	vcvtd_n_f64_s64(arg_i64, 64);
+	vcvtd_n_f64_s64(arg_i64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvtd_n_f64_s64(arg_i64, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_conversions_u64(uint64x2_t arg_u64x2, uint64_t arg_u64, uint64x1_t arg_u64x1) {
+	vcvt_n_f64_u64(arg_u64x1, 1);
+	vcvt_n_f64_u64(arg_u64x1, 64);
+	vcvt_n_f64_u64(arg_u64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvt_n_f64_u64(arg_u64x1, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvtq_n_f64_u64(arg_u64x2, 1);
+	vcvtq_n_f64_u64(arg_u64x2, 64);
+	vcvtq_n_f64_u64(arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvtq_n_f64_u64(arg_u64x2, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvtd_n_f64_u64(arg_u64, 1);
+	vcvtd_n_f64_u64(arg_u64, 64);
+	vcvtd_n_f64_u64(arg_u64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvtd_n_f64_u64(arg_u64, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/copy-vector-lane.c b/clang/test/Sema/aarch64-neon-immediate-ranges/copy-vector-lane.c
new file mode 100644
index 0000000000000..aafd36d1ccfe6
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/copy-vector-lane.c
@@ -0,0 +1,498 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_copy_vector_lane_s8(int8x16_t arg_i8x16, int8x8_t arg_i8x8) {
+	vcopy_lane_s8(arg_i8x8, 0, arg_i8x8, 0);
+
+	vcopy_lane_s8(arg_i8x8, 7, arg_i8x8, 0);
+	vcopy_lane_s8(arg_i8x8, -1, arg_i8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_s8(arg_i8x8, 8, arg_i8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_lane_s8(arg_i8x8, 0, arg_i8x8, 7);
+	vcopy_lane_s8(arg_i8x8, 0, arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_s8(arg_i8x8, 0, arg_i8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_s8(arg_i8x16, 0, arg_i8x8, 0);
+	vcopyq_lane_s8(arg_i8x16, 15, arg_i8x8, 0);
+	vcopyq_lane_s8(arg_i8x16, -1, arg_i8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_s8(arg_i8x16, 16, arg_i8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_s8(arg_i8x16, 0, arg_i8x8, 7);
+	vcopyq_lane_s8(arg_i8x16, 0, arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_s8(arg_i8x16, 0, arg_i8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_s8(arg_i8x8, 0, arg_i8x16, 0);
+	vcopy_laneq_s8(arg_i8x8, 7, arg_i8x16, 0);
+	vcopy_laneq_s8(arg_i8x8, -1, arg_i8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_s8(arg_i8x8, 8, arg_i8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_s8(arg_i8x8, 0, arg_i8x16, 15);
+	vcopy_laneq_s8(arg_i8x8, 0, arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_s8(arg_i8x8, 0, arg_i8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_s8(arg_i8x16, 0, arg_i8x16, 0);
+	vcopyq_laneq_s8(arg_i8x16, 15, arg_i8x16, 0);
+	vcopyq_laneq_s8(arg_i8x16, -1, arg_i8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_s8(arg_i8x16, 16, arg_i8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_s8(arg_i8x16, 0, arg_i8x16, 15);
+	vcopyq_laneq_s8(arg_i8x16, 0, arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_s8(arg_i8x16, 0, arg_i8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_copy_vector_lane_s16(int16x4_t arg_i16x4, int16x8_t arg_i16x8) {
+	vcopy_lane_s16(arg_i16x4, 0, arg_i16x4, 0);
+	vcopy_lane_s16(arg_i16x4, 3, arg_i16x4, 0);
+	vcopy_lane_s16(arg_i16x4, -1, arg_i16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_s16(arg_i16x4, 4, arg_i16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_lane_s16(arg_i16x4, 0, arg_i16x4, 3);
+	vcopy_lane_s16(arg_i16x4, 0, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_s16(arg_i16x4, 0, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_s16(arg_i16x8, 0, arg_i16x4, 0);
+	vcopyq_lane_s16(arg_i16x8, 7, arg_i16x4, 0);
+	vcopyq_lane_s16(arg_i16x8, -1, arg_i16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_s16(arg_i16x8, 8, arg_i16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_s16(arg_i16x8, 0, arg_i16x4, 3);
+	vcopyq_lane_s16(arg_i16x8, 0, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_s16(arg_i16x8, 0, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_s16(arg_i16x4, 0, arg_i16x8, 0);
+	vcopy_laneq_s16(arg_i16x4, 3, arg_i16x8, 0);
+	vcopy_laneq_s16(arg_i16x4, -1, arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_s16(arg_i16x4, 4, arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_s16(arg_i16x4, 0, arg_i16x8, 7);
+	vcopy_laneq_s16(arg_i16x4, 0, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_s16(arg_i16x4, 0, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_s16(arg_i16x8, 0, arg_i16x8, 0);
+	vcopyq_laneq_s16(arg_i16x8, 7, arg_i16x8, 0);
+	vcopyq_laneq_s16(arg_i16x8, -1, arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_s16(arg_i16x8, 8, arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_s16(arg_i16x8, 0, arg_i16x8, 7);
+	vcopyq_laneq_s16(arg_i16x8, 0, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_s16(arg_i16x8, 0, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_copy_vector_lane_s32(int32x2_t arg_i32x2, int32x4_t arg_i32x4) {
+	vcopy_lane_s32(arg_i32x2, 0, arg_i32x2, 0);
+	vcopy_lane_s32(arg_i32x2, 1, arg_i32x2, 0);
+	vcopy_lane_s32(arg_i32x2, -1, arg_i32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_s32(arg_i32x2, 2, arg_i32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_lane_s32(arg_i32x2, 0, arg_i32x2, 1);
+	vcopy_lane_s32(arg_i32x2, 0, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_s32(arg_i32x2, 0, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_s32(arg_i32x4, 0, arg_i32x2, 0);
+	vcopyq_lane_s32(arg_i32x4, 3, arg_i32x2, 0);
+	vcopyq_lane_s32(arg_i32x4, -1, arg_i32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_s32(arg_i32x4, 4, arg_i32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_s32(arg_i32x4, 0, arg_i32x2, 1);
+	vcopyq_lane_s32(arg_i32x4, 0, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_s32(arg_i32x4, 0, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_s32(arg_i32x2, 0, arg_i32x4, 0);
+	vcopy_laneq_s32(arg_i32x2, 1, arg_i32x4, 0);
+	vcopy_laneq_s32(arg_i32x2, -1, arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_s32(arg_i32x2, 2, arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_s32(arg_i32x2, 0, arg_i32x4, 3);
+	vcopy_laneq_s32(arg_i32x2, 0, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_s32(arg_i32x2, 0, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_s32(arg_i32x4, 0, arg_i32x4, 0);
+	vcopyq_laneq_s32(arg_i32x4, 3, arg_i32x4, 0);
+	vcopyq_laneq_s32(arg_i32x4, -1, arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_s32(arg_i32x4, 4, arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_s32(arg_i32x4, 0, arg_i32x4, 3);
+	vcopyq_laneq_s32(arg_i32x4, 0, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_s32(arg_i32x4, 0, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_copy_vector_lane_s64(int64x1_t arg_i64x1, int64x2_t arg_i64x2) {
+	vcopy_lane_s64(arg_i64x1, 0, arg_i64x1, 0);
+	vcopy_lane_s64(arg_i64x1, -1, arg_i64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_s64(arg_i64x1, 1, arg_i64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_s64(arg_i64x1, 0, arg_i64x1, 0);
+	vcopy_lane_s64(arg_i64x1, 0, arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_s64(arg_i64x1, 0, arg_i64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_s64(arg_i64x2, 0, arg_i64x1, 0);
+	vcopyq_lane_s64(arg_i64x2, 1, arg_i64x1, 0);
+	vcopyq_lane_s64(arg_i64x2, -1, arg_i64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_s64(arg_i64x2, 2, arg_i64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_s64(arg_i64x2, 0, arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_s64(arg_i64x2, 0, arg_i64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_s64(arg_i64x1, 0, arg_i64x2, 0);
+	vcopy_laneq_s64(arg_i64x1, -1, arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_s64(arg_i64x1, 1, arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_s64(arg_i64x1, 0, arg_i64x2, 1);
+	vcopy_laneq_s64(arg_i64x1, 0, arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_s64(arg_i64x1, 0, arg_i64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_s64(arg_i64x2, 0, arg_i64x2, 0);
+	vcopyq_laneq_s64(arg_i64x2, 1, arg_i64x2, 0);
+	vcopyq_laneq_s64(arg_i64x2, -1, arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_s64(arg_i64x2, 2, arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_s64(arg_i64x2, 0, arg_i64x2, 1);
+	vcopyq_laneq_s64(arg_i64x2, 0, arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_s64(arg_i64x2, 0, arg_i64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_copy_vector_lane_u8(uint8x8_t arg_u8x8, uint8x16_t arg_u8x16) {
+	vcopy_lane_u8(arg_u8x8, 0, arg_u8x8, 0);
+	vcopy_lane_u8(arg_u8x8, 7, arg_u8x8, 0);
+	vcopy_lane_u8(arg_u8x8, -1, arg_u8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_u8(arg_u8x8, 8, arg_u8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_lane_u8(arg_u8x8, 0, arg_u8x8, 7);
+	vcopy_lane_u8(arg_u8x8, 0, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_u8(arg_u8x8, 0, arg_u8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_u8(arg_u8x16, 0, arg_u8x8, 0);
+	vcopyq_lane_u8(arg_u8x16, 15, arg_u8x8, 0);
+	vcopyq_lane_u8(arg_u8x16, -1, arg_u8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_u8(arg_u8x16, 16, arg_u8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_u8(arg_u8x16, 0, arg_u8x8, 7);
+	vcopyq_lane_u8(arg_u8x16, 0, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_u8(arg_u8x16, 0, arg_u8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_u8(arg_u8x8, 0, arg_u8x16, 0);
+	vcopy_laneq_u8(arg_u8x8, 7, arg_u8x16, 0);
+	vcopy_laneq_u8(arg_u8x8, -1, arg_u8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_u8(arg_u8x8, 8, arg_u8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_u8(arg_u8x8, 0, arg_u8x16, 15);
+	vcopy_laneq_u8(arg_u8x8, 0, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_u8(arg_u8x8, 0, arg_u8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_u8(arg_u8x16, 0, arg_u8x16, 0);
+	vcopyq_laneq_u8(arg_u8x16, 15, arg_u8x16, 0);
+	vcopyq_laneq_u8(arg_u8x16, -1, arg_u8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_u8(arg_u8x16, 16, arg_u8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_u8(arg_u8x16, 0, arg_u8x16, 15);
+	vcopyq_laneq_u8(arg_u8x16, 0, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_u8(arg_u8x16, 0, arg_u8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_copy_vector_lane_u16(uint16x4_t arg_u16x4, uint16x8_t arg_u16x8) {
+	vcopy_lane_u16(arg_u16x4, 0, arg_u16x4, 0);
+	vcopy_lane_u16(arg_u16x4, 3, arg_u16x4, 0);
+	vcopy_lane_u16(arg_u16x4, -1, arg_u16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_u16(arg_u16x4, 4, arg_u16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_lane_u16(arg_u16x4, 0, arg_u16x4, 3);
+	vcopy_lane_u16(arg_u16x4, 0, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_u16(arg_u16x4, 0, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_u16(arg_u16x8, 0, arg_u16x4, 0);
+	vcopyq_lane_u16(arg_u16x8, 7, arg_u16x4, 0);
+	vcopyq_lane_u16(arg_u16x8, -1, arg_u16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_u16(arg_u16x8, 8, arg_u16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_u16(arg_u16x8, 0, arg_u16x4, 3);
+	vcopyq_lane_u16(arg_u16x8, 0, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_u16(arg_u16x8, 0, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_u16(arg_u16x4, 0, arg_u16x8, 0);
+	vcopy_laneq_u16(arg_u16x4, 3, arg_u16x8, 0);
+	vcopy_laneq_u16(arg_u16x4, -1, arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_u16(arg_u16x4, 4, arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_u16(arg_u16x4, 0, arg_u16x8, 7);
+	vcopy_laneq_u16(arg_u16x4, 0, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_u16(arg_u16x4, 0, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_u16(arg_u16x8, 0, arg_u16x8, 0);
+	vcopyq_laneq_u16(arg_u16x8, 7, arg_u16x8, 0);
+	vcopyq_laneq_u16(arg_u16x8, -1, arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_u16(arg_u16x8, 8, arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_u16(arg_u16x8, 0, arg_u16x8, 7);
+	vcopyq_laneq_u16(arg_u16x8, 0, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_u16(arg_u16x8, 0, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_copy_vector_lane_u32(uint32x2_t arg_u32x2, uint32x4_t arg_u32x4) {
+	vcopy_lane_u32(arg_u32x2, 0, arg_u32x2, 0);
+	vcopy_lane_u32(arg_u32x2, 1, arg_u32x2, 0);
+	vcopy_lane_u32(arg_u32x2, -1, arg_u32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_u32(arg_u32x2, 2, arg_u32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_lane_u32(arg_u32x2, 0, arg_u32x2, 1);
+	vcopy_lane_u32(arg_u32x2, 0, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_u32(arg_u32x2, 0, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_u32(arg_u32x4, 0, arg_u32x2, 0);
+	vcopyq_lane_u32(arg_u32x4, 3, arg_u32x2, 0);
+	vcopyq_lane_u32(arg_u32x4, -1, arg_u32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_u32(arg_u32x4, 4, arg_u32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_u32(arg_u32x4, 0, arg_u32x2, 1);
+	vcopyq_lane_u32(arg_u32x4, 0, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_u32(arg_u32x4, 0, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_u32(arg_u32x2, 0, arg_u32x4, 0);
+	vcopy_laneq_u32(arg_u32x2, 1, arg_u32x4, 0);
+	vcopy_laneq_u32(arg_u32x2, -1, arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_u32(arg_u32x2, 2, arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_u32(arg_u32x2, 0, arg_u32x4, 3);
+	vcopy_laneq_u32(arg_u32x2, 0, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_u32(arg_u32x2, 0, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_u32(arg_u32x4, 0, arg_u32x4, 0);
+	vcopyq_laneq_u32(arg_u32x4, 3, arg_u32x4, 0);
+	vcopyq_laneq_u32(arg_u32x4, -1, arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_u32(arg_u32x4, 4, arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_u32(arg_u32x4, 0, arg_u32x4, 3);
+	vcopyq_laneq_u32(arg_u32x4, 0, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_u32(arg_u32x4, 0, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_copy_vector_lane_u64(uint64x2_t arg_u64x2, uint64x1_t arg_u64x1) {
+	vcopy_lane_u64(arg_u64x1, 0, arg_u64x1, 0);
+	vcopy_lane_u64(arg_u64x1, -1, arg_u64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_u64(arg_u64x1, 1, arg_u64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_lane_u64(arg_u64x1, 0, arg_u64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_u64(arg_u64x1, 0, arg_u64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_u64(arg_u64x2, 0, arg_u64x1, 0);
+	vcopyq_lane_u64(arg_u64x2, 1, arg_u64x1, 0);
+	vcopyq_lane_u64(arg_u64x2, -1, arg_u64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_u64(arg_u64x2, 2, arg_u64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_u64(arg_u64x2, 0, arg_u64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_u64(arg_u64x2, 0, arg_u64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_u64(arg_u64x1, 0, arg_u64x2, 0);
+	vcopy_laneq_u64(arg_u64x1, -1, arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_u64(arg_u64x1, 1, arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_u64(arg_u64x1, 0, arg_u64x2, 1);
+	vcopy_laneq_u64(arg_u64x1, 0, arg_u64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_u64(arg_u64x1, 0, arg_u64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_u64(arg_u64x2, 0, arg_u64x2, 0);
+	vcopyq_laneq_u64(arg_u64x2, 1, arg_u64x2, 0);
+	vcopyq_laneq_u64(arg_u64x2, -1, arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_u64(arg_u64x2, 2, arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_u64(arg_u64x2, 0, arg_u64x2, 1);
+	vcopyq_laneq_u64(arg_u64x2, 0, arg_u64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_u64(arg_u64x2, 0, arg_u64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_copy_vector_lane_p64(poly64x1_t arg_p64x1, poly64x2_t arg_p64x2) {
+	vcopy_lane_p64(arg_p64x1, 0, arg_p64x1, 0);
+	vcopy_lane_p64(arg_p64x1, -1, arg_p64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_p64(arg_p64x1, 1, arg_p64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_lane_p64(arg_p64x1, 0, arg_p64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_p64(arg_p64x1, 0, arg_p64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_p64(arg_p64x2, 0, arg_p64x1, 0);
+	vcopyq_lane_p64(arg_p64x2, 1, arg_p64x1, 0);
+	vcopyq_lane_p64(arg_p64x2, -1, arg_p64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_p64(arg_p64x2, 2, arg_p64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_p64(arg_p64x2, 0, arg_p64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_p64(arg_p64x2, 0, arg_p64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_p64(arg_p64x1, 0, arg_p64x2, 0);
+	vcopy_laneq_p64(arg_p64x1, -1, arg_p64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_p64(arg_p64x1, 1, arg_p64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_p64(arg_p64x1, 0, arg_p64x2, 1);
+	vcopy_laneq_p64(arg_p64x1, 0, arg_p64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_p64(arg_p64x1, 0, arg_p64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_p64(arg_p64x2, 0, arg_p64x2, 0);
+	vcopyq_laneq_p64(arg_p64x2, 1, arg_p64x2, 0);
+	vcopyq_laneq_p64(arg_p64x2, -1, arg_p64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_p64(arg_p64x2, 2, arg_p64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_p64(arg_p64x2, 0, arg_p64x2, 1);
+	vcopyq_laneq_p64(arg_p64x2, 0, arg_p64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_p64(arg_p64x2, 0, arg_p64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_copy_vector_lane_f32(float32x2_t arg_f32x2, float32x4_t arg_f32x4) {
+	vcopy_lane_f32(arg_f32x2, 0, arg_f32x2, 0);
+	vcopy_lane_f32(arg_f32x2, 1, arg_f32x2, 0);
+	vcopy_lane_f32(arg_f32x2, -1, arg_f32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_f32(arg_f32x2, 2, arg_f32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_lane_f32(arg_f32x2, 0, arg_f32x2, 1);
+	vcopy_lane_f32(arg_f32x2, 0, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_f32(arg_f32x2, 0, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_f32(arg_f32x4, 0, arg_f32x2, 0);
+	vcopyq_lane_f32(arg_f32x4, 3, arg_f32x2, 0);
+	vcopyq_lane_f32(arg_f32x4, -1, arg_f32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_f32(arg_f32x4, 4, arg_f32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_f32(arg_f32x4, 0, arg_f32x2, 1);
+	vcopyq_lane_f32(arg_f32x4, 0, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_f32(arg_f32x4, 0, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_f32(arg_f32x2, 0, arg_f32x4, 0);
+	vcopy_laneq_f32(arg_f32x2, 1, arg_f32x4, 0);
+	vcopy_laneq_f32(arg_f32x2, -1, arg_f32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_f32(arg_f32x2, 2, arg_f32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_f32(arg_f32x2, 0, arg_f32x4, 3);
+	vcopy_laneq_f32(arg_f32x2, 0, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_f32(arg_f32x2, 0, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_f32(arg_f32x4, 0, arg_f32x4, 0);
+	vcopyq_laneq_f32(arg_f32x4, 3, arg_f32x4, 0);
+	vcopyq_laneq_f32(arg_f32x4, -1, arg_f32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_f32(arg_f32x4, 4, arg_f32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_f32(arg_f32x4, 0, arg_f32x4, 3);
+	vcopyq_laneq_f32(arg_f32x4, 0, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_f32(arg_f32x4, 0, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_copy_vector_lane_f64(float64x2_t arg_f64x2, float64x1_t arg_f64x1) {
+	vcopy_lane_f64(arg_f64x1, 0, arg_f64x1, 0);
+	vcopy_lane_f64(arg_f64x1, -1, arg_f64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_f64(arg_f64x1, 1, arg_f64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_lane_f64(arg_f64x1, 0, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_f64(arg_f64x1, 0, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_f64(arg_f64x2, 0, arg_f64x1, 0);
+	vcopyq_lane_f64(arg_f64x2, 1, arg_f64x1, 0);
+	vcopyq_lane_f64(arg_f64x2, -1, arg_f64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_f64(arg_f64x2, 2, arg_f64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_f64(arg_f64x2, 0, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_f64(arg_f64x2, 0, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_f64(arg_f64x1, 0, arg_f64x2, 0);
+	vcopy_laneq_f64(arg_f64x1, -1, arg_f64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_f64(arg_f64x1, 1, arg_f64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_f64(arg_f64x1, 0, arg_f64x2, 1);
+	vcopy_laneq_f64(arg_f64x1, 0, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_f64(arg_f64x1, 0, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_f64(arg_f64x2, 0, arg_f64x2, 0);
+	vcopyq_laneq_f64(arg_f64x2, 1, arg_f64x2, 0);
+	vcopyq_laneq_f64(arg_f64x2, -1, arg_f64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_f64(arg_f64x2, 2, arg_f64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_f64(arg_f64x2, 0, arg_f64x2, 1);
+	vcopyq_laneq_f64(arg_f64x2, 0, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_f64(arg_f64x2, 0, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_copy_vector_lane_p8(poly8x16_t arg_p8x16, poly8x8_t arg_p8x8) {
+	vcopy_lane_p8(arg_p8x8, 0, arg_p8x8, 0);
+	vcopy_lane_p8(arg_p8x8, 7, arg_p8x8, 0);
+	vcopy_lane_p8(arg_p8x8, -1, arg_p8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_p8(arg_p8x8, 8, arg_p8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_lane_p8(arg_p8x8, 0, arg_p8x8, 7);
+	vcopy_lane_p8(arg_p8x8, 0, arg_p8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_p8(arg_p8x8, 0, arg_p8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_p8(arg_p8x16, 0, arg_p8x8, 0);
+	vcopyq_lane_p8(arg_p8x16, 15, arg_p8x8, 0);
+	vcopyq_lane_p8(arg_p8x16, -1, arg_p8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_p8(arg_p8x16, 16, arg_p8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_p8(arg_p8x16, 0, arg_p8x8, 7);
+	vcopyq_lane_p8(arg_p8x16, 0, arg_p8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_p8(arg_p8x16, 0, arg_p8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_p8(arg_p8x8, 0, arg_p8x16, 0);
+	vcopy_laneq_p8(arg_p8x8, 7, arg_p8x16, 0);
+	vcopy_laneq_p8(arg_p8x8, -1, arg_p8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_p8(arg_p8x8, 8, arg_p8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_p8(arg_p8x8, 0, arg_p8x16, 15);
+	vcopy_laneq_p8(arg_p8x8, 0, arg_p8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_p8(arg_p8x8, 0, arg_p8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_p8(arg_p8x16, 0, arg_p8x16, 0);
+	vcopyq_laneq_p8(arg_p8x16, 15, arg_p8x16, 0);
+	vcopyq_laneq_p8(arg_p8x16, -1, arg_p8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_p8(arg_p8x16, 16, arg_p8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_p8(arg_p8x16, 0, arg_p8x16, 15);
+	vcopyq_laneq_p8(arg_p8x16, 0, arg_p8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_p8(arg_p8x16, 0, arg_p8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_copy_vector_lane_p16(poly16x8_t arg_p16x8, poly16x4_t arg_p16x4) {
+	vcopy_lane_p16(arg_p16x4, 0, arg_p16x4, 0);
+	vcopy_lane_p16(arg_p16x4, 3, arg_p16x4, 0);
+	vcopy_lane_p16(arg_p16x4, -1, arg_p16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_p16(arg_p16x4, 4, arg_p16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_lane_p16(arg_p16x4, 0, arg_p16x4, 3);
+	vcopy_lane_p16(arg_p16x4, 0, arg_p16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_lane_p16(arg_p16x4, 0, arg_p16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_p16(arg_p16x8, 0, arg_p16x4, 0);
+	vcopyq_lane_p16(arg_p16x8, 7, arg_p16x4, 0);
+	vcopyq_lane_p16(arg_p16x8, -1, arg_p16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_p16(arg_p16x8, 8, arg_p16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_lane_p16(arg_p16x8, 0, arg_p16x4, 3);
+	vcopyq_lane_p16(arg_p16x8, 0, arg_p16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_lane_p16(arg_p16x8, 0, arg_p16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_p16(arg_p16x4, 0, arg_p16x8, 0);
+	vcopy_laneq_p16(arg_p16x4, 3, arg_p16x8, 0);
+	vcopy_laneq_p16(arg_p16x4, -1, arg_p16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_p16(arg_p16x4, 4, arg_p16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopy_laneq_p16(arg_p16x4, 0, arg_p16x8, 7);
+	vcopy_laneq_p16(arg_p16x4, 0, arg_p16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopy_laneq_p16(arg_p16x4, 0, arg_p16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_p16(arg_p16x8, 0, arg_p16x8, 0);
+	vcopyq_laneq_p16(arg_p16x8, 7, arg_p16x8, 0);
+	vcopyq_laneq_p16(arg_p16x8, -1, arg_p16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_p16(arg_p16x8, 8, arg_p16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcopyq_laneq_p16(arg_p16x8, 0, arg_p16x8, 7);
+	vcopyq_laneq_p16(arg_p16x8, 0, arg_p16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcopyq_laneq_p16(arg_p16x8, 0, arg_p16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c b/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c
new file mode 100644
index 0000000000000..11f2c660a8ff2
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +v8.2a -target-feature +dotprod -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+void test_dot_product_u32(uint8x8_t arg_u8x8, uint32x2_t arg_u32x2, uint8x16_t arg_u8x16, uint32x4_t arg_u32x4) {
+	vdot_lane_u32(arg_u32x2, arg_u8x8, arg_u8x8, 0);
+	vdot_lane_u32(arg_u32x2, arg_u8x8, arg_u8x8, 1);
+	vdot_lane_u32(arg_u32x2, arg_u8x8, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdot_lane_u32(arg_u32x2, arg_u8x8, arg_u8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdotq_laneq_u32(arg_u32x4, arg_u8x16, arg_u8x16, 0);
+	vdotq_laneq_u32(arg_u32x4, arg_u8x16, arg_u8x16, 3);
+	vdotq_laneq_u32(arg_u32x4, arg_u8x16, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdotq_laneq_u32(arg_u32x4, arg_u8x16, arg_u8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdot_laneq_u32(arg_u32x2, arg_u8x8, arg_u8x16, 0);
+	vdot_laneq_u32(arg_u32x2, arg_u8x8, arg_u8x16, 3);
+	vdot_laneq_u32(arg_u32x2, arg_u8x8, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdot_laneq_u32(arg_u32x2, arg_u8x8, arg_u8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdotq_lane_u32(arg_u32x4, arg_u8x16, arg_u8x8, 0);
+	vdotq_lane_u32(arg_u32x4, arg_u8x16, arg_u8x8, 1);
+	vdotq_lane_u32(arg_u32x4, arg_u8x16, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdotq_lane_u32(arg_u32x4, arg_u8x16, arg_u8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_dot_product_s32(int32x2_t arg_i32x2, int8x16_t arg_i8x16, int8x8_t arg_i8x8, int32x4_t arg_i32x4) {
+	vdot_lane_s32(arg_i32x2, arg_i8x8, arg_i8x8, 0);
+	vdot_lane_s32(arg_i32x2, arg_i8x8, arg_i8x8, 1);
+	vdot_lane_s32(arg_i32x2, arg_i8x8, arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdot_lane_s32(arg_i32x2, arg_i8x8, arg_i8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdotq_laneq_s32(arg_i32x4, arg_i8x16, arg_i8x16, 0);
+	vdotq_laneq_s32(arg_i32x4, arg_i8x16, arg_i8x16, 3);
+	vdotq_laneq_s32(arg_i32x4, arg_i8x16, arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdotq_laneq_s32(arg_i32x4, arg_i8x16, arg_i8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdot_laneq_s32(arg_i32x2, arg_i8x8, arg_i8x16, 0);
+	vdot_laneq_s32(arg_i32x2, arg_i8x8, arg_i8x16, 3);
+	vdot_laneq_s32(arg_i32x2, arg_i8x8, arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdot_laneq_s32(arg_i32x2, arg_i8x8, arg_i8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdotq_lane_s32(arg_i32x4, arg_i8x16, arg_i8x8, 0);
+	vdotq_lane_s32(arg_i32x4, arg_i8x16, arg_i8x8, 1);
+	vdotq_lane_s32(arg_i32x4, arg_i8x16, arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdotq_lane_s32(arg_i32x4, arg_i8x16, arg_i8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/extract-elt-from-vector.c b/clang/test/Sema/aarch64-neon-immediate-ranges/extract-elt-from-vector.c
new file mode 100644
index 0000000000000..5738f5ad27f3e
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/extract-elt-from-vector.c
@@ -0,0 +1,301 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_extract_one_element_from_vector_s8(int8x16_t arg_i8x16, int8x8_t arg_i8x8) {
+	vdupb_lane_s8(arg_i8x8, 0);
+	vdupb_lane_s8(arg_i8x8, 7);
+	vdupb_lane_s8(arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupb_lane_s8(arg_i8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupb_laneq_s8(arg_i8x16, 0);
+	vdupb_laneq_s8(arg_i8x16, 15);
+	vdupb_laneq_s8(arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupb_laneq_s8(arg_i8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vget_lane_s8(arg_i8x8, 0);
+	vget_lane_s8(arg_i8x8, 7);
+	vget_lane_s8(arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vget_lane_s8(arg_i8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vgetq_lane_s8(arg_i8x16, 0);
+	vgetq_lane_s8(arg_i8x16, 15);
+	vgetq_lane_s8(arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vgetq_lane_s8(arg_i8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_one_element_from_vector_s16(int16x4_t arg_i16x4, int16x8_t arg_i16x8) {
+	vduph_lane_s16(arg_i16x4, 0);
+	vduph_lane_s16(arg_i16x4, 3);
+	vduph_lane_s16(arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vduph_lane_s16(arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vduph_laneq_s16(arg_i16x8, 0);
+	vduph_laneq_s16(arg_i16x8, 7);
+	vduph_laneq_s16(arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vduph_laneq_s16(arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vget_lane_s16(arg_i16x4, 0);
+	vget_lane_s16(arg_i16x4, 3);
+	vget_lane_s16(arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vget_lane_s16(arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vgetq_lane_s16(arg_i16x8, 0);
+	vgetq_lane_s16(arg_i16x8, 7);
+	vgetq_lane_s16(arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vgetq_lane_s16(arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_one_element_from_vector_s32(int32x4_t arg_i32x4, int32x2_t arg_i32x2) {
+	vdups_lane_s32(arg_i32x2, 0);
+	vdups_lane_s32(arg_i32x2, 1);
+	vdups_lane_s32(arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdups_lane_s32(arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdups_laneq_s32(arg_i32x4, 0);
+	vdups_laneq_s32(arg_i32x4, 3);
+	vdups_laneq_s32(arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdups_laneq_s32(arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vget_lane_s32(arg_i32x2, 0);
+	vget_lane_s32(arg_i32x2, 1);
+	vget_lane_s32(arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vget_lane_s32(arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vgetq_lane_s32(arg_i32x4, 0);
+	vgetq_lane_s32(arg_i32x4, 3);
+	vgetq_lane_s32(arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vgetq_lane_s32(arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_one_element_from_vector_s64(int64x2_t arg_i64x2, int64x1_t arg_i64x1) {
+	vdupd_lane_s64(arg_i64x1, 0);
+	vdupd_lane_s64(arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupd_lane_s64(arg_i64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupd_laneq_s64(arg_i64x2, 0);
+	vdupd_laneq_s64(arg_i64x2, 1);
+	vdupd_laneq_s64(arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupd_laneq_s64(arg_i64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vget_lane_s64(arg_i64x1, 0);
+	vget_lane_s64(arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vget_lane_s64(arg_i64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vgetq_lane_s64(arg_i64x2, 0);
+	vgetq_lane_s64(arg_i64x2, 1);
+	vgetq_lane_s64(arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vgetq_lane_s64(arg_i64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_one_element_from_vector_u8(uint8x8_t arg_u8x8, uint8x16_t arg_u8x16) {
+	vdupb_lane_u8(arg_u8x8, 0);
+	vdupb_lane_u8(arg_u8x8, 7);
+	vdupb_lane_u8(arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupb_lane_u8(arg_u8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupb_laneq_u8(arg_u8x16, 0);
+	vdupb_laneq_u8(arg_u8x16, 15);
+	vdupb_laneq_u8(arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupb_laneq_u8(arg_u8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vget_lane_u8(arg_u8x8, 0);
+	vget_lane_u8(arg_u8x8, 7);
+	vget_lane_u8(arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vget_lane_u8(arg_u8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vgetq_lane_u8(arg_u8x16, 0);
+	vgetq_lane_u8(arg_u8x16, 15);
+	vgetq_lane_u8(arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vgetq_lane_u8(arg_u8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_one_element_from_vector_u16(uint16x8_t arg_u16x8, uint16x4_t arg_u16x4) {
+	vduph_lane_u16(arg_u16x4, 0);
+	vduph_lane_u16(arg_u16x4, 3);
+	vduph_lane_u16(arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vduph_lane_u16(arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vduph_laneq_u16(arg_u16x8, 0);
+	vduph_laneq_u16(arg_u16x8, 7);
+	vduph_laneq_u16(arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vduph_laneq_u16(arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vget_lane_u16(arg_u16x4, 0);
+	vget_lane_u16(arg_u16x4, 3);
+	vget_lane_u16(arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vget_lane_u16(arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vgetq_lane_u16(arg_u16x8, 0);
+	vgetq_lane_u16(arg_u16x8, 7);
+	vgetq_lane_u16(arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vgetq_lane_u16(arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_one_element_from_vector_u32(uint32x2_t arg_u32x2, uint32x4_t arg_u32x4) {
+	vdups_lane_u32(arg_u32x2, 0);
+	vdups_lane_u32(arg_u32x2, 1);
+	vdups_lane_u32(arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdups_lane_u32(arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdups_laneq_u32(arg_u32x4, 0);
+	vdups_laneq_u32(arg_u32x4, 3);
+	vdups_laneq_u32(arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdups_laneq_u32(arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vget_lane_u32(arg_u32x2, 0);
+	vget_lane_u32(arg_u32x2, 1);
+	vget_lane_u32(arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vget_lane_u32(arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vgetq_lane_u32(arg_u32x4, 0);
+	vgetq_lane_u32(arg_u32x4, 3);
+	vgetq_lane_u32(arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vgetq_lane_u32(arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_one_element_from_vector_u64(uint64x1_t arg_u64x1, uint64x2_t arg_u64x2) {
+	vdupd_lane_u64(arg_u64x1, 0);
+	vdupd_lane_u64(arg_u64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupd_lane_u64(arg_u64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupd_laneq_u64(arg_u64x2, 0);
+	vdupd_laneq_u64(arg_u64x2, 1);
+	vdupd_laneq_u64(arg_u64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupd_laneq_u64(arg_u64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vget_lane_u64(arg_u64x1, 0);
+	vget_lane_u64(arg_u64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vget_lane_u64(arg_u64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vgetq_lane_u64(arg_u64x2, 0);
+	vgetq_lane_u64(arg_u64x2, 1);
+	vgetq_lane_u64(arg_u64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vgetq_lane_u64(arg_u64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_one_element_from_vector_f32(float32x4_t arg_f32x4, float32x2_t arg_f32x2) {
+	vdups_lane_f32(arg_f32x2, 0);
+	vdups_lane_f32(arg_f32x2, 1);
+	vdups_lane_f32(arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdups_lane_f32(arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdups_laneq_f32(arg_f32x4, 0);
+	vdups_laneq_f32(arg_f32x4, 3);
+	vdups_laneq_f32(arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdups_laneq_f32(arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vget_lane_f32(arg_f32x2, 0);
+	vget_lane_f32(arg_f32x2, 1);
+	vget_lane_f32(arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vget_lane_f32(arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vgetq_lane_f32(arg_f32x4, 0);
+	vgetq_lane_f32(arg_f32x4, 3);
+	vgetq_lane_f32(arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vgetq_lane_f32(arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_one_element_from_vector_f64(float64x2_t arg_f64x2, float64x1_t arg_f64x1) {
+	vdupd_lane_f64(arg_f64x1, 0);
+	vdupd_lane_f64(arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupd_lane_f64(arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupd_laneq_f64(arg_f64x2, 0);
+	vdupd_laneq_f64(arg_f64x2, 1);
+	vdupd_laneq_f64(arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupd_laneq_f64(arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vget_lane_f64(arg_f64x1, 0);
+	vget_lane_f64(arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vget_lane_f64(arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vgetq_lane_f64(arg_f64x2, 0);
+	vgetq_lane_f64(arg_f64x2, 1);
+	vgetq_lane_f64(arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vgetq_lane_f64(arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_one_element_from_vector_p8(poly8x16_t arg_p8x16, poly8x8_t arg_p8x8) {
+	vdupb_lane_p8(arg_p8x8, 0);
+	vdupb_lane_p8(arg_p8x8, 7);
+	vdupb_lane_p8(arg_p8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupb_lane_p8(arg_p8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupb_laneq_p8(arg_p8x16, 0);
+	vdupb_laneq_p8(arg_p8x16, 15);
+	vdupb_laneq_p8(arg_p8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupb_laneq_p8(arg_p8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vget_lane_p8(arg_p8x8, 0);
+	vget_lane_p8(arg_p8x8, 7);
+	vget_lane_p8(arg_p8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vget_lane_p8(arg_p8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vgetq_lane_p8(arg_p8x16, 0);
+	vgetq_lane_p8(arg_p8x16, 15);
+	vgetq_lane_p8(arg_p8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vgetq_lane_p8(arg_p8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_one_element_from_vector_p16(poly16x4_t arg_p16x4, poly16x8_t arg_p16x8) {
+	vduph_lane_p16(arg_p16x4, 0);
+	vduph_lane_p16(arg_p16x4, 3);
+	vduph_lane_p16(arg_p16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vduph_lane_p16(arg_p16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vduph_laneq_p16(arg_p16x8, 0);
+	vduph_laneq_p16(arg_p16x8, 7);
+	vduph_laneq_p16(arg_p16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vduph_laneq_p16(arg_p16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vget_lane_p16(arg_p16x4, 0);
+	vget_lane_p16(arg_p16x4, 3);
+	vget_lane_p16(arg_p16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vget_lane_p16(arg_p16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vgetq_lane_p16(arg_p16x8, 0);
+	vgetq_lane_p16(arg_p16x8, 7);
+	vgetq_lane_p16(arg_p16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vgetq_lane_p16(arg_p16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_one_element_from_vector_p64(poly64x2_t arg_p64x2, poly64x1_t arg_p64x1) {
+	vget_lane_p64(arg_p64x1, 0);
+	vget_lane_p64(arg_p64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vget_lane_p64(arg_p64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vgetq_lane_p64(arg_p64x2, 0);
+	vgetq_lane_p64(arg_p64x2, 1);
+	vgetq_lane_p64(arg_p64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vgetq_lane_p64(arg_p64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_one_element_from_vector_f16(float16x8_t arg_f16x8, float16x4_t arg_f16x4) {
+	vget_lane_f16(arg_f16x4, 0);
+	vget_lane_f16(arg_f16x4, 3);
+	vget_lane_f16(arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vget_lane_f16(arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vgetq_lane_f16(arg_f16x8, 0);
+	vgetq_lane_f16(arg_f16x8, 7);
+	vgetq_lane_f16(arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vgetq_lane_f16(arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/extract-vector-from-vectors.c b/clang/test/Sema/aarch64-neon-immediate-ranges/extract-vector-from-vectors.c
new file mode 100644
index 0000000000000..0453e56401a65
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/extract-vector-from-vectors.c
@@ -0,0 +1,170 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+void test_extract_vector_from_a_pair_of_vectors_s8(int8x8_t arg_i8x8, int8x16_t arg_i8x16) {
+	vext_s8(arg_i8x8, arg_i8x8, 0);
+	vext_s8(arg_i8x8, arg_i8x8, 7);
+	vext_s8(arg_i8x8, arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vext_s8(arg_i8x8, arg_i8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vextq_s8(arg_i8x16, arg_i8x16, 0);
+	vextq_s8(arg_i8x16, arg_i8x16, 15);
+	vextq_s8(arg_i8x16, arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vextq_s8(arg_i8x16, arg_i8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_vector_from_a_pair_of_vectors_s16(int16x8_t arg_i16x8, int16x4_t arg_i16x4) {
+	vext_s16(arg_i16x4, arg_i16x4, 0);
+	vext_s16(arg_i16x4, arg_i16x4, 3);
+	vext_s16(arg_i16x4, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vext_s16(arg_i16x4, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vextq_s16(arg_i16x8, arg_i16x8, 0);
+	vextq_s16(arg_i16x8, arg_i16x8, 7);
+	vextq_s16(arg_i16x8, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vextq_s16(arg_i16x8, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_vector_from_a_pair_of_vectors_s32(int32x2_t arg_i32x2, int32x4_t arg_i32x4) {
+	vext_s32(arg_i32x2, arg_i32x2, 0);
+	vext_s32(arg_i32x2, arg_i32x2, 1);
+	vext_s32(arg_i32x2, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vext_s32(arg_i32x2, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vextq_s32(arg_i32x4, arg_i32x4, 0);
+	vextq_s32(arg_i32x4, arg_i32x4, 3);
+	vextq_s32(arg_i32x4, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vextq_s32(arg_i32x4, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_vector_from_a_pair_of_vectors_s64(int64x2_t arg_i64x2, int64x1_t arg_i64x1) {
+	vext_s64(arg_i64x1, arg_i64x1, 0);
+	vext_s64(arg_i64x1, arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vext_s64(arg_i64x1, arg_i64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vextq_s64(arg_i64x2, arg_i64x2, 0);
+	vextq_s64(arg_i64x2, arg_i64x2, 1);
+	vextq_s64(arg_i64x2, arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vextq_s64(arg_i64x2, arg_i64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_vector_from_a_pair_of_vectors_u8(uint8x8_t arg_u8x8, uint8x16_t arg_u8x16) {
+	vext_u8(arg_u8x8, arg_u8x8, 0);
+	vext_u8(arg_u8x8, arg_u8x8, 7);
+	vext_u8(arg_u8x8, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vext_u8(arg_u8x8, arg_u8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vextq_u8(arg_u8x16, arg_u8x16, 0);
+	vextq_u8(arg_u8x16, arg_u8x16, 15);
+	vextq_u8(arg_u8x16, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vextq_u8(arg_u8x16, arg_u8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_vector_from_a_pair_of_vectors_u16(uint16x4_t arg_u16x4, uint16x8_t arg_u16x8) {
+	vext_u16(arg_u16x4, arg_u16x4, 0);
+	vext_u16(arg_u16x4, arg_u16x4, 3);
+	vext_u16(arg_u16x4, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vext_u16(arg_u16x4, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vextq_u16(arg_u16x8, arg_u16x8, 0);
+	vextq_u16(arg_u16x8, arg_u16x8, 7);
+	vextq_u16(arg_u16x8, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vextq_u16(arg_u16x8, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_vector_from_a_pair_of_vectors_u32(uint32x2_t arg_u32x2, uint32x4_t arg_u32x4) {
+	vext_u32(arg_u32x2, arg_u32x2, 0);
+	vext_u32(arg_u32x2, arg_u32x2, 1);
+	vext_u32(arg_u32x2, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vext_u32(arg_u32x2, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vextq_u32(arg_u32x4, arg_u32x4, 0);
+	vextq_u32(arg_u32x4, arg_u32x4, 3);
+	vextq_u32(arg_u32x4, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vextq_u32(arg_u32x4, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_vector_from_a_pair_of_vectors_u64(uint64x1_t arg_u64x1, uint64x2_t arg_u64x2) {
+	vext_u64(arg_u64x1, arg_u64x1, 0);
+	vext_u64(arg_u64x1, arg_u64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vext_u64(arg_u64x1, arg_u64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vextq_u64(arg_u64x2, arg_u64x2, 0);
+	vextq_u64(arg_u64x2, arg_u64x2, 1);
+	vextq_u64(arg_u64x2, arg_u64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vextq_u64(arg_u64x2, arg_u64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_vector_from_a_pair_of_vectors_p64(poly64x2_t arg_p64x2, poly64x1_t arg_p64x1) {
+	vext_p64(arg_p64x1, arg_p64x1, 0);
+	vext_p64(arg_p64x1, arg_p64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vext_p64(arg_p64x1, arg_p64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vextq_p64(arg_p64x2, arg_p64x2, 0);
+	vextq_p64(arg_p64x2, arg_p64x2, 1);
+	vextq_p64(arg_p64x2, arg_p64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vextq_p64(arg_p64x2, arg_p64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_vector_from_a_pair_of_vectors_f32(float32x2_t arg_f32x2, float32x4_t arg_f32x4) {
+	vext_f32(arg_f32x2, arg_f32x2, 0);
+	vext_f32(arg_f32x2, arg_f32x2, 1);
+	vext_f32(arg_f32x2, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vext_f32(arg_f32x2, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vextq_f32(arg_f32x4, arg_f32x4, 0);
+	vextq_f32(arg_f32x4, arg_f32x4, 3);
+	vextq_f32(arg_f32x4, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vextq_f32(arg_f32x4, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_vector_from_a_pair_of_vectors_f64(float64x2_t arg_f64x2, float64x1_t arg_f64x1) {
+	vext_f64(arg_f64x1, arg_f64x1, 0);
+	vext_f64(arg_f64x1, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vext_f64(arg_f64x1, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vextq_f64(arg_f64x2, arg_f64x2, 0);
+	vextq_f64(arg_f64x2, arg_f64x2, 1);
+	vextq_f64(arg_f64x2, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vextq_f64(arg_f64x2, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_vector_from_a_pair_of_vectors_p8(poly8x8_t arg_p8x8, poly8x16_t arg_p8x16) {
+	vext_p8(arg_p8x8, arg_p8x8, 0);
+	vext_p8(arg_p8x8, arg_p8x8, 7);
+	vext_p8(arg_p8x8, arg_p8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vext_p8(arg_p8x8, arg_p8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vextq_p8(arg_p8x16, arg_p8x16, 0);
+	vextq_p8(arg_p8x16, arg_p8x16, 15);
+	vextq_p8(arg_p8x16, arg_p8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vextq_p8(arg_p8x16, arg_p8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_extract_vector_from_a_pair_of_vectors_p16(poly16x8_t arg_p16x8, poly16x4_t arg_p16x4) {
+	vext_p16(arg_p16x4, arg_p16x4, 0);
+	vext_p16(arg_p16x4, arg_p16x4, 3);
+	vext_p16(arg_p16x4, arg_p16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vext_p16(arg_p16x4, arg_p16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vextq_p16(arg_p16x8, arg_p16x8, 0);
+	vextq_p16(arg_p16x8, arg_p16x8, 7);
+	vextq_p16(arg_p16x8, arg_p16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vextq_p16(arg_p16x8, arg_p16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/fp16-scalar.c b/clang/test/Sema/aarch64-neon-immediate-ranges/fp16-scalar.c
new file mode 100644
index 0000000000000..3a90b445f358d
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/fp16-scalar.c
@@ -0,0 +1,79 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -target-feature +v8.2a -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+#include <arm_fp16.h>
+// REQUIRES: aarch64-registered-target
+
+void test_conversions_s16(int16_t arg_i16) {
+	vcvth_n_f16_s16(arg_i16, 1);
+	vcvth_n_f16_s16(arg_i16, 16);
+	vcvth_n_f16_s16(arg_i16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvth_n_f16_s16(arg_i16, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_conversions_s32(int32_t arg_i32) {
+	vcvth_n_f16_s32(arg_i32, 1);
+	vcvth_n_f16_s32(arg_i32, 16);
+	vcvth_n_f16_s32(arg_i32, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvth_n_f16_s32(arg_i32, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_conversions_s64(int64_t arg_i64) {
+	vcvth_n_f16_s64(arg_i64, 1);
+	vcvth_n_f16_s64(arg_i64, 16);
+	vcvth_n_f16_s64(arg_i64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvth_n_f16_s64(arg_i64, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_conversions_u16(uint16_t arg_u16) {
+	vcvth_n_f16_u16(arg_u16, 1);
+	vcvth_n_f16_u16(arg_u16, 16);
+	vcvth_n_f16_u16(arg_u16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvth_n_f16_u16(arg_u16, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_conversions_u32(uint32_t arg_u32) {
+	vcvth_n_f16_u32(arg_u32, 1);
+	vcvth_n_f16_u32(arg_u32, 16);
+	vcvth_n_f16_u32(arg_u32, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvth_n_f16_u32(arg_u32, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_conversions_u64(uint64_t arg_u64) {
+	vcvth_n_f16_u64(arg_u64, 1);
+	vcvth_n_f16_u64(arg_u64, 16);
+	vcvth_n_f16_u64(arg_u64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvth_n_f16_u64(arg_u64, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_conversions_f16(float16_t arg_f16) {
+	vcvth_n_s16_f16(arg_f16, 1);
+	vcvth_n_s16_f16(arg_f16, 16);
+	vcvth_n_s16_f16(arg_f16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvth_n_s16_f16(arg_f16, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvth_n_s32_f16(arg_f16, 1);
+	vcvth_n_s32_f16(arg_f16, 16);
+	vcvth_n_s32_f16(arg_f16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvth_n_s32_f16(arg_f16, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvth_n_s64_f16(arg_f16, 1);
+	vcvth_n_s64_f16(arg_f16, 16);
+	vcvth_n_s64_f16(arg_f16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvth_n_s64_f16(arg_f16, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvth_n_u16_f16(arg_f16, 1);
+	vcvth_n_u16_f16(arg_f16, 16);
+	vcvth_n_u16_f16(arg_f16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvth_n_u16_f16(arg_f16, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvth_n_u32_f16(arg_f16, 1);
+	vcvth_n_u32_f16(arg_f16, 16);
+	vcvth_n_u32_f16(arg_f16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvth_n_u32_f16(arg_f16, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvth_n_u64_f16(arg_f16, 1);
+	vcvth_n_u64_f16(arg_f16, 16);
+	vcvth_n_u64_f16(arg_f16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvth_n_u64_f16(arg_f16, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/fp16-v84.c b/clang/test/Sema/aarch64-neon-immediate-ranges/fp16-v84.c
new file mode 100644
index 0000000000000..d31cf321d7619
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/fp16-v84.c
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +v8.4a -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_fused_multiply_accumulate_f16(float32x2_t arg_f32x2, float32x4_t arg_f32x4, float16x4_t arg_f16x4, float16x8_t arg_f16x8) {
+	vfmlal_lane_low_f16(arg_f32x2, arg_f16x4, arg_f16x4, 0);
+	vfmlal_lane_low_f16(arg_f32x2, arg_f16x4, arg_f16x4, 3);
+	vfmlal_lane_low_f16(arg_f32x2, arg_f16x4, arg_f16x4, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlal_lane_low_f16(arg_f32x2, arg_f16x4, arg_f16x4, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vfmlal_laneq_low_f16(arg_f32x2, arg_f16x4, arg_f16x8, 0);
+	vfmlal_laneq_low_f16(arg_f32x2, arg_f16x4, arg_f16x8, 7);
+	vfmlal_laneq_low_f16(arg_f32x2, arg_f16x4, arg_f16x8, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlal_laneq_low_f16(arg_f32x2, arg_f16x4, arg_f16x8, 8); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vfmlalq_lane_low_f16(arg_f32x4, arg_f16x8, arg_f16x4, 0);
+	vfmlalq_lane_low_f16(arg_f32x4, arg_f16x8, arg_f16x4, 3);
+	vfmlalq_lane_low_f16(arg_f32x4, arg_f16x8, arg_f16x4, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlalq_lane_low_f16(arg_f32x4, arg_f16x8, arg_f16x4, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vfmlalq_laneq_low_f16(arg_f32x4, arg_f16x8, arg_f16x8, 0);
+	vfmlalq_laneq_low_f16(arg_f32x4, arg_f16x8, arg_f16x8, 7);
+	vfmlalq_laneq_low_f16(arg_f32x4, arg_f16x8, arg_f16x8, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlalq_laneq_low_f16(arg_f32x4, arg_f16x8, arg_f16x8, 8); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vfmlsl_lane_low_f16(arg_f32x2, arg_f16x4, arg_f16x4, 0);
+	vfmlsl_lane_low_f16(arg_f32x2, arg_f16x4, arg_f16x4, 3);
+	vfmlsl_lane_low_f16(arg_f32x2, arg_f16x4, arg_f16x4, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlsl_lane_low_f16(arg_f32x2, arg_f16x4, arg_f16x4, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vfmlsl_laneq_low_f16(arg_f32x2, arg_f16x4, arg_f16x8, 0);
+	vfmlsl_laneq_low_f16(arg_f32x2, arg_f16x4, arg_f16x8, 7);
+	vfmlsl_laneq_low_f16(arg_f32x2, arg_f16x4, arg_f16x8, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlsl_laneq_low_f16(arg_f32x2, arg_f16x4, arg_f16x8, 8); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vfmlslq_lane_low_f16(arg_f32x4, arg_f16x8, arg_f16x4, 0);
+	vfmlslq_lane_low_f16(arg_f32x4, arg_f16x8, arg_f16x4, 3);
+	vfmlslq_lane_low_f16(arg_f32x4, arg_f16x8, arg_f16x4, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlslq_lane_low_f16(arg_f32x4, arg_f16x8, arg_f16x4, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vfmlslq_laneq_low_f16(arg_f32x4, arg_f16x8, arg_f16x8, 0);
+	vfmlslq_laneq_low_f16(arg_f32x4, arg_f16x8, arg_f16x8, 7);
+	vfmlslq_laneq_low_f16(arg_f32x4, arg_f16x8, arg_f16x8, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlslq_laneq_low_f16(arg_f32x4, arg_f16x8, arg_f16x8, 8); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vfmlal_lane_high_f16(arg_f32x2, arg_f16x4, arg_f16x4, 0);
+	vfmlal_lane_high_f16(arg_f32x2, arg_f16x4, arg_f16x4, 3);
+	vfmlal_lane_high_f16(arg_f32x2, arg_f16x4, arg_f16x4, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlal_lane_high_f16(arg_f32x2, arg_f16x4, arg_f16x4, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vfmlsl_lane_high_f16(arg_f32x2, arg_f16x4, arg_f16x4, 0);
+	vfmlsl_lane_high_f16(arg_f32x2, arg_f16x4, arg_f16x4, 3);
+	vfmlsl_lane_high_f16(arg_f32x2, arg_f16x4, arg_f16x4, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlsl_lane_high_f16(arg_f32x2, arg_f16x4, arg_f16x4, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vfmlalq_lane_high_f16(arg_f32x4, arg_f16x8, arg_f16x4, 0);
+	vfmlalq_lane_high_f16(arg_f32x4, arg_f16x8, arg_f16x4, 3);
+	vfmlalq_lane_high_f16(arg_f32x4, arg_f16x8, arg_f16x4, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlalq_lane_high_f16(arg_f32x4, arg_f16x8, arg_f16x4, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vfmlslq_lane_high_f16(arg_f32x4, arg_f16x8, arg_f16x4, 0);
+	vfmlslq_lane_high_f16(arg_f32x4, arg_f16x8, arg_f16x4, 3);
+	vfmlslq_lane_high_f16(arg_f32x4, arg_f16x8, arg_f16x4, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlslq_lane_high_f16(arg_f32x4, arg_f16x8, arg_f16x4, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vfmlal_laneq_high_f16(arg_f32x2, arg_f16x4, arg_f16x8, 0);
+	vfmlal_laneq_high_f16(arg_f32x2, arg_f16x4, arg_f16x8, 7);
+	vfmlal_laneq_high_f16(arg_f32x2, arg_f16x4, arg_f16x8, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlal_laneq_high_f16(arg_f32x2, arg_f16x4, arg_f16x8, 8); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vfmlsl_laneq_high_f16(arg_f32x2, arg_f16x4, arg_f16x8, 0);
+	vfmlsl_laneq_high_f16(arg_f32x2, arg_f16x4, arg_f16x8, 7);
+	vfmlsl_laneq_high_f16(arg_f32x2, arg_f16x4, arg_f16x8, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlsl_laneq_high_f16(arg_f32x2, arg_f16x4, arg_f16x8, 8); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vfmlalq_laneq_high_f16(arg_f32x4, arg_f16x8, arg_f16x8, 0);
+	vfmlalq_laneq_high_f16(arg_f32x4, arg_f16x8, arg_f16x8, 7);
+	vfmlalq_laneq_high_f16(arg_f32x4, arg_f16x8, arg_f16x8, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlalq_laneq_high_f16(arg_f32x4, arg_f16x8, arg_f16x8, 8); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vfmlslq_laneq_high_f16(arg_f32x4, arg_f16x8, arg_f16x8, 0);
+	vfmlslq_laneq_high_f16(arg_f32x4, arg_f16x8, arg_f16x8, 7);
+	vfmlslq_laneq_high_f16(arg_f32x4, arg_f16x8, arg_f16x8, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vfmlslq_laneq_high_f16(arg_f32x4, arg_f16x8, arg_f16x8, 8); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/fp16-vector.c b/clang/test/Sema/aarch64-neon-immediate-ranges/fp16-vector.c
new file mode 100644
index 0000000000000..6460018b74408
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/fp16-vector.c
@@ -0,0 +1,181 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -target-feature +v8.2a -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+#include <arm_fp16.h>
+// REQUIRES: aarch64-registered-target
+
+// vcvtq_n_f16_u16 is tested under clang/test/Sema/arm-mve-immediates.c
+
+void test_multiplication_f16(float16_t arg_f16, float16x8_t arg_f16x8, float16x4_t arg_f16x4) {
+	vmul_lane_f16(arg_f16x4, arg_f16x4, 0);
+	vmul_lane_f16(arg_f16x4, arg_f16x4, 3);
+	vmul_lane_f16(arg_f16x4, arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmul_lane_f16(arg_f16x4, arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulq_lane_f16(arg_f16x8, arg_f16x4, 0);
+	vmulq_lane_f16(arg_f16x8, arg_f16x4, 3);
+	vmulq_lane_f16(arg_f16x8, arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulq_lane_f16(arg_f16x8, arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmul_laneq_f16(arg_f16x4, arg_f16x8, 0);
+	vmul_laneq_f16(arg_f16x4, arg_f16x8, 7);
+	vmul_laneq_f16(arg_f16x4, arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmul_laneq_f16(arg_f16x4, arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulq_laneq_f16(arg_f16x8, arg_f16x8, 0);
+	vmulq_laneq_f16(arg_f16x8, arg_f16x8, 7);
+	vmulq_laneq_f16(arg_f16x8, arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulq_laneq_f16(arg_f16x8, arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulh_lane_f16(arg_f16, arg_f16x4, 0);
+	vmulh_lane_f16(arg_f16, arg_f16x4, 3);
+	vmulh_lane_f16(arg_f16, arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulh_lane_f16(arg_f16, arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulh_laneq_f16(arg_f16, arg_f16x8, 0);
+	vmulh_laneq_f16(arg_f16, arg_f16x8, 7);
+	vmulh_laneq_f16(arg_f16, arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulh_laneq_f16(arg_f16, arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_multiply_extended_f16(float16_t arg_f16, float16x8_t arg_f16x8, float16x4_t arg_f16x4) {
+	vmulx_lane_f16(arg_f16x4, arg_f16x4, 0);
+	vmulx_lane_f16(arg_f16x4, arg_f16x4, 3);
+	vmulx_lane_f16(arg_f16x4, arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulx_lane_f16(arg_f16x4, arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulxq_lane_f16(arg_f16x8, arg_f16x4, 0);
+	vmulxq_lane_f16(arg_f16x8, arg_f16x4, 3);
+	vmulxq_lane_f16(arg_f16x8, arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulxq_lane_f16(arg_f16x8, arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulx_laneq_f16(arg_f16x4, arg_f16x8, 0);
+	vmulx_laneq_f16(arg_f16x4, arg_f16x8, 7);
+	vmulx_laneq_f16(arg_f16x4, arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulx_laneq_f16(arg_f16x4, arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulxq_laneq_f16(arg_f16x8, arg_f16x8, 0);
+	vmulxq_laneq_f16(arg_f16x8, arg_f16x8, 7);
+	vmulxq_laneq_f16(arg_f16x8, arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulxq_laneq_f16(arg_f16x8, arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulxh_lane_f16(arg_f16, arg_f16x4, 0);
+	vmulxh_lane_f16(arg_f16, arg_f16x4, 3);
+	vmulxh_lane_f16(arg_f16, arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulxh_lane_f16(arg_f16, arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulxh_laneq_f16(arg_f16, arg_f16x8, 0);
+	vmulxh_laneq_f16(arg_f16, arg_f16x8, 7);
+	vmulxh_laneq_f16(arg_f16, arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulxh_laneq_f16(arg_f16, arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_fused_multiply_accumulate_f16(float16_t arg_f16, float16x8_t arg_f16x8, float16x4_t arg_f16x4) {
+	vfma_lane_f16(arg_f16x4, arg_f16x4, arg_f16x4, 0);
+	vfma_lane_f16(arg_f16x4, arg_f16x4, arg_f16x4, 3);
+	vfma_lane_f16(arg_f16x4, arg_f16x4, arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfma_lane_f16(arg_f16x4, arg_f16x4, arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmaq_lane_f16(arg_f16x8, arg_f16x8, arg_f16x4, 0);
+	vfmaq_lane_f16(arg_f16x8, arg_f16x8, arg_f16x4, 3);
+	vfmaq_lane_f16(arg_f16x8, arg_f16x8, arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmaq_lane_f16(arg_f16x8, arg_f16x8, arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfma_laneq_f16(arg_f16x4, arg_f16x4, arg_f16x8, 0);
+	vfma_laneq_f16(arg_f16x4, arg_f16x4, arg_f16x8, 7);
+	vfma_laneq_f16(arg_f16x4, arg_f16x4, arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfma_laneq_f16(arg_f16x4, arg_f16x4, arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmaq_laneq_f16(arg_f16x8, arg_f16x8, arg_f16x8, 0);
+	vfmaq_laneq_f16(arg_f16x8, arg_f16x8, arg_f16x8, 7);
+	vfmaq_laneq_f16(arg_f16x8, arg_f16x8, arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmaq_laneq_f16(arg_f16x8, arg_f16x8, arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmah_lane_f16(arg_f16, arg_f16, arg_f16x4, 0);
+	vfmah_lane_f16(arg_f16, arg_f16, arg_f16x4, 3);
+	vfmah_lane_f16(arg_f16, arg_f16, arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmah_lane_f16(arg_f16, arg_f16, arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmah_laneq_f16(arg_f16, arg_f16, arg_f16x8, 0);
+	vfmah_laneq_f16(arg_f16, arg_f16, arg_f16x8, 7);
+	vfmah_laneq_f16(arg_f16, arg_f16, arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmah_laneq_f16(arg_f16, arg_f16, arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfms_lane_f16(arg_f16x4, arg_f16x4, arg_f16x4, 0);
+	vfms_lane_f16(arg_f16x4, arg_f16x4, arg_f16x4, 3);
+	vfms_lane_f16(arg_f16x4, arg_f16x4, arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfms_lane_f16(arg_f16x4, arg_f16x4, arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmsq_lane_f16(arg_f16x8, arg_f16x8, arg_f16x4, 0);
+	vfmsq_lane_f16(arg_f16x8, arg_f16x8, arg_f16x4, 3);
+	vfmsq_lane_f16(arg_f16x8, arg_f16x8, arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmsq_lane_f16(arg_f16x8, arg_f16x8, arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfms_laneq_f16(arg_f16x4, arg_f16x4, arg_f16x8, 0);
+	vfms_laneq_f16(arg_f16x4, arg_f16x4, arg_f16x8, 7);
+	vfms_laneq_f16(arg_f16x4, arg_f16x4, arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfms_laneq_f16(arg_f16x4, arg_f16x4, arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmsq_laneq_f16(arg_f16x8, arg_f16x8, arg_f16x8, 0);
+	vfmsq_laneq_f16(arg_f16x8, arg_f16x8, arg_f16x8, 7);
+	vfmsq_laneq_f16(arg_f16x8, arg_f16x8, arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmsq_laneq_f16(arg_f16x8, arg_f16x8, arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmsh_lane_f16(arg_f16, arg_f16, arg_f16x4, 0);
+	vfmsh_lane_f16(arg_f16, arg_f16, arg_f16x4, 3);
+	vfmsh_lane_f16(arg_f16, arg_f16, arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmsh_lane_f16(arg_f16, arg_f16, arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmsh_laneq_f16(arg_f16, arg_f16, arg_f16x8, 0);
+	vfmsh_laneq_f16(arg_f16, arg_f16, arg_f16x8, 7);
+	vfmsh_laneq_f16(arg_f16, arg_f16, arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmsh_laneq_f16(arg_f16, arg_f16, arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_conversions_s16(int16x8_t arg_i16x8, int16x4_t arg_i16x4) {
+	vcvt_n_f16_s16(arg_i16x4, 1);
+	vcvt_n_f16_s16(arg_i16x4, 16);
+	vcvt_n_f16_s16(arg_i16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvt_n_f16_s16(arg_i16x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvtq_n_f16_s16(arg_i16x8, 1);
+	vcvtq_n_f16_s16(arg_i16x8, 16);
+	vcvtq_n_f16_s16(arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvtq_n_f16_s16(arg_i16x8, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_conversions_u16(uint16x8_t arg_u16x8, uint16x4_t arg_u16x4) {
+	vcvt_n_f16_u16(arg_u16x4, 1);
+	vcvt_n_f16_u16(arg_u16x4, 16);
+	vcvt_n_f16_u16(arg_u16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvt_n_f16_u16(arg_u16x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_conversions_f16(float16x8_t arg_f16x8, float16x4_t arg_f16x4) {
+	vcvt_n_s16_f16(arg_f16x4, 1);
+	vcvt_n_s16_f16(arg_f16x4, 16);
+	vcvt_n_s16_f16(arg_f16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvt_n_s16_f16(arg_f16x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvtq_n_s16_f16(arg_f16x8, 1);
+	vcvtq_n_s16_f16(arg_f16x8, 16);
+	vcvtq_n_s16_f16(arg_f16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvtq_n_s16_f16(arg_f16x8, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvt_n_u16_f16(arg_f16x4, 1);
+	vcvt_n_u16_f16(arg_f16x4, 16);
+	vcvt_n_u16_f16(arg_f16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvt_n_u16_f16(arg_f16x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vcvtq_n_u16_f16(arg_f16x8, 1);
+	vcvtq_n_u16_f16(arg_f16x8, 16);
+	vcvtq_n_u16_f16(arg_f16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vcvtq_n_u16_f16(arg_f16x8, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/fused-multiply-accumulate.c b/clang/test/Sema/aarch64-neon-immediate-ranges/fused-multiply-accumulate.c
new file mode 100644
index 0000000000000..c65a2e6e65332
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/fused-multiply-accumulate.c
@@ -0,0 +1,126 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_fused_multiply_accumulate_f32(float32x2_t arg_f32x2, float32_t arg_f32, float32x4_t arg_f32x4) {
+	vfma_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, 0);
+	vfma_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, 1);
+	vfma_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfma_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmaq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, 0);
+	vfmaq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, 1);
+	vfmaq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmaq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmas_lane_f32(arg_f32, arg_f32, arg_f32x2, 0);
+	vfmas_lane_f32(arg_f32, arg_f32, arg_f32x2, 1);
+	vfmas_lane_f32(arg_f32, arg_f32, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmas_lane_f32(arg_f32, arg_f32, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfma_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, 0);
+	vfma_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, 3);
+	vfma_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfma_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmaq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, 0);
+	vfmaq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, 3);
+	vfmaq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmaq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmas_laneq_f32(arg_f32, arg_f32, arg_f32x4, 0);
+	vfmas_laneq_f32(arg_f32, arg_f32, arg_f32x4, 3);
+	vfmas_laneq_f32(arg_f32, arg_f32, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmas_laneq_f32(arg_f32, arg_f32, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfms_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, 0);
+	vfms_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, 1);
+	vfms_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfms_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmsq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, 0);
+	vfmsq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, 1);
+	vfmsq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmsq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmss_lane_f32(arg_f32, arg_f32, arg_f32x2, 0);
+	vfmss_lane_f32(arg_f32, arg_f32, arg_f32x2, 1);
+	vfmss_lane_f32(arg_f32, arg_f32, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmss_lane_f32(arg_f32, arg_f32, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfms_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, 0);
+	vfms_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, 3);
+	vfms_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfms_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmsq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, 0);
+	vfmsq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, 3);
+	vfmsq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmsq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmss_laneq_f32(arg_f32, arg_f32, arg_f32x4, 0);
+	vfmss_laneq_f32(arg_f32, arg_f32, arg_f32x4, 3);
+	vfmss_laneq_f32(arg_f32, arg_f32, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmss_laneq_f32(arg_f32, arg_f32, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_fused_multiply_accumulate_f64(float64x2_t arg_f64x2, float64_t arg_f64, float64x1_t arg_f64x1) {
+	vfma_lane_f64(arg_f64x1, arg_f64x1, arg_f64x1, 0);
+	vfma_lane_f64(arg_f64x1, arg_f64x1, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfma_lane_f64(arg_f64x1, arg_f64x1, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmaq_lane_f64(arg_f64x2, arg_f64x2, arg_f64x1, 0);
+	vfmaq_lane_f64(arg_f64x2, arg_f64x2, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmaq_lane_f64(arg_f64x2, arg_f64x2, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmad_lane_f64(arg_f64, arg_f64, arg_f64x1, 0);
+	vfmad_lane_f64(arg_f64, arg_f64, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmad_lane_f64(arg_f64, arg_f64, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfma_laneq_f64(arg_f64x1, arg_f64x1, arg_f64x2, 0);
+	vfma_laneq_f64(arg_f64x1, arg_f64x1, arg_f64x2, 1);
+	vfma_laneq_f64(arg_f64x1, arg_f64x1, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfma_laneq_f64(arg_f64x1, arg_f64x1, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmaq_laneq_f64(arg_f64x2, arg_f64x2, arg_f64x2, 0);
+	vfmaq_laneq_f64(arg_f64x2, arg_f64x2, arg_f64x2, 1);
+	vfmaq_laneq_f64(arg_f64x2, arg_f64x2, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmaq_laneq_f64(arg_f64x2, arg_f64x2, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmad_laneq_f64(arg_f64, arg_f64, arg_f64x2, 0);
+	vfmad_laneq_f64(arg_f64, arg_f64, arg_f64x2, 1);
+	vfmad_laneq_f64(arg_f64, arg_f64, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmad_laneq_f64(arg_f64, arg_f64, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfms_lane_f64(arg_f64x1, arg_f64x1, arg_f64x1, 0);
+	vfms_lane_f64(arg_f64x1, arg_f64x1, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfms_lane_f64(arg_f64x1, arg_f64x1, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmsq_lane_f64(arg_f64x2, arg_f64x2, arg_f64x1, 0);
+	vfmsq_lane_f64(arg_f64x2, arg_f64x2, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmsq_lane_f64(arg_f64x2, arg_f64x2, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmsd_lane_f64(arg_f64, arg_f64, arg_f64x1, 0);
+	vfmsd_lane_f64(arg_f64, arg_f64, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmsd_lane_f64(arg_f64, arg_f64, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfms_laneq_f64(arg_f64x1, arg_f64x1, arg_f64x2, 0);
+	vfms_laneq_f64(arg_f64x1, arg_f64x1, arg_f64x2, 1);
+	vfms_laneq_f64(arg_f64x1, arg_f64x1, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfms_laneq_f64(arg_f64x1, arg_f64x1, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmsq_laneq_f64(arg_f64x2, arg_f64x2, arg_f64x2, 0);
+	vfmsq_laneq_f64(arg_f64x2, arg_f64x2, arg_f64x2, 1);
+	vfmsq_laneq_f64(arg_f64x2, arg_f64x2, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmsq_laneq_f64(arg_f64x2, arg_f64x2, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vfmsd_laneq_f64(arg_f64, arg_f64, arg_f64x2, 0);
+	vfmsd_laneq_f64(arg_f64, arg_f64, arg_f64x2, 1);
+	vfmsd_laneq_f64(arg_f64, arg_f64, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vfmsd_laneq_f64(arg_f64, arg_f64, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/luti.c b/clang/test/Sema/aarch64-neon-immediate-ranges/luti.c
new file mode 100644
index 0000000000000..bed8cbc1481dd
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/luti.c
@@ -0,0 +1,283 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +lut -target-feature +bf16 -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+// 2-bit indices
+
+void test_lookup_read_2bit_u8(uint8x16_t arg_u8x16, uint8x8_t arg_u8x8) {
+	vluti2_lane_u8(arg_u8x8, arg_u8x8, 0);
+	vluti2_lane_u8(arg_u8x8, arg_u8x8, 1);
+	vluti2_lane_u8(arg_u8x8, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_lane_u8(arg_u8x8, arg_u8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2_laneq_u8(arg_u8x8, arg_u8x16, 0);
+	vluti2_laneq_u8(arg_u8x8, arg_u8x16, 3);
+	vluti2_laneq_u8(arg_u8x8, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_laneq_u8(arg_u8x8, arg_u8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_lane_u8(arg_u8x16, arg_u8x8, 0);
+	vluti2q_lane_u8(arg_u8x16, arg_u8x8, 1);
+	vluti2q_lane_u8(arg_u8x16, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_lane_u8(arg_u8x16, arg_u8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_laneq_u8(arg_u8x16, arg_u8x16, 0);
+	vluti2q_laneq_u8(arg_u8x16, arg_u8x16, 3);
+	vluti2q_laneq_u8(arg_u8x16, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_laneq_u8(arg_u8x16, arg_u8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_lookup_read_2bit_s8(int8x8_t arg_i8x8, uint8x16_t arg_u8x16, uint8x8_t arg_u8x8, int8x16_t arg_i8x16) {
+	vluti2_lane_s8(arg_i8x8, arg_u8x8, 0);
+	vluti2_lane_s8(arg_i8x8, arg_u8x8, 1);
+	vluti2_lane_s8(arg_i8x8, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_lane_s8(arg_i8x8, arg_u8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2_laneq_s8(arg_i8x8, arg_u8x16, 0);
+	vluti2_laneq_s8(arg_i8x8, arg_u8x16, 3);
+	vluti2_laneq_s8(arg_i8x8, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_laneq_s8(arg_i8x8, arg_u8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_lane_s8(arg_i8x16, arg_u8x8, 0);
+	vluti2q_lane_s8(arg_i8x16, arg_u8x8, 1);
+	vluti2q_lane_s8(arg_i8x16, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_lane_s8(arg_i8x16, arg_u8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_laneq_s8(arg_i8x16, arg_u8x16, 0);
+	vluti2q_laneq_s8(arg_i8x16, arg_u8x16, 3);
+	vluti2q_laneq_s8(arg_i8x16, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_laneq_s8(arg_i8x16, arg_u8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_lookup_read_2bit_p8(poly8x16_t arg_p8x16, poly8x8_t arg_p8x8, uint8x16_t arg_u8x16, uint8x8_t arg_u8x8) {
+	vluti2_lane_p8(arg_p8x8, arg_u8x8, 0);
+	vluti2_lane_p8(arg_p8x8, arg_u8x8, 1);
+	vluti2_lane_p8(arg_p8x8, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_lane_p8(arg_p8x8, arg_u8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2_laneq_p8(arg_p8x8, arg_u8x16, 0);
+	vluti2_laneq_p8(arg_p8x8, arg_u8x16, 3);
+	vluti2_laneq_p8(arg_p8x8, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_laneq_p8(arg_p8x8, arg_u8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_lane_p8(arg_p8x16, arg_u8x8, 0);
+	vluti2q_lane_p8(arg_p8x16, arg_u8x8, 1);
+	vluti2q_lane_p8(arg_p8x16, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_lane_p8(arg_p8x16, arg_u8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_laneq_p8(arg_p8x16, arg_u8x16, 0);
+	vluti2q_laneq_p8(arg_p8x16, arg_u8x16, 3);
+	vluti2q_laneq_p8(arg_p8x16, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_laneq_p8(arg_p8x16, arg_u8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_lookup_read_2bit_u16(uint16x4_t arg_u16x4, uint8x16_t arg_u8x16, uint8x8_t arg_u8x8, uint16x8_t arg_u16x8) {
+	vluti2_lane_u16(arg_u16x4, arg_u8x8, 0);
+	vluti2_lane_u16(arg_u16x4, arg_u8x8, 3);
+	vluti2_lane_u16(arg_u16x4, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_lane_u16(arg_u16x4, arg_u8x8, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2_laneq_u16(arg_u16x4, arg_u8x16, 0);
+	vluti2_laneq_u16(arg_u16x4, arg_u8x16, 7);
+	vluti2_laneq_u16(arg_u16x4, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_laneq_u16(arg_u16x4, arg_u8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_lane_u16(arg_u16x8, arg_u8x8, 0);
+	vluti2q_lane_u16(arg_u16x8, arg_u8x8, 3);
+	vluti2q_lane_u16(arg_u16x8, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_lane_u16(arg_u16x8, arg_u8x8, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_laneq_u16(arg_u16x8, arg_u8x16, 0);
+	vluti2q_laneq_u16(arg_u16x8, arg_u8x16, 7);
+	vluti2q_laneq_u16(arg_u16x8, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_laneq_u16(arg_u16x8, arg_u8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_lookup_read_2bit_s16(int16x4_t arg_i16x4, int16x8_t arg_i16x8, uint8x16_t arg_u8x16, uint8x8_t arg_u8x8) {
+	vluti2_lane_s16(arg_i16x4, arg_u8x8, 0);
+	vluti2_lane_s16(arg_i16x4, arg_u8x8, 3);
+	vluti2_lane_s16(arg_i16x4, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_lane_s16(arg_i16x4, arg_u8x8, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2_laneq_s16(arg_i16x4, arg_u8x16, 0);
+	vluti2_laneq_s16(arg_i16x4, arg_u8x16, 7);
+	vluti2_laneq_s16(arg_i16x4, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_laneq_s16(arg_i16x4, arg_u8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_lane_s16(arg_i16x8, arg_u8x8, 0);
+	vluti2q_lane_s16(arg_i16x8, arg_u8x8, 3);
+	vluti2q_lane_s16(arg_i16x8, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_lane_s16(arg_i16x8, arg_u8x8, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_laneq_s16(arg_i16x8, arg_u8x16, 0);
+	vluti2q_laneq_s16(arg_i16x8, arg_u8x16, 7);
+	vluti2q_laneq_s16(arg_i16x8, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_laneq_s16(arg_i16x8, arg_u8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_lookup_read_2bit_f16(float16x8_t arg_f16x8, uint8x16_t arg_u8x16, float16x4_t arg_f16x4, uint8x8_t arg_u8x8) {
+	vluti2_lane_f16(arg_f16x4, arg_u8x8, 0);
+	vluti2_lane_f16(arg_f16x4, arg_u8x8, 3);
+	vluti2_lane_f16(arg_f16x4, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_lane_f16(arg_f16x4, arg_u8x8, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2_laneq_f16(arg_f16x4, arg_u8x16, 0);
+	vluti2_laneq_f16(arg_f16x4, arg_u8x16, 7);
+	vluti2_laneq_f16(arg_f16x4, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_laneq_f16(arg_f16x4, arg_u8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_lane_f16(arg_f16x8, arg_u8x8, 0);
+	vluti2q_lane_f16(arg_f16x8, arg_u8x8, 3);
+	vluti2q_lane_f16(arg_f16x8, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_lane_f16(arg_f16x8, arg_u8x8, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_laneq_f16(arg_f16x8, arg_u8x16, 0);
+	vluti2q_laneq_f16(arg_f16x8, arg_u8x16, 7);
+	vluti2q_laneq_f16(arg_f16x8, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_laneq_f16(arg_f16x8, arg_u8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_lookup_read_2bit_bf16(bfloat16x4_t arg_b16x4, bfloat16x8_t arg_b16x8, uint8x16_t arg_u8x16, uint8x8_t arg_u8x8) {
+	vluti2_lane_bf16(arg_b16x4, arg_u8x8, 0);
+	vluti2_lane_bf16(arg_b16x4, arg_u8x8, 3);
+	vluti2_lane_bf16(arg_b16x4, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_lane_bf16(arg_b16x4, arg_u8x8, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2_laneq_bf16(arg_b16x4, arg_u8x16, 0);
+	vluti2_laneq_bf16(arg_b16x4, arg_u8x16, 7);
+	vluti2_laneq_bf16(arg_b16x4, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_laneq_bf16(arg_b16x4, arg_u8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_lane_bf16(arg_b16x8, arg_u8x8, 0);
+	vluti2q_lane_bf16(arg_b16x8, arg_u8x8, 3);
+	vluti2q_lane_bf16(arg_b16x8, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_lane_bf16(arg_b16x8, arg_u8x8, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_laneq_bf16(arg_b16x8, arg_u8x16, 0);
+	vluti2q_laneq_bf16(arg_b16x8, arg_u8x16, 7);
+	vluti2q_laneq_bf16(arg_b16x8, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_laneq_bf16(arg_b16x8, arg_u8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_lookup_read_2bit_p16(poly16x4_t arg_p16x4, poly16x8_t arg_p16x8, uint8x16_t arg_u8x16, uint8x8_t arg_u8x8) {
+	vluti2_lane_p16(arg_p16x4, arg_u8x8, 0);
+	vluti2_lane_p16(arg_p16x4, arg_u8x8, 3);
+	vluti2_lane_p16(arg_p16x4, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_lane_p16(arg_p16x4, arg_u8x8, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2_laneq_p16(arg_p16x4, arg_u8x16, 0);
+	vluti2_laneq_p16(arg_p16x4, arg_u8x16, 7);
+	vluti2_laneq_p16(arg_p16x4, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2_laneq_p16(arg_p16x4, arg_u8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_lane_p16(arg_p16x8, arg_u8x8, 0);
+	vluti2q_lane_p16(arg_p16x8, arg_u8x8, 3);
+	vluti2q_lane_p16(arg_p16x8, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_lane_p16(arg_p16x8, arg_u8x8, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti2q_laneq_p16(arg_p16x8, arg_u8x16, 0);
+	vluti2q_laneq_p16(arg_p16x8, arg_u8x16, 7);
+	vluti2q_laneq_p16(arg_p16x8, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti2q_laneq_p16(arg_p16x8, arg_u8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+// 4-bit indices 
+
+void test_lookup_read_4bit_u8(uint8x8_t arg_u8x8, uint8x16_t arg_u8x16) {
+	vluti4q_lane_u8(arg_u8x16, arg_u8x8, 0);
+	vluti4q_lane_u8(arg_u8x16, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_lane_u8(arg_u8x16, arg_u8x8, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti4q_laneq_u8(arg_u8x16, arg_u8x16, 0);
+	vluti4q_laneq_u8(arg_u8x16, arg_u8x16, 1);
+	vluti4q_laneq_u8(arg_u8x16, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_laneq_u8(arg_u8x16, arg_u8x16, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_lookup_read_4bit_s8(int8x16_t arg_i8x16, uint8x8_t arg_u8x8, uint8x16_t arg_u8x16) {
+	vluti4q_lane_s8(arg_i8x16, arg_u8x8, 0);
+	vluti4q_lane_s8(arg_i8x16, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_lane_s8(arg_i8x16, arg_u8x8, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti4q_laneq_s8(arg_i8x16, arg_u8x16, 0);
+	vluti4q_laneq_s8(arg_i8x16, arg_u8x16, 1);
+	vluti4q_laneq_s8(arg_i8x16, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_laneq_s8(arg_i8x16, arg_u8x16, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_lookup_read_4bit_p8(uint8x8_t arg_u8x8, uint8x16_t arg_u8x16, poly8x16_t arg_p8x16) {
+	vluti4q_lane_p8(arg_p8x16, arg_u8x8, 0);
+	vluti4q_lane_p8(arg_p8x16, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_lane_p8(arg_p8x16, arg_u8x8, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti4q_laneq_p8(arg_p8x16, arg_u8x16, 0);
+	vluti4q_laneq_p8(arg_p8x16, arg_u8x16, 1);
+	vluti4q_laneq_p8(arg_p8x16, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_laneq_p8(arg_p8x16, arg_u8x16, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_lookup_read_4bit_x2(int16x8x2_t arg_i16x8x2, uint8x8_t arg_u8x8, float16x8x2_t arg_f16x8x2, uint8x16_t arg_u8x16, poly16x8x2_t arg_p16x8x2, uint16x8x2_t arg_u16x8x2, bfloat16x8x2_t arg_b16x8x2) {
+	vluti4q_lane_u16_x2(arg_u16x8x2, arg_u8x8, 0);
+	vluti4q_lane_u16_x2(arg_u16x8x2, arg_u8x8, 1);
+	vluti4q_lane_u16_x2(arg_u16x8x2, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_lane_u16_x2(arg_u16x8x2, arg_u8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti4q_laneq_u16_x2(arg_u16x8x2, arg_u8x16, 0);
+	vluti4q_laneq_u16_x2(arg_u16x8x2, arg_u8x16, 3);
+	vluti4q_laneq_u16_x2(arg_u16x8x2, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_laneq_u16_x2(arg_u16x8x2, arg_u8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti4q_lane_s16_x2(arg_i16x8x2, arg_u8x8, 0);
+	vluti4q_lane_s16_x2(arg_i16x8x2, arg_u8x8, 1);
+	vluti4q_lane_s16_x2(arg_i16x8x2, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_lane_s16_x2(arg_i16x8x2, arg_u8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti4q_laneq_s16_x2(arg_i16x8x2, arg_u8x16, 0);
+	vluti4q_laneq_s16_x2(arg_i16x8x2, arg_u8x16, 3);
+	vluti4q_laneq_s16_x2(arg_i16x8x2, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_laneq_s16_x2(arg_i16x8x2, arg_u8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti4q_lane_f16_x2(arg_f16x8x2, arg_u8x8, 0);
+	vluti4q_lane_f16_x2(arg_f16x8x2, arg_u8x8, 1);
+	vluti4q_lane_f16_x2(arg_f16x8x2, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_lane_f16_x2(arg_f16x8x2, arg_u8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti4q_laneq_f16_x2(arg_f16x8x2, arg_u8x16, 0);
+	vluti4q_laneq_f16_x2(arg_f16x8x2, arg_u8x16, 3);
+	vluti4q_laneq_f16_x2(arg_f16x8x2, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_laneq_f16_x2(arg_f16x8x2, arg_u8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti4q_lane_bf16_x2(arg_b16x8x2, arg_u8x8, 0);
+	vluti4q_lane_bf16_x2(arg_b16x8x2, arg_u8x8, 1);
+	vluti4q_lane_bf16_x2(arg_b16x8x2, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_lane_bf16_x2(arg_b16x8x2, arg_u8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti4q_laneq_bf16_x2(arg_b16x8x2, arg_u8x16, 0);
+	vluti4q_laneq_bf16_x2(arg_b16x8x2, arg_u8x16, 3);
+	vluti4q_laneq_bf16_x2(arg_b16x8x2, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_laneq_bf16_x2(arg_b16x8x2, arg_u8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti4q_lane_p16_x2(arg_p16x8x2, arg_u8x8, 0);
+	vluti4q_lane_p16_x2(arg_p16x8x2, arg_u8x8, 1);
+	vluti4q_lane_p16_x2(arg_p16x8x2, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_lane_p16_x2(arg_p16x8x2, arg_u8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vluti4q_laneq_p16_x2(arg_p16x8x2, arg_u8x16, 0);
+	vluti4q_laneq_p16_x2(arg_p16x8x2, arg_u8x16, 3);
+	vluti4q_laneq_p16_x2(arg_p16x8x2, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vluti4q_laneq_p16_x2(arg_p16x8x2, arg_u8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/matrix-multiplication.c b/clang/test/Sema/aarch64-neon-immediate-ranges/matrix-multiplication.c
new file mode 100644
index 0000000000000..dd501b84bae47
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/matrix-multiplication.c
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +v8.6a -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_dot_product_s32(int8x8_t arg_i8x8, int32x2_t arg_i32x2, uint8x16_t arg_u8x16, uint8x8_t arg_u8x8,
+						  int32x4_t arg_i32x4, int8x16_t arg_i8x16) {
+	vusdot_lane_s32(arg_i32x2, arg_u8x8, arg_i8x8, 0);
+	vusdot_lane_s32(arg_i32x2, arg_u8x8, arg_i8x8, 1);
+	vusdot_lane_s32(arg_i32x2, arg_u8x8, arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vusdot_lane_s32(arg_i32x2, arg_u8x8, arg_i8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsudot_lane_s32(arg_i32x2, arg_i8x8, arg_u8x8, 0);
+	vsudot_lane_s32(arg_i32x2, arg_i8x8, arg_u8x8, 1);
+	vsudot_lane_s32(arg_i32x2, arg_i8x8, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsudot_lane_s32(arg_i32x2, arg_i8x8, arg_u8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vusdot_laneq_s32(arg_i32x2, arg_u8x8, arg_i8x16, 0);
+	vusdot_laneq_s32(arg_i32x2, arg_u8x8, arg_i8x16, 3);
+	vusdot_laneq_s32(arg_i32x2, arg_u8x8, arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vusdot_laneq_s32(arg_i32x2, arg_u8x8, arg_i8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsudot_laneq_s32(arg_i32x2, arg_i8x8, arg_u8x16, 0);
+	vsudot_laneq_s32(arg_i32x2, arg_i8x8, arg_u8x16, 3);
+	vsudot_laneq_s32(arg_i32x2, arg_i8x8, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsudot_laneq_s32(arg_i32x2, arg_i8x8, arg_u8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vusdotq_lane_s32(arg_i32x4, arg_u8x16, arg_i8x8, 0);
+	vusdotq_lane_s32(arg_i32x4, arg_u8x16, arg_i8x8, 1);
+	vusdotq_lane_s32(arg_i32x4, arg_u8x16, arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vusdotq_lane_s32(arg_i32x4, arg_u8x16, arg_i8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsudotq_lane_s32(arg_i32x4, arg_i8x16, arg_u8x8, 0);
+	vsudotq_lane_s32(arg_i32x4, arg_i8x16, arg_u8x8, 1);
+	vsudotq_lane_s32(arg_i32x4, arg_i8x16, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsudotq_lane_s32(arg_i32x4, arg_i8x16, arg_u8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vusdotq_laneq_s32(arg_i32x4, arg_u8x16, arg_i8x16, 0);
+	vusdotq_laneq_s32(arg_i32x4, arg_u8x16, arg_i8x16, 3);
+	vusdotq_laneq_s32(arg_i32x4, arg_u8x16, arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vusdotq_laneq_s32(arg_i32x4, arg_u8x16, arg_i8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsudotq_laneq_s32(arg_i32x4, arg_i8x16, arg_u8x16, 0);
+	vsudotq_laneq_s32(arg_i32x4, arg_i8x16, arg_u8x16, 3);
+	vsudotq_laneq_s32(arg_i32x4, arg_i8x16, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsudotq_laneq_s32(arg_i32x4, arg_i8x16, arg_u8x16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/multiply-extended.c b/clang/test/Sema/aarch64-neon-immediate-ranges/multiply-extended.c
new file mode 100644
index 0000000000000..8c679e7e6a7d9
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/multiply-extended.c
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_multiply_extended_f32(float32_t arg_f32, float32x2_t arg_f32x2, float32x4_t arg_f32x4) {
+	vmulx_lane_f32(arg_f32x2, arg_f32x2, 0);
+	vmulx_lane_f32(arg_f32x2, arg_f32x2, 1);
+	vmulx_lane_f32(arg_f32x2, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulx_lane_f32(arg_f32x2, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulxq_lane_f32(arg_f32x4, arg_f32x2, 0);
+	vmulxq_lane_f32(arg_f32x4, arg_f32x2, 1);
+	vmulxq_lane_f32(arg_f32x4, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulxq_lane_f32(arg_f32x4, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulxs_lane_f32(arg_f32, arg_f32x2, 0);
+	vmulxs_lane_f32(arg_f32, arg_f32x2, 1);
+	vmulxs_lane_f32(arg_f32, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulxs_lane_f32(arg_f32, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulx_laneq_f32(arg_f32x2, arg_f32x4, 0);
+	vmulx_laneq_f32(arg_f32x2, arg_f32x4, 3);
+	vmulx_laneq_f32(arg_f32x2, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulx_laneq_f32(arg_f32x2, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulxq_laneq_f32(arg_f32x4, arg_f32x4, 0);
+	vmulxq_laneq_f32(arg_f32x4, arg_f32x4, 3);
+	vmulxq_laneq_f32(arg_f32x4, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulxq_laneq_f32(arg_f32x4, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulxs_laneq_f32(arg_f32, arg_f32x4, 0);
+	vmulxs_laneq_f32(arg_f32, arg_f32x4, 3);
+	vmulxs_laneq_f32(arg_f32, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulxs_laneq_f32(arg_f32, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_multiply_extended_f64(float64_t arg_f64, float64x2_t arg_f64x2, float64x1_t arg_f64x1) {
+	vmulx_lane_f64(arg_f64x1, arg_f64x1, 0);
+	vmulx_lane_f64(arg_f64x1, arg_f64x1, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+	vmulx_lane_f64(arg_f64x1, arg_f64x1, 1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+
+	vmulxq_lane_f64(arg_f64x2, arg_f64x1, 0);
+	vmulxq_lane_f64(arg_f64x2, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulxq_lane_f64(arg_f64x2, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulxd_lane_f64(arg_f64, arg_f64x1, 0);
+	vmulxd_lane_f64(arg_f64, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulxd_lane_f64(arg_f64, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulx_laneq_f64(arg_f64x1, arg_f64x2, 0);
+	vmulx_laneq_f64(arg_f64x1, arg_f64x2, 1);
+	vmulx_laneq_f64(arg_f64x1, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulx_laneq_f64(arg_f64x1, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulxq_laneq_f64(arg_f64x2, arg_f64x2, 0);
+	vmulxq_laneq_f64(arg_f64x2, arg_f64x2, 1);
+	vmulxq_laneq_f64(arg_f64x2, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulxq_laneq_f64(arg_f64x2, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulxd_laneq_f64(arg_f64, arg_f64x2, 0);
+	vmulxd_laneq_f64(arg_f64, arg_f64x2, 1);
+	vmulxd_laneq_f64(arg_f64, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulxd_laneq_f64(arg_f64, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/saturating-multiply-accumulate.c b/clang/test/Sema/aarch64-neon-immediate-ranges/saturating-multiply-accumulate.c
new file mode 100644
index 0000000000000..854d6171a914c
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/saturating-multiply-accumulate.c
@@ -0,0 +1,132 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_saturating_multiply_accumulate_s16(int16x4_t arg_i16x4, int32_t arg_i32, int16_t arg_i16,
+											 int32x4_t arg_i32x4, int16x8_t arg_i16x8) {
+	vqdmlal_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, 0);
+	vqdmlal_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, 3);
+	vqdmlal_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlal_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlalh_lane_s16(arg_i32, arg_i16, arg_i16x4, 0);
+	vqdmlalh_lane_s16(arg_i32, arg_i16, arg_i16x4, 3);
+	vqdmlalh_lane_s16(arg_i32, arg_i16, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlalh_lane_s16(arg_i32, arg_i16, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlal_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, 0);
+	vqdmlal_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, 3);
+	vqdmlal_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlal_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlal_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, 0);
+	vqdmlal_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, 7);
+	vqdmlal_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlal_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlalh_laneq_s16(arg_i32, arg_i16, arg_i16x8, 0);
+	vqdmlalh_laneq_s16(arg_i32, arg_i16, arg_i16x8, 7);
+	vqdmlalh_laneq_s16(arg_i32, arg_i16, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlalh_laneq_s16(arg_i32, arg_i16, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlal_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, 0);
+	vqdmlal_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, 7);
+	vqdmlal_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlal_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlsl_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, 0);
+	vqdmlsl_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, 3);
+	vqdmlsl_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlsl_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlslh_lane_s16(arg_i32, arg_i16, arg_i16x4, 0);
+	vqdmlslh_lane_s16(arg_i32, arg_i16, arg_i16x4, 3);
+	vqdmlslh_lane_s16(arg_i32, arg_i16, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlslh_lane_s16(arg_i32, arg_i16, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlsl_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, 0);
+	vqdmlsl_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, 3);
+	vqdmlsl_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlsl_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlsl_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, 0);
+	vqdmlsl_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, 7);
+	vqdmlsl_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlsl_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlslh_laneq_s16(arg_i32, arg_i16, arg_i16x8, 0);
+	vqdmlslh_laneq_s16(arg_i32, arg_i16, arg_i16x8, 7);
+	vqdmlslh_laneq_s16(arg_i32, arg_i16, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlslh_laneq_s16(arg_i32, arg_i16, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlsl_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, 0);
+	vqdmlsl_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, 7);
+	vqdmlsl_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlsl_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_saturating_multiply_accumulate_s32(int32_t arg_i32, int32x4_t arg_i32x4, int64_t arg_i64, int64x2_t arg_i64x2, int32x2_t arg_i32x2) {
+	vqdmlal_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, 0);
+	vqdmlal_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, 1);
+	vqdmlal_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlal_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlals_lane_s32(arg_i64, arg_i32, arg_i32x2, 0);
+	vqdmlals_lane_s32(arg_i64, arg_i32, arg_i32x2, 1);
+	vqdmlals_lane_s32(arg_i64, arg_i32, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlals_lane_s32(arg_i64, arg_i32, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlal_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, 0);
+	vqdmlal_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, 1);
+	vqdmlal_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlal_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlal_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, 0);
+	vqdmlal_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, 3);
+	vqdmlal_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlal_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlals_laneq_s32(arg_i64, arg_i32, arg_i32x4, 0);
+	vqdmlals_laneq_s32(arg_i64, arg_i32, arg_i32x4, 3);
+	vqdmlals_laneq_s32(arg_i64, arg_i32, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlals_laneq_s32(arg_i64, arg_i32, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlal_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, 0);
+	vqdmlal_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, 3);
+	vqdmlal_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlal_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlsl_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, 0);
+	vqdmlsl_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, 1);
+	vqdmlsl_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlsl_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlsls_lane_s32(arg_i64, arg_i32, arg_i32x2, 0);
+	vqdmlsls_lane_s32(arg_i64, arg_i32, arg_i32x2, 1);
+	vqdmlsls_lane_s32(arg_i64, arg_i32, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlsls_lane_s32(arg_i64, arg_i32, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlsl_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, 0);
+	vqdmlsl_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, 1);
+	vqdmlsl_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlsl_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlsl_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, 0);
+	vqdmlsl_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, 3);
+	vqdmlsl_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlsl_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlsls_laneq_s32(arg_i64, arg_i32, arg_i32x4, 0);
+	vqdmlsls_laneq_s32(arg_i64, arg_i32, arg_i32x4, 3);
+	vqdmlsls_laneq_s32(arg_i64, arg_i32, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlsls_laneq_s32(arg_i64, arg_i32, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmlsl_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, 0);
+	vqdmlsl_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, 3);
+	vqdmlsl_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmlsl_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/saturating-multiply-by-scalar-and-widen.c b/clang/test/Sema/aarch64-neon-immediate-ranges/saturating-multiply-by-scalar-and-widen.c
new file mode 100644
index 0000000000000..662a3c2ed172d
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/saturating-multiply-by-scalar-and-widen.c
@@ -0,0 +1,193 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_saturating_multiply_by_scalar_and_widen_s16(int16x4_t arg_i16x4, int16x8_t arg_i16x8, int16_t arg_i16) {
+	vqdmull_lane_s16(arg_i16x4, arg_i16x4, 0);
+	vqdmull_lane_s16(arg_i16x4, arg_i16x4, 3);
+	vqdmull_lane_s16(arg_i16x4, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmull_lane_s16(arg_i16x4, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmullh_lane_s16(arg_i16, arg_i16x4, 0);
+	vqdmullh_lane_s16(arg_i16, arg_i16x4, 3);
+	vqdmullh_lane_s16(arg_i16, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmullh_lane_s16(arg_i16, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmull_high_lane_s16(arg_i16x8, arg_i16x4, 0);
+	vqdmull_high_lane_s16(arg_i16x8, arg_i16x4, 3);
+	vqdmull_high_lane_s16(arg_i16x8, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmull_high_lane_s16(arg_i16x8, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmull_laneq_s16(arg_i16x4, arg_i16x8, 0);
+	vqdmull_laneq_s16(arg_i16x4, arg_i16x8, 7);
+	vqdmull_laneq_s16(arg_i16x4, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmull_laneq_s16(arg_i16x4, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmullh_laneq_s16(arg_i16, arg_i16x8, 0);
+	vqdmullh_laneq_s16(arg_i16, arg_i16x8, 7);
+	vqdmullh_laneq_s16(arg_i16, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmullh_laneq_s16(arg_i16, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmull_high_laneq_s16(arg_i16x8, arg_i16x8, 0);
+	vqdmull_high_laneq_s16(arg_i16x8, arg_i16x8, 7);
+	vqdmull_high_laneq_s16(arg_i16x8, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmull_high_laneq_s16(arg_i16x8, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmulh_lane_s16(arg_i16x4, arg_i16x4, 0);
+	vqdmulh_lane_s16(arg_i16x4, arg_i16x4, 3);
+	vqdmulh_lane_s16(arg_i16x4, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmulh_lane_s16(arg_i16x4, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmulhq_lane_s16(arg_i16x8, arg_i16x4, 0);
+	vqdmulhq_lane_s16(arg_i16x8, arg_i16x4, 3);
+	vqdmulhq_lane_s16(arg_i16x8, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmulhq_lane_s16(arg_i16x8, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmulhh_lane_s16(arg_i16, arg_i16x4, 0);
+	vqdmulhh_lane_s16(arg_i16, arg_i16x4, 3);
+	vqdmulhh_lane_s16(arg_i16, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmulhh_lane_s16(arg_i16, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmulh_laneq_s16(arg_i16x4, arg_i16x8, 0);
+	vqdmulh_laneq_s16(arg_i16x4, arg_i16x8, 7);
+	vqdmulh_laneq_s16(arg_i16x4, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmulh_laneq_s16(arg_i16x4, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmulhq_laneq_s16(arg_i16x8, arg_i16x8, 0);
+	vqdmulhq_laneq_s16(arg_i16x8, arg_i16x8, 7);
+	vqdmulhq_laneq_s16(arg_i16x8, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmulhq_laneq_s16(arg_i16x8, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmulhh_laneq_s16(arg_i16, arg_i16x8, 0);
+	vqdmulhh_laneq_s16(arg_i16, arg_i16x8, 7);
+	vqdmulhh_laneq_s16(arg_i16, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmulhh_laneq_s16(arg_i16, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmulh_lane_s16(arg_i16x4, arg_i16x4, 0);
+	vqrdmulh_lane_s16(arg_i16x4, arg_i16x4, 3);
+	vqrdmulh_lane_s16(arg_i16x4, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmulh_lane_s16(arg_i16x4, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmulhq_lane_s16(arg_i16x8, arg_i16x4, 0);
+	vqrdmulhq_lane_s16(arg_i16x8, arg_i16x4, 3);
+	vqrdmulhq_lane_s16(arg_i16x8, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmulhq_lane_s16(arg_i16x8, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmulhh_lane_s16(arg_i16, arg_i16x4, 0);
+	vqrdmulhh_lane_s16(arg_i16, arg_i16x4, 3);
+	vqrdmulhh_lane_s16(arg_i16, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmulhh_lane_s16(arg_i16, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmulh_laneq_s16(arg_i16x4, arg_i16x8, 0);
+	vqrdmulh_laneq_s16(arg_i16x4, arg_i16x8, 7);
+	vqrdmulh_laneq_s16(arg_i16x4, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmulh_laneq_s16(arg_i16x4, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmulhq_laneq_s16(arg_i16x8, arg_i16x8, 0);
+	vqrdmulhq_laneq_s16(arg_i16x8, arg_i16x8, 7);
+	vqrdmulhq_laneq_s16(arg_i16x8, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmulhq_laneq_s16(arg_i16x8, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmulhh_laneq_s16(arg_i16, arg_i16x8, 0);
+	vqrdmulhh_laneq_s16(arg_i16, arg_i16x8, 7);
+	vqrdmulhh_laneq_s16(arg_i16, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmulhh_laneq_s16(arg_i16, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+
+void test_saturating_multiply_by_scalar_and_widen_s32(int32x2_t arg_i32x2, int32_t arg_i32, int32x4_t arg_i32x4) {
+	vqdmull_lane_s32(arg_i32x2, arg_i32x2, 0);
+	vqdmull_lane_s32(arg_i32x2, arg_i32x2, 1);
+	vqdmull_lane_s32(arg_i32x2, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmull_lane_s32(arg_i32x2, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmulls_lane_s32(arg_i32, arg_i32x2, 0);
+	vqdmulls_lane_s32(arg_i32, arg_i32x2, 1);
+	vqdmulls_lane_s32(arg_i32, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmulls_lane_s32(arg_i32, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmull_high_lane_s32(arg_i32x4, arg_i32x2, 0);
+	vqdmull_high_lane_s32(arg_i32x4, arg_i32x2, 1);
+	vqdmull_high_lane_s32(arg_i32x4, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmull_high_lane_s32(arg_i32x4, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmull_laneq_s32(arg_i32x2, arg_i32x4, 0);
+	vqdmull_laneq_s32(arg_i32x2, arg_i32x4, 3);
+	vqdmull_laneq_s32(arg_i32x2, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmull_laneq_s32(arg_i32x2, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmulls_laneq_s32(arg_i32, arg_i32x4, 0);
+	vqdmulls_laneq_s32(arg_i32, arg_i32x4, 3);
+	vqdmulls_laneq_s32(arg_i32, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmulls_laneq_s32(arg_i32, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmull_high_laneq_s32(arg_i32x4, arg_i32x4, 0);
+	vqdmull_high_laneq_s32(arg_i32x4, arg_i32x4, 3);
+	vqdmull_high_laneq_s32(arg_i32x4, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmull_high_laneq_s32(arg_i32x4, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmulh_lane_s32(arg_i32x2, arg_i32x2, 0);
+	vqdmulh_lane_s32(arg_i32x2, arg_i32x2, 1);
+	vqdmulh_lane_s32(arg_i32x2, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmulh_lane_s32(arg_i32x2, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmulhq_lane_s32(arg_i32x4, arg_i32x2, 0);
+	vqdmulhq_lane_s32(arg_i32x4, arg_i32x2, 1);
+	vqdmulhq_lane_s32(arg_i32x4, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmulhq_lane_s32(arg_i32x4, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmulhs_lane_s32(arg_i32, arg_i32x2, 0);
+	vqdmulhs_lane_s32(arg_i32, arg_i32x2, 1);
+	vqdmulhs_lane_s32(arg_i32, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmulhs_lane_s32(arg_i32, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmulh_laneq_s32(arg_i32x2, arg_i32x4, 0);
+	vqdmulh_laneq_s32(arg_i32x2, arg_i32x4, 3);
+	vqdmulh_laneq_s32(arg_i32x2, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmulh_laneq_s32(arg_i32x2, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmulhq_laneq_s32(arg_i32x4, arg_i32x4, 0);
+	vqdmulhq_laneq_s32(arg_i32x4, arg_i32x4, 3);
+	vqdmulhq_laneq_s32(arg_i32x4, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmulhq_laneq_s32(arg_i32x4, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqdmulhs_laneq_s32(arg_i32, arg_i32x4, 0);
+	vqdmulhs_laneq_s32(arg_i32, arg_i32x4, 3);
+	vqdmulhs_laneq_s32(arg_i32, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqdmulhs_laneq_s32(arg_i32, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmulh_lane_s32(arg_i32x2, arg_i32x2, 0);
+	vqrdmulh_lane_s32(arg_i32x2, arg_i32x2, 1);
+	vqrdmulh_lane_s32(arg_i32x2, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmulh_lane_s32(arg_i32x2, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmulhq_lane_s32(arg_i32x4, arg_i32x2, 0);
+	vqrdmulhq_lane_s32(arg_i32x4, arg_i32x2, 1);
+	vqrdmulhq_lane_s32(arg_i32x4, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmulhq_lane_s32(arg_i32x4, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmulhs_lane_s32(arg_i32, arg_i32x2, 0);
+	vqrdmulhs_lane_s32(arg_i32, arg_i32x2, 1);
+	vqrdmulhs_lane_s32(arg_i32, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmulhs_lane_s32(arg_i32, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmulh_laneq_s32(arg_i32x2, arg_i32x4, 0);
+	vqrdmulh_laneq_s32(arg_i32x2, arg_i32x4, 3);
+	vqrdmulh_laneq_s32(arg_i32x2, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmulh_laneq_s32(arg_i32x2, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmulhq_laneq_s32(arg_i32x4, arg_i32x4, 0);
+	vqrdmulhq_laneq_s32(arg_i32x4, arg_i32x4, 3);
+	vqrdmulhq_laneq_s32(arg_i32x4, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmulhq_laneq_s32(arg_i32x4, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmulhs_laneq_s32(arg_i32, arg_i32x4, 0);
+	vqrdmulhs_laneq_s32(arg_i32, arg_i32x4, 3);
+	vqrdmulhs_laneq_s32(arg_i32, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmulhs_laneq_s32(arg_i32, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/set-lanes-to-value.c b/clang/test/Sema/aarch64-neon-immediate-ranges/set-lanes-to-value.c
new file mode 100644
index 0000000000000..b5fa76b5be882
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/set-lanes-to-value.c
@@ -0,0 +1,297 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_set_all_lanes_to_the_same_value_s8(int8x8_t arg_i8x8, int8x16_t arg_i8x16) {
+	vdup_lane_s8(arg_i8x8, 0);
+	vdup_lane_s8(arg_i8x8, 7);
+	vdup_lane_s8(arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_lane_s8(arg_i8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_lane_s8(arg_i8x8, 0);
+	vdupq_lane_s8(arg_i8x8, 7);
+	vdupq_lane_s8(arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_lane_s8(arg_i8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdup_laneq_s8(arg_i8x16, 0);
+	vdup_laneq_s8(arg_i8x16, 15);
+	vdup_laneq_s8(arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_laneq_s8(arg_i8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_laneq_s8(arg_i8x16, 0);
+	vdupq_laneq_s8(arg_i8x16, 15);
+	vdupq_laneq_s8(arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_laneq_s8(arg_i8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_all_lanes_to_the_same_value_s16(int16x4_t arg_i16x4, int16x8_t arg_i16x8) {
+	vdup_lane_s16(arg_i16x4, 0);
+	vdup_lane_s16(arg_i16x4, 3);
+	vdup_lane_s16(arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_lane_s16(arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_lane_s16(arg_i16x4, 0);
+	vdupq_lane_s16(arg_i16x4, 3);
+	vdupq_lane_s16(arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_lane_s16(arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdup_laneq_s16(arg_i16x8, 0);
+	vdup_laneq_s16(arg_i16x8, 7);
+	vdup_laneq_s16(arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_laneq_s16(arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_laneq_s16(arg_i16x8, 0);
+	vdupq_laneq_s16(arg_i16x8, 7);
+	vdupq_laneq_s16(arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_laneq_s16(arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_all_lanes_to_the_same_value_s32(int32x4_t arg_i32x4, int32x2_t arg_i32x2) {
+	vdup_lane_s32(arg_i32x2, 0);
+	vdup_lane_s32(arg_i32x2, 1);
+	vdup_lane_s32(arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_lane_s32(arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_lane_s32(arg_i32x2, 0);
+	vdupq_lane_s32(arg_i32x2, 1);
+	vdupq_lane_s32(arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_lane_s32(arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdup_laneq_s32(arg_i32x4, 0);
+	vdup_laneq_s32(arg_i32x4, 3);
+	vdup_laneq_s32(arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_laneq_s32(arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_laneq_s32(arg_i32x4, 0);
+	vdupq_laneq_s32(arg_i32x4, 3);
+	vdupq_laneq_s32(arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_laneq_s32(arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_all_lanes_to_the_same_value_s64(int64x1_t arg_i64x1, int64x2_t arg_i64x2) {
+	vdup_lane_s64(arg_i64x1, 0);
+	vdup_lane_s64(arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_lane_s64(arg_i64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_lane_s64(arg_i64x1, 0);
+	vdupq_lane_s64(arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_lane_s64(arg_i64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdup_laneq_s64(arg_i64x2, 0);
+	vdup_laneq_s64(arg_i64x2, 1);
+	vdup_laneq_s64(arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_laneq_s64(arg_i64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_laneq_s64(arg_i64x2, 0);
+	vdupq_laneq_s64(arg_i64x2, 1);
+	vdupq_laneq_s64(arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_laneq_s64(arg_i64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_all_lanes_to_the_same_value_u8(uint8x8_t arg_u8x8, uint8x16_t arg_u8x16) {
+	vdup_lane_u8(arg_u8x8, 0);
+	vdup_lane_u8(arg_u8x8, 7);
+	vdup_lane_u8(arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_lane_u8(arg_u8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_lane_u8(arg_u8x8, 0);
+	vdupq_lane_u8(arg_u8x8, 7);
+	vdupq_lane_u8(arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_lane_u8(arg_u8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdup_laneq_u8(arg_u8x16, 0);
+	vdup_laneq_u8(arg_u8x16, 15);
+	vdup_laneq_u8(arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_laneq_u8(arg_u8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_laneq_u8(arg_u8x16, 0);
+	vdupq_laneq_u8(arg_u8x16, 15);
+	vdupq_laneq_u8(arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_laneq_u8(arg_u8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_all_lanes_to_the_same_value_u16(uint16x4_t arg_u16x4, uint16x8_t arg_u16x8) {
+	vdup_lane_u16(arg_u16x4, 0);
+	vdup_lane_u16(arg_u16x4, 3);
+	vdup_lane_u16(arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_lane_u16(arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_lane_u16(arg_u16x4, 0);
+	vdupq_lane_u16(arg_u16x4, 3);
+	vdupq_lane_u16(arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_lane_u16(arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdup_laneq_u16(arg_u16x8, 0);
+	vdup_laneq_u16(arg_u16x8, 7);
+	vdup_laneq_u16(arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_laneq_u16(arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_laneq_u16(arg_u16x8, 0);
+	vdupq_laneq_u16(arg_u16x8, 7);
+	vdupq_laneq_u16(arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_laneq_u16(arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_all_lanes_to_the_same_value_u32(uint32x4_t arg_u32x4, uint32x2_t arg_u32x2) {
+	vdup_lane_u32(arg_u32x2, 0);
+	vdup_lane_u32(arg_u32x2, 1);
+	vdup_lane_u32(arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_lane_u32(arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_lane_u32(arg_u32x2, 0);
+	vdupq_lane_u32(arg_u32x2, 1);
+	vdupq_lane_u32(arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_lane_u32(arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdup_laneq_u32(arg_u32x4, 0);
+	vdup_laneq_u32(arg_u32x4, 3);
+	vdup_laneq_u32(arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_laneq_u32(arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_laneq_u32(arg_u32x4, 0);
+	vdupq_laneq_u32(arg_u32x4, 3);
+	vdupq_laneq_u32(arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_laneq_u32(arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_all_lanes_to_the_same_value_u64(uint64x1_t arg_u64x1, uint64x2_t arg_u64x2) {
+	vdup_lane_u64(arg_u64x1, 0);
+	vdup_lane_u64(arg_u64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_lane_u64(arg_u64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_lane_u64(arg_u64x1, 0);
+	vdupq_lane_u64(arg_u64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_lane_u64(arg_u64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdup_laneq_u64(arg_u64x2, 0);
+	vdup_laneq_u64(arg_u64x2, 1);
+	vdup_laneq_u64(arg_u64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_laneq_u64(arg_u64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_laneq_u64(arg_u64x2, 0);
+	vdupq_laneq_u64(arg_u64x2, 1);
+	vdupq_laneq_u64(arg_u64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_laneq_u64(arg_u64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_all_lanes_to_the_same_value_p64(poly64x1_t arg_p64x1, poly64x2_t arg_p64x2) {
+	vdup_lane_p64(arg_p64x1, 0);
+	vdup_lane_p64(arg_p64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_lane_p64(arg_p64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_lane_p64(arg_p64x1, 0);
+	vdupq_lane_p64(arg_p64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_lane_p64(arg_p64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdup_laneq_p64(arg_p64x2, 0);
+	vdup_laneq_p64(arg_p64x2, 1);
+	vdup_laneq_p64(arg_p64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_laneq_p64(arg_p64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_laneq_p64(arg_p64x2, 0);
+	vdupq_laneq_p64(arg_p64x2, 1);
+	vdupq_laneq_p64(arg_p64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_laneq_p64(arg_p64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_all_lanes_to_the_same_value_f32(float32x2_t arg_f32x2, float32x4_t arg_f32x4) {
+	vdup_lane_f32(arg_f32x2, 0);
+	vdup_lane_f32(arg_f32x2, 1);
+	vdup_lane_f32(arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_lane_f32(arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_lane_f32(arg_f32x2, 0);
+	vdupq_lane_f32(arg_f32x2, 1);
+	vdupq_lane_f32(arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_lane_f32(arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdup_laneq_f32(arg_f32x4, 0);
+	vdup_laneq_f32(arg_f32x4, 3);
+	vdup_laneq_f32(arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_laneq_f32(arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_laneq_f32(arg_f32x4, 0);
+	vdupq_laneq_f32(arg_f32x4, 3);
+	vdupq_laneq_f32(arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_laneq_f32(arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_all_lanes_to_the_same_value_p8(poly8x16_t arg_p8x16, poly8x8_t arg_p8x8) {
+	vdup_lane_p8(arg_p8x8, 0);
+	vdup_lane_p8(arg_p8x8, 7);
+	vdup_lane_p8(arg_p8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_lane_p8(arg_p8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_lane_p8(arg_p8x8, 0);
+	vdupq_lane_p8(arg_p8x8, 7);
+	vdupq_lane_p8(arg_p8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_lane_p8(arg_p8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdup_laneq_p8(arg_p8x16, 0);
+	vdup_laneq_p8(arg_p8x16, 15);
+	vdup_laneq_p8(arg_p8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_laneq_p8(arg_p8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_laneq_p8(arg_p8x16, 0);
+	vdupq_laneq_p8(arg_p8x16, 15);
+	vdupq_laneq_p8(arg_p8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_laneq_p8(arg_p8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_all_lanes_to_the_same_value_p16(poly16x4_t arg_p16x4, poly16x8_t arg_p16x8) {
+	vdup_lane_p16(arg_p16x4, 0);
+	vdup_lane_p16(arg_p16x4, 3);
+	vdup_lane_p16(arg_p16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_lane_p16(arg_p16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_lane_p16(arg_p16x4, 0);
+	vdupq_lane_p16(arg_p16x4, 3);
+	vdupq_lane_p16(arg_p16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_lane_p16(arg_p16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdup_laneq_p16(arg_p16x8, 0);
+	vdup_laneq_p16(arg_p16x8, 7);
+	vdup_laneq_p16(arg_p16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_laneq_p16(arg_p16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_laneq_p16(arg_p16x8, 0);
+	vdupq_laneq_p16(arg_p16x8, 7);
+	vdupq_laneq_p16(arg_p16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_laneq_p16(arg_p16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_all_lanes_to_the_same_value_f64(float64x2_t arg_f64x2, float64x1_t arg_f64x1) {
+	vdup_lane_f64(arg_f64x1, 0);
+	vdup_lane_f64(arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_lane_f64(arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_lane_f64(arg_f64x1, 0);
+	vdupq_lane_f64(arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_lane_f64(arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdup_laneq_f64(arg_f64x2, 0);
+	vdup_laneq_f64(arg_f64x2, 1);
+	vdup_laneq_f64(arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdup_laneq_f64(arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vdupq_laneq_f64(arg_f64x2, 0);
+	vdupq_laneq_f64(arg_f64x2, 1);
+	vdupq_laneq_f64(arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vdupq_laneq_f64(arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/set-vector-lane.c b/clang/test/Sema/aarch64-neon-immediate-ranges/set-vector-lane.c
new file mode 100644
index 0000000000000..3ab077ed56287
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/set-vector-lane.c
@@ -0,0 +1,162 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+// vsetq_lane_u8, vsetq_lane_u16, vsetq_lane_u32, vsetq_lane_u64 are
+// tesed under clang/test/Sema/arm-mve-immediates.c
+
+void test_set_vector_lane_u8(uint8x16_t arg_u8x16, uint8_t arg_u8, uint8x8_t arg_u8x8) {
+	vset_lane_u8(arg_u8, arg_u8x8, 0);
+	vset_lane_u8(arg_u8, arg_u8x8, 7);
+	vset_lane_u8(arg_u8, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vset_lane_u8(arg_u8, arg_u8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_set_vector_lane_u16(uint16x4_t arg_u16x4, uint16_t arg_u16, uint16x8_t arg_u16x8) {
+	vset_lane_u16(arg_u16, arg_u16x4, 0);
+	vset_lane_u16(arg_u16, arg_u16x4, 3);
+	vset_lane_u16(arg_u16, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vset_lane_u16(arg_u16, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_set_vector_lane_u32(uint32x2_t arg_u32x2, uint32x4_t arg_u32x4, uint32_t arg_u32) {
+	vset_lane_u32(arg_u32, arg_u32x2, 0);
+	vset_lane_u32(arg_u32, arg_u32x2, 1);
+	vset_lane_u32(arg_u32, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vset_lane_u32(arg_u32, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_set_vector_lane_u64(uint64x2_t arg_u64x2, uint64x1_t arg_u64x1, uint64_t arg_u64) {
+	vset_lane_u64(arg_u64, arg_u64x1, 0);
+	vset_lane_u64(arg_u64, arg_u64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vset_lane_u64(arg_u64, arg_u64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_set_vector_lane_p64(poly64_t arg_p64, poly64x1_t arg_p64x1, poly64x2_t arg_p64x2) {
+	vset_lane_p64(arg_p64, arg_p64x1, 0);
+	vset_lane_p64(arg_p64, arg_p64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vset_lane_p64(arg_p64, arg_p64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsetq_lane_p64(arg_p64, arg_p64x2, 0);
+	vsetq_lane_p64(arg_p64, arg_p64x2, 1);
+	vsetq_lane_p64(arg_p64, arg_p64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsetq_lane_p64(arg_p64, arg_p64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_vector_lane_s8(int8x16_t arg_i8x16, int8x8_t arg_i8x8, int8_t arg_i8) {
+	vset_lane_s8(arg_i8, arg_i8x8, 0);
+	vset_lane_s8(arg_i8, arg_i8x8, 7);
+	vset_lane_s8(arg_i8, arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vset_lane_s8(arg_i8, arg_i8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsetq_lane_s8(arg_i8, arg_i8x16, 0);
+	vsetq_lane_s8(arg_i8, arg_i8x16, 15);
+	vsetq_lane_s8(arg_i8, arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsetq_lane_s8(arg_i8, arg_i8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_vector_lane_s16(int16x4_t arg_i16x4, int16_t arg_i16, int16x8_t arg_i16x8) {
+	vset_lane_s16(arg_i16, arg_i16x4, 0);
+	vset_lane_s16(arg_i16, arg_i16x4, 3);
+	vset_lane_s16(arg_i16, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vset_lane_s16(arg_i16, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsetq_lane_s16(arg_i16, arg_i16x8, 0);
+	vsetq_lane_s16(arg_i16, arg_i16x8, 7);
+	vsetq_lane_s16(arg_i16, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsetq_lane_s16(arg_i16, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_vector_lane_s32(int32_t arg_i32, int32x2_t arg_i32x2, int32x4_t arg_i32x4) {
+	vset_lane_s32(arg_i32, arg_i32x2, 0);
+	vset_lane_s32(arg_i32, arg_i32x2, 1);
+	vset_lane_s32(arg_i32, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vset_lane_s32(arg_i32, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsetq_lane_s32(arg_i32, arg_i32x4, 0);
+	vsetq_lane_s32(arg_i32, arg_i32x4, 3);
+	vsetq_lane_s32(arg_i32, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsetq_lane_s32(arg_i32, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_vector_lane_s64(int64_t arg_i64, int64x2_t arg_i64x2, int64x1_t arg_i64x1) {
+	vset_lane_s64(arg_i64, arg_i64x1, 0);
+	vset_lane_s64(arg_i64, arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vset_lane_s64(arg_i64, arg_i64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsetq_lane_s64(arg_i64, arg_i64x2, 0);
+	vsetq_lane_s64(arg_i64, arg_i64x2, 1);
+	vsetq_lane_s64(arg_i64, arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsetq_lane_s64(arg_i64, arg_i64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_vector_lane_p8(poly8_t arg_p8, poly8x16_t arg_p8x16, poly8x8_t arg_p8x8) {
+	vset_lane_p8(arg_p8, arg_p8x8, 0);
+	vset_lane_p8(arg_p8, arg_p8x8, 7);
+	vset_lane_p8(arg_p8, arg_p8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vset_lane_p8(arg_p8, arg_p8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsetq_lane_p8(arg_p8, arg_p8x16, 0);
+	vsetq_lane_p8(arg_p8, arg_p8x16, 15);
+	vsetq_lane_p8(arg_p8, arg_p8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsetq_lane_p8(arg_p8, arg_p8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_vector_lane_p16(poly16x4_t arg_p16x4, poly16_t arg_p16, poly16x8_t arg_p16x8) {
+	vset_lane_p16(arg_p16, arg_p16x4, 0);
+	vset_lane_p16(arg_p16, arg_p16x4, 3);
+	vset_lane_p16(arg_p16, arg_p16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vset_lane_p16(arg_p16, arg_p16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsetq_lane_p16(arg_p16, arg_p16x8, 0);
+	vsetq_lane_p16(arg_p16, arg_p16x8, 7);
+	vsetq_lane_p16(arg_p16, arg_p16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsetq_lane_p16(arg_p16, arg_p16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_vector_lane_f16(float16x8_t arg_f16x8, float16x4_t arg_f16x4, float16_t arg_f16) {
+	vset_lane_f16(arg_f16, arg_f16x4, 0);
+	vset_lane_f16(arg_f16, arg_f16x4, 3);
+	vset_lane_f16(arg_f16, arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vset_lane_f16(arg_f16, arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsetq_lane_f16(arg_f16, arg_f16x8, 0);
+	vsetq_lane_f16(arg_f16, arg_f16x8, 7);
+	vsetq_lane_f16(arg_f16, arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsetq_lane_f16(arg_f16, arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_vector_lane_f32(float32x2_t arg_f32x2, float32x4_t arg_f32x4, float32_t arg_f32) {
+	vset_lane_f32(arg_f32, arg_f32x2, 0);
+	vset_lane_f32(arg_f32, arg_f32x2, 1);
+	vset_lane_f32(arg_f32, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vset_lane_f32(arg_f32, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsetq_lane_f32(arg_f32, arg_f32x4, 0);
+	vsetq_lane_f32(arg_f32, arg_f32x4, 3);
+	vsetq_lane_f32(arg_f32, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsetq_lane_f32(arg_f32, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_set_vector_lane_f64(float64x1_t arg_f64x1, float64x2_t arg_f64x2, float64_t arg_f64) {
+	vset_lane_f64(arg_f64, arg_f64x1, 0);
+	vset_lane_f64(arg_f64, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vset_lane_f64(arg_f64, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsetq_lane_f64(arg_f64, arg_f64x2, 0);
+	vsetq_lane_f64(arg_f64, arg_f64x2, 1);
+	vsetq_lane_f64(arg_f64, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsetq_lane_f64(arg_f64, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/sqrdmlah-ranges.c b/clang/test/Sema/aarch64-neon-immediate-ranges/sqrdmlah-ranges.c
new file mode 100644
index 0000000000000..2439fb79737e6
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/sqrdmlah-ranges.c
@@ -0,0 +1,130 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -target-feature +v8.1a -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+void test_saturating_multiply_accumulate_by_element_s16(int16x8_t arg_i16x8, int16_t arg_i16, int16x4_t arg_i16x4) {
+	vqrdmlah_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, 0);
+	vqrdmlah_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, 3);
+	vqrdmlah_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlah_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlahq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, 0);
+	vqrdmlahq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, 3);
+	vqrdmlahq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlahq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlah_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, 0);
+	vqrdmlah_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, 7);
+	vqrdmlah_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlah_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlahq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, 0);
+	vqrdmlahq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, 7);
+	vqrdmlahq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlahq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlsh_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, 0);
+	vqrdmlsh_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, 3);
+	vqrdmlsh_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlsh_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlshq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, 0);
+	vqrdmlshq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, 3);
+	vqrdmlshq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlshq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlsh_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, 0);
+	vqrdmlsh_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, 7);
+	vqrdmlsh_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlsh_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlshq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, 0);
+	vqrdmlshq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, 7);
+	vqrdmlshq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlshq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlahh_lane_s16(arg_i16, arg_i16, arg_i16x4, 0);
+	vqrdmlahh_lane_s16(arg_i16, arg_i16, arg_i16x4, 3);
+	vqrdmlahh_lane_s16(arg_i16, arg_i16, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlahh_lane_s16(arg_i16, arg_i16, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlahh_laneq_s16(arg_i16, arg_i16, arg_i16x8, 0);
+	vqrdmlahh_laneq_s16(arg_i16, arg_i16, arg_i16x8, 7);
+	vqrdmlahh_laneq_s16(arg_i16, arg_i16, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlahh_laneq_s16(arg_i16, arg_i16, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlshh_lane_s16(arg_i16, arg_i16, arg_i16x4, 0);
+	vqrdmlshh_lane_s16(arg_i16, arg_i16, arg_i16x4, 3);
+	vqrdmlshh_lane_s16(arg_i16, arg_i16, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlshh_lane_s16(arg_i16, arg_i16, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlshh_laneq_s16(arg_i16, arg_i16, arg_i16x8, 0);
+	vqrdmlshh_laneq_s16(arg_i16, arg_i16, arg_i16x8, 7);
+	vqrdmlshh_laneq_s16(arg_i16, arg_i16, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlshh_laneq_s16(arg_i16, arg_i16, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_saturating_multiply_accumulate_by_element_s32(int32x2_t arg_i32x2, int32x4_t arg_i32x4, int32_t arg_i32) {
+	vqrdmlah_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, 0);
+	vqrdmlah_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, 1);
+	vqrdmlah_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlah_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlahq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, 0);
+	vqrdmlahq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, 1);
+	vqrdmlahq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlahq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlah_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, 0);
+	vqrdmlah_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, 3);
+	vqrdmlah_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlah_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlahq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, 0);
+	vqrdmlahq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, 3);
+	vqrdmlahq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlahq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlsh_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, 0);
+	vqrdmlsh_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, 1);
+	vqrdmlsh_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlsh_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlshq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, 0);
+	vqrdmlshq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, 1);
+	vqrdmlshq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlshq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlsh_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, 0);
+	vqrdmlsh_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, 3);
+	vqrdmlsh_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlsh_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlshq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, 0);
+	vqrdmlshq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, 3);
+	vqrdmlshq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlshq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlahs_lane_s32(arg_i32, arg_i32, arg_i32x2, 0);
+	vqrdmlahs_lane_s32(arg_i32, arg_i32, arg_i32x2, 1);
+	vqrdmlahs_lane_s32(arg_i32, arg_i32, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlahs_lane_s32(arg_i32, arg_i32, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlahs_laneq_s32(arg_i32, arg_i32, arg_i32x4, 0);
+	vqrdmlahs_laneq_s32(arg_i32, arg_i32, arg_i32x4, 3);
+	vqrdmlahs_laneq_s32(arg_i32, arg_i32, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlahs_laneq_s32(arg_i32, arg_i32, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlshs_lane_s32(arg_i32, arg_i32, arg_i32x2, 0);
+	vqrdmlshs_lane_s32(arg_i32, arg_i32, arg_i32x2, 1);
+	vqrdmlshs_lane_s32(arg_i32, arg_i32, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlshs_lane_s32(arg_i32, arg_i32, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrdmlshs_laneq_s32(arg_i32, arg_i32, arg_i32x4, 0);
+	vqrdmlshs_laneq_s32(arg_i32, arg_i32, arg_i32x4, 3);
+	vqrdmlshs_laneq_s32(arg_i32, arg_i32, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrdmlshs_laneq_s32(arg_i32, arg_i32, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/vcmla.c b/clang/test/Sema/aarch64-neon-immediate-ranges/vcmla.c
new file mode 100644
index 0000000000000..21c24975b38b3
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/vcmla.c
@@ -0,0 +1,203 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +v8.3a -ffreestanding -fsyntax-only -verify %s
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+#include <arm_fp16.h>
+
+void test_vcmla_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c){
+  vcmla_lane_f16(a, b, c, 0);
+  vcmla_lane_f16(a, b, c, 1);
+
+  vcmla_lane_f16(a, b, c, 2); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmla_lane_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c){
+  vcmla_laneq_f16(a, b, c, 0);
+  vcmla_laneq_f16(a, b, c, 1);
+  vcmla_laneq_f16(a, b, c, 3);
+
+  vcmla_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmla_laneq_f16(a, b, c, 4);  // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c){
+  vcmlaq_lane_f16(a, b, c, 0);
+  vcmlaq_lane_f16(a, b, c, 1);
+
+  vcmlaq_lane_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_lane_f16(a, b, c, 2);  // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c){
+  vcmlaq_laneq_f16(a, b, c, 0);
+  vcmlaq_laneq_f16(a, b, c, 1);
+  vcmlaq_laneq_f16(a, b, c, 3);
+
+  vcmlaq_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_laneq_f16(a, b, c, 4);  // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c){
+  vcmla_lane_f32(a, b, c, 0);
+
+  vcmla_lane_f32(a, b, c, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_lane_f32(a, b, c, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_lane_f32(a, b, c, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t c){
+  vcmla_laneq_f32(a, b, c, 0);
+
+  vcmla_laneq_f32(a, b, c, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_laneq_f32(a, b, c, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c){
+  vcmlaq_laneq_f32(a, b, c, 0);
+  vcmlaq_laneq_f32(a, b, c, 1);
+
+  vcmlaq_laneq_f32(a, b, c, 2); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_laneq_f32(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot90_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c){
+  vcmla_rot90_lane_f16(a, b, c, 0);
+  vcmla_rot90_lane_f16(a, b, c, 1);
+
+  vcmla_rot90_lane_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmla_rot90_lane_f16(a, b, c, 2); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot90_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c){
+  vcmla_rot90_laneq_f16(a, b, c, 0);
+  vcmla_rot90_laneq_f16(a, b, c, 3);
+
+  vcmla_rot90_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmla_rot90_laneq_f16(a, b, c, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_rot90_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c){
+  vcmlaq_rot90_laneq_f16(a, b, c, 0);
+  vcmlaq_rot90_laneq_f16(a, b, c, 3);
+
+  vcmlaq_rot90_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_rot90_laneq_f16(a, b, c, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot180_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c){
+  vcmla_rot180_lane_f16(a, b, c, 0);
+  vcmla_rot180_lane_f16(a, b, c, 1);
+
+  vcmla_rot180_lane_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmla_rot180_lane_f16(a, b, c, 2); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot180_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c){
+  vcmla_rot180_laneq_f16(a, b, c, 0);
+  vcmla_rot180_laneq_f16(a, b, c, 3);
+
+  vcmla_rot180_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmla_rot180_laneq_f16(a, b, c, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_rot180_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c){
+  vcmlaq_rot180_laneq_f16(a, b, c, 0);
+  vcmlaq_rot180_laneq_f16(a, b, c, 3);
+
+  vcmlaq_rot180_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_rot180_laneq_f16(a, b, c, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot270_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c){
+  vcmla_rot270_lane_f16(a, b, c, 0);
+  vcmla_rot270_lane_f16(a, b, c, 1);
+
+  vcmla_rot270_lane_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmla_rot270_lane_f16(a, b, c, 2); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot270_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c){
+  vcmla_rot270_laneq_f16(a, b, c, 0);
+  vcmla_rot270_laneq_f16(a, b, c, 3);
+
+  vcmla_rot270_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmla_rot270_laneq_f16(a, b, c, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_rot270_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c){
+  vcmlaq_rot270_laneq_f16(a, b, c, 0);
+  vcmlaq_rot270_laneq_f16(a, b, c, 3);
+
+  vcmlaq_rot270_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_rot270_laneq_f16(a, b, c, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot90_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c){
+  vcmla_rot90_lane_f32(a, b, c, 0);
+
+  vcmla_rot90_lane_f32(a, b, c, 1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_rot90_lane_f32(a, b, c, -1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot90_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t c){
+  vcmla_rot90_laneq_f32(a, b, c, 0);
+  vcmla_rot90_laneq_f32(a, b, c, 1);
+
+  vcmla_rot90_laneq_f32(a, b, c, 2);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_rot90_laneq_f32(a, b, c, -1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_rot90_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c){
+  vcmlaq_rot90_laneq_f32(a, b, c, 0);
+  vcmlaq_rot90_laneq_f32(a, b, c, 1);
+
+  vcmlaq_rot90_laneq_f32(a, b, c, 2);  // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_rot90_laneq_f32(a, b, c, -1);  // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot180_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c){
+  vcmla_rot180_lane_f32(a, b, c, 0);
+
+  vcmla_rot180_lane_f32(a, b, c, 1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_rot180_lane_f32(a, b, c, -1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot180_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t c){
+  vcmla_rot180_laneq_f32(a, b, c, 0);
+  vcmla_rot180_laneq_f32(a, b, c, 1);
+
+  vcmla_rot180_laneq_f32(a, b, c, 2);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_rot180_laneq_f32(a, b, c, -1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_rot180_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c){
+  vcmlaq_rot90_laneq_f32(a, b, c, 0);
+  vcmlaq_rot90_laneq_f32(a, b, c, 1);
+
+  vcmlaq_rot90_laneq_f32(a, b, c, 2);  // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_rot90_laneq_f32(a, b, c, -1);  // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot270_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c){
+  vcmla_rot270_lane_f32(a, b, c, 0);
+
+  vcmla_rot270_lane_f32(a, b, c, 1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_rot270_lane_f32(a, b, c, -1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot270_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t c){
+  vcmla_rot270_laneq_f32(a, b, c, 0);
+  vcmla_rot270_laneq_f32(a, b, c, 1);
+
+  vcmla_rot270_laneq_f32(a, b, c, 2);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_rot270_laneq_f32(a, b, c, -1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_rot270_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c){
+  vcmlaq_rot270_laneq_f32(a, b, c, 0);
+  vcmlaq_rot270_laneq_f32(a, b, c, 1);
+
+  vcmlaq_rot270_laneq_f32(a, b, c, 2);  // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_rot270_laneq_f32(a, b, c, -1);  // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
\ No newline at end of file
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/vector-load.c b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-load.c
new file mode 100644
index 0000000000000..3259d47e1b625
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-load.c
@@ -0,0 +1,692 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_vector_load_s8(int8x8x2_t arg_i8x8x2, int8x8x3_t arg_i8x8x3, int8x16x2_t arg_i8x16x2,
+						 int8x16x3_t arg_i8x16x3, int8x8_t arg_i8x8, int8x16x4_t arg_i8x16x4,
+						 int8x16_t arg_i8x16, int8x8x4_t arg_i8x8x4, int8_t* arg_i8_ptr) {
+	vld1_lane_s8(arg_i8_ptr, arg_i8x8, 0);
+	vld1_lane_s8(arg_i8_ptr, arg_i8x8, 7);
+	vld1_lane_s8(arg_i8_ptr, arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1_lane_s8(arg_i8_ptr, arg_i8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld1q_lane_s8(arg_i8_ptr, arg_i8x16, 0);
+	vld1q_lane_s8(arg_i8_ptr, arg_i8x16, 15);
+	vld1q_lane_s8(arg_i8_ptr, arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1q_lane_s8(arg_i8_ptr, arg_i8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2_lane_s8(arg_i8_ptr, arg_i8x8x2, 0);
+	vld2_lane_s8(arg_i8_ptr, arg_i8x8x2, 7);
+	vld2_lane_s8(arg_i8_ptr, arg_i8x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2_lane_s8(arg_i8_ptr, arg_i8x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2q_lane_s8(arg_i8_ptr, arg_i8x16x2, 0);
+	vld2q_lane_s8(arg_i8_ptr, arg_i8x16x2, 15);
+	vld2q_lane_s8(arg_i8_ptr, arg_i8x16x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2q_lane_s8(arg_i8_ptr, arg_i8x16x2, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3_lane_s8(arg_i8_ptr, arg_i8x8x3, 0);
+	vld3_lane_s8(arg_i8_ptr, arg_i8x8x3, 7);
+	vld3_lane_s8(arg_i8_ptr, arg_i8x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3_lane_s8(arg_i8_ptr, arg_i8x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3q_lane_s8(arg_i8_ptr, arg_i8x16x3, 0);
+	vld3q_lane_s8(arg_i8_ptr, arg_i8x16x3, 15);
+	vld3q_lane_s8(arg_i8_ptr, arg_i8x16x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3q_lane_s8(arg_i8_ptr, arg_i8x16x3, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4_lane_s8(arg_i8_ptr, arg_i8x8x4, 0);
+	vld4_lane_s8(arg_i8_ptr, arg_i8x8x4, 7);
+	vld4_lane_s8(arg_i8_ptr, arg_i8x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4_lane_s8(arg_i8_ptr, arg_i8x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4q_lane_s8(arg_i8_ptr, arg_i8x16x4, 0);
+	vld4q_lane_s8(arg_i8_ptr, arg_i8x16x4, 15);
+	vld4q_lane_s8(arg_i8_ptr, arg_i8x16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4q_lane_s8(arg_i8_ptr, arg_i8x16x4, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_load_s16(int16x8x2_t arg_i16x8x2, int16x8x3_t arg_i16x8x3, int16x8x4_t arg_i16x8x4,
+						  int16_t* arg_i16_ptr, int16x4x2_t arg_i16x4x2, int16x4x3_t arg_i16x4x3,
+						  int16x8_t arg_i16x8, int16x4x4_t arg_i16x4x4, int16x4_t arg_i16x4) {
+	vld1_lane_s16(arg_i16_ptr, arg_i16x4, 0);
+	vld1_lane_s16(arg_i16_ptr, arg_i16x4, 3);
+	vld1_lane_s16(arg_i16_ptr, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1_lane_s16(arg_i16_ptr, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld1q_lane_s16(arg_i16_ptr, arg_i16x8, 0);
+	vld1q_lane_s16(arg_i16_ptr, arg_i16x8, 7);
+	vld1q_lane_s16(arg_i16_ptr, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1q_lane_s16(arg_i16_ptr, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2_lane_s16(arg_i16_ptr, arg_i16x4x2, 0);
+	vld2_lane_s16(arg_i16_ptr, arg_i16x4x2, 3);
+	vld2_lane_s16(arg_i16_ptr, arg_i16x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2_lane_s16(arg_i16_ptr, arg_i16x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2q_lane_s16(arg_i16_ptr, arg_i16x8x2, 0);
+	vld2q_lane_s16(arg_i16_ptr, arg_i16x8x2, 7);
+	vld2q_lane_s16(arg_i16_ptr, arg_i16x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2q_lane_s16(arg_i16_ptr, arg_i16x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3_lane_s16(arg_i16_ptr, arg_i16x4x3, 0);
+	vld3_lane_s16(arg_i16_ptr, arg_i16x4x3, 3);
+	vld3_lane_s16(arg_i16_ptr, arg_i16x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3_lane_s16(arg_i16_ptr, arg_i16x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3q_lane_s16(arg_i16_ptr, arg_i16x8x3, 0);
+	vld3q_lane_s16(arg_i16_ptr, arg_i16x8x3, 7);
+	vld3q_lane_s16(arg_i16_ptr, arg_i16x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3q_lane_s16(arg_i16_ptr, arg_i16x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4_lane_s16(arg_i16_ptr, arg_i16x4x4, 0);
+	vld4_lane_s16(arg_i16_ptr, arg_i16x4x4, 3);
+	vld4_lane_s16(arg_i16_ptr, arg_i16x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4_lane_s16(arg_i16_ptr, arg_i16x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4q_lane_s16(arg_i16_ptr, arg_i16x8x4, 0);
+	vld4q_lane_s16(arg_i16_ptr, arg_i16x8x4, 7);
+	vld4q_lane_s16(arg_i16_ptr, arg_i16x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4q_lane_s16(arg_i16_ptr, arg_i16x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_load_s32(int32x2x4_t arg_i32x2x4, int32x4_t arg_i32x4, int32x2_t arg_i32x2,
+						  int32x4x2_t arg_i32x4x2, int32x4x4_t arg_i32x4x4, int32_t* arg_i32_ptr,
+						  int32x2x3_t arg_i32x2x3, int32x4x3_t arg_i32x4x3, int32x2x2_t arg_i32x2x2) {
+	vld1_lane_s32(arg_i32_ptr, arg_i32x2, 0);
+	vld1_lane_s32(arg_i32_ptr, arg_i32x2, 1);
+	vld1_lane_s32(arg_i32_ptr, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1_lane_s32(arg_i32_ptr, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld1q_lane_s32(arg_i32_ptr, arg_i32x4, 0);
+	vld1q_lane_s32(arg_i32_ptr, arg_i32x4, 3);
+	vld1q_lane_s32(arg_i32_ptr, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1q_lane_s32(arg_i32_ptr, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2_lane_s32(arg_i32_ptr, arg_i32x2x2, 0);
+	vld2_lane_s32(arg_i32_ptr, arg_i32x2x2, 1);
+	vld2_lane_s32(arg_i32_ptr, arg_i32x2x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2_lane_s32(arg_i32_ptr, arg_i32x2x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2q_lane_s32(arg_i32_ptr, arg_i32x4x2, 0);
+	vld2q_lane_s32(arg_i32_ptr, arg_i32x4x2, 3);
+	vld2q_lane_s32(arg_i32_ptr, arg_i32x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2q_lane_s32(arg_i32_ptr, arg_i32x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3_lane_s32(arg_i32_ptr, arg_i32x2x3, 0);
+	vld3_lane_s32(arg_i32_ptr, arg_i32x2x3, 1);
+	vld3_lane_s32(arg_i32_ptr, arg_i32x2x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3_lane_s32(arg_i32_ptr, arg_i32x2x3, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3q_lane_s32(arg_i32_ptr, arg_i32x4x3, 0);
+	vld3q_lane_s32(arg_i32_ptr, arg_i32x4x3, 3);
+	vld3q_lane_s32(arg_i32_ptr, arg_i32x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3q_lane_s32(arg_i32_ptr, arg_i32x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4_lane_s32(arg_i32_ptr, arg_i32x2x4, 0);
+	vld4_lane_s32(arg_i32_ptr, arg_i32x2x4, 1);
+	vld4_lane_s32(arg_i32_ptr, arg_i32x2x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4_lane_s32(arg_i32_ptr, arg_i32x2x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4q_lane_s32(arg_i32_ptr, arg_i32x4x4, 0);
+	vld4q_lane_s32(arg_i32_ptr, arg_i32x4x4, 3);
+	vld4q_lane_s32(arg_i32_ptr, arg_i32x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4q_lane_s32(arg_i32_ptr, arg_i32x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_load_s64(int64x1x4_t arg_i64x1x4, int64x1_t arg_i64x1, int64x2x2_t arg_i64x2x2,
+						  int64x2x4_t arg_i64x2x4, int64x1x3_t arg_i64x1x3, int64x1x2_t arg_i64x1x2,
+						  int64x2_t arg_i64x2, int64x2x3_t arg_i64x2x3, int64_t* arg_i64_ptr) {
+	vld1_lane_s64(arg_i64_ptr, arg_i64x1, 0);
+	vld1_lane_s64(arg_i64_ptr, arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1_lane_s64(arg_i64_ptr, arg_i64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld1q_lane_s64(arg_i64_ptr, arg_i64x2, 0);
+	vld1q_lane_s64(arg_i64_ptr, arg_i64x2, 1);
+	vld1q_lane_s64(arg_i64_ptr, arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1q_lane_s64(arg_i64_ptr, arg_i64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vldap1_lane_s64(arg_i64_ptr, arg_i64x1, 0);
+	vldap1_lane_s64(arg_i64_ptr, arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vldap1_lane_s64(arg_i64_ptr, arg_i64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vldap1q_lane_s64(arg_i64_ptr, arg_i64x2, 0);
+	vldap1q_lane_s64(arg_i64_ptr, arg_i64x2, 1);
+	vldap1q_lane_s64(arg_i64_ptr, arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vldap1q_lane_s64(arg_i64_ptr, arg_i64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vstl1_lane_s64(arg_i64_ptr, arg_i64x1, 0);
+	vstl1_lane_s64(arg_i64_ptr, arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vstl1_lane_s64(arg_i64_ptr, arg_i64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vstl1q_lane_s64(arg_i64_ptr, arg_i64x2, 0);
+	vstl1q_lane_s64(arg_i64_ptr, arg_i64x2, 1);
+	vstl1q_lane_s64(arg_i64_ptr, arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vstl1q_lane_s64(arg_i64_ptr, arg_i64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2_lane_s64(arg_i64_ptr, arg_i64x1x2, 0);
+	vld2_lane_s64(arg_i64_ptr, arg_i64x1x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2_lane_s64(arg_i64_ptr, arg_i64x1x2, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2q_lane_s64(arg_i64_ptr, arg_i64x2x2, 0);
+	vld2q_lane_s64(arg_i64_ptr, arg_i64x2x2, 1);
+	vld2q_lane_s64(arg_i64_ptr, arg_i64x2x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2q_lane_s64(arg_i64_ptr, arg_i64x2x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3_lane_s64(arg_i64_ptr, arg_i64x1x3, 0);
+	vld3_lane_s64(arg_i64_ptr, arg_i64x1x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3_lane_s64(arg_i64_ptr, arg_i64x1x3, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3q_lane_s64(arg_i64_ptr, arg_i64x2x3, 0);
+	vld3q_lane_s64(arg_i64_ptr, arg_i64x2x3, 1);
+	vld3q_lane_s64(arg_i64_ptr, arg_i64x2x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3q_lane_s64(arg_i64_ptr, arg_i64x2x3, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4_lane_s64(arg_i64_ptr, arg_i64x1x4, 0);
+	vld4_lane_s64(arg_i64_ptr, arg_i64x1x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4_lane_s64(arg_i64_ptr, arg_i64x1x4, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4q_lane_s64(arg_i64_ptr, arg_i64x2x4, 0);
+	vld4q_lane_s64(arg_i64_ptr, arg_i64x2x4, 1);
+	vld4q_lane_s64(arg_i64_ptr, arg_i64x2x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4q_lane_s64(arg_i64_ptr, arg_i64x2x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_load_u8(uint8x8x2_t arg_u8x8x2, uint8x16x2_t arg_u8x16x2, uint8x8x4_t arg_u8x8x4,
+						uint8x8_t arg_u8x8, uint8x8x3_t arg_u8x8x3, uint8x16_t arg_u8x16,
+						uint8x16x4_t arg_u8x16x4, uint8_t *arg_u8_ptr, uint8x16x3_t arg_u8x16x3) {
+	vld1_lane_u8(arg_u8_ptr, arg_u8x8, 0);
+	vld1_lane_u8(arg_u8_ptr, arg_u8x8, 7);
+	vld1_lane_u8(arg_u8_ptr, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1_lane_u8(arg_u8_ptr, arg_u8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld1q_lane_u8(arg_u8_ptr, arg_u8x16, 0);
+	vld1q_lane_u8(arg_u8_ptr, arg_u8x16, 15);
+	vld1q_lane_u8(arg_u8_ptr, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1q_lane_u8(arg_u8_ptr, arg_u8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2_lane_u8(arg_u8_ptr, arg_u8x8x2, 0);
+	vld2_lane_u8(arg_u8_ptr, arg_u8x8x2, 7);
+	vld2_lane_u8(arg_u8_ptr, arg_u8x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2_lane_u8(arg_u8_ptr, arg_u8x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2q_lane_u8(arg_u8_ptr, arg_u8x16x2, 0);
+	vld2q_lane_u8(arg_u8_ptr, arg_u8x16x2, 15);
+	vld2q_lane_u8(arg_u8_ptr, arg_u8x16x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2q_lane_u8(arg_u8_ptr, arg_u8x16x2, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3_lane_u8(arg_u8_ptr, arg_u8x8x3, 0);
+	vld3_lane_u8(arg_u8_ptr, arg_u8x8x3, 7);
+	vld3_lane_u8(arg_u8_ptr, arg_u8x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3_lane_u8(arg_u8_ptr, arg_u8x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3q_lane_u8(arg_u8_ptr, arg_u8x16x3, 0);
+	vld3q_lane_u8(arg_u8_ptr, arg_u8x16x3, 15);
+	vld3q_lane_u8(arg_u8_ptr, arg_u8x16x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3q_lane_u8(arg_u8_ptr, arg_u8x16x3, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4_lane_u8(arg_u8_ptr, arg_u8x8x4, 0);
+	vld4_lane_u8(arg_u8_ptr, arg_u8x8x4, 7);
+	vld4_lane_u8(arg_u8_ptr, arg_u8x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4_lane_u8(arg_u8_ptr, arg_u8x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4q_lane_u8(arg_u8_ptr, arg_u8x16x4, 0);
+	vld4q_lane_u8(arg_u8_ptr, arg_u8x16x4, 15);
+	vld4q_lane_u8(arg_u8_ptr, arg_u8x16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4q_lane_u8(arg_u8_ptr, arg_u8x16x4, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_load_u16(uint16x8x2_t arg_u16x8x2, uint16x8x4_t arg_u16x8x4, uint16x4x4_t arg_u16x4x4,
+						  uint16x4x2_t arg_u16x4x2, uint16x8_t arg_u16x8, uint16_t *arg_u16_ptr,
+						  uint16x8x3_t arg_u16x8x3, uint16x4_t arg_u16x4, uint16x4x3_t arg_u16x4x3) {
+	vld1_lane_u16(arg_u16_ptr, arg_u16x4, 0);
+	vld1_lane_u16(arg_u16_ptr, arg_u16x4, 3);
+	vld1_lane_u16(arg_u16_ptr, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1_lane_u16(arg_u16_ptr, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld1q_lane_u16(arg_u16_ptr, arg_u16x8, 0);
+	vld1q_lane_u16(arg_u16_ptr, arg_u16x8, 7);
+	vld1q_lane_u16(arg_u16_ptr, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1q_lane_u16(arg_u16_ptr, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2_lane_u16(arg_u16_ptr, arg_u16x4x2, 0);
+	vld2_lane_u16(arg_u16_ptr, arg_u16x4x2, 3);
+	vld2_lane_u16(arg_u16_ptr, arg_u16x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2_lane_u16(arg_u16_ptr, arg_u16x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2q_lane_u16(arg_u16_ptr, arg_u16x8x2, 0);
+	vld2q_lane_u16(arg_u16_ptr, arg_u16x8x2, 7);
+	vld2q_lane_u16(arg_u16_ptr, arg_u16x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2q_lane_u16(arg_u16_ptr, arg_u16x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3_lane_u16(arg_u16_ptr, arg_u16x4x3, 0);
+	vld3_lane_u16(arg_u16_ptr, arg_u16x4x3, 3);
+	vld3_lane_u16(arg_u16_ptr, arg_u16x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3_lane_u16(arg_u16_ptr, arg_u16x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3q_lane_u16(arg_u16_ptr, arg_u16x8x3, 0);
+	vld3q_lane_u16(arg_u16_ptr, arg_u16x8x3, 7);
+	vld3q_lane_u16(arg_u16_ptr, arg_u16x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3q_lane_u16(arg_u16_ptr, arg_u16x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4_lane_u16(arg_u16_ptr, arg_u16x4x4, 0);
+	vld4_lane_u16(arg_u16_ptr, arg_u16x4x4, 3);
+	vld4_lane_u16(arg_u16_ptr, arg_u16x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4_lane_u16(arg_u16_ptr, arg_u16x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4q_lane_u16(arg_u16_ptr, arg_u16x8x4, 0);
+	vld4q_lane_u16(arg_u16_ptr, arg_u16x8x4, 7);
+	vld4q_lane_u16(arg_u16_ptr, arg_u16x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4q_lane_u16(arg_u16_ptr, arg_u16x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_load_u32(uint32x2x3_t arg_u32x2x3, uint32x2_t arg_u32x2, uint32x2x4_t arg_u32x2x4,
+						  uint32x4_t arg_u32x4, uint32x4x2_t arg_u32x4x2, uint32x2x2_t arg_u32x2x2,
+						  void *arg_u32_ptr, uint32x4x4_t arg_u32x4x4, uint32x4x3_t arg_u32x4x3) {
+	vld1_lane_u32(arg_u32_ptr, arg_u32x2, 0);
+	vld1_lane_u32(arg_u32_ptr, arg_u32x2, 1);
+	vld1_lane_u32(arg_u32_ptr, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1_lane_u32(arg_u32_ptr, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld1q_lane_u32(arg_u32_ptr, arg_u32x4, 0);
+	vld1q_lane_u32(arg_u32_ptr, arg_u32x4, 3);
+	vld1q_lane_u32(arg_u32_ptr, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1q_lane_u32(arg_u32_ptr, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2_lane_u32(arg_u32_ptr, arg_u32x2x2, 0);
+	vld2_lane_u32(arg_u32_ptr, arg_u32x2x2, 1);
+	vld2_lane_u32(arg_u32_ptr, arg_u32x2x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2_lane_u32(arg_u32_ptr, arg_u32x2x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2q_lane_u32(arg_u32_ptr, arg_u32x4x2, 0);
+	vld2q_lane_u32(arg_u32_ptr, arg_u32x4x2, 3);
+	vld2q_lane_u32(arg_u32_ptr, arg_u32x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2q_lane_u32(arg_u32_ptr, arg_u32x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3_lane_u32(arg_u32_ptr, arg_u32x2x3, 0);
+	vld3_lane_u32(arg_u32_ptr, arg_u32x2x3, 1);
+	vld3_lane_u32(arg_u32_ptr, arg_u32x2x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3_lane_u32(arg_u32_ptr, arg_u32x2x3, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3q_lane_u32(arg_u32_ptr, arg_u32x4x3, 0);
+	vld3q_lane_u32(arg_u32_ptr, arg_u32x4x3, 3);
+	vld3q_lane_u32(arg_u32_ptr, arg_u32x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3q_lane_u32(arg_u32_ptr, arg_u32x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4_lane_u32(arg_u32_ptr, arg_u32x2x4, 0);
+	vld4_lane_u32(arg_u32_ptr, arg_u32x2x4, 1);
+	vld4_lane_u32(arg_u32_ptr, arg_u32x2x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4_lane_u32(arg_u32_ptr, arg_u32x2x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4q_lane_u32(arg_u32_ptr, arg_u32x4x4, 0);
+	vld4q_lane_u32(arg_u32_ptr, arg_u32x4x4, 3);
+	vld4q_lane_u32(arg_u32_ptr, arg_u32x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4q_lane_u32(arg_u32_ptr, arg_u32x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_load_u64(uint64x2x2_t arg_u64x2x2, uint64x1x2_t arg_u64x1x2, uint64x2x3_t arg_u64x2x3,
+						  uint64x1_t arg_u64x1, uint64x1x4_t arg_u64x1x4, uint64x1x3_t arg_u64x1x3,
+						  uint64_t *arg_u64_ptr, uint64x2_t arg_u64x2, uint64x2x4_t arg_u64x2x4) {
+	vld1_lane_u64(arg_u64_ptr, arg_u64x1, 0);
+	vld1_lane_u64(arg_u64_ptr, arg_u64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1_lane_u64(arg_u64_ptr, arg_u64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld1q_lane_u64(arg_u64_ptr, arg_u64x2, 0);
+	vld1q_lane_u64(arg_u64_ptr, arg_u64x2, 1);
+	vld1q_lane_u64(arg_u64_ptr, arg_u64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1q_lane_u64(arg_u64_ptr, arg_u64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vldap1_lane_u64(arg_u64_ptr, arg_u64x1, 0);
+	vldap1_lane_u64(arg_u64_ptr, arg_u64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vldap1_lane_u64(arg_u64_ptr, arg_u64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vldap1q_lane_u64(arg_u64_ptr, arg_u64x2, 0);
+	vldap1q_lane_u64(arg_u64_ptr, arg_u64x2, 1);
+	vldap1q_lane_u64(arg_u64_ptr, arg_u64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vldap1q_lane_u64(arg_u64_ptr, arg_u64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vstl1_lane_u64(arg_u64_ptr, arg_u64x1, 0);
+	vstl1_lane_u64(arg_u64_ptr, arg_u64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vstl1_lane_u64(arg_u64_ptr, arg_u64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vstl1q_lane_u64(arg_u64_ptr, arg_u64x2, 0);
+	vstl1q_lane_u64(arg_u64_ptr, arg_u64x2, 1);
+	vstl1q_lane_u64(arg_u64_ptr, arg_u64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vstl1q_lane_u64(arg_u64_ptr, arg_u64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2_lane_u64(arg_u64_ptr, arg_u64x1x2, 0);
+	vld2_lane_u64(arg_u64_ptr, arg_u64x1x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2_lane_u64(arg_u64_ptr, arg_u64x1x2, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2q_lane_u64(arg_u64_ptr, arg_u64x2x2, 0);
+	vld2q_lane_u64(arg_u64_ptr, arg_u64x2x2, 1);
+	vld2q_lane_u64(arg_u64_ptr, arg_u64x2x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2q_lane_u64(arg_u64_ptr, arg_u64x2x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3_lane_u64(arg_u64_ptr, arg_u64x1x3, 0);
+	vld3_lane_u64(arg_u64_ptr, arg_u64x1x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3_lane_u64(arg_u64_ptr, arg_u64x1x3, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3q_lane_u64(arg_u64_ptr, arg_u64x2x3, 0);
+	vld3q_lane_u64(arg_u64_ptr, arg_u64x2x3, 1);
+	vld3q_lane_u64(arg_u64_ptr, arg_u64x2x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3q_lane_u64(arg_u64_ptr, arg_u64x2x3, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4_lane_u64(arg_u64_ptr, arg_u64x1x4, 0);
+	vld4_lane_u64(arg_u64_ptr, arg_u64x1x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4_lane_u64(arg_u64_ptr, arg_u64x1x4, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4q_lane_u64(arg_u64_ptr, arg_u64x2x4, 0);
+	vld4q_lane_u64(arg_u64_ptr, arg_u64x2x4, 1);
+	vld4q_lane_u64(arg_u64_ptr, arg_u64x2x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4q_lane_u64(arg_u64_ptr, arg_u64x2x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_load_p64(poly64_t *arg_p64_ptr, poly64x2x2_t arg_p64x2x2, poly64x1x2_t arg_p64x1x2,
+						  poly64x2x4_t arg_p64x2x4, poly64x1x3_t arg_p64x1x3, poly64x2x3_t arg_p64x2x3,
+						  poly64x1_t arg_p64x1, poly64x2_t arg_p64x2, poly64x1x4_t arg_p64x1x4) {
+	vld1_lane_p64(arg_p64_ptr, arg_p64x1, 0);
+	vld1_lane_p64(arg_p64_ptr, arg_p64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1_lane_p64(arg_p64_ptr, arg_p64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld1q_lane_p64(arg_p64_ptr, arg_p64x2, 0);
+	vld1q_lane_p64(arg_p64_ptr, arg_p64x2, 1);
+	vld1q_lane_p64(arg_p64_ptr, arg_p64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1q_lane_p64(arg_p64_ptr, arg_p64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vldap1_lane_p64(arg_p64_ptr, arg_p64x1, 0);
+	vldap1_lane_p64(arg_p64_ptr, arg_p64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vldap1_lane_p64(arg_p64_ptr, arg_p64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vldap1q_lane_p64(arg_p64_ptr, arg_p64x2, 0);
+	vldap1q_lane_p64(arg_p64_ptr, arg_p64x2, 1);
+	vldap1q_lane_p64(arg_p64_ptr, arg_p64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vldap1q_lane_p64(arg_p64_ptr, arg_p64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vstl1_lane_p64(arg_p64_ptr, arg_p64x1, 0);
+	vstl1_lane_p64(arg_p64_ptr, arg_p64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vstl1_lane_p64(arg_p64_ptr, arg_p64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vstl1q_lane_p64(arg_p64_ptr, arg_p64x2, 0);
+	vstl1q_lane_p64(arg_p64_ptr, arg_p64x2, 1);
+	vstl1q_lane_p64(arg_p64_ptr, arg_p64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vstl1q_lane_p64(arg_p64_ptr, arg_p64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2_lane_p64(arg_p64_ptr, arg_p64x1x2, 0);
+	vld2_lane_p64(arg_p64_ptr, arg_p64x1x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2_lane_p64(arg_p64_ptr, arg_p64x1x2, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2q_lane_p64(arg_p64_ptr, arg_p64x2x2, 0);
+	vld2q_lane_p64(arg_p64_ptr, arg_p64x2x2, 1);
+	vld2q_lane_p64(arg_p64_ptr, arg_p64x2x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2q_lane_p64(arg_p64_ptr, arg_p64x2x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3_lane_p64(arg_p64_ptr, arg_p64x1x3, 0);
+	vld3_lane_p64(arg_p64_ptr, arg_p64x1x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3_lane_p64(arg_p64_ptr, arg_p64x1x3, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3q_lane_p64(arg_p64_ptr, arg_p64x2x3, 0);
+	vld3q_lane_p64(arg_p64_ptr, arg_p64x2x3, 1);
+	vld3q_lane_p64(arg_p64_ptr, arg_p64x2x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3q_lane_p64(arg_p64_ptr, arg_p64x2x3, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4_lane_p64(arg_p64_ptr, arg_p64x1x4, 0);
+	vld4_lane_p64(arg_p64_ptr, arg_p64x1x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4_lane_p64(arg_p64_ptr, arg_p64x1x4, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4q_lane_p64(arg_p64_ptr, arg_p64x2x4, 0);
+	vld4q_lane_p64(arg_p64_ptr, arg_p64x2x4, 1);
+	vld4q_lane_p64(arg_p64_ptr, arg_p64x2x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4q_lane_p64(arg_p64_ptr, arg_p64x2x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_load_f16(float16_t *arg_f16_ptr, float16x8_t arg_f16x8, float16x8x2_t arg_f16x8x2,
+						  float16x8x3_t arg_f16x8x3, float16x4x4_t arg_f16x4x4, float16x8x4_t arg_f16x8x4,
+						  float16x4x2_t arg_f16x4x2, float16x4_t arg_f16x4, float16x4x3_t arg_f16x4x3) {
+	vld1_lane_f16(arg_f16_ptr, arg_f16x4, 0);
+	vld1_lane_f16(arg_f16_ptr, arg_f16x4, 3);
+	vld1_lane_f16(arg_f16_ptr, arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1_lane_f16(arg_f16_ptr, arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld1q_lane_f16(arg_f16_ptr, arg_f16x8, 0);
+	vld1q_lane_f16(arg_f16_ptr, arg_f16x8, 7);
+	vld1q_lane_f16(arg_f16_ptr, arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1q_lane_f16(arg_f16_ptr, arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2_lane_f16(arg_f16_ptr, arg_f16x4x2, 0);
+	vld2_lane_f16(arg_f16_ptr, arg_f16x4x2, 3);
+	vld2_lane_f16(arg_f16_ptr, arg_f16x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2_lane_f16(arg_f16_ptr, arg_f16x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2q_lane_f16(arg_f16_ptr, arg_f16x8x2, 0);
+	vld2q_lane_f16(arg_f16_ptr, arg_f16x8x2, 7);
+	vld2q_lane_f16(arg_f16_ptr, arg_f16x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2q_lane_f16(arg_f16_ptr, arg_f16x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3_lane_f16(arg_f16_ptr, arg_f16x4x3, 0);
+	vld3_lane_f16(arg_f16_ptr, arg_f16x4x3, 3);
+	vld3_lane_f16(arg_f16_ptr, arg_f16x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3_lane_f16(arg_f16_ptr, arg_f16x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3q_lane_f16(arg_f16_ptr, arg_f16x8x3, 0);
+	vld3q_lane_f16(arg_f16_ptr, arg_f16x8x3, 7);
+	vld3q_lane_f16(arg_f16_ptr, arg_f16x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3q_lane_f16(arg_f16_ptr, arg_f16x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4_lane_f16(arg_f16_ptr, arg_f16x4x4, 0);
+	vld4_lane_f16(arg_f16_ptr, arg_f16x4x4, 3);
+	vld4_lane_f16(arg_f16_ptr, arg_f16x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4_lane_f16(arg_f16_ptr, arg_f16x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4q_lane_f16(arg_f16_ptr, arg_f16x8x4, 0);
+	vld4q_lane_f16(arg_f16_ptr, arg_f16x8x4, 7);
+	vld4q_lane_f16(arg_f16_ptr, arg_f16x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4q_lane_f16(arg_f16_ptr, arg_f16x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_load_f32(float32_t *arg_f32_ptr, float32x4x3_t arg_f32x4x3, float32x2x4_t arg_f32x2x4,
+						  float32x4x4_t arg_f32x4x4, float32x2x3_t arg_f32x2x3, float32x2x2_t arg_f32x2x2,
+						  float32x4x2_t arg_f32x4x2, float32x2_t arg_f32x2, float32x4_t arg_f32x4) {
+	vld1_lane_f32(arg_f32_ptr, arg_f32x2, 0);
+	vld1_lane_f32(arg_f32_ptr, arg_f32x2, 1);
+	vld1_lane_f32(arg_f32_ptr, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1_lane_f32(arg_f32_ptr, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld1q_lane_f32(arg_f32_ptr, arg_f32x4, 0);
+	vld1q_lane_f32(arg_f32_ptr, arg_f32x4, 3);
+	vld1q_lane_f32(arg_f32_ptr, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1q_lane_f32(arg_f32_ptr, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2_lane_f32(arg_f32_ptr, arg_f32x2x2, 0);
+	vld2_lane_f32(arg_f32_ptr, arg_f32x2x2, 1);
+	vld2_lane_f32(arg_f32_ptr, arg_f32x2x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2_lane_f32(arg_f32_ptr, arg_f32x2x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2q_lane_f32(arg_f32_ptr, arg_f32x4x2, 0);
+	vld2q_lane_f32(arg_f32_ptr, arg_f32x4x2, 3);
+	vld2q_lane_f32(arg_f32_ptr, arg_f32x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2q_lane_f32(arg_f32_ptr, arg_f32x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3_lane_f32(arg_f32_ptr, arg_f32x2x3, 0);
+	vld3_lane_f32(arg_f32_ptr, arg_f32x2x3, 1);
+	vld3_lane_f32(arg_f32_ptr, arg_f32x2x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3_lane_f32(arg_f32_ptr, arg_f32x2x3, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3q_lane_f32(arg_f32_ptr, arg_f32x4x3, 0);
+	vld3q_lane_f32(arg_f32_ptr, arg_f32x4x3, 3);
+	vld3q_lane_f32(arg_f32_ptr, arg_f32x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3q_lane_f32(arg_f32_ptr, arg_f32x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4_lane_f32(arg_f32_ptr, arg_f32x2x4, 0);
+	vld4_lane_f32(arg_f32_ptr, arg_f32x2x4, 1);
+	vld4_lane_f32(arg_f32_ptr, arg_f32x2x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4_lane_f32(arg_f32_ptr, arg_f32x2x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4q_lane_f32(arg_f32_ptr, arg_f32x4x4, 0);
+	vld4q_lane_f32(arg_f32_ptr, arg_f32x4x4, 3);
+	vld4q_lane_f32(arg_f32_ptr, arg_f32x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4q_lane_f32(arg_f32_ptr, arg_f32x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_load_p8(poly8x16_t arg_p8x16, poly8x8x2_t arg_p8x8x2, poly8x16x4_t arg_p8x16x4,
+						 poly8_t *arg_p8_ptr, poly8x8_t arg_p8x8, poly8x8x4_t arg_p8x8x4,
+						 poly8x16x2_t arg_p8x16x2, poly8x8x3_t arg_p8x8x3, poly8x16x3_t arg_p8x16x3) {
+	vld1_lane_p8(arg_p8_ptr, arg_p8x8, 0);
+	vld1_lane_p8(arg_p8_ptr, arg_p8x8, 7);
+	vld1_lane_p8(arg_p8_ptr, arg_p8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1_lane_p8(arg_p8_ptr, arg_p8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld1q_lane_p8(arg_p8_ptr, arg_p8x16, 0);
+	vld1q_lane_p8(arg_p8_ptr, arg_p8x16, 15);
+	vld1q_lane_p8(arg_p8_ptr, arg_p8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1q_lane_p8(arg_p8_ptr, arg_p8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2_lane_p8(arg_p8_ptr, arg_p8x8x2, 0);
+	vld2_lane_p8(arg_p8_ptr, arg_p8x8x2, 7);
+	vld2_lane_p8(arg_p8_ptr, arg_p8x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2_lane_p8(arg_p8_ptr, arg_p8x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2q_lane_p8(arg_p8_ptr, arg_p8x16x2, 0);
+	vld2q_lane_p8(arg_p8_ptr, arg_p8x16x2, 15);
+	vld2q_lane_p8(arg_p8_ptr, arg_p8x16x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2q_lane_p8(arg_p8_ptr, arg_p8x16x2, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3_lane_p8(arg_p8_ptr, arg_p8x8x3, 0);
+	vld3_lane_p8(arg_p8_ptr, arg_p8x8x3, 7);
+	vld3_lane_p8(arg_p8_ptr, arg_p8x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3_lane_p8(arg_p8_ptr, arg_p8x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3q_lane_p8(arg_p8_ptr, arg_p8x16x3, 0);
+	vld3q_lane_p8(arg_p8_ptr, arg_p8x16x3, 15);
+	vld3q_lane_p8(arg_p8_ptr, arg_p8x16x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3q_lane_p8(arg_p8_ptr, arg_p8x16x3, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4_lane_p8(arg_p8_ptr, arg_p8x8x4, 0);
+	vld4_lane_p8(arg_p8_ptr, arg_p8x8x4, 7);
+	vld4_lane_p8(arg_p8_ptr, arg_p8x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4_lane_p8(arg_p8_ptr, arg_p8x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4q_lane_p8(arg_p8_ptr, arg_p8x16x4, 0);
+	vld4q_lane_p8(arg_p8_ptr, arg_p8x16x4, 15);
+	vld4q_lane_p8(arg_p8_ptr, arg_p8x16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4q_lane_p8(arg_p8_ptr, arg_p8x16x4, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_load_p16(poly16x8x4_t arg_p16x8x4, poly16x8_t arg_p16x8, poly16x4x4_t arg_p16x4x4,
+						  poly16x8x3_t arg_p16x8x3, poly16_t *arg_p16_ptr, poly16x4_t arg_p16x4,
+						  poly16x8x2_t arg_p16x8x2, poly16x4x2_t arg_p16x4x2, poly16x4x3_t arg_p16x4x3) {
+	vld1_lane_p16(arg_p16_ptr, arg_p16x4, 0);
+	vld1_lane_p16(arg_p16_ptr, arg_p16x4, 3);
+	vld1_lane_p16(arg_p16_ptr, arg_p16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1_lane_p16(arg_p16_ptr, arg_p16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld1q_lane_p16(arg_p16_ptr, arg_p16x8, 0);
+	vld1q_lane_p16(arg_p16_ptr, arg_p16x8, 7);
+	vld1q_lane_p16(arg_p16_ptr, arg_p16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1q_lane_p16(arg_p16_ptr, arg_p16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2_lane_p16(arg_p16_ptr, arg_p16x4x2, 0);
+	vld2_lane_p16(arg_p16_ptr, arg_p16x4x2, 3);
+	vld2_lane_p16(arg_p16_ptr, arg_p16x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2_lane_p16(arg_p16_ptr, arg_p16x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2q_lane_p16(arg_p16_ptr, arg_p16x8x2, 0);
+	vld2q_lane_p16(arg_p16_ptr, arg_p16x8x2, 7);
+	vld2q_lane_p16(arg_p16_ptr, arg_p16x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2q_lane_p16(arg_p16_ptr, arg_p16x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3_lane_p16(arg_p16_ptr, arg_p16x4x3, 0);
+	vld3_lane_p16(arg_p16_ptr, arg_p16x4x3, 3);
+	vld3_lane_p16(arg_p16_ptr, arg_p16x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3_lane_p16(arg_p16_ptr, arg_p16x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3q_lane_p16(arg_p16_ptr, arg_p16x8x3, 0);
+	vld3q_lane_p16(arg_p16_ptr, arg_p16x8x3, 7);
+	vld3q_lane_p16(arg_p16_ptr, arg_p16x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3q_lane_p16(arg_p16_ptr, arg_p16x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4_lane_p16(arg_p16_ptr, arg_p16x4x4, 0);
+	vld4_lane_p16(arg_p16_ptr, arg_p16x4x4, 3);
+	vld4_lane_p16(arg_p16_ptr, arg_p16x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4_lane_p16(arg_p16_ptr, arg_p16x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4q_lane_p16(arg_p16_ptr, arg_p16x8x4, 0);
+	vld4q_lane_p16(arg_p16_ptr, arg_p16x8x4, 7);
+	vld4q_lane_p16(arg_p16_ptr, arg_p16x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4q_lane_p16(arg_p16_ptr, arg_p16x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_load_f64(float64x1_t arg_f64x1, float64x1x2_t arg_f64x1x2, float64_t* arg_f64_ptr,
+						  float64x2x3_t arg_f64x2x3, float64x2x4_t arg_f64x2x4, float64x2x2_t arg_f64x2x2,
+						  float64x2_t arg_f64x2, float64x1x3_t arg_f64x1x3, float64x1x4_t arg_f64x1x4) {
+	vld1_lane_f64(arg_f64_ptr, arg_f64x1, 0);
+	vld1_lane_f64(arg_f64_ptr, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1_lane_f64(arg_f64_ptr, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld1q_lane_f64(arg_f64_ptr, arg_f64x2, 0);
+	vld1q_lane_f64(arg_f64_ptr, arg_f64x2, 1);
+	vld1q_lane_f64(arg_f64_ptr, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld1q_lane_f64(arg_f64_ptr, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vldap1_lane_f64(arg_f64_ptr, arg_f64x1, 0);
+	vldap1_lane_f64(arg_f64_ptr, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vldap1_lane_f64(arg_f64_ptr, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vldap1q_lane_f64(arg_f64_ptr, arg_f64x2, 0);
+	vldap1q_lane_f64(arg_f64_ptr, arg_f64x2, 1);
+	vldap1q_lane_f64(arg_f64_ptr, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vldap1q_lane_f64(arg_f64_ptr, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vstl1_lane_f64(arg_f64_ptr, arg_f64x1, 0);
+	vstl1_lane_f64(arg_f64_ptr, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vstl1_lane_f64(arg_f64_ptr, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vstl1q_lane_f64(arg_f64_ptr, arg_f64x2, 0);
+	vstl1q_lane_f64(arg_f64_ptr, arg_f64x2, 1);
+	vstl1q_lane_f64(arg_f64_ptr, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vstl1q_lane_f64(arg_f64_ptr, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2_lane_f64(arg_f64_ptr, arg_f64x1x2, 0);
+	vld2_lane_f64(arg_f64_ptr, arg_f64x1x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2_lane_f64(arg_f64_ptr, arg_f64x1x2, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld2q_lane_f64(arg_f64_ptr, arg_f64x2x2, 0);
+	vld2q_lane_f64(arg_f64_ptr, arg_f64x2x2, 1);
+	vld2q_lane_f64(arg_f64_ptr, arg_f64x2x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld2q_lane_f64(arg_f64_ptr, arg_f64x2x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3_lane_f64(arg_f64_ptr, arg_f64x1x3, 0);
+	vld3_lane_f64(arg_f64_ptr, arg_f64x1x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3_lane_f64(arg_f64_ptr, arg_f64x1x3, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld3q_lane_f64(arg_f64_ptr, arg_f64x2x3, 0);
+	vld3q_lane_f64(arg_f64_ptr, arg_f64x2x3, 1);
+	vld3q_lane_f64(arg_f64_ptr, arg_f64x2x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld3q_lane_f64(arg_f64_ptr, arg_f64x2x3, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4_lane_f64(arg_f64_ptr, arg_f64x1x4, 0);
+	vld4_lane_f64(arg_f64_ptr, arg_f64x1x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4_lane_f64(arg_f64_ptr, arg_f64x1x4, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vld4q_lane_f64(arg_f64_ptr, arg_f64x2x4, 0);
+	vld4q_lane_f64(arg_f64_ptr, arg_f64x2x4, 1);
+	vld4q_lane_f64(arg_f64_ptr, arg_f64x2x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vld4q_lane_f64(arg_f64_ptr, arg_f64x2x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-accumulate-by-scalar.c b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-accumulate-by-scalar.c
new file mode 100644
index 0000000000000..a7eee3ad25e09
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-accumulate-by-scalar.c
@@ -0,0 +1,200 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+void test_vector_multiply_accumulate_by_scalar_s16(int32x4_t arg_i32x4, int16x8_t arg_i16x8, int16x4_t arg_i16x4) {
+	vmla_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, 0);
+	vmla_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, 3);
+	vmla_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmla_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlaq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, 0);
+	vmlaq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, 3);
+	vmlaq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlaq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmla_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, 0);
+	vmla_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, 7);
+	vmla_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmla_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlaq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, 0);
+	vmlaq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, 7);
+	vmlaq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlaq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, 0);
+	vmlal_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, 3);
+	vmlal_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, 0);
+	vmlal_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, 3);
+	vmlal_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, 0);
+	vmlal_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, 7);
+	vmlal_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, 0);
+	vmlal_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, 7);
+	vmlal_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_accumulate_by_scalar_s32(int32x4_t arg_i32x4, int32x2_t arg_i32x2, int64x2_t arg_i64x2) {
+	vmla_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, 0);
+	vmla_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, 1);
+	vmla_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmla_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlaq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, 0);
+	vmlaq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, 1);
+	vmlaq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlaq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmla_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, 0);
+	vmla_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, 3);
+	vmla_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmla_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlaq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, 0);
+	vmlaq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, 3);
+	vmlaq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlaq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, 0);
+	vmlal_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, 1);
+	vmlal_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, 0);
+	vmlal_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, 1);
+	vmlal_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, 0);
+	vmlal_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, 3);
+	vmlal_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, 0);
+	vmlal_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, 3);
+	vmlal_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_accumulate_by_scalar_u16(uint16x4_t arg_u16x4, uint16x8_t arg_u16x8, uint32x4_t arg_u32x4) {
+	vmla_lane_u16(arg_u16x4, arg_u16x4, arg_u16x4, 0);
+	vmla_lane_u16(arg_u16x4, arg_u16x4, arg_u16x4, 3);
+	vmla_lane_u16(arg_u16x4, arg_u16x4, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmla_lane_u16(arg_u16x4, arg_u16x4, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlaq_lane_u16(arg_u16x8, arg_u16x8, arg_u16x4, 0);
+	vmlaq_lane_u16(arg_u16x8, arg_u16x8, arg_u16x4, 3);
+	vmlaq_lane_u16(arg_u16x8, arg_u16x8, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlaq_lane_u16(arg_u16x8, arg_u16x8, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmla_laneq_u16(arg_u16x4, arg_u16x4, arg_u16x8, 0);
+	vmla_laneq_u16(arg_u16x4, arg_u16x4, arg_u16x8, 7);
+	vmla_laneq_u16(arg_u16x4, arg_u16x4, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmla_laneq_u16(arg_u16x4, arg_u16x4, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlaq_laneq_u16(arg_u16x8, arg_u16x8, arg_u16x8, 0);
+	vmlaq_laneq_u16(arg_u16x8, arg_u16x8, arg_u16x8, 7);
+	vmlaq_laneq_u16(arg_u16x8, arg_u16x8, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlaq_laneq_u16(arg_u16x8, arg_u16x8, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_lane_u16(arg_u32x4, arg_u16x4, arg_u16x4, 0);
+	vmlal_lane_u16(arg_u32x4, arg_u16x4, arg_u16x4, 3);
+	vmlal_lane_u16(arg_u32x4, arg_u16x4, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_lane_u16(arg_u32x4, arg_u16x4, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_high_lane_u16(arg_u32x4, arg_u16x8, arg_u16x4, 0);
+	vmlal_high_lane_u16(arg_u32x4, arg_u16x8, arg_u16x4, 3);
+	vmlal_high_lane_u16(arg_u32x4, arg_u16x8, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_high_lane_u16(arg_u32x4, arg_u16x8, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_laneq_u16(arg_u32x4, arg_u16x4, arg_u16x8, 0);
+	vmlal_laneq_u16(arg_u32x4, arg_u16x4, arg_u16x8, 7);
+	vmlal_laneq_u16(arg_u32x4, arg_u16x4, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_laneq_u16(arg_u32x4, arg_u16x4, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_high_laneq_u16(arg_u32x4, arg_u16x8, arg_u16x8, 0);
+	vmlal_high_laneq_u16(arg_u32x4, arg_u16x8, arg_u16x8, 7);
+	vmlal_high_laneq_u16(arg_u32x4, arg_u16x8, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_high_laneq_u16(arg_u32x4, arg_u16x8, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_accumulate_by_scalar_u32(uint64x2_t arg_u64x2, uint32x2_t arg_u32x2, uint32x4_t arg_u32x4) {
+	vmla_lane_u32(arg_u32x2, arg_u32x2, arg_u32x2, 0);
+	vmla_lane_u32(arg_u32x2, arg_u32x2, arg_u32x2, 1);
+	vmla_lane_u32(arg_u32x2, arg_u32x2, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmla_lane_u32(arg_u32x2, arg_u32x2, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlaq_lane_u32(arg_u32x4, arg_u32x4, arg_u32x2, 0);
+	vmlaq_lane_u32(arg_u32x4, arg_u32x4, arg_u32x2, 1);
+	vmlaq_lane_u32(arg_u32x4, arg_u32x4, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlaq_lane_u32(arg_u32x4, arg_u32x4, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmla_laneq_u32(arg_u32x2, arg_u32x2, arg_u32x4, 0);
+	vmla_laneq_u32(arg_u32x2, arg_u32x2, arg_u32x4, 3);
+	vmla_laneq_u32(arg_u32x2, arg_u32x2, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmla_laneq_u32(arg_u32x2, arg_u32x2, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlaq_laneq_u32(arg_u32x4, arg_u32x4, arg_u32x4, 0);
+	vmlaq_laneq_u32(arg_u32x4, arg_u32x4, arg_u32x4, 3);
+	vmlaq_laneq_u32(arg_u32x4, arg_u32x4, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlaq_laneq_u32(arg_u32x4, arg_u32x4, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_lane_u32(arg_u64x2, arg_u32x2, arg_u32x2, 0);
+	vmlal_lane_u32(arg_u64x2, arg_u32x2, arg_u32x2, 1);
+	vmlal_lane_u32(arg_u64x2, arg_u32x2, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_lane_u32(arg_u64x2, arg_u32x2, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_high_lane_u32(arg_u64x2, arg_u32x4, arg_u32x2, 0);
+	vmlal_high_lane_u32(arg_u64x2, arg_u32x4, arg_u32x2, 1);
+	vmlal_high_lane_u32(arg_u64x2, arg_u32x4, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_high_lane_u32(arg_u64x2, arg_u32x4, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_laneq_u32(arg_u64x2, arg_u32x2, arg_u32x4, 0);
+	vmlal_laneq_u32(arg_u64x2, arg_u32x2, arg_u32x4, 3);
+	vmlal_laneq_u32(arg_u64x2, arg_u32x2, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_laneq_u32(arg_u64x2, arg_u32x2, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlal_high_laneq_u32(arg_u64x2, arg_u32x4, arg_u32x4, 0);
+	vmlal_high_laneq_u32(arg_u64x2, arg_u32x4, arg_u32x4, 3);
+	vmlal_high_laneq_u32(arg_u64x2, arg_u32x4, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlal_high_laneq_u32(arg_u64x2, arg_u32x4, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_accumulate_by_scalar_f32(float32x4_t arg_f32x4, float32x2_t arg_f32x2) {
+	vmla_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, 0);
+	vmla_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, 1);
+	vmla_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmla_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlaq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, 0);
+	vmlaq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, 1);
+	vmlaq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlaq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmla_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, 0);
+	vmla_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, 3);
+	vmla_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmla_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlaq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, 0);
+	vmlaq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, 3);
+	vmlaq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlaq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-by-scalar-and-widen.c b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-by-scalar-and-widen.c
new file mode 100644
index 0000000000000..1ed848742e681
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-by-scalar-and-widen.c
@@ -0,0 +1,98 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_vector_multiply_by_scalar_and_widen_s16(int16x4_t arg_i16x4, int16x8_t arg_i16x8) {
+	vmull_lane_s16(arg_i16x4, arg_i16x4, 0);
+	vmull_lane_s16(arg_i16x4, arg_i16x4, 3);
+	vmull_lane_s16(arg_i16x4, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_lane_s16(arg_i16x4, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmull_high_lane_s16(arg_i16x8, arg_i16x4, 0);
+	vmull_high_lane_s16(arg_i16x8, arg_i16x4, 3);
+	vmull_high_lane_s16(arg_i16x8, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_high_lane_s16(arg_i16x8, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmull_laneq_s16(arg_i16x4, arg_i16x8, 0);
+	vmull_laneq_s16(arg_i16x4, arg_i16x8, 7);
+	vmull_laneq_s16(arg_i16x4, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_laneq_s16(arg_i16x4, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmull_high_laneq_s16(arg_i16x8, arg_i16x8, 0);
+	vmull_high_laneq_s16(arg_i16x8, arg_i16x8, 7);
+	vmull_high_laneq_s16(arg_i16x8, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_high_laneq_s16(arg_i16x8, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_by_scalar_and_widen_s32(int32x4_t arg_i32x4, int32x2_t arg_i32x2) {
+	vmull_lane_s32(arg_i32x2, arg_i32x2, 0);
+	vmull_lane_s32(arg_i32x2, arg_i32x2, 1);
+	vmull_lane_s32(arg_i32x2, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_lane_s32(arg_i32x2, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmull_high_lane_s32(arg_i32x4, arg_i32x2, 0);
+	vmull_high_lane_s32(arg_i32x4, arg_i32x2, 1);
+	vmull_high_lane_s32(arg_i32x4, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_high_lane_s32(arg_i32x4, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmull_laneq_s32(arg_i32x2, arg_i32x4, 0);
+	vmull_laneq_s32(arg_i32x2, arg_i32x4, 3);
+	vmull_laneq_s32(arg_i32x2, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_laneq_s32(arg_i32x2, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmull_high_laneq_s32(arg_i32x4, arg_i32x4, 0);
+	vmull_high_laneq_s32(arg_i32x4, arg_i32x4, 3);
+	vmull_high_laneq_s32(arg_i32x4, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_high_laneq_s32(arg_i32x4, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_by_scalar_and_widen_u16(uint16x8_t arg_u16x8, uint16x4_t arg_u16x4) {
+	vmull_lane_u16(arg_u16x4, arg_u16x4, 0);
+	vmull_lane_u16(arg_u16x4, arg_u16x4, 3);
+	vmull_lane_u16(arg_u16x4, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_lane_u16(arg_u16x4, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmull_high_lane_u16(arg_u16x8, arg_u16x4, 0);
+	vmull_high_lane_u16(arg_u16x8, arg_u16x4, 3);
+	vmull_high_lane_u16(arg_u16x8, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_high_lane_u16(arg_u16x8, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmull_laneq_u16(arg_u16x4, arg_u16x8, 0);
+	vmull_laneq_u16(arg_u16x4, arg_u16x8, 7);
+	vmull_laneq_u16(arg_u16x4, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_laneq_u16(arg_u16x4, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmull_high_laneq_u16(arg_u16x8, arg_u16x8, 0);
+	vmull_high_laneq_u16(arg_u16x8, arg_u16x8, 7);
+	vmull_high_laneq_u16(arg_u16x8, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_high_laneq_u16(arg_u16x8, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_by_scalar_and_widen_u32(uint32x2_t arg_u32x2, uint32x4_t arg_u32x4) {
+	vmull_lane_u32(arg_u32x2, arg_u32x2, 0);
+	vmull_lane_u32(arg_u32x2, arg_u32x2, 1);
+	vmull_lane_u32(arg_u32x2, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_lane_u32(arg_u32x2, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmull_high_lane_u32(arg_u32x4, arg_u32x2, 0);
+	vmull_high_lane_u32(arg_u32x4, arg_u32x2, 1);
+	vmull_high_lane_u32(arg_u32x4, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_high_lane_u32(arg_u32x4, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmull_laneq_u32(arg_u32x2, arg_u32x4, 0);
+	vmull_laneq_u32(arg_u32x2, arg_u32x4, 3);
+	vmull_laneq_u32(arg_u32x2, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_laneq_u32(arg_u32x2, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmull_high_laneq_u32(arg_u32x4, arg_u32x4, 0);
+	vmull_high_laneq_u32(arg_u32x4, arg_u32x4, 3);
+	vmull_high_laneq_u32(arg_u32x4, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmull_high_laneq_u32(arg_u32x4, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-by-scalar.c b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-by-scalar.c
new file mode 100644
index 0000000000000..7c9e73fb12a5a
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-by-scalar.c
@@ -0,0 +1,160 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_vector_multiply_by_scalar_s16(int16x4_t arg_i16x4, int16x8_t arg_i16x8) {
+	vmul_lane_s16(arg_i16x4, arg_i16x4, 0);
+	vmul_lane_s16(arg_i16x4, arg_i16x4, 3);
+	vmul_lane_s16(arg_i16x4, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmul_lane_s16(arg_i16x4, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulq_lane_s16(arg_i16x8, arg_i16x4, 0);
+	vmulq_lane_s16(arg_i16x8, arg_i16x4, 3);
+	vmulq_lane_s16(arg_i16x8, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulq_lane_s16(arg_i16x8, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmul_laneq_s16(arg_i16x4, arg_i16x8, 0);
+	vmul_laneq_s16(arg_i16x4, arg_i16x8, 7);
+	vmul_laneq_s16(arg_i16x4, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmul_laneq_s16(arg_i16x4, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulq_laneq_s16(arg_i16x8, arg_i16x8, 0);
+	vmulq_laneq_s16(arg_i16x8, arg_i16x8, 7);
+	vmulq_laneq_s16(arg_i16x8, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulq_laneq_s16(arg_i16x8, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_by_scalar_s32(int32x2_t arg_i32x2, int32x4_t arg_i32x4) {
+	vmul_lane_s32(arg_i32x2, arg_i32x2, 0);
+	vmul_lane_s32(arg_i32x2, arg_i32x2, 1);
+	vmul_lane_s32(arg_i32x2, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmul_lane_s32(arg_i32x2, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulq_lane_s32(arg_i32x4, arg_i32x2, 0);
+	vmulq_lane_s32(arg_i32x4, arg_i32x2, 1);
+	vmulq_lane_s32(arg_i32x4, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulq_lane_s32(arg_i32x4, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmul_laneq_s32(arg_i32x2, arg_i32x4, 0);
+	vmul_laneq_s32(arg_i32x2, arg_i32x4, 3);
+	vmul_laneq_s32(arg_i32x2, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmul_laneq_s32(arg_i32x2, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulq_laneq_s32(arg_i32x4, arg_i32x4, 0);
+	vmulq_laneq_s32(arg_i32x4, arg_i32x4, 3);
+	vmulq_laneq_s32(arg_i32x4, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulq_laneq_s32(arg_i32x4, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_by_scalar_u16(uint16x8_t arg_u16x8, uint16x4_t arg_u16x4) {
+	vmul_lane_u16(arg_u16x4, arg_u16x4, 0);
+	vmul_lane_u16(arg_u16x4, arg_u16x4, 3);
+	vmul_lane_u16(arg_u16x4, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmul_lane_u16(arg_u16x4, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulq_lane_u16(arg_u16x8, arg_u16x4, 0);
+	vmulq_lane_u16(arg_u16x8, arg_u16x4, 3);
+	vmulq_lane_u16(arg_u16x8, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulq_lane_u16(arg_u16x8, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmul_laneq_u16(arg_u16x4, arg_u16x8, 0);
+	vmul_laneq_u16(arg_u16x4, arg_u16x8, 7);
+	vmul_laneq_u16(arg_u16x4, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmul_laneq_u16(arg_u16x4, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulq_laneq_u16(arg_u16x8, arg_u16x8, 0);
+	vmulq_laneq_u16(arg_u16x8, arg_u16x8, 7);
+	vmulq_laneq_u16(arg_u16x8, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulq_laneq_u16(arg_u16x8, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_by_scalar_u32(uint32x2_t arg_u32x2, uint32x4_t arg_u32x4) {
+	vmul_lane_u32(arg_u32x2, arg_u32x2, 0);
+	vmul_lane_u32(arg_u32x2, arg_u32x2, 1);
+	vmul_lane_u32(arg_u32x2, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmul_lane_u32(arg_u32x2, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulq_lane_u32(arg_u32x4, arg_u32x2, 0);
+	vmulq_lane_u32(arg_u32x4, arg_u32x2, 1);
+	vmulq_lane_u32(arg_u32x4, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulq_lane_u32(arg_u32x4, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmul_laneq_u32(arg_u32x2, arg_u32x4, 0);
+	vmul_laneq_u32(arg_u32x2, arg_u32x4, 3);
+	vmul_laneq_u32(arg_u32x2, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmul_laneq_u32(arg_u32x2, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulq_laneq_u32(arg_u32x4, arg_u32x4, 0);
+	vmulq_laneq_u32(arg_u32x4, arg_u32x4, 3);
+	vmulq_laneq_u32(arg_u32x4, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulq_laneq_u32(arg_u32x4, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_by_scalar_f32(float32_t arg_f32, float32x2_t arg_f32x2, float32x4_t arg_f32x4) {
+	vmul_lane_f32(arg_f32x2, arg_f32x2, 0);
+	vmul_lane_f32(arg_f32x2, arg_f32x2, 1);
+	vmul_lane_f32(arg_f32x2, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmul_lane_f32(arg_f32x2, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulq_lane_f32(arg_f32x4, arg_f32x2, 0);
+	vmulq_lane_f32(arg_f32x4, arg_f32x2, 1);
+	vmulq_lane_f32(arg_f32x4, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulq_lane_f32(arg_f32x4, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmuls_lane_f32(arg_f32, arg_f32x2, 0);
+	vmuls_lane_f32(arg_f32, arg_f32x2, 1);
+	vmuls_lane_f32(arg_f32, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmuls_lane_f32(arg_f32, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmul_laneq_f32(arg_f32x2, arg_f32x4, 0);
+	vmul_laneq_f32(arg_f32x2, arg_f32x4, 3);
+	vmul_laneq_f32(arg_f32x2, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmul_laneq_f32(arg_f32x2, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulq_laneq_f32(arg_f32x4, arg_f32x4, 0);
+	vmulq_laneq_f32(arg_f32x4, arg_f32x4, 3);
+	vmulq_laneq_f32(arg_f32x4, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulq_laneq_f32(arg_f32x4, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmuls_laneq_f32(arg_f32, arg_f32x4, 0);
+	vmuls_laneq_f32(arg_f32, arg_f32x4, 3);
+	vmuls_laneq_f32(arg_f32, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmuls_laneq_f32(arg_f32, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_by_scalar_f64(float64x1_t arg_f64x1, float64_t arg_f64, float64x2_t arg_f64x2) {
+	vmul_lane_f64(arg_f64x1, arg_f64x1, 0);
+	vmul_lane_f64(arg_f64x1, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmul_lane_f64(arg_f64x1, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulq_lane_f64(arg_f64x2, arg_f64x1, 0);
+	vmulq_lane_f64(arg_f64x2, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulq_lane_f64(arg_f64x2, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmuld_lane_f64(arg_f64, arg_f64x1, 0);
+	vmuld_lane_f64(arg_f64, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmuld_lane_f64(arg_f64, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmul_laneq_f64(arg_f64x1, arg_f64x2, 0);
+	vmul_laneq_f64(arg_f64x1, arg_f64x2, 1);
+	vmul_laneq_f64(arg_f64x1, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmul_laneq_f64(arg_f64x1, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmulq_laneq_f64(arg_f64x2, arg_f64x2, 0);
+	vmulq_laneq_f64(arg_f64x2, arg_f64x2, 1);
+	vmulq_laneq_f64(arg_f64x2, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmulq_laneq_f64(arg_f64x2, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmuld_laneq_f64(arg_f64, arg_f64x2, 0);
+	vmuld_laneq_f64(arg_f64, arg_f64x2, 1);
+	vmuld_laneq_f64(arg_f64, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmuld_laneq_f64(arg_f64, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-subtract-by-scalar.c b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-subtract-by-scalar.c
new file mode 100644
index 0000000000000..c717948b13da9
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-multiply-subtract-by-scalar.c
@@ -0,0 +1,201 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_vector_multiply_subtract_by_scalar_s16(int16x8_t arg_i16x8, int16x4_t arg_i16x4, int32x4_t arg_i32x4) {
+	vmls_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, 0);
+	vmls_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, 3);
+	vmls_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmls_lane_s16(arg_i16x4, arg_i16x4, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, 0);
+	vmlsq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, 3);
+	vmlsq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsq_lane_s16(arg_i16x8, arg_i16x8, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmls_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, 0);
+	vmls_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, 7);
+	vmls_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmls_laneq_s16(arg_i16x4, arg_i16x4, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, 0);
+	vmlsq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, 7);
+	vmlsq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsq_laneq_s16(arg_i16x8, arg_i16x8, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, 0);
+	vmlsl_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, 3);
+	vmlsl_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_lane_s16(arg_i32x4, arg_i16x4, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, 0);
+	vmlsl_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, 3);
+	vmlsl_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_high_lane_s16(arg_i32x4, arg_i16x8, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, 0);
+	vmlsl_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, 7);
+	vmlsl_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_laneq_s16(arg_i32x4, arg_i16x4, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, 0);
+	vmlsl_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, 7);
+	vmlsl_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_high_laneq_s16(arg_i32x4, arg_i16x8, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_subtract_by_scalar_s32(int64x2_t arg_i64x2, int32x4_t arg_i32x4, int32x2_t arg_i32x2) {
+	vmls_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, 0);
+	vmls_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, 1);
+	vmls_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmls_lane_s32(arg_i32x2, arg_i32x2, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, 0);
+	vmlsq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, 1);
+	vmlsq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsq_lane_s32(arg_i32x4, arg_i32x4, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmls_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, 0);
+	vmls_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, 3);
+	vmls_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmls_laneq_s32(arg_i32x2, arg_i32x2, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, 0);
+	vmlsq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, 3);
+	vmlsq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsq_laneq_s32(arg_i32x4, arg_i32x4, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, 0);
+	vmlsl_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, 1);
+	vmlsl_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_lane_s32(arg_i64x2, arg_i32x2, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, 0);
+	vmlsl_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, 1);
+	vmlsl_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_high_lane_s32(arg_i64x2, arg_i32x4, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, 0);
+	vmlsl_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, 3);
+	vmlsl_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_laneq_s32(arg_i64x2, arg_i32x2, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, 0);
+	vmlsl_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, 3);
+	vmlsl_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_high_laneq_s32(arg_i64x2, arg_i32x4, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_subtract_by_scalar_u16(uint16x8_t arg_u16x8, uint16x4_t arg_u16x4, uint32x4_t arg_u32x4) {
+	vmls_lane_u16(arg_u16x4, arg_u16x4, arg_u16x4, 0);
+	vmls_lane_u16(arg_u16x4, arg_u16x4, arg_u16x4, 3);
+	vmls_lane_u16(arg_u16x4, arg_u16x4, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmls_lane_u16(arg_u16x4, arg_u16x4, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsq_lane_u16(arg_u16x8, arg_u16x8, arg_u16x4, 0);
+	vmlsq_lane_u16(arg_u16x8, arg_u16x8, arg_u16x4, 3);
+	vmlsq_lane_u16(arg_u16x8, arg_u16x8, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsq_lane_u16(arg_u16x8, arg_u16x8, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmls_laneq_u16(arg_u16x4, arg_u16x4, arg_u16x8, 0);
+	vmls_laneq_u16(arg_u16x4, arg_u16x4, arg_u16x8, 7);
+	vmls_laneq_u16(arg_u16x4, arg_u16x4, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmls_laneq_u16(arg_u16x4, arg_u16x4, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsq_laneq_u16(arg_u16x8, arg_u16x8, arg_u16x8, 0);
+	vmlsq_laneq_u16(arg_u16x8, arg_u16x8, arg_u16x8, 7);
+	vmlsq_laneq_u16(arg_u16x8, arg_u16x8, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsq_laneq_u16(arg_u16x8, arg_u16x8, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_lane_u16(arg_u32x4, arg_u16x4, arg_u16x4, 0);
+	vmlsl_lane_u16(arg_u32x4, arg_u16x4, arg_u16x4, 3);
+	vmlsl_lane_u16(arg_u32x4, arg_u16x4, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_lane_u16(arg_u32x4, arg_u16x4, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_high_lane_u16(arg_u32x4, arg_u16x8, arg_u16x4, 0);
+	vmlsl_high_lane_u16(arg_u32x4, arg_u16x8, arg_u16x4, 3);
+	vmlsl_high_lane_u16(arg_u32x4, arg_u16x8, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_high_lane_u16(arg_u32x4, arg_u16x8, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_laneq_u16(arg_u32x4, arg_u16x4, arg_u16x8, 0);
+	vmlsl_laneq_u16(arg_u32x4, arg_u16x4, arg_u16x8, 7);
+	vmlsl_laneq_u16(arg_u32x4, arg_u16x4, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_laneq_u16(arg_u32x4, arg_u16x4, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_high_laneq_u16(arg_u32x4, arg_u16x8, arg_u16x8, 0);
+	vmlsl_high_laneq_u16(arg_u32x4, arg_u16x8, arg_u16x8, 7);
+	vmlsl_high_laneq_u16(arg_u32x4, arg_u16x8, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_high_laneq_u16(arg_u32x4, arg_u16x8, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_subtract_by_scalar_u32(uint64x2_t arg_u64x2, uint32x2_t arg_u32x2, uint32x4_t arg_u32x4) {
+	vmls_lane_u32(arg_u32x2, arg_u32x2, arg_u32x2, 0);
+	vmls_lane_u32(arg_u32x2, arg_u32x2, arg_u32x2, 1);
+	vmls_lane_u32(arg_u32x2, arg_u32x2, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmls_lane_u32(arg_u32x2, arg_u32x2, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsq_lane_u32(arg_u32x4, arg_u32x4, arg_u32x2, 0);
+	vmlsq_lane_u32(arg_u32x4, arg_u32x4, arg_u32x2, 1);
+	vmlsq_lane_u32(arg_u32x4, arg_u32x4, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsq_lane_u32(arg_u32x4, arg_u32x4, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmls_laneq_u32(arg_u32x2, arg_u32x2, arg_u32x4, 0);
+	vmls_laneq_u32(arg_u32x2, arg_u32x2, arg_u32x4, 3);
+	vmls_laneq_u32(arg_u32x2, arg_u32x2, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmls_laneq_u32(arg_u32x2, arg_u32x2, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsq_laneq_u32(arg_u32x4, arg_u32x4, arg_u32x4, 0);
+	vmlsq_laneq_u32(arg_u32x4, arg_u32x4, arg_u32x4, 3);
+	vmlsq_laneq_u32(arg_u32x4, arg_u32x4, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsq_laneq_u32(arg_u32x4, arg_u32x4, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_lane_u32(arg_u64x2, arg_u32x2, arg_u32x2, 0);
+	vmlsl_lane_u32(arg_u64x2, arg_u32x2, arg_u32x2, 1);
+	vmlsl_lane_u32(arg_u64x2, arg_u32x2, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_lane_u32(arg_u64x2, arg_u32x2, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_high_lane_u32(arg_u64x2, arg_u32x4, arg_u32x2, 0);
+	vmlsl_high_lane_u32(arg_u64x2, arg_u32x4, arg_u32x2, 1);
+	vmlsl_high_lane_u32(arg_u64x2, arg_u32x4, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_high_lane_u32(arg_u64x2, arg_u32x4, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_laneq_u32(arg_u64x2, arg_u32x2, arg_u32x4, 0);
+	vmlsl_laneq_u32(arg_u64x2, arg_u32x2, arg_u32x4, 3);
+	vmlsl_laneq_u32(arg_u64x2, arg_u32x2, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_laneq_u32(arg_u64x2, arg_u32x2, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsl_high_laneq_u32(arg_u64x2, arg_u32x4, arg_u32x4, 0);
+	vmlsl_high_laneq_u32(arg_u64x2, arg_u32x4, arg_u32x4, 3);
+	vmlsl_high_laneq_u32(arg_u64x2, arg_u32x4, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsl_high_laneq_u32(arg_u64x2, arg_u32x4, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_multiply_subtract_by_scalar_f32(float32x4_t arg_f32x4, float32x2_t arg_f32x2) {
+	vmls_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, 0);
+	vmls_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, 1);
+	vmls_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmls_lane_f32(arg_f32x2, arg_f32x2, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, 0);
+	vmlsq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, 1);
+	vmlsq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsq_lane_f32(arg_f32x4, arg_f32x4, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmls_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, 0);
+	vmls_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, 3);
+	vmls_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmls_laneq_f32(arg_f32x2, arg_f32x2, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vmlsq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, 0);
+	vmlsq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, 3);
+	vmlsq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vmlsq_laneq_f32(arg_f32x4, arg_f32x4, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/vector-shift-left.c b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-shift-left.c
new file mode 100644
index 0000000000000..1def72fc843d9
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-shift-left.c
@@ -0,0 +1,542 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+// Widening left-shifts should have a range of 0..(sizeinbits(arg)-1), this range has had
+// to be weakened to 0..((sizeinbits(arg)*2)-1) due to a use of vshll_n_s16 with an
+// out-of-bounds immediate in the defintiion of vcvt_f32_bf16. As a result, the upper bounds
+// of widening left-shift intrinsics are not currently tested here.
+
+void test_vector_shift_left_s8(int8x8_t arg_i8x8, int8x16_t arg_i8x16) {
+	vshl_n_s8(arg_i8x8, 0);
+	vshl_n_s8(arg_i8x8, 7);
+	vshl_n_s8(arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshl_n_s8(arg_i8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshlq_n_s8(arg_i8x16, 0);
+	vshlq_n_s8(arg_i8x16, 7);
+	vshlq_n_s8(arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshlq_n_s8(arg_i8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_s16(int16x4_t arg_i16x4, int16x8_t arg_i16x8) {
+	vshl_n_s16(arg_i16x4, 0);
+	vshl_n_s16(arg_i16x4, 15);
+	vshl_n_s16(arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshl_n_s16(arg_i16x4, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshlq_n_s16(arg_i16x8, 0);
+	vshlq_n_s16(arg_i16x8, 15);
+	vshlq_n_s16(arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshlq_n_s16(arg_i16x8, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_s32(int32x2_t arg_i32x2, int32x4_t arg_i32x4) {
+	vshl_n_s32(arg_i32x2, 0);
+	vshl_n_s32(arg_i32x2, 31);
+	vshl_n_s32(arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshl_n_s32(arg_i32x2, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshlq_n_s32(arg_i32x4, 0);
+	vshlq_n_s32(arg_i32x4, 31);
+	vshlq_n_s32(arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshlq_n_s32(arg_i32x4, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_s64(int64_t arg_i64, int64x2_t arg_i64x2, int64x1_t arg_i64x1) {
+	vshl_n_s64(arg_i64x1, 0);
+	vshl_n_s64(arg_i64x1, 63);
+	vshl_n_s64(arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshl_n_s64(arg_i64x1, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshlq_n_s64(arg_i64x2, 0);
+	vshlq_n_s64(arg_i64x2, 63);
+	vshlq_n_s64(arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshlq_n_s64(arg_i64x2, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshld_n_s64(arg_i64, 0);
+	vshld_n_s64(arg_i64, 63);
+	vshld_n_s64(arg_i64, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshld_n_s64(arg_i64, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_u8(uint8x8_t arg_u8x8, uint8x16_t arg_u8x16) {
+	vshl_n_u8(arg_u8x8, 0);
+	vshl_n_u8(arg_u8x8, 7);
+	vshl_n_u8(arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshl_n_u8(arg_u8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshlq_n_u8(arg_u8x16, 0);
+	vshlq_n_u8(arg_u8x16, 7);
+	vshlq_n_u8(arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshlq_n_u8(arg_u8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_u16(uint16x4_t arg_u16x4, uint16x8_t arg_u16x8) {
+	vshl_n_u16(arg_u16x4, 0);
+	vshl_n_u16(arg_u16x4, 15);
+	vshl_n_u16(arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshl_n_u16(arg_u16x4, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshlq_n_u16(arg_u16x8, 0);
+	vshlq_n_u16(arg_u16x8, 15);
+	vshlq_n_u16(arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshlq_n_u16(arg_u16x8, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_u32(uint32x2_t arg_u32x2, uint32x4_t arg_u32x4) {
+	vshl_n_u32(arg_u32x2, 0);
+	vshl_n_u32(arg_u32x2, 31);
+	vshl_n_u32(arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshl_n_u32(arg_u32x2, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshlq_n_u32(arg_u32x4, 0);
+	vshlq_n_u32(arg_u32x4, 31);
+	vshlq_n_u32(arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshlq_n_u32(arg_u32x4, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_u64(uint64x1_t arg_u64x1, uint64_t arg_u64, uint64x2_t arg_u64x2) {
+	vshl_n_u64(arg_u64x1, 0);
+	vshl_n_u64(arg_u64x1, 63);
+	vshl_n_u64(arg_u64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshl_n_u64(arg_u64x1, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshlq_n_u64(arg_u64x2, 0);
+	vshlq_n_u64(arg_u64x2, 63);
+	vshlq_n_u64(arg_u64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshlq_n_u64(arg_u64x2, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshld_n_u64(arg_u64, 0);
+	vshld_n_u64(arg_u64, 63);
+	vshld_n_u64(arg_u64, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshld_n_u64(arg_u64, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_shift_left_s8(int8x8_t arg_i8x8, int8x16_t arg_i8x16, int8_t arg_i8) {
+	vqshl_n_s8(arg_i8x8, 0);
+	vqshl_n_s8(arg_i8x8, 7);
+	vqshl_n_s8(arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshl_n_s8(arg_i8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlq_n_s8(arg_i8x16, 0);
+	vqshlq_n_s8(arg_i8x16, 7);
+	vqshlq_n_s8(arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlq_n_s8(arg_i8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlb_n_s8(arg_i8, 0);
+	vqshlb_n_s8(arg_i8, 7);
+	vqshlb_n_s8(arg_i8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlb_n_s8(arg_i8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlu_n_s8(arg_i8x8, 0);
+	vqshlu_n_s8(arg_i8x8, 7);
+	vqshlu_n_s8(arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlu_n_s8(arg_i8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshluq_n_s8(arg_i8x16, 0);
+	vqshluq_n_s8(arg_i8x16, 7);
+	vqshluq_n_s8(arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshluq_n_s8(arg_i8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlub_n_s8(arg_i8, 0);
+	vqshlub_n_s8(arg_i8, 7);
+	vqshlub_n_s8(arg_i8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlub_n_s8(arg_i8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_shift_left_s16(int16x4_t arg_i16x4, int16_t arg_i16, int16x8_t arg_i16x8) {
+	vqshl_n_s16(arg_i16x4, 0);
+	vqshl_n_s16(arg_i16x4, 15);
+	vqshl_n_s16(arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshl_n_s16(arg_i16x4, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlq_n_s16(arg_i16x8, 0);
+	vqshlq_n_s16(arg_i16x8, 15);
+	vqshlq_n_s16(arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlq_n_s16(arg_i16x8, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlh_n_s16(arg_i16, 0);
+	vqshlh_n_s16(arg_i16, 15);
+	vqshlh_n_s16(arg_i16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlh_n_s16(arg_i16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlu_n_s16(arg_i16x4, 0);
+	vqshlu_n_s16(arg_i16x4, 15);
+	vqshlu_n_s16(arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlu_n_s16(arg_i16x4, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshluq_n_s16(arg_i16x8, 0);
+	vqshluq_n_s16(arg_i16x8, 15);
+	vqshluq_n_s16(arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshluq_n_s16(arg_i16x8, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshluh_n_s16(arg_i16, 0);
+	vqshluh_n_s16(arg_i16, 15);
+	vqshluh_n_s16(arg_i16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshluh_n_s16(arg_i16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_shift_left_s32(int32x2_t arg_i32x2, int32_t arg_i32, int32x4_t arg_i32x4) {
+	vqshl_n_s32(arg_i32x2, 0);
+	vqshl_n_s32(arg_i32x2, 31);
+	vqshl_n_s32(arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshl_n_s32(arg_i32x2, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlq_n_s32(arg_i32x4, 0);
+	vqshlq_n_s32(arg_i32x4, 31);
+	vqshlq_n_s32(arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlq_n_s32(arg_i32x4, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshls_n_s32(arg_i32, 0);
+	vqshls_n_s32(arg_i32, 31);
+	vqshls_n_s32(arg_i32, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshls_n_s32(arg_i32, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlu_n_s32(arg_i32x2, 0);
+	vqshlu_n_s32(arg_i32x2, 31);
+	vqshlu_n_s32(arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlu_n_s32(arg_i32x2, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshluq_n_s32(arg_i32x4, 0);
+	vqshluq_n_s32(arg_i32x4, 31);
+	vqshluq_n_s32(arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshluq_n_s32(arg_i32x4, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlus_n_s32(arg_i32, 0);
+	vqshlus_n_s32(arg_i32, 31);
+	vqshlus_n_s32(arg_i32, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlus_n_s32(arg_i32, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_shift_left_s64(int64_t arg_i64, int64x2_t arg_i64x2, int64x1_t arg_i64x1) {
+	vqshl_n_s64(arg_i64x1, 0);
+	vqshl_n_s64(arg_i64x1, 63);
+	vqshl_n_s64(arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshl_n_s64(arg_i64x1, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlq_n_s64(arg_i64x2, 0);
+	vqshlq_n_s64(arg_i64x2, 63);
+	vqshlq_n_s64(arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlq_n_s64(arg_i64x2, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshld_n_s64(arg_i64, 0);
+	vqshld_n_s64(arg_i64, 63);
+	vqshld_n_s64(arg_i64, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshld_n_s64(arg_i64, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlu_n_s64(arg_i64x1, 0);
+	vqshlu_n_s64(arg_i64x1, 63);
+	vqshlu_n_s64(arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlu_n_s64(arg_i64x1, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshluq_n_s64(arg_i64x2, 0);
+	vqshluq_n_s64(arg_i64x2, 63);
+	vqshluq_n_s64(arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshluq_n_s64(arg_i64x2, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlud_n_s64(arg_i64, 0);
+	vqshlud_n_s64(arg_i64, 63);
+	vqshlud_n_s64(arg_i64, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlud_n_s64(arg_i64, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_shift_left_u8(uint8x8_t arg_u8x8, uint8_t arg_u8, uint8x16_t arg_u8x16) {
+	vqshl_n_u8(arg_u8x8, 0);
+	vqshl_n_u8(arg_u8x8, 7);
+	vqshl_n_u8(arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshl_n_u8(arg_u8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlq_n_u8(arg_u8x16, 0);
+	vqshlq_n_u8(arg_u8x16, 7);
+	vqshlq_n_u8(arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlq_n_u8(arg_u8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlb_n_u8(arg_u8, 0);
+	vqshlb_n_u8(arg_u8, 7);
+	vqshlb_n_u8(arg_u8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlb_n_u8(arg_u8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_shift_left_u16(uint16_t arg_u16, uint16x4_t arg_u16x4, uint16x8_t arg_u16x8) {
+	vqshl_n_u16(arg_u16x4, 0);
+	vqshl_n_u16(arg_u16x4, 15);
+	vqshl_n_u16(arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshl_n_u16(arg_u16x4, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlq_n_u16(arg_u16x8, 0);
+	vqshlq_n_u16(arg_u16x8, 15);
+	vqshlq_n_u16(arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlq_n_u16(arg_u16x8, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlh_n_u16(arg_u16, 0);
+	vqshlh_n_u16(arg_u16, 15);
+	vqshlh_n_u16(arg_u16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlh_n_u16(arg_u16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_shift_left_u32(uint32x2_t arg_u32x2, uint32x4_t arg_u32x4, uint32_t arg_u32) {
+	vqshl_n_u32(arg_u32x2, 0);
+	vqshl_n_u32(arg_u32x2, 31);
+	vqshl_n_u32(arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshl_n_u32(arg_u32x2, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlq_n_u32(arg_u32x4, 0);
+	vqshlq_n_u32(arg_u32x4, 31);
+	vqshlq_n_u32(arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlq_n_u32(arg_u32x4, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshls_n_u32(arg_u32, 0);
+	vqshls_n_u32(arg_u32, 31);
+	vqshls_n_u32(arg_u32, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshls_n_u32(arg_u32, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_shift_left_u64(uint64x1_t arg_u64x1, uint64_t arg_u64, uint64x2_t arg_u64x2) {
+	vqshl_n_u64(arg_u64x1, 0);
+	vqshl_n_u64(arg_u64x1, 63);
+	vqshl_n_u64(arg_u64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshl_n_u64(arg_u64x1, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshlq_n_u64(arg_u64x2, 0);
+	vqshlq_n_u64(arg_u64x2, 63);
+	vqshlq_n_u64(arg_u64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshlq_n_u64(arg_u64x2, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshld_n_u64(arg_u64, 0);
+	vqshld_n_u64(arg_u64, 63);
+	vqshld_n_u64(arg_u64, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshld_n_u64(arg_u64, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_and_widen_s8(int8x8_t arg_i8x8, int8x16_t arg_i8x16) {
+	vshll_n_s8(arg_i8x8, 0);
+	vshll_n_s8(arg_i8x8, 7);
+	vshll_n_s8(arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+
+	vshll_high_n_s8(arg_i8x16, 0);
+	vshll_high_n_s8(arg_i8x16, 7);
+	vshll_high_n_s8(arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vector_shift_left_and_widen_s16(int16x4_t arg_i16x4, int16x8_t arg_i16x8) {
+	vshll_n_s16(arg_i16x4, 0);
+	vshll_n_s16(arg_i16x4, 15);
+	vshll_n_s16(arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshll_high_n_s16(arg_i16x8, 0);
+	vshll_high_n_s16(arg_i16x8, 15);
+	vshll_high_n_s16(arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vector_shift_left_and_widen_s32(int32x2_t arg_i32x2, int32x4_t arg_i32x4) {
+	vshll_n_s32(arg_i32x2, 0);
+	vshll_n_s32(arg_i32x2, 31);
+	vshll_n_s32(arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshll_high_n_s32(arg_i32x4, 0);
+	vshll_high_n_s32(arg_i32x4, 31);
+	vshll_high_n_s32(arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vector_shift_left_and_widen_u8(uint8x8_t arg_u8x8, uint8x16_t arg_u8x16) {
+	vshll_n_u8(arg_u8x8, 0);
+	vshll_n_u8(arg_u8x8, 7);
+	vshll_n_u8(arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshll_high_n_u8(arg_u8x16, 0);
+	vshll_high_n_u8(arg_u8x16, 7);
+	vshll_high_n_u8(arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vector_shift_left_and_widen_u16(uint16x4_t arg_u16x4, uint16x8_t arg_u16x8) {
+	vshll_n_u16(arg_u16x4, 0);
+	vshll_n_u16(arg_u16x4, 15);
+	vshll_n_u16(arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshll_high_n_u16(arg_u16x8, 0);
+	vshll_high_n_u16(arg_u16x8, 15);
+	vshll_high_n_u16(arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vector_shift_left_and_widen_u32(uint32x2_t arg_u32x2, uint32x4_t arg_u32x4) {
+	vshll_n_u32(arg_u32x2, 0);
+	vshll_n_u32(arg_u32x2, 31);
+	vshll_n_u32(arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshll_high_n_u32(arg_u32x4, 0);
+	vshll_high_n_u32(arg_u32x4, 31);
+	vshll_high_n_u32(arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vector_shift_left_and_insert_s8(int8x8_t arg_i8x8, int8x16_t arg_i8x16) {
+	vsli_n_s8(arg_i8x8, arg_i8x8, 0);
+	vsli_n_s8(arg_i8x8, arg_i8x8, 7);
+	vsli_n_s8(arg_i8x8, arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsli_n_s8(arg_i8x8, arg_i8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsliq_n_s8(arg_i8x16, arg_i8x16, 0);
+	vsliq_n_s8(arg_i8x16, arg_i8x16, 7);
+	vsliq_n_s8(arg_i8x16, arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsliq_n_s8(arg_i8x16, arg_i8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_and_insert_s16(int16x4_t arg_i16x4, int16x8_t arg_i16x8) {
+	vsli_n_s16(arg_i16x4, arg_i16x4, 0);
+	vsli_n_s16(arg_i16x4, arg_i16x4, 15);
+	vsli_n_s16(arg_i16x4, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsli_n_s16(arg_i16x4, arg_i16x4, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsliq_n_s16(arg_i16x8, arg_i16x8, 0);
+	vsliq_n_s16(arg_i16x8, arg_i16x8, 15);
+	vsliq_n_s16(arg_i16x8, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsliq_n_s16(arg_i16x8, arg_i16x8, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_and_insert_s32(int32x2_t arg_i32x2, int32x4_t arg_i32x4) {
+	vsli_n_s32(arg_i32x2, arg_i32x2, 0);
+	vsli_n_s32(arg_i32x2, arg_i32x2, 31);
+	vsli_n_s32(arg_i32x2, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsli_n_s32(arg_i32x2, arg_i32x2, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsliq_n_s32(arg_i32x4, arg_i32x4, 0);
+	vsliq_n_s32(arg_i32x4, arg_i32x4, 31);
+	vsliq_n_s32(arg_i32x4, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsliq_n_s32(arg_i32x4, arg_i32x4, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_and_insert_s64(int64_t arg_i64, int64x2_t arg_i64x2, int64x1_t arg_i64x1) {
+	vsli_n_s64(arg_i64x1, arg_i64x1, 0);
+	vsli_n_s64(arg_i64x1, arg_i64x1, 63);
+	vsli_n_s64(arg_i64x1, arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsli_n_s64(arg_i64x1, arg_i64x1, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsliq_n_s64(arg_i64x2, arg_i64x2, 0);
+	vsliq_n_s64(arg_i64x2, arg_i64x2, 63);
+	vsliq_n_s64(arg_i64x2, arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsliq_n_s64(arg_i64x2, arg_i64x2, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vslid_n_s64(arg_i64, arg_i64, 0);
+	vslid_n_s64(arg_i64, arg_i64, 63);
+	vslid_n_s64(arg_i64, arg_i64, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vslid_n_s64(arg_i64, arg_i64, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_and_insert_u8(uint8x8_t arg_u8x8, uint8x16_t arg_u8x16) {
+	vsli_n_u8(arg_u8x8, arg_u8x8, 0);
+	vsli_n_u8(arg_u8x8, arg_u8x8, 7);
+	vsli_n_u8(arg_u8x8, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsli_n_u8(arg_u8x8, arg_u8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsliq_n_u8(arg_u8x16, arg_u8x16, 0);
+	vsliq_n_u8(arg_u8x16, arg_u8x16, 7);
+	vsliq_n_u8(arg_u8x16, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsliq_n_u8(arg_u8x16, arg_u8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_and_insert_u16(uint16x4_t arg_u16x4, uint16x8_t arg_u16x8) {
+	vsli_n_u16(arg_u16x4, arg_u16x4, 0);
+	vsli_n_u16(arg_u16x4, arg_u16x4, 15);
+	vsli_n_u16(arg_u16x4, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsli_n_u16(arg_u16x4, arg_u16x4, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsliq_n_u16(arg_u16x8, arg_u16x8, 0);
+	vsliq_n_u16(arg_u16x8, arg_u16x8, 15);
+	vsliq_n_u16(arg_u16x8, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsliq_n_u16(arg_u16x8, arg_u16x8, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_and_insert_u32(uint32x2_t arg_u32x2, uint32x4_t arg_u32x4) {
+	vsli_n_u32(arg_u32x2, arg_u32x2, 0);
+	vsli_n_u32(arg_u32x2, arg_u32x2, 31);
+	vsli_n_u32(arg_u32x2, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsli_n_u32(arg_u32x2, arg_u32x2, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsliq_n_u32(arg_u32x4, arg_u32x4, 0);
+	vsliq_n_u32(arg_u32x4, arg_u32x4, 31);
+	vsliq_n_u32(arg_u32x4, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsliq_n_u32(arg_u32x4, arg_u32x4, 32); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_and_insert_u64(uint64x1_t arg_u64x1, uint64_t arg_u64, uint64x2_t arg_u64x2) {
+	vsli_n_u64(arg_u64x1, arg_u64x1, 0);
+	vsli_n_u64(arg_u64x1, arg_u64x1, 63);
+	vsli_n_u64(arg_u64x1, arg_u64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsli_n_u64(arg_u64x1, arg_u64x1, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsliq_n_u64(arg_u64x2, arg_u64x2, 0);
+	vsliq_n_u64(arg_u64x2, arg_u64x2, 63);
+	vsliq_n_u64(arg_u64x2, arg_u64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsliq_n_u64(arg_u64x2, arg_u64x2, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vslid_n_u64(arg_u64, arg_u64, 0);
+	vslid_n_u64(arg_u64, arg_u64, 63);
+	vslid_n_u64(arg_u64, arg_u64, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vslid_n_u64(arg_u64, arg_u64, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_and_insert_p64(poly64x2_t arg_p64x2, poly64x1_t arg_p64x1) {
+	vsli_n_p64(arg_p64x1, arg_p64x1, 0);
+	vsli_n_p64(arg_p64x1, arg_p64x1, 63);
+	vsli_n_p64(arg_p64x1, arg_p64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsli_n_p64(arg_p64x1, arg_p64x1, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsliq_n_p64(arg_p64x2, arg_p64x2, 0);
+	vsliq_n_p64(arg_p64x2, arg_p64x2, 63);
+	vsliq_n_p64(arg_p64x2, arg_p64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsliq_n_p64(arg_p64x2, arg_p64x2, 64); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_and_insert_p8(poly8x16_t arg_p8x16, poly8x8_t arg_p8x8) {
+	vsli_n_p8(arg_p8x8, arg_p8x8, 0);
+	vsli_n_p8(arg_p8x8, arg_p8x8, 7);
+	vsli_n_p8(arg_p8x8, arg_p8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsli_n_p8(arg_p8x8, arg_p8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsliq_n_p8(arg_p8x16, arg_p8x16, 0);
+	vsliq_n_p8(arg_p8x16, arg_p8x16, 7);
+	vsliq_n_p8(arg_p8x16, arg_p8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsliq_n_p8(arg_p8x16, arg_p8x16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_left_and_insert_p16(poly16x4_t arg_p16x4, poly16x8_t arg_p16x8) {
+	vsli_n_p16(arg_p16x4, arg_p16x4, 0);
+	vsli_n_p16(arg_p16x4, arg_p16x4, 15);
+	vsli_n_p16(arg_p16x4, arg_p16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsli_n_p16(arg_p16x4, arg_p16x4, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsliq_n_p16(arg_p16x8, arg_p16x8, 0);
+	vsliq_n_p16(arg_p16x8, arg_p16x8, 15);
+	vsliq_n_p16(arg_p16x8, arg_p16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsliq_n_p16(arg_p16x8, arg_p16x8, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/vector-shift-right.c b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-shift-right.c
new file mode 100644
index 0000000000000..ad4677fe43666
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-shift-right.c
@@ -0,0 +1,1083 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_vector_shift_right_s8(int8x8_t arg_i8x8, int8x16_t arg_i8x16) {
+	vshr_n_s8(arg_i8x8, 1);
+	vshr_n_s8(arg_i8x8, 8);
+	vshr_n_s8(arg_i8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshr_n_s8(arg_i8x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrq_n_s8(arg_i8x16, 1);
+	vshrq_n_s8(arg_i8x16, 8);
+	vshrq_n_s8(arg_i8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrq_n_s8(arg_i8x16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_s16(int16x4_t arg_i16x4, int16x8_t arg_i16x8) {
+	vshr_n_s16(arg_i16x4, 1);
+	vshr_n_s16(arg_i16x4, 16);
+	vshr_n_s16(arg_i16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshr_n_s16(arg_i16x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrq_n_s16(arg_i16x8, 1);
+	vshrq_n_s16(arg_i16x8, 16);
+	vshrq_n_s16(arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrq_n_s16(arg_i16x8, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_s32(int32x2_t arg_i32x2, int32x4_t arg_i32x4) {
+	vshr_n_s32(arg_i32x2, 1);
+	vshr_n_s32(arg_i32x2, 32);
+	vshr_n_s32(arg_i32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshr_n_s32(arg_i32x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrq_n_s32(arg_i32x4, 1);
+	vshrq_n_s32(arg_i32x4, 32);
+	vshrq_n_s32(arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrq_n_s32(arg_i32x4, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_s64(int64_t arg_i64, int64x1_t arg_i64x1, int64x2_t arg_i64x2) {
+	vshr_n_s64(arg_i64x1, 1);
+	vshr_n_s64(arg_i64x1, 64);
+	vshr_n_s64(arg_i64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshr_n_s64(arg_i64x1, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrq_n_s64(arg_i64x2, 1);
+	vshrq_n_s64(arg_i64x2, 64);
+	vshrq_n_s64(arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrq_n_s64(arg_i64x2, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrd_n_s64(arg_i64, 1);
+	vshrd_n_s64(arg_i64, 64);
+	vshrd_n_s64(arg_i64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrd_n_s64(arg_i64, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_u8(uint8x16_t arg_u8x16, uint8x8_t arg_u8x8) {
+	vshr_n_u8(arg_u8x8, 1);
+	vshr_n_u8(arg_u8x8, 8);
+	vshr_n_u8(arg_u8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshr_n_u8(arg_u8x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrq_n_u8(arg_u8x16, 1);
+	vshrq_n_u8(arg_u8x16, 8);
+	vshrq_n_u8(arg_u8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrq_n_u8(arg_u8x16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_u16(uint16x8_t arg_u16x8, uint16x4_t arg_u16x4) {
+	vshr_n_u16(arg_u16x4, 1);
+	vshr_n_u16(arg_u16x4, 16);
+	vshr_n_u16(arg_u16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshr_n_u16(arg_u16x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrq_n_u16(arg_u16x8, 1);
+	vshrq_n_u16(arg_u16x8, 16);
+	vshrq_n_u16(arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrq_n_u16(arg_u16x8, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_u32(uint32x2_t arg_u32x2, uint32x4_t arg_u32x4) {
+	vshr_n_u32(arg_u32x2, 1);
+	vshr_n_u32(arg_u32x2, 32);
+	vshr_n_u32(arg_u32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshr_n_u32(arg_u32x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrq_n_u32(arg_u32x4, 1);
+	vshrq_n_u32(arg_u32x4, 32);
+	vshrq_n_u32(arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrq_n_u32(arg_u32x4, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_u64(uint64x2_t arg_u64x2, uint64_t arg_u64, uint64x1_t arg_u64x1) {
+	vshr_n_u64(arg_u64x1, 1);
+	vshr_n_u64(arg_u64x1, 64);
+	vshr_n_u64(arg_u64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshr_n_u64(arg_u64x1, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrq_n_u64(arg_u64x2, 1);
+	vshrq_n_u64(arg_u64x2, 64);
+	vshrq_n_u64(arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrq_n_u64(arg_u64x2, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrd_n_u64(arg_u64, 1);
+	vshrd_n_u64(arg_u64, 64);
+	vshrd_n_u64(arg_u64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrd_n_u64(arg_u64, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_s8(int8x8_t arg_i8x8, int8x16_t arg_i8x16) {
+	vrshr_n_s8(arg_i8x8, 1);
+	vrshr_n_s8(arg_i8x8, 8);
+	vrshr_n_s8(arg_i8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshr_n_s8(arg_i8x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrq_n_s8(arg_i8x16, 1);
+	vrshrq_n_s8(arg_i8x16, 8);
+	vrshrq_n_s8(arg_i8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrq_n_s8(arg_i8x16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_s16(int16x4_t arg_i16x4, int16x8_t arg_i16x8) {
+	vrshr_n_s16(arg_i16x4, 1);
+	vrshr_n_s16(arg_i16x4, 16);
+	vrshr_n_s16(arg_i16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshr_n_s16(arg_i16x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrq_n_s16(arg_i16x8, 1);
+	vrshrq_n_s16(arg_i16x8, 16);
+	vrshrq_n_s16(arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrq_n_s16(arg_i16x8, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_s32(int32x2_t arg_i32x2, int32x4_t arg_i32x4) {
+	vrshr_n_s32(arg_i32x2, 1);
+	vrshr_n_s32(arg_i32x2, 32);
+	vrshr_n_s32(arg_i32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshr_n_s32(arg_i32x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrq_n_s32(arg_i32x4, 1);
+	vrshrq_n_s32(arg_i32x4, 32);
+	vrshrq_n_s32(arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrq_n_s32(arg_i32x4, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_s64(int64_t arg_i64, int64x1_t arg_i64x1, int64x2_t arg_i64x2) {
+	vrshr_n_s64(arg_i64x1, 1);
+	vrshr_n_s64(arg_i64x1, 64);
+	vrshr_n_s64(arg_i64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshr_n_s64(arg_i64x1, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrq_n_s64(arg_i64x2, 1);
+	vrshrq_n_s64(arg_i64x2, 64);
+	vrshrq_n_s64(arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrq_n_s64(arg_i64x2, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrd_n_s64(arg_i64, 1);
+	vrshrd_n_s64(arg_i64, 64);
+	vrshrd_n_s64(arg_i64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrd_n_s64(arg_i64, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_u8(uint8x16_t arg_u8x16, uint8x8_t arg_u8x8) {
+	vrshr_n_u8(arg_u8x8, 1);
+	vrshr_n_u8(arg_u8x8, 8);
+	vrshr_n_u8(arg_u8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshr_n_u8(arg_u8x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrq_n_u8(arg_u8x16, 1);
+	vrshrq_n_u8(arg_u8x16, 8);
+	vrshrq_n_u8(arg_u8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrq_n_u8(arg_u8x16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_u16(uint16x8_t arg_u16x8, uint16x4_t arg_u16x4) {
+	vrshr_n_u16(arg_u16x4, 1);
+	vrshr_n_u16(arg_u16x4, 16);
+	vrshr_n_u16(arg_u16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshr_n_u16(arg_u16x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrq_n_u16(arg_u16x8, 1);
+	vrshrq_n_u16(arg_u16x8, 16);
+	vrshrq_n_u16(arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrq_n_u16(arg_u16x8, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_u32(uint32x2_t arg_u32x2, uint32x4_t arg_u32x4) {
+	vrshr_n_u32(arg_u32x2, 1);
+	vrshr_n_u32(arg_u32x2, 32);
+	vrshr_n_u32(arg_u32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshr_n_u32(arg_u32x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrq_n_u32(arg_u32x4, 1);
+	vrshrq_n_u32(arg_u32x4, 32);
+	vrshrq_n_u32(arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrq_n_u32(arg_u32x4, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_u64(uint64x2_t arg_u64x2, uint64_t arg_u64, uint64x1_t arg_u64x1) {
+	vrshr_n_u64(arg_u64x1, 1);
+	vrshr_n_u64(arg_u64x1, 64);
+	vrshr_n_u64(arg_u64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshr_n_u64(arg_u64x1, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrq_n_u64(arg_u64x2, 1);
+	vrshrq_n_u64(arg_u64x2, 64);
+	vrshrq_n_u64(arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrq_n_u64(arg_u64x2, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrd_n_u64(arg_u64, 1);
+	vrshrd_n_u64(arg_u64, 64);
+	vrshrd_n_u64(arg_u64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrd_n_u64(arg_u64, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_accumulate_s8(int8x8_t arg_i8x8, int8x16_t arg_i8x16) {
+	vsra_n_s8(arg_i8x8, arg_i8x8, 1);
+	vsra_n_s8(arg_i8x8, arg_i8x8, 8);
+	vsra_n_s8(arg_i8x8, arg_i8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsra_n_s8(arg_i8x8, arg_i8x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsraq_n_s8(arg_i8x16, arg_i8x16, 1);
+	vsraq_n_s8(arg_i8x16, arg_i8x16, 8);
+	vsraq_n_s8(arg_i8x16, arg_i8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsraq_n_s8(arg_i8x16, arg_i8x16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_accumulate_s16(int16x4_t arg_i16x4, int16x8_t arg_i16x8) {
+	vsra_n_s16(arg_i16x4, arg_i16x4, 1);
+	vsra_n_s16(arg_i16x4, arg_i16x4, 16);
+	vsra_n_s16(arg_i16x4, arg_i16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsra_n_s16(arg_i16x4, arg_i16x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsraq_n_s16(arg_i16x8, arg_i16x8, 1);
+	vsraq_n_s16(arg_i16x8, arg_i16x8, 16);
+	vsraq_n_s16(arg_i16x8, arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsraq_n_s16(arg_i16x8, arg_i16x8, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_accumulate_s32(int32x2_t arg_i32x2, int32x4_t arg_i32x4) {
+	vsra_n_s32(arg_i32x2, arg_i32x2, 1);
+	vsra_n_s32(arg_i32x2, arg_i32x2, 32);
+	vsra_n_s32(arg_i32x2, arg_i32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsra_n_s32(arg_i32x2, arg_i32x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsraq_n_s32(arg_i32x4, arg_i32x4, 1);
+	vsraq_n_s32(arg_i32x4, arg_i32x4, 32);
+	vsraq_n_s32(arg_i32x4, arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsraq_n_s32(arg_i32x4, arg_i32x4, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_accumulate_s64(int64_t arg_i64, int64x1_t arg_i64x1, int64x2_t arg_i64x2) {
+	vsra_n_s64(arg_i64x1, arg_i64x1, 1);
+	vsra_n_s64(arg_i64x1, arg_i64x1, 64);
+	vsra_n_s64(arg_i64x1, arg_i64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsra_n_s64(arg_i64x1, arg_i64x1, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsraq_n_s64(arg_i64x2, arg_i64x2, 1);
+	vsraq_n_s64(arg_i64x2, arg_i64x2, 64);
+	vsraq_n_s64(arg_i64x2, arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsraq_n_s64(arg_i64x2, arg_i64x2, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsrad_n_s64(arg_i64, arg_i64, 1);
+	vsrad_n_s64(arg_i64, arg_i64, 64);
+	vsrad_n_s64(arg_i64, arg_i64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsrad_n_s64(arg_i64, arg_i64, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_accumulate_u8(uint8x16_t arg_u8x16, uint8x8_t arg_u8x8) {
+	vsra_n_u8(arg_u8x8, arg_u8x8, 1);
+	vsra_n_u8(arg_u8x8, arg_u8x8, 8);
+	vsra_n_u8(arg_u8x8, arg_u8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsra_n_u8(arg_u8x8, arg_u8x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsraq_n_u8(arg_u8x16, arg_u8x16, 1);
+	vsraq_n_u8(arg_u8x16, arg_u8x16, 8);
+	vsraq_n_u8(arg_u8x16, arg_u8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsraq_n_u8(arg_u8x16, arg_u8x16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_accumulate_u16(uint16x8_t arg_u16x8, uint16x4_t arg_u16x4) {
+	vsra_n_u16(arg_u16x4, arg_u16x4, 1);
+	vsra_n_u16(arg_u16x4, arg_u16x4, 16);
+	vsra_n_u16(arg_u16x4, arg_u16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsra_n_u16(arg_u16x4, arg_u16x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsraq_n_u16(arg_u16x8, arg_u16x8, 1);
+	vsraq_n_u16(arg_u16x8, arg_u16x8, 16);
+	vsraq_n_u16(arg_u16x8, arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsraq_n_u16(arg_u16x8, arg_u16x8, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_accumulate_u32(uint32x2_t arg_u32x2, uint32x4_t arg_u32x4) {
+	vsra_n_u32(arg_u32x2, arg_u32x2, 1);
+	vsra_n_u32(arg_u32x2, arg_u32x2, 32);
+	vsra_n_u32(arg_u32x2, arg_u32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsra_n_u32(arg_u32x2, arg_u32x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsraq_n_u32(arg_u32x4, arg_u32x4, 1);
+	vsraq_n_u32(arg_u32x4, arg_u32x4, 32);
+	vsraq_n_u32(arg_u32x4, arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsraq_n_u32(arg_u32x4, arg_u32x4, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_accumulate_u64(uint64x2_t arg_u64x2, uint64_t arg_u64, uint64x1_t arg_u64x1) {
+	vsra_n_u64(arg_u64x1, arg_u64x1, 1);
+	vsra_n_u64(arg_u64x1, arg_u64x1, 64);
+	vsra_n_u64(arg_u64x1, arg_u64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsra_n_u64(arg_u64x1, arg_u64x1, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsraq_n_u64(arg_u64x2, arg_u64x2, 1);
+	vsraq_n_u64(arg_u64x2, arg_u64x2, 64);
+	vsraq_n_u64(arg_u64x2, arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsraq_n_u64(arg_u64x2, arg_u64x2, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsrad_n_u64(arg_u64, arg_u64, 1);
+	vsrad_n_u64(arg_u64, arg_u64, 64);
+	vsrad_n_u64(arg_u64, arg_u64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsrad_n_u64(arg_u64, arg_u64, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_and_accumulate_s8(int8x8_t arg_i8x8, int8x16_t arg_i8x16) {
+	vrsra_n_s8(arg_i8x8, arg_i8x8, 1);
+	vrsra_n_s8(arg_i8x8, arg_i8x8, 8);
+	vrsra_n_s8(arg_i8x8, arg_i8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsra_n_s8(arg_i8x8, arg_i8x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrsraq_n_s8(arg_i8x16, arg_i8x16, 1);
+	vrsraq_n_s8(arg_i8x16, arg_i8x16, 8);
+	vrsraq_n_s8(arg_i8x16, arg_i8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsraq_n_s8(arg_i8x16, arg_i8x16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_and_accumulate_s16(int16x4_t arg_i16x4, int16x8_t arg_i16x8) {
+	vrsra_n_s16(arg_i16x4, arg_i16x4, 1);
+	vrsra_n_s16(arg_i16x4, arg_i16x4, 16);
+	vrsra_n_s16(arg_i16x4, arg_i16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsra_n_s16(arg_i16x4, arg_i16x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrsraq_n_s16(arg_i16x8, arg_i16x8, 1);
+	vrsraq_n_s16(arg_i16x8, arg_i16x8, 16);
+	vrsraq_n_s16(arg_i16x8, arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsraq_n_s16(arg_i16x8, arg_i16x8, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_and_accumulate_s32(int32x2_t arg_i32x2, int32x4_t arg_i32x4) {
+	vrsra_n_s32(arg_i32x2, arg_i32x2, 1);
+	vrsra_n_s32(arg_i32x2, arg_i32x2, 32);
+	vrsra_n_s32(arg_i32x2, arg_i32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsra_n_s32(arg_i32x2, arg_i32x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrsraq_n_s32(arg_i32x4, arg_i32x4, 1);
+	vrsraq_n_s32(arg_i32x4, arg_i32x4, 32);
+	vrsraq_n_s32(arg_i32x4, arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsraq_n_s32(arg_i32x4, arg_i32x4, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_and_accumulate_s64(int64_t arg_i64, int64x1_t arg_i64x1, int64x2_t arg_i64x2) {
+	vrsra_n_s64(arg_i64x1, arg_i64x1, 1);
+	vrsra_n_s64(arg_i64x1, arg_i64x1, 64);
+	vrsra_n_s64(arg_i64x1, arg_i64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsra_n_s64(arg_i64x1, arg_i64x1, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrsraq_n_s64(arg_i64x2, arg_i64x2, 1);
+	vrsraq_n_s64(arg_i64x2, arg_i64x2, 64);
+	vrsraq_n_s64(arg_i64x2, arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsraq_n_s64(arg_i64x2, arg_i64x2, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrsrad_n_s64(arg_i64, arg_i64, 1);
+	vrsrad_n_s64(arg_i64, arg_i64, 64);
+	vrsrad_n_s64(arg_i64, arg_i64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsrad_n_s64(arg_i64, arg_i64, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_and_accumulate_u8(uint8x16_t arg_u8x16, uint8x8_t arg_u8x8) {
+	vrsra_n_u8(arg_u8x8, arg_u8x8, 1);
+	vrsra_n_u8(arg_u8x8, arg_u8x8, 8);
+	vrsra_n_u8(arg_u8x8, arg_u8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsra_n_u8(arg_u8x8, arg_u8x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrsraq_n_u8(arg_u8x16, arg_u8x16, 1);
+	vrsraq_n_u8(arg_u8x16, arg_u8x16, 8);
+	vrsraq_n_u8(arg_u8x16, arg_u8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsraq_n_u8(arg_u8x16, arg_u8x16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_and_accumulate_u16(uint16x8_t arg_u16x8, uint16x4_t arg_u16x4) {
+	vrsra_n_u16(arg_u16x4, arg_u16x4, 1);
+	vrsra_n_u16(arg_u16x4, arg_u16x4, 16);
+	vrsra_n_u16(arg_u16x4, arg_u16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsra_n_u16(arg_u16x4, arg_u16x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrsraq_n_u16(arg_u16x8, arg_u16x8, 1);
+	vrsraq_n_u16(arg_u16x8, arg_u16x8, 16);
+	vrsraq_n_u16(arg_u16x8, arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsraq_n_u16(arg_u16x8, arg_u16x8, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_and_accumulate_u32(uint32x2_t arg_u32x2, uint32x4_t arg_u32x4) {
+	vrsra_n_u32(arg_u32x2, arg_u32x2, 1);
+	vrsra_n_u32(arg_u32x2, arg_u32x2, 32);
+	vrsra_n_u32(arg_u32x2, arg_u32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsra_n_u32(arg_u32x2, arg_u32x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrsraq_n_u32(arg_u32x4, arg_u32x4, 1);
+	vrsraq_n_u32(arg_u32x4, arg_u32x4, 32);
+	vrsraq_n_u32(arg_u32x4, arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsraq_n_u32(arg_u32x4, arg_u32x4, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_and_accumulate_u64(uint64x2_t arg_u64x2, uint64_t arg_u64, uint64x1_t arg_u64x1) {
+	vrsra_n_u64(arg_u64x1, arg_u64x1, 1);
+	vrsra_n_u64(arg_u64x1, arg_u64x1, 64);
+	vrsra_n_u64(arg_u64x1, arg_u64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsra_n_u64(arg_u64x1, arg_u64x1, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrsraq_n_u64(arg_u64x2, arg_u64x2, 1);
+	vrsraq_n_u64(arg_u64x2, arg_u64x2, 64);
+	vrsraq_n_u64(arg_u64x2, arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsraq_n_u64(arg_u64x2, arg_u64x2, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrsrad_n_u64(arg_u64, arg_u64, 1);
+	vrsrad_n_u64(arg_u64, arg_u64, 64);
+	vrsrad_n_u64(arg_u64, arg_u64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrsrad_n_u64(arg_u64, arg_u64, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_narrow_s16(int16x8_t arg_i16x8, int8x8_t arg_i8x8) {
+	vshrn_n_s16(arg_i16x8, 1);
+	vshrn_n_s16(arg_i16x8, 8);
+	vshrn_n_s16(arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrn_n_s16(arg_i16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrn_high_n_s16(arg_i8x8, arg_i16x8, 1);
+	vshrn_high_n_s16(arg_i8x8, arg_i16x8, 8);
+	vshrn_high_n_s16(arg_i8x8, arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrn_high_n_s16(arg_i8x8, arg_i16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_narrow_s32(int32x4_t arg_i32x4, int16x4_t arg_i16x4) {
+	vshrn_n_s32(arg_i32x4, 1);
+	vshrn_n_s32(arg_i32x4, 16);
+	vshrn_n_s32(arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrn_n_s32(arg_i32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrn_high_n_s32(arg_i16x4, arg_i32x4, 1);
+	vshrn_high_n_s32(arg_i16x4, arg_i32x4, 16);
+	vshrn_high_n_s32(arg_i16x4, arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrn_high_n_s32(arg_i16x4, arg_i32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_narrow_s64(int32x2_t arg_i32x2, int64x2_t arg_i64x2) {
+	vshrn_n_s64(arg_i64x2, 1);
+	vshrn_n_s64(arg_i64x2, 32);
+	vshrn_n_s64(arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrn_n_s64(arg_i64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrn_high_n_s64(arg_i32x2, arg_i64x2, 1);
+	vshrn_high_n_s64(arg_i32x2, arg_i64x2, 32);
+	vshrn_high_n_s64(arg_i32x2, arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrn_high_n_s64(arg_i32x2, arg_i64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_narrow_u16(uint16x8_t arg_u16x8, uint8x8_t arg_u8x8) {
+	vshrn_n_u16(arg_u16x8, 1);
+	vshrn_n_u16(arg_u16x8, 8);
+	vshrn_n_u16(arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrn_n_u16(arg_u16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrn_high_n_u16(arg_u8x8, arg_u16x8, 1);
+	vshrn_high_n_u16(arg_u8x8, arg_u16x8, 8);
+	vshrn_high_n_u16(arg_u8x8, arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrn_high_n_u16(arg_u8x8, arg_u16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_narrow_u32(uint32x4_t arg_u32x4, uint16x4_t arg_u16x4) {
+	vshrn_n_u32(arg_u32x4, 1);
+	vshrn_n_u32(arg_u32x4, 16);
+	vshrn_n_u32(arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrn_n_u32(arg_u32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrn_high_n_u32(arg_u16x4, arg_u32x4, 1);
+	vshrn_high_n_u32(arg_u16x4, arg_u32x4, 16);
+	vshrn_high_n_u32(arg_u16x4, arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrn_high_n_u32(arg_u16x4, arg_u32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_narrow_u64(uint64x2_t arg_u64x2, uint32x2_t arg_u32x2) {
+	vshrn_n_u64(arg_u64x2, 1);
+	vshrn_n_u64(arg_u64x2, 32);
+	vshrn_n_u64(arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrn_n_u64(arg_u64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vshrn_high_n_u64(arg_u32x2, arg_u64x2, 1);
+	vshrn_high_n_u64(arg_u32x2, arg_u64x2, 32);
+	vshrn_high_n_u64(arg_u32x2, arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vshrn_high_n_u64(arg_u32x2, arg_u64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_shift_right_and_narrow_s16(int16x8_t arg_i16x8, uint8x8_t arg_u8x8, int16_t arg_i16, int8x8_t arg_i8x8) {
+	vqshrun_n_s16(arg_i16x8, 1);
+	vqshrun_n_s16(arg_i16x8, 8);
+	vqshrun_n_s16(arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrun_n_s16(arg_i16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrunh_n_s16(arg_i16, 1);
+	vqshrunh_n_s16(arg_i16, 8);
+	vqshrunh_n_s16(arg_i16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrunh_n_s16(arg_i16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrun_high_n_s16(arg_u8x8, arg_i16x8, 1);
+	vqshrun_high_n_s16(arg_u8x8, arg_i16x8, 8);
+	vqshrun_high_n_s16(arg_u8x8, arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrun_high_n_s16(arg_u8x8, arg_i16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrn_n_s16(arg_i16x8, 1);
+	vqshrn_n_s16(arg_i16x8, 8);
+	vqshrn_n_s16(arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrn_n_s16(arg_i16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrnh_n_s16(arg_i16, 1);
+	vqshrnh_n_s16(arg_i16, 8);
+	vqshrnh_n_s16(arg_i16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrnh_n_s16(arg_i16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrn_high_n_s16(arg_i8x8, arg_i16x8, 1);
+	vqshrn_high_n_s16(arg_i8x8, arg_i16x8, 8);
+	vqshrn_high_n_s16(arg_i8x8, arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrn_high_n_s16(arg_i8x8, arg_i16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_shift_right_and_narrow_s32(int16x4_t arg_i16x4, int32_t arg_i32, int32x4_t arg_i32x4, uint16x4_t arg_u16x4) {
+	vqshrun_n_s32(arg_i32x4, 1);
+	vqshrun_n_s32(arg_i32x4, 16);
+	vqshrun_n_s32(arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrun_n_s32(arg_i32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshruns_n_s32(arg_i32, 1);
+	vqshruns_n_s32(arg_i32, 16);
+	vqshruns_n_s32(arg_i32, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshruns_n_s32(arg_i32, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrun_high_n_s32(arg_u16x4, arg_i32x4, 1);
+	vqshrun_high_n_s32(arg_u16x4, arg_i32x4, 16);
+	vqshrun_high_n_s32(arg_u16x4, arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrun_high_n_s32(arg_u16x4, arg_i32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrn_n_s32(arg_i32x4, 1);
+	vqshrn_n_s32(arg_i32x4, 16);
+	vqshrn_n_s32(arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrn_n_s32(arg_i32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrns_n_s32(arg_i32, 1);
+	vqshrns_n_s32(arg_i32, 16);
+	vqshrns_n_s32(arg_i32, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrns_n_s32(arg_i32, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrn_high_n_s32(arg_i16x4, arg_i32x4, 1);
+	vqshrn_high_n_s32(arg_i16x4, arg_i32x4, 16);
+	vqshrn_high_n_s32(arg_i16x4, arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrn_high_n_s32(arg_i16x4, arg_i32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_shift_right_and_narrow_s64(uint32x2_t arg_u32x2, int64x2_t arg_i64x2, int32x2_t arg_i32x2, int64_t arg_i64) {
+	vqshrun_n_s64(arg_i64x2, 1);
+	vqshrun_n_s64(arg_i64x2, 32);
+	vqshrun_n_s64(arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrun_n_s64(arg_i64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrund_n_s64(arg_i64, 1);
+	vqshrund_n_s64(arg_i64, 32);
+	vqshrund_n_s64(arg_i64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrund_n_s64(arg_i64, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrun_high_n_s64(arg_u32x2, arg_i64x2, 1);
+	vqshrun_high_n_s64(arg_u32x2, arg_i64x2, 32);
+	vqshrun_high_n_s64(arg_u32x2, arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrun_high_n_s64(arg_u32x2, arg_i64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrn_n_s64(arg_i64x2, 1);
+	vqshrn_n_s64(arg_i64x2, 32);
+	vqshrn_n_s64(arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrn_n_s64(arg_i64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrnd_n_s64(arg_i64, 1);
+	vqshrnd_n_s64(arg_i64, 32);
+	vqshrnd_n_s64(arg_i64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrnd_n_s64(arg_i64, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrn_high_n_s64(arg_i32x2, arg_i64x2, 1);
+	vqshrn_high_n_s64(arg_i32x2, arg_i64x2, 32);
+	vqshrn_high_n_s64(arg_i32x2, arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrn_high_n_s64(arg_i32x2, arg_i64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_shift_right_and_narrow_u16(uint16x8_t arg_u16x8, uint16_t arg_u16, uint8x8_t arg_u8x8) {
+	vqshrn_n_u16(arg_u16x8, 1);
+	vqshrn_n_u16(arg_u16x8, 8);
+	vqshrn_n_u16(arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrn_n_u16(arg_u16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrnh_n_u16(arg_u16, 1);
+	vqshrnh_n_u16(arg_u16, 8);
+	vqshrnh_n_u16(arg_u16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrnh_n_u16(arg_u16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrn_high_n_u16(arg_u8x8, arg_u16x8, 1);
+	vqshrn_high_n_u16(arg_u8x8, arg_u16x8, 8);
+	vqshrn_high_n_u16(arg_u8x8, arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrn_high_n_u16(arg_u8x8, arg_u16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_shift_right_and_narrow_u32(uint32x4_t arg_u32x4, uint32_t arg_u32, uint16x4_t arg_u16x4) {
+	vqshrn_n_u32(arg_u32x4, 1);
+	vqshrn_n_u32(arg_u32x4, 16);
+	vqshrn_n_u32(arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrn_n_u32(arg_u32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrns_n_u32(arg_u32, 1);
+	vqshrns_n_u32(arg_u32, 16);
+	vqshrns_n_u32(arg_u32, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrns_n_u32(arg_u32, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrn_high_n_u32(arg_u16x4, arg_u32x4, 1);
+	vqshrn_high_n_u32(arg_u16x4, arg_u32x4, 16);
+	vqshrn_high_n_u32(arg_u16x4, arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrn_high_n_u32(arg_u16x4, arg_u32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_shift_right_and_narrow_u64(uint64x2_t arg_u64x2, uint32x2_t arg_u32x2, uint64_t arg_u64) {
+	vqshrn_n_u64(arg_u64x2, 1);
+	vqshrn_n_u64(arg_u64x2, 32);
+	vqshrn_n_u64(arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrn_n_u64(arg_u64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrnd_n_u64(arg_u64, 1);
+	vqshrnd_n_u64(arg_u64, 32);
+	vqshrnd_n_u64(arg_u64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrnd_n_u64(arg_u64, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqshrn_high_n_u64(arg_u32x2, arg_u64x2, 1);
+	vqshrn_high_n_u64(arg_u32x2, arg_u64x2, 32);
+	vqshrn_high_n_u64(arg_u32x2, arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqshrn_high_n_u64(arg_u32x2, arg_u64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_rounding_shift_right_and_narrow_s16(int16x8_t arg_i16x8, uint8x8_t arg_u8x8,
+																int16_t arg_i16, int8x8_t arg_i8x8) {
+	vqrshrun_n_s16(arg_i16x8, 1);
+	vqrshrun_n_s16(arg_i16x8, 8);
+	vqrshrun_n_s16(arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrun_n_s16(arg_i16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrunh_n_s16(arg_i16, 1);
+	vqrshrunh_n_s16(arg_i16, 8);
+	vqrshrunh_n_s16(arg_i16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrunh_n_s16(arg_i16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrun_high_n_s16(arg_u8x8, arg_i16x8, 1);
+	vqrshrun_high_n_s16(arg_u8x8, arg_i16x8, 8);
+	vqrshrun_high_n_s16(arg_u8x8, arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrun_high_n_s16(arg_u8x8, arg_i16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrn_n_s16(arg_i16x8, 1);
+	vqrshrn_n_s16(arg_i16x8, 8);
+	vqrshrn_n_s16(arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrn_n_s16(arg_i16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrnh_n_s16(arg_i16, 1);
+	vqrshrnh_n_s16(arg_i16, 8);
+	vqrshrnh_n_s16(arg_i16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrnh_n_s16(arg_i16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrn_high_n_s16(arg_i8x8, arg_i16x8, 1);
+	vqrshrn_high_n_s16(arg_i8x8, arg_i16x8, 8);
+	vqrshrn_high_n_s16(arg_i8x8, arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrn_high_n_s16(arg_i8x8, arg_i16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_rounding_shift_right_and_narrow_s32(int16x4_t arg_i16x4, int32_t arg_i32,
+																 int32x4_t arg_i32x4, uint16x4_t arg_u16x4) {
+	vqrshrun_n_s32(arg_i32x4, 1);
+	vqrshrun_n_s32(arg_i32x4, 16);
+	vqrshrun_n_s32(arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrun_n_s32(arg_i32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshruns_n_s32(arg_i32, 1);
+	vqrshruns_n_s32(arg_i32, 16);
+	vqrshruns_n_s32(arg_i32, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshruns_n_s32(arg_i32, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrun_high_n_s32(arg_u16x4, arg_i32x4, 1);
+	vqrshrun_high_n_s32(arg_u16x4, arg_i32x4, 16);
+	vqrshrun_high_n_s32(arg_u16x4, arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrun_high_n_s32(arg_u16x4, arg_i32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrn_n_s32(arg_i32x4, 1);
+	vqrshrn_n_s32(arg_i32x4, 16);
+	vqrshrn_n_s32(arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrn_n_s32(arg_i32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrns_n_s32(arg_i32, 1);
+	vqrshrns_n_s32(arg_i32, 16);
+	vqrshrns_n_s32(arg_i32, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrns_n_s32(arg_i32, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrn_high_n_s32(arg_i16x4, arg_i32x4, 1);
+	vqrshrn_high_n_s32(arg_i16x4, arg_i32x4, 16);
+	vqrshrn_high_n_s32(arg_i16x4, arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrn_high_n_s32(arg_i16x4, arg_i32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_rounding_shift_right_and_narrow_s64(uint32x2_t arg_u32x2, int64x2_t arg_i64x2,
+																int32x2_t arg_i32x2, int64_t arg_i64) {
+	vqrshrun_n_s64(arg_i64x2, 1);
+	vqrshrun_n_s64(arg_i64x2, 32);
+	vqrshrun_n_s64(arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrun_n_s64(arg_i64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrund_n_s64(arg_i64, 1);
+	vqrshrund_n_s64(arg_i64, 32);
+	vqrshrund_n_s64(arg_i64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrund_n_s64(arg_i64, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrun_high_n_s64(arg_u32x2, arg_i64x2, 1);
+	vqrshrun_high_n_s64(arg_u32x2, arg_i64x2, 32);
+	vqrshrun_high_n_s64(arg_u32x2, arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrun_high_n_s64(arg_u32x2, arg_i64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrn_n_s64(arg_i64x2, 1);
+	vqrshrn_n_s64(arg_i64x2, 32);
+	vqrshrn_n_s64(arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrn_n_s64(arg_i64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrnd_n_s64(arg_i64, 1);
+	vqrshrnd_n_s64(arg_i64, 32);
+	vqrshrnd_n_s64(arg_i64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrnd_n_s64(arg_i64, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrn_high_n_s64(arg_i32x2, arg_i64x2, 1);
+	vqrshrn_high_n_s64(arg_i32x2, arg_i64x2, 32);
+	vqrshrn_high_n_s64(arg_i32x2, arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrn_high_n_s64(arg_i32x2, arg_i64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_rounding_shift_right_and_narrow_u16(uint16x8_t arg_u16x8, uint16_t arg_u16,
+																uint8x8_t arg_u8x8) {
+	vqrshrn_n_u16(arg_u16x8, 1);
+	vqrshrn_n_u16(arg_u16x8, 8);
+	vqrshrn_n_u16(arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrn_n_u16(arg_u16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrnh_n_u16(arg_u16, 1);
+	vqrshrnh_n_u16(arg_u16, 8);
+	vqrshrnh_n_u16(arg_u16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrnh_n_u16(arg_u16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrn_high_n_u16(arg_u8x8, arg_u16x8, 1);
+	vqrshrn_high_n_u16(arg_u8x8, arg_u16x8, 8);
+	vqrshrn_high_n_u16(arg_u8x8, arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrn_high_n_u16(arg_u8x8, arg_u16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_rounding_shift_right_and_narrow_u32(uint32x4_t arg_u32x4, uint32_t arg_u32,
+																uint16x4_t arg_u16x4) {
+	vqrshrn_n_u32(arg_u32x4, 1);
+	vqrshrn_n_u32(arg_u32x4, 16);
+	vqrshrn_n_u32(arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrn_n_u32(arg_u32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrns_n_u32(arg_u32, 1);
+	vqrshrns_n_u32(arg_u32, 16);
+	vqrshrns_n_u32(arg_u32, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrns_n_u32(arg_u32, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrn_high_n_u32(arg_u16x4, arg_u32x4, 1);
+	vqrshrn_high_n_u32(arg_u16x4, arg_u32x4, 16);
+	vqrshrn_high_n_u32(arg_u16x4, arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrn_high_n_u32(arg_u16x4, arg_u32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_saturating_rounding_shift_right_and_narrow_u64(uint64x2_t arg_u64x2, uint32x2_t arg_u32x2,
+																uint64_t arg_u64) {
+	vqrshrn_n_u64(arg_u64x2, 1);
+	vqrshrn_n_u64(arg_u64x2, 32);
+	vqrshrn_n_u64(arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrn_n_u64(arg_u64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrnd_n_u64(arg_u64, 1);
+	vqrshrnd_n_u64(arg_u64, 32);
+	vqrshrnd_n_u64(arg_u64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrnd_n_u64(arg_u64, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vqrshrn_high_n_u64(arg_u32x2, arg_u64x2, 1);
+	vqrshrn_high_n_u64(arg_u32x2, arg_u64x2, 32);
+	vqrshrn_high_n_u64(arg_u32x2, arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vqrshrn_high_n_u64(arg_u32x2, arg_u64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_and_narrow_s16(int16x8_t arg_i16x8, int8x8_t arg_i8x8) {
+	vrshrn_n_s16(arg_i16x8, 1);
+	vrshrn_n_s16(arg_i16x8, 8);
+	vrshrn_n_s16(arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrn_n_s16(arg_i16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrn_high_n_s16(arg_i8x8, arg_i16x8, 1);
+	vrshrn_high_n_s16(arg_i8x8, arg_i16x8, 8);
+	vrshrn_high_n_s16(arg_i8x8, arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrn_high_n_s16(arg_i8x8, arg_i16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_and_narrow_s32(int32x4_t arg_i32x4, int16x4_t arg_i16x4) {
+	vrshrn_n_s32(arg_i32x4, 1);
+	vrshrn_n_s32(arg_i32x4, 16);
+	vrshrn_n_s32(arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrn_n_s32(arg_i32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrn_high_n_s32(arg_i16x4, arg_i32x4, 1);
+	vrshrn_high_n_s32(arg_i16x4, arg_i32x4, 16);
+	vrshrn_high_n_s32(arg_i16x4, arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrn_high_n_s32(arg_i16x4, arg_i32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_and_narrow_s64(int32x2_t arg_i32x2, int64x2_t arg_i64x2) {
+	vrshrn_n_s64(arg_i64x2, 1);
+	vrshrn_n_s64(arg_i64x2, 32);
+	vrshrn_n_s64(arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrn_n_s64(arg_i64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrn_high_n_s64(arg_i32x2, arg_i64x2, 1);
+	vrshrn_high_n_s64(arg_i32x2, arg_i64x2, 32);
+	vrshrn_high_n_s64(arg_i32x2, arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrn_high_n_s64(arg_i32x2, arg_i64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_and_narrow_u16(uint16x8_t arg_u16x8, uint8x8_t arg_u8x8) {
+	vrshrn_n_u16(arg_u16x8, 1);
+	vrshrn_n_u16(arg_u16x8, 8);
+	vrshrn_n_u16(arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrn_n_u16(arg_u16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrn_high_n_u16(arg_u8x8, arg_u16x8, 1);
+	vrshrn_high_n_u16(arg_u8x8, arg_u16x8, 8);
+	vrshrn_high_n_u16(arg_u8x8, arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrn_high_n_u16(arg_u8x8, arg_u16x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_and_narrow_u32(uint32x4_t arg_u32x4, uint16x4_t arg_u16x4) {
+	vrshrn_n_u32(arg_u32x4, 1);
+	vrshrn_n_u32(arg_u32x4, 16);
+	vrshrn_n_u32(arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrn_n_u32(arg_u32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrn_high_n_u32(arg_u16x4, arg_u32x4, 1);
+	vrshrn_high_n_u32(arg_u16x4, arg_u32x4, 16);
+	vrshrn_high_n_u32(arg_u16x4, arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrn_high_n_u32(arg_u16x4, arg_u32x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_rounding_shift_right_and_narrow_u64(uint64x2_t arg_u64x2, uint32x2_t arg_u32x2) {
+	vrshrn_n_u64(arg_u64x2, 1);
+	vrshrn_n_u64(arg_u64x2, 32);
+	vrshrn_n_u64(arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrn_n_u64(arg_u64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vrshrn_high_n_u64(arg_u32x2, arg_u64x2, 1);
+	vrshrn_high_n_u64(arg_u32x2, arg_u64x2, 32);
+	vrshrn_high_n_u64(arg_u32x2, arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vrshrn_high_n_u64(arg_u32x2, arg_u64x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_insert_s8(int8x8_t arg_i8x8, int8x16_t arg_i8x16) {
+	vsri_n_s8(arg_i8x8, arg_i8x8, 1);
+	vsri_n_s8(arg_i8x8, arg_i8x8, 8);
+	vsri_n_s8(arg_i8x8, arg_i8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsri_n_s8(arg_i8x8, arg_i8x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsriq_n_s8(arg_i8x16, arg_i8x16, 1);
+	vsriq_n_s8(arg_i8x16, arg_i8x16, 8);
+	vsriq_n_s8(arg_i8x16, arg_i8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsriq_n_s8(arg_i8x16, arg_i8x16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_insert_s16(int16x4_t arg_i16x4, int16x8_t arg_i16x8) {
+	vsri_n_s16(arg_i16x4, arg_i16x4, 1);
+	vsri_n_s16(arg_i16x4, arg_i16x4, 16);
+	vsri_n_s16(arg_i16x4, arg_i16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsri_n_s16(arg_i16x4, arg_i16x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsriq_n_s16(arg_i16x8, arg_i16x8, 1);
+	vsriq_n_s16(arg_i16x8, arg_i16x8, 16);
+	vsriq_n_s16(arg_i16x8, arg_i16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsriq_n_s16(arg_i16x8, arg_i16x8, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_insert_s32(int32x2_t arg_i32x2, int32x4_t arg_i32x4) {
+	vsri_n_s32(arg_i32x2, arg_i32x2, 1);
+	vsri_n_s32(arg_i32x2, arg_i32x2, 32);
+	vsri_n_s32(arg_i32x2, arg_i32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsri_n_s32(arg_i32x2, arg_i32x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsriq_n_s32(arg_i32x4, arg_i32x4, 1);
+	vsriq_n_s32(arg_i32x4, arg_i32x4, 32);
+	vsriq_n_s32(arg_i32x4, arg_i32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsriq_n_s32(arg_i32x4, arg_i32x4, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_insert_s64(int64_t arg_i64, int64x1_t arg_i64x1, int64x2_t arg_i64x2) {
+	vsri_n_s64(arg_i64x1, arg_i64x1, 1);
+	vsri_n_s64(arg_i64x1, arg_i64x1, 64);
+	vsri_n_s64(arg_i64x1, arg_i64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsri_n_s64(arg_i64x1, arg_i64x1, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsriq_n_s64(arg_i64x2, arg_i64x2, 1);
+	vsriq_n_s64(arg_i64x2, arg_i64x2, 64);
+	vsriq_n_s64(arg_i64x2, arg_i64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsriq_n_s64(arg_i64x2, arg_i64x2, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsrid_n_s64(arg_i64, arg_i64, 1);
+	vsrid_n_s64(arg_i64, arg_i64, 64);
+	vsrid_n_s64(arg_i64, arg_i64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsrid_n_s64(arg_i64, arg_i64, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_insert_u8(uint8x16_t arg_u8x16, uint8x8_t arg_u8x8) {
+	vsri_n_u8(arg_u8x8, arg_u8x8, 1);
+	vsri_n_u8(arg_u8x8, arg_u8x8, 8);
+	vsri_n_u8(arg_u8x8, arg_u8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsri_n_u8(arg_u8x8, arg_u8x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsriq_n_u8(arg_u8x16, arg_u8x16, 1);
+	vsriq_n_u8(arg_u8x16, arg_u8x16, 8);
+	vsriq_n_u8(arg_u8x16, arg_u8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsriq_n_u8(arg_u8x16, arg_u8x16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_insert_u16(uint16x8_t arg_u16x8, uint16x4_t arg_u16x4) {
+	vsri_n_u16(arg_u16x4, arg_u16x4, 1);
+	vsri_n_u16(arg_u16x4, arg_u16x4, 16);
+	vsri_n_u16(arg_u16x4, arg_u16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsri_n_u16(arg_u16x4, arg_u16x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsriq_n_u16(arg_u16x8, arg_u16x8, 1);
+	vsriq_n_u16(arg_u16x8, arg_u16x8, 16);
+	vsriq_n_u16(arg_u16x8, arg_u16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsriq_n_u16(arg_u16x8, arg_u16x8, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_insert_u32(uint32x2_t arg_u32x2, uint32x4_t arg_u32x4) {
+	vsri_n_u32(arg_u32x2, arg_u32x2, 1);
+	vsri_n_u32(arg_u32x2, arg_u32x2, 32);
+	vsri_n_u32(arg_u32x2, arg_u32x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsri_n_u32(arg_u32x2, arg_u32x2, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsriq_n_u32(arg_u32x4, arg_u32x4, 1);
+	vsriq_n_u32(arg_u32x4, arg_u32x4, 32);
+	vsriq_n_u32(arg_u32x4, arg_u32x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsriq_n_u32(arg_u32x4, arg_u32x4, 33); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_insert_u64(uint64x2_t arg_u64x2, uint64_t arg_u64, uint64x1_t arg_u64x1) {
+	vsri_n_u64(arg_u64x1, arg_u64x1, 1);
+	vsri_n_u64(arg_u64x1, arg_u64x1, 64);
+	vsri_n_u64(arg_u64x1, arg_u64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsri_n_u64(arg_u64x1, arg_u64x1, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsriq_n_u64(arg_u64x2, arg_u64x2, 1);
+	vsriq_n_u64(arg_u64x2, arg_u64x2, 64);
+	vsriq_n_u64(arg_u64x2, arg_u64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsriq_n_u64(arg_u64x2, arg_u64x2, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsrid_n_u64(arg_u64, arg_u64, 1);
+	vsrid_n_u64(arg_u64, arg_u64, 64);
+	vsrid_n_u64(arg_u64, arg_u64, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsrid_n_u64(arg_u64, arg_u64, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_insert_p64(poly64x2_t arg_p64x2, poly64x1_t arg_p64x1) {
+	vsri_n_p64(arg_p64x1, arg_p64x1, 1);
+	vsri_n_p64(arg_p64x1, arg_p64x1, 64);
+	vsri_n_p64(arg_p64x1, arg_p64x1, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsri_n_p64(arg_p64x1, arg_p64x1, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsriq_n_p64(arg_p64x2, arg_p64x2, 1);
+	vsriq_n_p64(arg_p64x2, arg_p64x2, 64);
+	vsriq_n_p64(arg_p64x2, arg_p64x2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsriq_n_p64(arg_p64x2, arg_p64x2, 65); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_insert_p8(poly8x16_t arg_p8x16, poly8x8_t arg_p8x8) {
+	vsri_n_p8(arg_p8x8, arg_p8x8, 1);
+	vsri_n_p8(arg_p8x8, arg_p8x8, 8);
+	vsri_n_p8(arg_p8x8, arg_p8x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsri_n_p8(arg_p8x8, arg_p8x8, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsriq_n_p8(arg_p8x16, arg_p8x16, 1);
+	vsriq_n_p8(arg_p8x16, arg_p8x16, 8);
+	vsriq_n_p8(arg_p8x16, arg_p8x16, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsriq_n_p8(arg_p8x16, arg_p8x16, 9); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_shift_right_and_insert_p16(poly16x4_t arg_p16x4, poly16x8_t arg_p16x8) {
+	vsri_n_p16(arg_p16x4, arg_p16x4, 1);
+	vsri_n_p16(arg_p16x4, arg_p16x4, 16);
+	vsri_n_p16(arg_p16x4, arg_p16x4, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsri_n_p16(arg_p16x4, arg_p16x4, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vsriq_n_p16(arg_p16x8, arg_p16x8, 1);
+	vsriq_n_p16(arg_p16x8, arg_p16x8, 16);
+	vsriq_n_p16(arg_p16x8, arg_p16x8, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vsriq_n_p16(arg_p16x8, arg_p16x8, 17); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/vector-store.c b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-store.c
new file mode 100644
index 0000000000000..a35891c9adbb5
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/vector-store.c
@@ -0,0 +1,620 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon  -ffreestanding -fsyntax-only -verify %s
+
+#include <arm_neon.h>
+// REQUIRES: aarch64-registered-target
+
+
+void test_vector_store_s8(int8x8_t arg_i8x8, int8x8x3_t arg_i8x8x3, int8_t* arg_i8_ptr,
+						  int8x16x3_t arg_i8x16x3,  int8x8x4_t arg_i8x8x4, int8x16x4_t arg_i8x16x4,
+						  int8x8x2_t arg_i8x8x2, int8x16_t arg_i8x16, int8x16x2_t arg_i8x16x2) {
+	vst1_lane_s8(arg_i8_ptr, arg_i8x8, 0);
+	vst1_lane_s8(arg_i8_ptr, arg_i8x8, 7);
+	vst1_lane_s8(arg_i8_ptr, arg_i8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1_lane_s8(arg_i8_ptr, arg_i8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst1q_lane_s8(arg_i8_ptr, arg_i8x16, 0);
+	vst1q_lane_s8(arg_i8_ptr, arg_i8x16, 15);
+	vst1q_lane_s8(arg_i8_ptr, arg_i8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1q_lane_s8(arg_i8_ptr, arg_i8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2_lane_s8(arg_i8_ptr, arg_i8x8x2, 0);
+	vst2_lane_s8(arg_i8_ptr, arg_i8x8x2, 7);
+	vst2_lane_s8(arg_i8_ptr, arg_i8x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2_lane_s8(arg_i8_ptr, arg_i8x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3_lane_s8(arg_i8_ptr, arg_i8x8x3, 0);
+	vst3_lane_s8(arg_i8_ptr, arg_i8x8x3, 7);
+	vst3_lane_s8(arg_i8_ptr, arg_i8x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3_lane_s8(arg_i8_ptr, arg_i8x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4_lane_s8(arg_i8_ptr, arg_i8x8x4, 0);
+	vst4_lane_s8(arg_i8_ptr, arg_i8x8x4, 7);
+	vst4_lane_s8(arg_i8_ptr, arg_i8x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4_lane_s8(arg_i8_ptr, arg_i8x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2q_lane_s8(arg_i8_ptr, arg_i8x16x2, 0);
+	vst2q_lane_s8(arg_i8_ptr, arg_i8x16x2, 15);
+	vst2q_lane_s8(arg_i8_ptr, arg_i8x16x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2q_lane_s8(arg_i8_ptr, arg_i8x16x2, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3q_lane_s8(arg_i8_ptr, arg_i8x16x3, 0);
+	vst3q_lane_s8(arg_i8_ptr, arg_i8x16x3, 15);
+	vst3q_lane_s8(arg_i8_ptr, arg_i8x16x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3q_lane_s8(arg_i8_ptr, arg_i8x16x3, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4q_lane_s8(arg_i8_ptr, arg_i8x16x4, 0);
+	vst4q_lane_s8(arg_i8_ptr, arg_i8x16x4, 15);
+	vst4q_lane_s8(arg_i8_ptr, arg_i8x16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4q_lane_s8(arg_i8_ptr, arg_i8x16x4, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_store_s16(int16x8x3_t arg_i16x8x3, int16x4_t arg_i16x4, int16x4x3_t arg_i16x4x3,
+						   int16x8_t arg_i16x8, int16_t* arg_i16_ptr, int16x8x2_t arg_i16x8x2,
+						   int16x8x4_t arg_i16x8x4, int16x4x4_t arg_i16x4x4, int16x4x2_t arg_i16x4x2) {
+	vst1_lane_s16(arg_i16_ptr, arg_i16x4, 0);
+	vst1_lane_s16(arg_i16_ptr, arg_i16x4, 3);
+	vst1_lane_s16(arg_i16_ptr, arg_i16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1_lane_s16(arg_i16_ptr, arg_i16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst1q_lane_s16(arg_i16_ptr, arg_i16x8, 0);
+	vst1q_lane_s16(arg_i16_ptr, arg_i16x8, 7);
+	vst1q_lane_s16(arg_i16_ptr, arg_i16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1q_lane_s16(arg_i16_ptr, arg_i16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2_lane_s16(arg_i16_ptr, arg_i16x4x2, 0);
+	vst2_lane_s16(arg_i16_ptr, arg_i16x4x2, 3);
+	vst2_lane_s16(arg_i16_ptr, arg_i16x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2_lane_s16(arg_i16_ptr, arg_i16x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2q_lane_s16(arg_i16_ptr, arg_i16x8x2, 0);
+	vst2q_lane_s16(arg_i16_ptr, arg_i16x8x2, 7);
+	vst2q_lane_s16(arg_i16_ptr, arg_i16x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2q_lane_s16(arg_i16_ptr, arg_i16x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3_lane_s16(arg_i16_ptr, arg_i16x4x3, 0);
+	vst3_lane_s16(arg_i16_ptr, arg_i16x4x3, 3);
+	vst3_lane_s16(arg_i16_ptr, arg_i16x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3_lane_s16(arg_i16_ptr, arg_i16x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3q_lane_s16(arg_i16_ptr, arg_i16x8x3, 0);
+	vst3q_lane_s16(arg_i16_ptr, arg_i16x8x3, 7);
+	vst3q_lane_s16(arg_i16_ptr, arg_i16x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3q_lane_s16(arg_i16_ptr, arg_i16x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4_lane_s16(arg_i16_ptr, arg_i16x4x4, 0);
+	vst4_lane_s16(arg_i16_ptr, arg_i16x4x4, 3);
+	vst4_lane_s16(arg_i16_ptr, arg_i16x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4_lane_s16(arg_i16_ptr, arg_i16x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4q_lane_s16(arg_i16_ptr, arg_i16x8x4, 0);
+	vst4q_lane_s16(arg_i16_ptr, arg_i16x8x4, 7);
+	vst4q_lane_s16(arg_i16_ptr, arg_i16x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4q_lane_s16(arg_i16_ptr, arg_i16x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_store_s32(int32x4x3_t arg_i32x4x3, int32x4_t arg_i32x4, int32x2x2_t arg_i32x2x2,
+						   int32x2x3_t arg_i32x2x3, int32x4x4_t arg_i32x4x4, int32x4x2_t arg_i32x4x2,
+						   int32x2_t arg_i32x2, int32x2x4_t arg_i32x2x4, int32_t* arg_i32_ptr) {
+	vst1_lane_s32(arg_i32_ptr, arg_i32x2, 0);
+	vst1_lane_s32(arg_i32_ptr, arg_i32x2, 1);
+	vst1_lane_s32(arg_i32_ptr, arg_i32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1_lane_s32(arg_i32_ptr, arg_i32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst1q_lane_s32(arg_i32_ptr, arg_i32x4, 0);
+	vst1q_lane_s32(arg_i32_ptr, arg_i32x4, 3);
+	vst1q_lane_s32(arg_i32_ptr, arg_i32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1q_lane_s32(arg_i32_ptr, arg_i32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2_lane_s32(arg_i32_ptr, arg_i32x2x2, 0);
+	vst2_lane_s32(arg_i32_ptr, arg_i32x2x2, 1);
+	vst2_lane_s32(arg_i32_ptr, arg_i32x2x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2_lane_s32(arg_i32_ptr, arg_i32x2x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2q_lane_s32(arg_i32_ptr, arg_i32x4x2, 0);
+	vst2q_lane_s32(arg_i32_ptr, arg_i32x4x2, 3);
+	vst2q_lane_s32(arg_i32_ptr, arg_i32x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2q_lane_s32(arg_i32_ptr, arg_i32x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3_lane_s32(arg_i32_ptr, arg_i32x2x3, 0);
+	vst3_lane_s32(arg_i32_ptr, arg_i32x2x3, 1);
+	vst3_lane_s32(arg_i32_ptr, arg_i32x2x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3_lane_s32(arg_i32_ptr, arg_i32x2x3, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3q_lane_s32(arg_i32_ptr, arg_i32x4x3, 0);
+	vst3q_lane_s32(arg_i32_ptr, arg_i32x4x3, 3);
+	vst3q_lane_s32(arg_i32_ptr, arg_i32x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3q_lane_s32(arg_i32_ptr, arg_i32x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4_lane_s32(arg_i32_ptr, arg_i32x2x4, 0);
+	vst4_lane_s32(arg_i32_ptr, arg_i32x2x4, 1);
+	vst4_lane_s32(arg_i32_ptr, arg_i32x2x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4_lane_s32(arg_i32_ptr, arg_i32x2x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4q_lane_s32(arg_i32_ptr, arg_i32x4x4, 0);
+	vst4q_lane_s32(arg_i32_ptr, arg_i32x4x4, 3);
+	vst4q_lane_s32(arg_i32_ptr, arg_i32x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4q_lane_s32(arg_i32_ptr, arg_i32x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_store_s64(int64x2x2_t arg_i64x2x2, int64_t* arg_i64_ptr, int64x1_t arg_i64x1,
+						   int64x2x4_t arg_i64x2x4, int64x1x4_t arg_i64x1x4, int64x1x2_t arg_i64x1x2,
+						   int64x1x3_t arg_i64x1x3, int64x2x3_t arg_i64x2x3, int64x2_t arg_i64x2) {
+	vst1_lane_s64(arg_i64_ptr, arg_i64x1, 0);
+	vst1_lane_s64(arg_i64_ptr, arg_i64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1_lane_s64(arg_i64_ptr, arg_i64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst1q_lane_s64(arg_i64_ptr, arg_i64x2, 0);
+	vst1q_lane_s64(arg_i64_ptr, arg_i64x2, 1);
+	vst1q_lane_s64(arg_i64_ptr, arg_i64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1q_lane_s64(arg_i64_ptr, arg_i64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2_lane_s64(arg_i64_ptr, arg_i64x1x2, 0);
+	vst2_lane_s64(arg_i64_ptr, arg_i64x1x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2_lane_s64(arg_i64_ptr, arg_i64x1x2, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2q_lane_s64(arg_i64_ptr, arg_i64x2x2, 0);
+	vst2q_lane_s64(arg_i64_ptr, arg_i64x2x2, 1);
+	vst2q_lane_s64(arg_i64_ptr, arg_i64x2x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2q_lane_s64(arg_i64_ptr, arg_i64x2x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3_lane_s64(arg_i64_ptr, arg_i64x1x3, 0);
+	vst3_lane_s64(arg_i64_ptr, arg_i64x1x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3_lane_s64(arg_i64_ptr, arg_i64x1x3, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3q_lane_s64(arg_i64_ptr, arg_i64x2x3, 0);
+	vst3q_lane_s64(arg_i64_ptr, arg_i64x2x3, 1);
+	vst3q_lane_s64(arg_i64_ptr, arg_i64x2x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3q_lane_s64(arg_i64_ptr, arg_i64x2x3, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4_lane_s64(arg_i64_ptr, arg_i64x1x4, 0);
+	vst4_lane_s64(arg_i64_ptr, arg_i64x1x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4_lane_s64(arg_i64_ptr, arg_i64x1x4, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4q_lane_s64(arg_i64_ptr, arg_i64x2x4, 0);
+	vst4q_lane_s64(arg_i64_ptr, arg_i64x2x4, 1);
+	vst4q_lane_s64(arg_i64_ptr, arg_i64x2x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4q_lane_s64(arg_i64_ptr, arg_i64x2x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_store_u8(uint8x16_t arg_u8x16, uint8x16x3_t arg_u8x16x3, uint8x8x4_t arg_u8x8x4,
+						  uint8x16x2_t arg_u8x16x2, uint8x8_t arg_u8x8, uint8x8x3_t arg_u8x8x3,
+						  uint8x16x4_t arg_u8x16x4, uint8_t* arg_u8_ptr, uint8x8x2_t arg_u8x8x2) {
+	vst1_lane_u8(arg_u8_ptr, arg_u8x8, 0);
+	vst1_lane_u8(arg_u8_ptr, arg_u8x8, 7);
+	vst1_lane_u8(arg_u8_ptr, arg_u8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1_lane_u8(arg_u8_ptr, arg_u8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst1q_lane_u8(arg_u8_ptr, arg_u8x16, 0);
+	vst1q_lane_u8(arg_u8_ptr, arg_u8x16, 15);
+	vst1q_lane_u8(arg_u8_ptr, arg_u8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1q_lane_u8(arg_u8_ptr, arg_u8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2_lane_u8(arg_u8_ptr, arg_u8x8x2, 0);
+	vst2_lane_u8(arg_u8_ptr, arg_u8x8x2, 7);
+	vst2_lane_u8(arg_u8_ptr, arg_u8x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2_lane_u8(arg_u8_ptr, arg_u8x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3_lane_u8(arg_u8_ptr, arg_u8x8x3, 0);
+	vst3_lane_u8(arg_u8_ptr, arg_u8x8x3, 7);
+	vst3_lane_u8(arg_u8_ptr, arg_u8x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3_lane_u8(arg_u8_ptr, arg_u8x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4_lane_u8(arg_u8_ptr, arg_u8x8x4, 0);
+	vst4_lane_u8(arg_u8_ptr, arg_u8x8x4, 7);
+	vst4_lane_u8(arg_u8_ptr, arg_u8x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4_lane_u8(arg_u8_ptr, arg_u8x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2q_lane_u8(arg_u8_ptr, arg_u8x16x2, 0);
+	vst2q_lane_u8(arg_u8_ptr, arg_u8x16x2, 15);
+	vst2q_lane_u8(arg_u8_ptr, arg_u8x16x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2q_lane_u8(arg_u8_ptr, arg_u8x16x2, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3q_lane_u8(arg_u8_ptr, arg_u8x16x3, 0);
+	vst3q_lane_u8(arg_u8_ptr, arg_u8x16x3, 15);
+	vst3q_lane_u8(arg_u8_ptr, arg_u8x16x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3q_lane_u8(arg_u8_ptr, arg_u8x16x3, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4q_lane_u8(arg_u8_ptr, arg_u8x16x4, 0);
+	vst4q_lane_u8(arg_u8_ptr, arg_u8x16x4, 15);
+	vst4q_lane_u8(arg_u8_ptr, arg_u8x16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4q_lane_u8(arg_u8_ptr, arg_u8x16x4, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_store_u16(uint16x8x3_t arg_u16x8x3, uint16x4x4_t arg_u16x4x4, uint16_t* arg_u16_ptr,
+						   uint16x4_t arg_u16x4, uint16x4x2_t arg_u16x4x2, uint16x4x3_t arg_u16x4x3,
+						   uint16x8_t arg_u16x8, uint16x8x2_t arg_u16x8x2, uint16x8x4_t arg_u16x8x4) {
+	vst1_lane_u16(arg_u16_ptr, arg_u16x4, 0);
+	vst1_lane_u16(arg_u16_ptr, arg_u16x4, 3);
+	vst1_lane_u16(arg_u16_ptr, arg_u16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1_lane_u16(arg_u16_ptr, arg_u16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst1q_lane_u16(arg_u16_ptr, arg_u16x8, 0);
+	vst1q_lane_u16(arg_u16_ptr, arg_u16x8, 7);
+	vst1q_lane_u16(arg_u16_ptr, arg_u16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1q_lane_u16(arg_u16_ptr, arg_u16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2_lane_u16(arg_u16_ptr, arg_u16x4x2, 0);
+	vst2_lane_u16(arg_u16_ptr, arg_u16x4x2, 3);
+	vst2_lane_u16(arg_u16_ptr, arg_u16x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2_lane_u16(arg_u16_ptr, arg_u16x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2q_lane_u16(arg_u16_ptr, arg_u16x8x2, 0);
+	vst2q_lane_u16(arg_u16_ptr, arg_u16x8x2, 7);
+	vst2q_lane_u16(arg_u16_ptr, arg_u16x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2q_lane_u16(arg_u16_ptr, arg_u16x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3_lane_u16(arg_u16_ptr, arg_u16x4x3, 0);
+	vst3_lane_u16(arg_u16_ptr, arg_u16x4x3, 3);
+	vst3_lane_u16(arg_u16_ptr, arg_u16x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3_lane_u16(arg_u16_ptr, arg_u16x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3q_lane_u16(arg_u16_ptr, arg_u16x8x3, 0);
+	vst3q_lane_u16(arg_u16_ptr, arg_u16x8x3, 7);
+	vst3q_lane_u16(arg_u16_ptr, arg_u16x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3q_lane_u16(arg_u16_ptr, arg_u16x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4_lane_u16(arg_u16_ptr, arg_u16x4x4, 0);
+	vst4_lane_u16(arg_u16_ptr, arg_u16x4x4, 3);
+	vst4_lane_u16(arg_u16_ptr, arg_u16x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4_lane_u16(arg_u16_ptr, arg_u16x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4q_lane_u16(arg_u16_ptr, arg_u16x8x4, 0);
+	vst4q_lane_u16(arg_u16_ptr, arg_u16x8x4, 7);
+	vst4q_lane_u16(arg_u16_ptr, arg_u16x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4q_lane_u16(arg_u16_ptr, arg_u16x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_store_u32(uint32x4x3_t arg_u32x4x3, uint32x2_t arg_u32x2, uint32x2x3_t arg_u32x2x3,
+						   uint32x4x4_t arg_u32x4x4, uint32x4_t arg_u32x4, uint32x2x2_t arg_u32x2x2,
+						   uint32_t* arg_u32_ptr, uint32x2x4_t arg_u32x2x4, uint32x4x2_t arg_u32x4x2) {
+	vst1_lane_u32(arg_u32_ptr, arg_u32x2, 0);
+	vst1_lane_u32(arg_u32_ptr, arg_u32x2, 1);
+	vst1_lane_u32(arg_u32_ptr, arg_u32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1_lane_u32(arg_u32_ptr, arg_u32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst1q_lane_u32(arg_u32_ptr, arg_u32x4, 0);
+	vst1q_lane_u32(arg_u32_ptr, arg_u32x4, 3);
+	vst1q_lane_u32(arg_u32_ptr, arg_u32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1q_lane_u32(arg_u32_ptr, arg_u32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2_lane_u32(arg_u32_ptr, arg_u32x2x2, 0);
+	vst2_lane_u32(arg_u32_ptr, arg_u32x2x2, 1);
+	vst2_lane_u32(arg_u32_ptr, arg_u32x2x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2_lane_u32(arg_u32_ptr, arg_u32x2x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2q_lane_u32(arg_u32_ptr, arg_u32x4x2, 0);
+	vst2q_lane_u32(arg_u32_ptr, arg_u32x4x2, 3);
+	vst2q_lane_u32(arg_u32_ptr, arg_u32x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2q_lane_u32(arg_u32_ptr, arg_u32x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3_lane_u32(arg_u32_ptr, arg_u32x2x3, 0);
+	vst3_lane_u32(arg_u32_ptr, arg_u32x2x3, 1);
+	vst3_lane_u32(arg_u32_ptr, arg_u32x2x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3_lane_u32(arg_u32_ptr, arg_u32x2x3, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3q_lane_u32(arg_u32_ptr, arg_u32x4x3, 0);
+	vst3q_lane_u32(arg_u32_ptr, arg_u32x4x3, 3);
+	vst3q_lane_u32(arg_u32_ptr, arg_u32x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3q_lane_u32(arg_u32_ptr, arg_u32x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4_lane_u32(arg_u32_ptr, arg_u32x2x4, 0);
+	vst4_lane_u32(arg_u32_ptr, arg_u32x2x4, 1);
+	vst4_lane_u32(arg_u32_ptr, arg_u32x2x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4_lane_u32(arg_u32_ptr, arg_u32x2x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4q_lane_u32(arg_u32_ptr, arg_u32x4x4, 0);
+	vst4q_lane_u32(arg_u32_ptr, arg_u32x4x4, 3);
+	vst4q_lane_u32(arg_u32_ptr, arg_u32x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4q_lane_u32(arg_u32_ptr, arg_u32x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_store_u64(uint64x2x3_t arg_u64x2x3, uint64x1_t arg_u64x1, uint64x2_t arg_u64x2,
+						   uint64x1x2_t arg_u64x1x2, uint64x2x2_t arg_u64x2x2, uint64x1x3_t arg_u64x1x3,
+						   uint64_t* arg_u64_ptr, uint64x2x4_t arg_u64x2x4, uint64x1x4_t arg_u64x1x4) {
+	vst1_lane_u64(arg_u64_ptr, arg_u64x1, 0);
+	vst1_lane_u64(arg_u64_ptr, arg_u64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1_lane_u64(arg_u64_ptr, arg_u64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst1q_lane_u64(arg_u64_ptr, arg_u64x2, 0);
+	vst1q_lane_u64(arg_u64_ptr, arg_u64x2, 1);
+	vst1q_lane_u64(arg_u64_ptr, arg_u64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1q_lane_u64(arg_u64_ptr, arg_u64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2_lane_u64(arg_u64_ptr, arg_u64x1x2, 0);
+	vst2_lane_u64(arg_u64_ptr, arg_u64x1x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2_lane_u64(arg_u64_ptr, arg_u64x1x2, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2q_lane_u64(arg_u64_ptr, arg_u64x2x2, 0);
+	vst2q_lane_u64(arg_u64_ptr, arg_u64x2x2, 1);
+	vst2q_lane_u64(arg_u64_ptr, arg_u64x2x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2q_lane_u64(arg_u64_ptr, arg_u64x2x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3_lane_u64(arg_u64_ptr, arg_u64x1x3, 0);
+	vst3_lane_u64(arg_u64_ptr, arg_u64x1x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3_lane_u64(arg_u64_ptr, arg_u64x1x3, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3q_lane_u64(arg_u64_ptr, arg_u64x2x3, 0);
+	vst3q_lane_u64(arg_u64_ptr, arg_u64x2x3, 1);
+	vst3q_lane_u64(arg_u64_ptr, arg_u64x2x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3q_lane_u64(arg_u64_ptr, arg_u64x2x3, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4_lane_u64(arg_u64_ptr, arg_u64x1x4, 0);
+	vst4_lane_u64(arg_u64_ptr, arg_u64x1x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4_lane_u64(arg_u64_ptr, arg_u64x1x4, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4q_lane_u64(arg_u64_ptr, arg_u64x2x4, 0);
+	vst4q_lane_u64(arg_u64_ptr, arg_u64x2x4, 1);
+	vst4q_lane_u64(arg_u64_ptr, arg_u64x2x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4q_lane_u64(arg_u64_ptr, arg_u64x2x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_store_p64(poly64x2x4_t arg_p64x2x4, poly64x1x3_t arg_p64x1x3, poly64x1_t arg_p64x1,
+						   poly64x2x2_t arg_p64x2x2, poly64x1x4_t arg_p64x1x4, poly64_t* arg_p64_ptr,
+						   poly64x1x2_t arg_p64x1x2, poly64x2_t arg_p64x2, poly64x2x3_t arg_p64x2x3) {
+	vst1_lane_p64(arg_p64_ptr, arg_p64x1, 0);
+	vst1_lane_p64(arg_p64_ptr, arg_p64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1_lane_p64(arg_p64_ptr, arg_p64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst1q_lane_p64(arg_p64_ptr, arg_p64x2, 0);
+	vst1q_lane_p64(arg_p64_ptr, arg_p64x2, 1);
+	vst1q_lane_p64(arg_p64_ptr, arg_p64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1q_lane_p64(arg_p64_ptr, arg_p64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2_lane_p64(arg_p64_ptr, arg_p64x1x2, 0);
+	vst2_lane_p64(arg_p64_ptr, arg_p64x1x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2_lane_p64(arg_p64_ptr, arg_p64x1x2, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2q_lane_p64(arg_p64_ptr, arg_p64x2x2, 0);
+	vst2q_lane_p64(arg_p64_ptr, arg_p64x2x2, 1);
+	vst2q_lane_p64(arg_p64_ptr, arg_p64x2x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2q_lane_p64(arg_p64_ptr, arg_p64x2x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3_lane_p64(arg_p64_ptr, arg_p64x1x3, 0);
+	vst3_lane_p64(arg_p64_ptr, arg_p64x1x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3_lane_p64(arg_p64_ptr, arg_p64x1x3, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3q_lane_p64(arg_p64_ptr, arg_p64x2x3, 0);
+	vst3q_lane_p64(arg_p64_ptr, arg_p64x2x3, 1);
+	vst3q_lane_p64(arg_p64_ptr, arg_p64x2x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3q_lane_p64(arg_p64_ptr, arg_p64x2x3, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4_lane_p64(arg_p64_ptr, arg_p64x1x4, 0);
+	vst4_lane_p64(arg_p64_ptr, arg_p64x1x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4_lane_p64(arg_p64_ptr, arg_p64x1x4, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4q_lane_p64(arg_p64_ptr, arg_p64x2x4, 0);
+	vst4q_lane_p64(arg_p64_ptr, arg_p64x2x4, 1);
+	vst4q_lane_p64(arg_p64_ptr, arg_p64x2x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4q_lane_p64(arg_p64_ptr, arg_p64x2x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_store_f16(float16x4x4_t arg_f16x4x4, float16x8_t arg_f16x8, float16x8x2_t arg_f16x8x2,
+						   float16x8x3_t arg_f16x8x3, float16x4x2_t arg_f16x4x2, float16x4x3_t arg_f16x4x3,
+						   float16x4_t arg_f16x4, float16_t* arg_f16_ptr, float16x8x4_t arg_f16x8x4) {
+	vst1_lane_f16(arg_f16_ptr, arg_f16x4, 0);
+	vst1_lane_f16(arg_f16_ptr, arg_f16x4, 3);
+	vst1_lane_f16(arg_f16_ptr, arg_f16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1_lane_f16(arg_f16_ptr, arg_f16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst1q_lane_f16(arg_f16_ptr, arg_f16x8, 0);
+	vst1q_lane_f16(arg_f16_ptr, arg_f16x8, 7);
+	vst1q_lane_f16(arg_f16_ptr, arg_f16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1q_lane_f16(arg_f16_ptr, arg_f16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2_lane_f16(arg_f16_ptr, arg_f16x4x2, 0);
+	vst2_lane_f16(arg_f16_ptr, arg_f16x4x2, 3);
+	vst2_lane_f16(arg_f16_ptr, arg_f16x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2_lane_f16(arg_f16_ptr, arg_f16x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2q_lane_f16(arg_f16_ptr, arg_f16x8x2, 0);
+	vst2q_lane_f16(arg_f16_ptr, arg_f16x8x2, 7);
+	vst2q_lane_f16(arg_f16_ptr, arg_f16x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2q_lane_f16(arg_f16_ptr, arg_f16x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3_lane_f16(arg_f16_ptr, arg_f16x4x3, 0);
+	vst3_lane_f16(arg_f16_ptr, arg_f16x4x3, 3);
+	vst3_lane_f16(arg_f16_ptr, arg_f16x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3_lane_f16(arg_f16_ptr, arg_f16x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3q_lane_f16(arg_f16_ptr, arg_f16x8x3, 0);
+	vst3q_lane_f16(arg_f16_ptr, arg_f16x8x3, 7);
+	vst3q_lane_f16(arg_f16_ptr, arg_f16x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3q_lane_f16(arg_f16_ptr, arg_f16x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4_lane_f16(arg_f16_ptr, arg_f16x4x4, 0);
+	vst4_lane_f16(arg_f16_ptr, arg_f16x4x4, 3);
+	vst4_lane_f16(arg_f16_ptr, arg_f16x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4_lane_f16(arg_f16_ptr, arg_f16x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4q_lane_f16(arg_f16_ptr, arg_f16x8x4, 0);
+	vst4q_lane_f16(arg_f16_ptr, arg_f16x8x4, 7);
+	vst4q_lane_f16(arg_f16_ptr, arg_f16x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4q_lane_f16(arg_f16_ptr, arg_f16x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_store_f32(float32x2x3_t arg_f32x2x3, float32x2x2_t arg_f32x2x2, float32x4_t arg_f32x4,
+						   float32x4x3_t arg_f32x4x3, float32_t* arg_f32_ptr, float32x4x4_t arg_f32x4x4,
+						   float32x2x4_t arg_f32x2x4, float32x2_t arg_f32x2, float32x4x2_t arg_f32x4x2) {
+	vst1_lane_f32(arg_f32_ptr, arg_f32x2, 0);
+	vst1_lane_f32(arg_f32_ptr, arg_f32x2, 1);
+	vst1_lane_f32(arg_f32_ptr, arg_f32x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1_lane_f32(arg_f32_ptr, arg_f32x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst1q_lane_f32(arg_f32_ptr, arg_f32x4, 0);
+	vst1q_lane_f32(arg_f32_ptr, arg_f32x4, 3);
+	vst1q_lane_f32(arg_f32_ptr, arg_f32x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1q_lane_f32(arg_f32_ptr, arg_f32x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2_lane_f32(arg_f32_ptr, arg_f32x2x2, 0);
+	vst2_lane_f32(arg_f32_ptr, arg_f32x2x2, 1);
+	vst2_lane_f32(arg_f32_ptr, arg_f32x2x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2_lane_f32(arg_f32_ptr, arg_f32x2x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2q_lane_f32(arg_f32_ptr, arg_f32x4x2, 0);
+	vst2q_lane_f32(arg_f32_ptr, arg_f32x4x2, 3);
+	vst2q_lane_f32(arg_f32_ptr, arg_f32x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2q_lane_f32(arg_f32_ptr, arg_f32x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3_lane_f32(arg_f32_ptr, arg_f32x2x3, 0);
+	vst3_lane_f32(arg_f32_ptr, arg_f32x2x3, 1);
+	vst3_lane_f32(arg_f32_ptr, arg_f32x2x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3_lane_f32(arg_f32_ptr, arg_f32x2x3, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3q_lane_f32(arg_f32_ptr, arg_f32x4x3, 0);
+	vst3q_lane_f32(arg_f32_ptr, arg_f32x4x3, 3);
+	vst3q_lane_f32(arg_f32_ptr, arg_f32x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3q_lane_f32(arg_f32_ptr, arg_f32x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4_lane_f32(arg_f32_ptr, arg_f32x2x4, 0);
+	vst4_lane_f32(arg_f32_ptr, arg_f32x2x4, 1);
+	vst4_lane_f32(arg_f32_ptr, arg_f32x2x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4_lane_f32(arg_f32_ptr, arg_f32x2x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4q_lane_f32(arg_f32_ptr, arg_f32x4x4, 0);
+	vst4q_lane_f32(arg_f32_ptr, arg_f32x4x4, 3);
+	vst4q_lane_f32(arg_f32_ptr, arg_f32x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4q_lane_f32(arg_f32_ptr, arg_f32x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_store_p8(poly8x16_t arg_p8x16, poly8x16x2_t arg_p8x16x2, poly8x8x3_t arg_p8x8x3,
+						  poly8x16x3_t arg_p8x16x3, poly8x16x4_t arg_p8x16x4, poly8x8x4_t arg_p8x8x4,
+						  poly8_t* arg_p8_ptr, poly8x8_t arg_p8x8, poly8x8x2_t arg_p8x8x2) {
+	vst1_lane_p8(arg_p8_ptr, arg_p8x8, 0);
+	vst1_lane_p8(arg_p8_ptr, arg_p8x8, 7);
+	vst1_lane_p8(arg_p8_ptr, arg_p8x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1_lane_p8(arg_p8_ptr, arg_p8x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst1q_lane_p8(arg_p8_ptr, arg_p8x16, 0);
+	vst1q_lane_p8(arg_p8_ptr, arg_p8x16, 15);
+	vst1q_lane_p8(arg_p8_ptr, arg_p8x16, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1q_lane_p8(arg_p8_ptr, arg_p8x16, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2_lane_p8(arg_p8_ptr, arg_p8x8x2, 0);
+	vst2_lane_p8(arg_p8_ptr, arg_p8x8x2, 7);
+	vst2_lane_p8(arg_p8_ptr, arg_p8x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2_lane_p8(arg_p8_ptr, arg_p8x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3_lane_p8(arg_p8_ptr, arg_p8x8x3, 0);
+	vst3_lane_p8(arg_p8_ptr, arg_p8x8x3, 7);
+	vst3_lane_p8(arg_p8_ptr, arg_p8x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3_lane_p8(arg_p8_ptr, arg_p8x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4_lane_p8(arg_p8_ptr, arg_p8x8x4, 0);
+	vst4_lane_p8(arg_p8_ptr, arg_p8x8x4, 7);
+	vst4_lane_p8(arg_p8_ptr, arg_p8x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4_lane_p8(arg_p8_ptr, arg_p8x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2q_lane_p8(arg_p8_ptr, arg_p8x16x2, 0);
+	vst2q_lane_p8(arg_p8_ptr, arg_p8x16x2, 15);
+	vst2q_lane_p8(arg_p8_ptr, arg_p8x16x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2q_lane_p8(arg_p8_ptr, arg_p8x16x2, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3q_lane_p8(arg_p8_ptr, arg_p8x16x3, 0);
+	vst3q_lane_p8(arg_p8_ptr, arg_p8x16x3, 15);
+	vst3q_lane_p8(arg_p8_ptr, arg_p8x16x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3q_lane_p8(arg_p8_ptr, arg_p8x16x3, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4q_lane_p8(arg_p8_ptr, arg_p8x16x4, 0);
+	vst4q_lane_p8(arg_p8_ptr, arg_p8x16x4, 15);
+	vst4q_lane_p8(arg_p8_ptr, arg_p8x16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4q_lane_p8(arg_p8_ptr, arg_p8x16x4, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_store_p16(poly16x4x4_t arg_p16x4x4, poly16x4_t arg_p16x4, poly16x8x2_t arg_p16x8x2,
+						   poly16_t* arg_p16_ptr, poly16x8_t arg_p16x8, poly16x8x3_t arg_p16x8x3,
+						   poly16x4x3_t arg_p16x4x3, poly16x8x4_t arg_p16x8x4, poly16x4x2_t arg_p16x4x2) {
+	vst1_lane_p16(arg_p16_ptr, arg_p16x4, 0);
+	vst1_lane_p16(arg_p16_ptr, arg_p16x4, 3);
+	vst1_lane_p16(arg_p16_ptr, arg_p16x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1_lane_p16(arg_p16_ptr, arg_p16x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst1q_lane_p16(arg_p16_ptr, arg_p16x8, 0);
+	vst1q_lane_p16(arg_p16_ptr, arg_p16x8, 7);
+	vst1q_lane_p16(arg_p16_ptr, arg_p16x8, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1q_lane_p16(arg_p16_ptr, arg_p16x8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2_lane_p16(arg_p16_ptr, arg_p16x4x2, 0);
+	vst2_lane_p16(arg_p16_ptr, arg_p16x4x2, 3);
+	vst2_lane_p16(arg_p16_ptr, arg_p16x4x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2_lane_p16(arg_p16_ptr, arg_p16x4x2, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2q_lane_p16(arg_p16_ptr, arg_p16x8x2, 0);
+	vst2q_lane_p16(arg_p16_ptr, arg_p16x8x2, 7);
+	vst2q_lane_p16(arg_p16_ptr, arg_p16x8x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2q_lane_p16(arg_p16_ptr, arg_p16x8x2, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3_lane_p16(arg_p16_ptr, arg_p16x4x3, 0);
+	vst3_lane_p16(arg_p16_ptr, arg_p16x4x3, 3);
+	vst3_lane_p16(arg_p16_ptr, arg_p16x4x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3_lane_p16(arg_p16_ptr, arg_p16x4x3, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3q_lane_p16(arg_p16_ptr, arg_p16x8x3, 0);
+	vst3q_lane_p16(arg_p16_ptr, arg_p16x8x3, 7);
+	vst3q_lane_p16(arg_p16_ptr, arg_p16x8x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3q_lane_p16(arg_p16_ptr, arg_p16x8x3, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4_lane_p16(arg_p16_ptr, arg_p16x4x4, 0);
+	vst4_lane_p16(arg_p16_ptr, arg_p16x4x4, 3);
+	vst4_lane_p16(arg_p16_ptr, arg_p16x4x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4_lane_p16(arg_p16_ptr, arg_p16x4x4, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4q_lane_p16(arg_p16_ptr, arg_p16x8x4, 0);
+	vst4q_lane_p16(arg_p16_ptr, arg_p16x8x4, 7);
+	vst4q_lane_p16(arg_p16_ptr, arg_p16x8x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4q_lane_p16(arg_p16_ptr, arg_p16x8x4, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
+void test_vector_store_f64(float64_t* arg_f64_ptr, float64x2_t arg_f64x2, float64x1x3_t arg_f64x1x3,
+						   float64x2x4_t arg_f64x2x4, float64x1x4_t arg_f64x1x4, float64x1x2_t arg_f64x1x2,
+						   float64x1_t arg_f64x1, float64x2x2_t arg_f64x2x2, float64x2x3_t arg_f64x2x3) {
+	vst1_lane_f64(arg_f64_ptr, arg_f64x1, 0);
+	vst1_lane_f64(arg_f64_ptr, arg_f64x1, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1_lane_f64(arg_f64_ptr, arg_f64x1, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst1q_lane_f64(arg_f64_ptr, arg_f64x2, 0);
+	vst1q_lane_f64(arg_f64_ptr, arg_f64x2, 1);
+	vst1q_lane_f64(arg_f64_ptr, arg_f64x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst1q_lane_f64(arg_f64_ptr, arg_f64x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2_lane_f64(arg_f64_ptr, arg_f64x1x2, 0);
+	vst2_lane_f64(arg_f64_ptr, arg_f64x1x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2_lane_f64(arg_f64_ptr, arg_f64x1x2, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst2q_lane_f64(arg_f64_ptr, arg_f64x2x2, 0);
+	vst2q_lane_f64(arg_f64_ptr, arg_f64x2x2, 1);
+	vst2q_lane_f64(arg_f64_ptr, arg_f64x2x2, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst2q_lane_f64(arg_f64_ptr, arg_f64x2x2, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3_lane_f64(arg_f64_ptr, arg_f64x1x3, 0);
+	vst3_lane_f64(arg_f64_ptr, arg_f64x1x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3_lane_f64(arg_f64_ptr, arg_f64x1x3, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst3q_lane_f64(arg_f64_ptr, arg_f64x2x3, 0);
+	vst3q_lane_f64(arg_f64_ptr, arg_f64x2x3, 1);
+	vst3q_lane_f64(arg_f64_ptr, arg_f64x2x3, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst3q_lane_f64(arg_f64_ptr, arg_f64x2x3, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4_lane_f64(arg_f64_ptr, arg_f64x1x4, 0);
+	vst4_lane_f64(arg_f64_ptr, arg_f64x1x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4_lane_f64(arg_f64_ptr, arg_f64x1x4, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+	vst4q_lane_f64(arg_f64_ptr, arg_f64x2x4, 0);
+	vst4q_lane_f64(arg_f64_ptr, arg_f64x2x4, 1);
+	vst4q_lane_f64(arg_f64_ptr, arg_f64x2x4, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+	vst4q_lane_f64(arg_f64_ptr, arg_f64x2x4, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+
+}
+
diff --git a/clang/test/Sema/aarch64-neon-ranges.c b/clang/test/Sema/aarch64-neon-ranges.c
deleted file mode 100644
index 2e60a12c26380..0000000000000
--- a/clang/test/Sema/aarch64-neon-ranges.c
+++ /dev/null
@@ -1,220 +0,0 @@
-// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -ffreestanding -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple arm64-linux-gnu -target-feature +neon -ffreestanding -fsyntax-only -verify %s
-
-// REQUIRES: aarch64-registered-target || arm-registered-target
-
-#include <arm_neon.h>
-
-void test_vext_8bit(int8x8_t small, int8x16_t big) {
-  vext_s8(small, small, 7);
-  vext_u8(small, small, 7);
-  vext_p8(small, small, 7);
-  vextq_s8(big, big, 15);
-  vextq_u8(big, big, 15);
-  vextq_p8(big, big, 15);
-
-  vext_s8(small, small, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vext_u8(small, small, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vext_p8(small, small, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vextq_s8(big, big, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vextq_u8(big, big, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vextq_p8(big, big, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-}
-
-void test_mul_lane_f64(float64x1_t small, float64x2_t big, float64x2_t rhs) {
-  vmul_lane_f64(small, small, 0);
-  vmul_laneq_f64(small, big, 1);
-  vmulq_lane_f64(big, small, 0);
-  vmulq_laneq_f64(big, big, 1);
-  vfma_lane_f64(small, small, small, 0);
-  vfma_laneq_f64(small, small, big, 1);
-  vfmaq_lane_f64(big, big, small, 0);
-  vfmaq_laneq_f64(big, big, big, 1);
-
-  vmul_lane_f64(small, small, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vmul_laneq_f64(small, big, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vfma_lane_f64(small, small, small, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vfma_laneq_f64(small, small, big, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vfmaq_laneq_f64(big, big, big, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-}
-
-void test_ld1st1(int8x8_t small, int8x16_t big, void *addr) {
-  vld1_lane_s8(addr, small, 7);
-  vld1_lane_s16(addr, small, 3);
-  vld1_lane_s32(addr, small, 1);
-  vld1_lane_s64(addr, small, 0);
-
-  vld1q_lane_s8(addr, big, 15);
-  vld1q_lane_s16(addr, big, 7);
-  vld1q_lane_s32(addr, big, 3);
-  vld1q_lane_s64(addr, big, 1);
-
-  vld1_lane_s8(addr, small, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld1_lane_s16(addr, small, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld1_lane_s32(addr, small, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld1_lane_s64(addr, small, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vld1q_lane_s8(addr, big, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld1q_lane_s16(addr, big, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld1q_lane_s32(addr, big, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld1q_lane_s64(addr, big, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vst1_lane_s8(addr, small, 7);
-  vst1_lane_s16(addr, small, 3);
-  vst1_lane_s32(addr, small, 1);
-  vst1_lane_s64(addr, small, 0);
-
-  vst1q_lane_s8(addr, big, 15);
-  vst1q_lane_s16(addr, big, 7);
-  vst1q_lane_s32(addr, big, 3);
-  vst1q_lane_s64(addr, big, 1);
-
-  vst1_lane_s8(addr, small, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst1_lane_s16(addr, small, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst1_lane_s32(addr, small, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst1_lane_s64(addr, small, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vst1q_lane_s8(addr, big, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst1q_lane_s16(addr, big, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst1q_lane_s32(addr, big, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst1q_lane_s64(addr, big, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-}
-
-void test_ld2st2(int8x8x2_t small8, int8x16x2_t big8,
-                 int16x4x2_t small16, int16x8x2_t big16,
-                 int32x2x2_t small32, int32x4x2_t big32,
-                 int64x1x2_t small64, int64x2x2_t big64,
-                 void *addr) {
-  vld2_lane_s8(addr, small8, 7);
-  vld2_lane_s16(addr, small16, 3);
-  vld2_lane_s32(addr, small32, 1);
-  vld2_lane_s64(addr, small64, 0);
-
-  vld2q_lane_s8(addr, big8, 15);
-  vld2q_lane_s16(addr, big16, 7);
-  vld2q_lane_s32(addr, big32, 3);
-  vld2q_lane_s64(addr, big64, 1);
-
-  vld2_lane_s8(addr, small8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld2_lane_s16(addr, small16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld2_lane_s32(addr, small32, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld2_lane_s64(addr, small64, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vld2q_lane_s8(addr, big8, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld2q_lane_s16(addr, big16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld2q_lane_s32(addr, big32, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld2q_lane_s64(addr, big64, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vst2_lane_s8(addr, small8, 7);
-  vst2_lane_s16(addr, small16, 3);
-  vst2_lane_s32(addr, small32, 1);
-  vst2_lane_s64(addr, small64, 0);
-
-  vst2q_lane_s8(addr, big8, 15);
-  vst2q_lane_s16(addr, big16, 7);
-  vst2q_lane_s32(addr, big32, 3);
-  vst2q_lane_s64(addr, big64, 1);
-
-  vst2_lane_s8(addr, small8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst2_lane_s16(addr, small16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst2_lane_s32(addr, small32, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst2_lane_s64(addr, small64, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vst2q_lane_s8(addr, big8, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst2q_lane_s16(addr, big16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst2q_lane_s32(addr, big32, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst2q_lane_s64(addr, big64, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-}
-
-void test_ld3st3(int8x8x3_t small8, int8x16x3_t big8,
-                 int16x4x3_t small16, int16x8x3_t big16,
-                 int32x2x3_t small32, int32x4x3_t big32,
-                 int64x1x3_t small64, int64x2x3_t big64,
-                 void *addr) {
-  vld3_lane_s8(addr, small8, 7);
-  vld3_lane_s16(addr, small16, 3);
-  vld3_lane_s32(addr, small32, 1);
-  vld3_lane_s64(addr, small64, 0);
-
-  vld3q_lane_s8(addr, big8, 15);
-  vld3q_lane_s16(addr, big16, 7);
-  vld3q_lane_s32(addr, big32, 3);
-  vld3q_lane_s64(addr, big64, 1);
-
-  vld3_lane_s8(addr, small8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld3_lane_s16(addr, small16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld3_lane_s32(addr, small32, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld3_lane_s64(addr, small64, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vld3q_lane_s8(addr, big8, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld3q_lane_s16(addr, big16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld3q_lane_s32(addr, big32, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld3q_lane_s64(addr, big64, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vst3_lane_s8(addr, small8, 7);
-  vst3_lane_s16(addr, small16, 3);
-  vst3_lane_s32(addr, small32, 1);
-  vst3_lane_s64(addr, small64, 0);
-
-  vst3q_lane_s8(addr, big8, 15);
-  vst3q_lane_s16(addr, big16, 7);
-  vst3q_lane_s32(addr, big32, 3);
-  vst3q_lane_s64(addr, big64, 1);
-
-  vst3_lane_s8(addr, small8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst3_lane_s16(addr, small16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst3_lane_s32(addr, small32, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst3_lane_s64(addr, small64, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vst3q_lane_s8(addr, big8, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst3q_lane_s16(addr, big16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst3q_lane_s32(addr, big32, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst3q_lane_s64(addr, big64, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-}
-
-void test_ld4st4(int8x8x4_t small8, int8x16x4_t big8,
-                 int16x4x4_t small16, int16x8x4_t big16,
-                 int32x2x4_t small32, int32x4x4_t big32,
-                 int64x1x4_t small64, int64x2x4_t big64,
-                 void *addr) {
-  vld4_lane_s8(addr, small8, 7);
-  vld4_lane_s16(addr, small16, 3);
-  vld4_lane_s32(addr, small32, 1);
-  vld4_lane_s64(addr, small64, 0);
-
-  vld4q_lane_s8(addr, big8, 15);
-  vld4q_lane_s16(addr, big16, 7);
-  vld4q_lane_s32(addr, big32, 3);
-  vld4q_lane_s64(addr, big64, 1);
-
-  vld4_lane_s8(addr, small8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld4_lane_s16(addr, small16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld4_lane_s32(addr, small32, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld4_lane_s64(addr, small64, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vld4q_lane_s8(addr, big8, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld4q_lane_s16(addr, big16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld4q_lane_s32(addr, big32, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vld4q_lane_s64(addr, big64, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vst4_lane_s8(addr, small8, 7);
-  vst4_lane_s16(addr, small16, 3);
-  vst4_lane_s32(addr, small32, 1);
-  vst4_lane_s64(addr, small64, 0);
-
-  vst4q_lane_s8(addr, big8, 15);
-  vst4q_lane_s16(addr, big16, 7);
-  vst4q_lane_s32(addr, big32, 3);
-  vst4q_lane_s64(addr, big64, 1);
-
-  vst4_lane_s8(addr, small8, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst4_lane_s16(addr, small16, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst4_lane_s32(addr, small32, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst4_lane_s64(addr, small64, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-
-  vst4q_lane_s8(addr, big8, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst4q_lane_s16(addr, big16, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst4q_lane_s32(addr, big32, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-  vst4q_lane_s64(addr, big64, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
-}
-
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 8ec8e67388bbd..e0d7b0db7f578 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/AArch64ImmCheck.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/SetTheory.h"
@@ -47,6 +48,7 @@
 #include <set>
 #include <sstream>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -333,6 +335,8 @@ class Intrinsic {
 
   /// The types of return value [0] and parameters [1..].
   std::vector<Type> Types;
+
+  SmallVector<ImmCheck, 2> ImmChecks;
   /// The index of the key type passed to CGBuiltin.cpp for polymorphic calls.
   int PolymorphicKeyType;
   /// The local variables defined.
@@ -369,12 +373,13 @@ class Intrinsic {
 public:
   Intrinsic(Record *R, StringRef Name, StringRef Proto, TypeSpec OutTS,
             TypeSpec InTS, ClassKind CK, ListInit *Body, NeonEmitter &Emitter,
-            StringRef ArchGuard, StringRef TargetGuard, bool IsUnavailable, bool BigEndianSafe)
+            StringRef ArchGuard, StringRef TargetGuard, bool IsUnavailable,
+            bool BigEndianSafe)
       : R(R), Name(Name.str()), OutTS(OutTS), InTS(InTS), CK(CK), Body(Body),
-        ArchGuard(ArchGuard.str()), TargetGuard(TargetGuard.str()), IsUnavailable(IsUnavailable),
-        BigEndianSafe(BigEndianSafe), PolymorphicKeyType(0), NeededEarly(false),
-        UseMacro(false), BaseType(OutTS, "."), InBaseType(InTS, "."),
-        Emitter(Emitter) {
+        ArchGuard(ArchGuard.str()), TargetGuard(TargetGuard.str()),
+        IsUnavailable(IsUnavailable), BigEndianSafe(BigEndianSafe),
+        PolymorphicKeyType(0), NeededEarly(false), UseMacro(false),
+        BaseType(OutTS, "."), InBaseType(InTS, "."), Emitter(Emitter) {
     // Modify the TypeSpec per-argument to get a concrete Type, and create
     // known variables for each.
     // Types[0] is the return value.
@@ -403,6 +408,37 @@ class Intrinsic {
           (Type.isScalar() && Type.isHalf()))
         UseMacro = true;
     }
+
+    int ArgIdx, Kind, TypeArgIdx;
+    std::vector<Record *> ImmCheckList = R->getValueAsListOfDefs("ImmChecks");
+    for (const auto *I : ImmCheckList) {
+      unsigned EltSizeInBits = 0, VecSizeInBits = 0;
+
+      ArgIdx = I->getValueAsInt("ImmArgIdx");
+      TypeArgIdx = I->getValueAsInt("TypeContextArgIdx");
+      Kind = I->getValueAsDef("Kind")->getValueAsInt("Value");
+
+      assert((ArgIdx >= 0 && Kind >= 0) &&
+             "ImmArgIdx and Kind must be nonnegative");
+
+      if (TypeArgIdx >= 0) {
+        Type ContextType = getParamType(TypeArgIdx);
+
+        // Element size cannot be set for intrinscs that map to polymorphic
+        // builtins.
+        if (CK != ClassB)
+          EltSizeInBits = ContextType.getElementSizeInBits();
+
+        VecSizeInBits = ContextType.getSizeInBits();
+      }
+
+      ImmChecks.emplace_back(ArgIdx, Kind, EltSizeInBits, VecSizeInBits);
+    }
+    llvm::sort(ImmChecks.begin(), ImmChecks.end(),
+               [](const ImmCheck &a, const ImmCheck &b) {
+                 return a.getImmArgIdx() < b.getImmArgIdx();
+               }); // Sort for comparison with other intrinsics which map to the
+                   // same builtin
   }
 
   /// Get the Record that this intrinsic is based off.
@@ -414,6 +450,7 @@ class Intrinsic {
   /// Get the architectural guard string (#ifdef).
   std::string getArchGuard() const { return ArchGuard; }
   std::string getTargetGuard() const { return TargetGuard; }
+  ArrayRef<ImmCheck> getImmChecks() const { return ImmChecks; }
   /// Get the non-mangled name.
   std::string getName() const { return Name; }
 
@@ -422,15 +459,11 @@ class Intrinsic {
     return llvm::any_of(Types, [](const Type &T) { return T.isImmediate(); });
   }
 
-  /// Return the parameter index of the immediate operand.
-  unsigned getImmediateIdx() const {
-    for (unsigned Idx = 0; Idx < Types.size(); ++Idx)
-      if (Types[Idx].isImmediate())
-        return Idx - 1;
-    llvm_unreachable("Intrinsic has no immediate");
+  // Return if the supplied argument is an immediate
+  bool isArgImmediate(unsigned idx) const {
+    return Types[idx + 1].isImmediate();
   }
 
-
   unsigned getNumParams() const { return Types.size() - 1; }
   Type getReturnType() const { return Types[0]; }
   Type getParamType(unsigned I) const { return Types[I + 1]; }
@@ -554,6 +587,8 @@ class NeonEmitter {
                                      SmallVectorImpl<Intrinsic *> &Defs);
   void genOverloadTypeCheckCode(raw_ostream &OS,
                                 SmallVectorImpl<Intrinsic *> &Defs);
+  bool areRangeChecksCompatible(const ArrayRef<ImmCheck> ChecksA,
+                                const ArrayRef<ImmCheck> ChecksB);
   void genIntrinsicRangeCheckCode(raw_ostream &OS,
                                   SmallVectorImpl<Intrinsic *> &Defs);
 
@@ -1031,7 +1066,7 @@ std::string Intrinsic::getBuiltinTypeStr() {
     if (LocalCK == ClassI && T.isInteger())
       T.makeSigned();
 
-    if (hasImmediate() && getImmediateIdx() == I)
+    if (isArgImmediate(I))
       T.makeImmediate(32);
 
     S += T.builtin_str();
@@ -1470,7 +1505,7 @@ Intrinsic::DagEmitter::emitDagCall(DagInit *DI, bool MatchMangledName) {
     N = emitDagArg(DI->getArg(0), "").second;
   std::optional<std::string> MangledName;
   if (MatchMangledName) {
-    if (Intr.getRecord()->getValueAsBit("isLaneQ"))
+    if (Intr.getRecord()->getValueAsString("Name").contains("laneq"))
       N += "q";
     MangledName = Intr.mangleName(N, ClassS);
   }
@@ -2143,85 +2178,58 @@ void NeonEmitter::genOverloadTypeCheckCode(raw_ostream &OS,
   OS << "#endif\n\n";
 }
 
-void NeonEmitter::genIntrinsicRangeCheckCode(raw_ostream &OS,
-                                        SmallVectorImpl<Intrinsic *> &Defs) {
-  OS << "#ifdef GET_NEON_IMMEDIATE_CHECK\n";
+inline bool
+NeonEmitter::areRangeChecksCompatible(const ArrayRef<ImmCheck> ChecksA,
+                                      const ArrayRef<ImmCheck> ChecksB) {
+  // If multiple intrinsics map to the same builtin, we must ensure that the
+  // intended range checks performed in SemaArm.cpp do not contradict each
+  // other, as these are emitted once per-buitlin.
+  //
+  // The arguments to be checked and type of each check to be performed must be
+  // the same. The element types may differ as they will be resolved
+  // per-intrinsic as overloaded types by SemaArm.cpp, though the vector sizes
+  // are not and so must be the same.
+  bool compat =
+      std::equal(ChecksA.begin(), ChecksA.end(), ChecksB.begin(), ChecksB.end(),
+                 [](const auto &A, const auto &B) {
+                   return A.getImmArgIdx() == B.getImmArgIdx() &&
+                          A.getKind() == B.getKind() &&
+                          A.getVecSizeInBits() == B.getVecSizeInBits();
+                 });
+
+  return compat;
+}
 
-  std::set<std::string> Emitted;
+void NeonEmitter::genIntrinsicRangeCheckCode(
+    raw_ostream &OS, SmallVectorImpl<Intrinsic *> &Defs) {
+  std::unordered_map<std::string, ArrayRef<ImmCheck>> Emitted;
 
-  for (auto *Def : Defs) {
-    if (Def->hasBody())
-      continue;
-    // Functions which do not have an immediate do not need to have range
-    // checking code emitted.
-    if (!Def->hasImmediate())
-      continue;
-    if (Emitted.find(Def->getMangledName()) != Emitted.end())
+  OS << "#ifdef GET_NEON_IMMEDIATE_CHECK\n";
+  for (auto &Def : Defs) {
+    // If the Def has a body (operation DAGs), it is not a __builtin_neon_
+    if (Def->hasBody() || !Def->hasImmediate())
       continue;
 
-    std::string LowerBound, UpperBound;
-
-    Record *R = Def->getRecord();
-    if (R->getValueAsBit("isVXAR")) {
-      //VXAR takes an immediate in the range [0, 63]
-      LowerBound = "0";
-      UpperBound = "63";
-    } else if (R->getValueAsBit("isVCVT_N")) {
-      // VCVT between floating- and fixed-point values takes an immediate
-      // in the range [1, 32) for f32 or [1, 64) for f64 or [1, 16) for f16.
-      LowerBound = "1";
-	  if (Def->getBaseType().getElementSizeInBits() == 16 ||
-		  Def->getName().find('h') != std::string::npos)
-		// VCVTh operating on FP16 intrinsics in range [1, 16)
-		UpperBound = "15";
-	  else if (Def->getBaseType().getElementSizeInBits() == 32)
-        UpperBound = "31";
-	  else
-        UpperBound = "63";
-    } else if (R->getValueAsBit("isScalarShift")) {
-      // Right shifts have an 'r' in the name, left shifts do not. Convert
-      // instructions have the same bounds and right shifts.
-      if (Def->getName().find('r') != std::string::npos ||
-          Def->getName().find("cvt") != std::string::npos)
-        LowerBound = "1";
-
-      UpperBound = utostr(Def->getReturnType().getElementSizeInBits() - 1);
-    } else if (R->getValueAsBit("isShift")) {
-      // Builtins which are overloaded by type will need to have their upper
-      // bound computed at Sema time based on the type constant.
-
-      // Right shifts have an 'r' in the name, left shifts do not.
-      if (Def->getName().find('r') != std::string::npos)
-        LowerBound = "1";
-      UpperBound = "RFT(TV, true)";
-    } else if (Def->getClassKind(true) == ClassB) {
-      // ClassB intrinsics have a type (and hence lane number) that is only
-      // known at runtime.
-      if (R->getValueAsBit("isLaneQ"))
-        UpperBound = "RFT(TV, false, true)";
-      else
-        UpperBound = "RFT(TV, false, false)";
-    } else {
-      // The immediate generally refers to a lane in the preceding argument.
-      assert(Def->getImmediateIdx() > 0);
-      Type T = Def->getParamType(Def->getImmediateIdx() - 1);
-      UpperBound = utostr(T.getNumElements() - 1);
-    }
+    // Sorted by immediate argument index
+    ArrayRef<ImmCheck> Checks = Def->getImmChecks();
 
-    // Calculate the index of the immediate that should be range checked.
-    unsigned Idx = Def->getNumParams();
-    if (Def->hasImmediate())
-      Idx = Def->getGeneratedParamIdx(Def->getImmediateIdx());
-
-    OS << "case NEON::BI__builtin_neon_" << Def->getMangledName() << ": "
-       << "i = " << Idx << ";";
-    if (!LowerBound.empty())
-      OS << " l = " << LowerBound << ";";
-    if (!UpperBound.empty())
-      OS << " u = " << UpperBound << ";";
-    OS << " break;\n";
+    const auto it = Emitted.find(Def->getMangledName());
+    if (it != Emitted.end()) {
+      assert(areRangeChecksCompatible(Checks, it->second) &&
+             "Neon intrinsics with incompatible immediate range checks cannot "
+             "share a builtin.");
+      continue; // Ensure this is emitted only once
+    }
 
-    Emitted.insert(Def->getMangledName());
+    // Emit builtin's range checks
+    OS << "case NEON::BI__builtin_neon_" << Def->getMangledName() << ":\n";
+    for (const auto &Check : Checks) {
+      OS << " ImmChecks.emplace_back(" << Check.getImmArgIdx() << ", "
+         << Check.getKind() << ", " << Check.getElementSizeInBits() << ", "
+         << Check.getVecSizeInBits() << ");\n"
+         << " break;\n";
+    }
+    Emitted[Def->getMangledName()] = Checks;
   }
 
   OS << "#endif\n\n";
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index ca63bd354bfc7..b2e2db1a40990 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/TableGen/AArch64ImmCheck.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include <array>
@@ -49,23 +50,6 @@ enum class ACLEKind { SVE, SME };
 using TypeSpec = std::string;
 
 namespace {
-
-class ImmCheck {
-  unsigned Arg;
-  unsigned Kind;
-  unsigned ElementSizeInBits;
-
-public:
-  ImmCheck(unsigned Arg, unsigned Kind, unsigned ElementSizeInBits = 0)
-      : Arg(Arg), Kind(Kind), ElementSizeInBits(ElementSizeInBits) {}
-  ImmCheck(const ImmCheck &Other) = default;
-  ~ImmCheck() = default;
-
-  unsigned getArg() const { return Arg; }
-  unsigned getKind() const { return Kind; }
-  unsigned getElementSizeInBits() const { return ElementSizeInBits; }
-};
-
 class SVEType {
   bool Float, Signed, Immediate, Void, Constant, Pointer, BFloat;
   bool DefaultType, IsScalable, Predicate, PredicatePattern, PrefetchOp,
@@ -388,6 +372,9 @@ class SVEEmitter {
   /// Emit all the range checks for the immediates.
   void createRangeChecks(raw_ostream &o);
 
+  // Emit all the ImmCheckTypes to arm_immcheck_types.inc
+  void createImmCheckTypes(raw_ostream &OS);
+
   /// Create the SVETypeFlags used in CGBuiltins
   void createTypeFlags(raw_ostream &o);
 
@@ -430,7 +417,6 @@ const std::array<SVEEmitter::ReinterpretTypeInfo, 12> SVEEmitter::Reinterprets =
 
 } // end anonymous namespace
 
-
 //===----------------------------------------------------------------------===//
 // Type implementation
 //===----------------------------------------------------------------------===//
@@ -1210,18 +1196,17 @@ void SVEEmitter::createIntrinsic(
     // Collate a list of range/option checks for the immediates.
     SmallVector<ImmCheck, 2> ImmChecks;
     for (auto *R : ImmCheckList) {
-      int64_t Arg = R->getValueAsInt("Arg");
-      int64_t EltSizeArg = R->getValueAsInt("EltSizeArg");
+      int64_t ArgIdx = R->getValueAsInt("ImmArgIdx");
+      int64_t EltSizeArgIdx = R->getValueAsInt("TypeContextArgIdx");
       int64_t Kind = R->getValueAsDef("Kind")->getValueAsInt("Value");
-      assert(Arg >= 0 && Kind >= 0 && "Arg and Kind must be nonnegative");
+      assert(ArgIdx >= 0 && Kind >= 0 &&
+             "ImmArgIdx and Kind must be nonnegative");
 
       unsigned ElementSizeInBits = 0;
-      char Mod;
-      unsigned NumVectors;
-      std::tie(Mod, NumVectors) = getProtoModifier(Proto, EltSizeArg + 1);
-      if (EltSizeArg >= 0)
+      auto [Mod, NumVectors] = getProtoModifier(Proto, EltSizeArgIdx + 1);
+      if (EltSizeArgIdx >= 0)
         ElementSizeInBits = SVEType(TS, Mod, NumVectors).getElementSizeInBits();
-      ImmChecks.push_back(ImmCheck(Arg, Kind, ElementSizeInBits));
+      ImmChecks.push_back(ImmCheck(ArgIdx, Kind, ElementSizeInBits));
     }
 
     Out.push_back(std::make_unique<Intrinsic>(
@@ -1541,8 +1526,8 @@ void SVEEmitter::createRangeChecks(raw_ostream &OS) {
 
     OS << "case SVE::BI__builtin_sve_" << Def->getMangledName() << ":\n";
     for (auto &Check : Def->getImmChecks())
-      OS << "ImmChecks.push_back(std::make_tuple(" << Check.getArg() << ", "
-         << Check.getKind() << ", " << Check.getElementSizeInBits() << "));\n";
+      OS << "ImmChecks.emplace_back(" << Check.getImmArgIdx() << ", "
+         << Check.getKind() << ", " << Check.getElementSizeInBits() << ");\n";
     OS << "  break;\n";
 
     Emitted.insert(Def->getMangledName());
@@ -1572,8 +1557,10 @@ void SVEEmitter::createTypeFlags(raw_ostream &OS) {
   for (auto &KV : MergeTypes)
     OS << "  " << KV.getKey() << " = " << KV.getValue() << ",\n";
   OS << "#endif\n\n";
+}
 
-  OS << "#ifdef LLVM_GET_SVE_IMMCHECKTYPES\n";
+void SVEEmitter::createImmCheckTypes(raw_ostream &OS) {
+  OS << "#ifdef LLVM_GET_ARM_INTRIN_IMMCHECKTYPES\n";
   for (auto &KV : ImmCheckTypes)
     OS << "  " << KV.getKey() << " = " << KV.getValue() << ",\n";
   OS << "#endif\n\n";
@@ -1734,8 +1721,9 @@ void SVEEmitter::createSMERangeChecks(raw_ostream &OS) {
 
     OS << "case SME::BI__builtin_sme_" << Def->getMangledName() << ":\n";
     for (auto &Check : Def->getImmChecks())
-      OS << "ImmChecks.push_back(std::make_tuple(" << Check.getArg() << ", "
-         << Check.getKind() << ", " << Check.getElementSizeInBits() << "));\n";
+      OS << "ImmChecks.push_back(std::make_tuple(" << Check.getImmArgIdx()
+         << ", " << Check.getKind() << ", " << Check.getElementSizeInBits()
+         << "));\n";
     OS << "  break;\n";
 
     Emitted.insert(Def->getMangledName());
@@ -1858,6 +1846,10 @@ void EmitSveTypeFlags(RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createTypeFlags(OS);
 }
 
+void EmitImmCheckTypes(RecordKeeper &Records, raw_ostream &OS) {
+  SVEEmitter(Records).createImmCheckTypes(OS);
+}
+
 void EmitSveStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SVE);
 }
diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp
index 42cc704543f18..84afd4c0afb26 100644
--- a/clang/utils/TableGen/TableGen.cpp
+++ b/clang/utils/TableGen/TableGen.cpp
@@ -75,6 +75,7 @@ enum ActionType {
   GenArmVectorType,
   GenArmNeonSema,
   GenArmNeonTest,
+  GenArmImmCheckTypes,
   GenArmMveHeader,
   GenArmMveBuiltinDef,
   GenArmMveBuiltinSema,
@@ -234,6 +235,10 @@ cl::opt<ActionType> Action(
                    "Generate ARM NEON sema support for clang"),
         clEnumValN(GenArmNeonTest, "gen-arm-neon-test",
                    "Generate ARM NEON tests for clang"),
+        clEnumValN(
+            GenArmImmCheckTypes, "gen-arm-immcheck-types",
+            "Generate arm_immcheck_types.inc (immediate range check types)"
+            " for clang"),
         clEnumValN(GenArmSveHeader, "gen-arm-sve-header",
                    "Generate arm_sve.h for clang"),
         clEnumValN(GenArmSveBuiltins, "gen-arm-sve-builtins",
@@ -469,6 +474,9 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenArmNeonTest:
     EmitNeonTest(Records, OS);
     break;
+  case GenArmImmCheckTypes:
+    EmitImmCheckTypes(Records, OS);
+    break;
   case GenArmMveHeader:
     EmitMveHeader(Records, OS);
     break;
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index 5f2dd257cb90a..3a424c9c91fe7 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -108,6 +108,7 @@ void EmitNeonSema(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitVectorTypes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitNeonTest(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
+void EmitImmCheckTypes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSveBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSveBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
diff --git a/llvm/docs/CommandGuide/tblgen.rst b/llvm/docs/CommandGuide/tblgen.rst
index aa4c8e1786405..92186579e682d 100644
--- a/llvm/docs/CommandGuide/tblgen.rst
+++ b/llvm/docs/CommandGuide/tblgen.rst
@@ -276,6 +276,10 @@ clang-tblgen Options
 
   Generate ARM NEON tests for Clang.
 
+.. option:: -gen-arm-immcheck-types
+
+  Generate ``arm_immcheck_types.inc`` for Clang.
+
 .. option:: -gen-arm-sve-header
 
   Generate ``arm_sve.h`` for Clang.
diff --git a/llvm/include/llvm/TableGen/AArch64ImmCheck.h b/llvm/include/llvm/TableGen/AArch64ImmCheck.h
new file mode 100644
index 0000000000000..0242620c9c60e
--- /dev/null
+++ b/llvm/include/llvm/TableGen/AArch64ImmCheck.h
@@ -0,0 +1,37 @@
+//===----- AArch64ImmCheck.h -- ARM immediate range check -----*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the ImmCheck class which supports the range-checking of
+/// immediate values supplied to AArch64 SVE/SME and NEON intrinsics.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef AARCH64_IMMCHECK_H
+#define AARCH64_IMMCHECK_H
+
+class ImmCheck {
+  int ImmArgIdx;
+  unsigned Kind;
+  unsigned ElementSizeInBits;
+  unsigned VecSizeInBits;
+
+public:
+  ImmCheck(int ImmArgIdx, unsigned Kind, unsigned ElementSizeInBits = 0,
+           unsigned VecSizeInBits = 128)
+      : ImmArgIdx(ImmArgIdx), Kind(Kind), ElementSizeInBits(ElementSizeInBits),
+        VecSizeInBits(VecSizeInBits) {}
+  ImmCheck(const ImmCheck &Other) = default;
+  ~ImmCheck() = default;
+
+  int getImmArgIdx() const { return ImmArgIdx; }
+  unsigned getKind() const { return Kind; }
+  unsigned getElementSizeInBits() const { return ElementSizeInBits; }
+  unsigned getVecSizeInBits() const { return VecSizeInBits; }
+};
+
+#endif

From 6ec889e53f1ae048cc9aee79c91118fc8bbc6974 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 6 Sep 2024 13:04:03 +0100
Subject: [PATCH 365/425] [DAG] Add support for neg(abd(x,y)) patterns.

Currently limited to cases which have legal/custom ABDS/ABDU handling - I'll extend this for all targets in future (similar to how we support neg(abs(x))) once I've addressed some outstanding regressions on aarch64/riscv.

Helps avoid a lot of extra cmov instructions on x86 in particular, and allows us to more easily improve the codegen in future commits.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  20 +++
 llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll   |  20 ++-
 llvm/test/CodeGen/X86/abds-neg.ll             | 138 +++++++++---------
 llvm/test/CodeGen/X86/abdu-neg.ll             | 108 +++++++-------
 4 files changed, 154 insertions(+), 132 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 37272a09b336a..bb907633e1f82 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4103,12 +4103,24 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
       sd_match(N1, m_SMin(m_Specific(A), m_Specific(B))))
     return DAG.getNode(ISD::ABDS, DL, VT, A, B);
 
+  // smin(a,b) - smax(a,b) --> neg(abds(a,b))
+  if (hasOperation(ISD::ABDS, VT) &&
+      sd_match(N0, m_SMin(m_Value(A), m_Value(B))) &&
+      sd_match(N1, m_SMax(m_Specific(A), m_Specific(B))))
+    return DAG.getNegative(DAG.getNode(ISD::ABDS, DL, VT, A, B), DL, VT);
+
   // umax(a,b) - umin(a,b) --> abdu(a,b)
   if ((!LegalOperations || hasOperation(ISD::ABDU, VT)) &&
       sd_match(N0, m_UMax(m_Value(A), m_Value(B))) &&
       sd_match(N1, m_UMin(m_Specific(A), m_Specific(B))))
     return DAG.getNode(ISD::ABDU, DL, VT, A, B);
 
+  // umin(a,b) - umax(a,b) --> neg(abdu(a,b))
+  if (hasOperation(ISD::ABDU, VT) &&
+      sd_match(N0, m_UMin(m_Value(A), m_Value(B))) &&
+      sd_match(N1, m_UMax(m_Specific(A), m_Specific(B))))
+    return DAG.getNegative(DAG.getNode(ISD::ABDU, DL, VT, A, B), DL, VT);
+
   return SDValue();
 }
 
@@ -11605,6 +11617,10 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
     if (sd_match(True, m_Sub(m_Specific(LHS), m_Specific(RHS))) &&
         sd_match(False, m_Sub(m_Specific(RHS), m_Specific(LHS))))
       return DAG.getNode(ABDOpc, DL, VT, LHS, RHS);
+    if (sd_match(True, m_Sub(m_Specific(RHS), m_Specific(LHS))) &&
+        sd_match(False, m_Sub(m_Specific(LHS), m_Specific(RHS))) &&
+        hasOperation(ABDOpc, VT))
+      return DAG.getNegative(DAG.getNode(ABDOpc, DL, VT, LHS, RHS), DL, VT);
     break;
   case ISD::SETLT:
   case ISD::SETLE:
@@ -11613,6 +11629,10 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
     if (sd_match(True, m_Sub(m_Specific(RHS), m_Specific(LHS))) &&
         sd_match(False, m_Sub(m_Specific(LHS), m_Specific(RHS))))
       return DAG.getNode(ABDOpc, DL, VT, LHS, RHS);
+    if (sd_match(True, m_Sub(m_Specific(LHS), m_Specific(RHS))) &&
+        sd_match(False, m_Sub(m_Specific(RHS), m_Specific(LHS))) &&
+        hasOperation(ABDOpc, VT))
+      return DAG.getNegative(DAG.getNode(ABDOpc, DL, VT, LHS, RHS), DL, VT);
     break;
   default:
     break;
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
index 64a2e89e34811..b540948b20f75 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@@ -1933,13 +1933,19 @@ define <16 x i8> @absd_int8_sle(<16 x i8>, <16 x i8>) {
 ; some cases we are unable to optimize
 ; check whether goes beyond the scope
 define <4 x i32> @absd_int32_ugt_opp(<4 x i32>, <4 x i32>) {
-; CHECK-LABEL: absd_int32_ugt_opp:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcmpgtuw v4, v2, v3
-; CHECK-NEXT:    vsubuwm v5, v2, v3
-; CHECK-NEXT:    vsubuwm v2, v3, v2
-; CHECK-NEXT:    xxsel v2, v5, v2, v4
-; CHECK-NEXT:    blr
+; CHECK-PWR9-LABEL: absd_int32_ugt_opp:
+; CHECK-PWR9:       # %bb.0:
+; CHECK-PWR9-NEXT:    vabsduw v2, v2, v3
+; CHECK-PWR9-NEXT:    vnegw v2, v2
+; CHECK-PWR9-NEXT:    blr
+;
+; CHECK-PWR78-LABEL: absd_int32_ugt_opp:
+; CHECK-PWR78:       # %bb.0:
+; CHECK-PWR78-NEXT:    vcmpgtuw v4, v2, v3
+; CHECK-PWR78-NEXT:    vsubuwm v5, v2, v3
+; CHECK-PWR78-NEXT:    vsubuwm v2, v3, v2
+; CHECK-PWR78-NEXT:    xxsel v2, v5, v2, v4
+; CHECK-PWR78-NEXT:    blr
   %3 = icmp ugt <4 x i32> %0, %1
   %4 = sub <4 x i32> %0, %1
   %5 = sub <4 x i32> %1, %0
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index f837f49abf7a4..b22c8c54e89a6 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -522,23 +522,25 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 ; X86-LABEL: abd_minmax_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    cmovgl %edx, %ecx
-; X86-NEXT:    subb %cl, %al
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovsl %ecx, %eax
+; X86-NEXT:    negb %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_minmax_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    cmpb %sil, %dil
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    cmovll %edi, %eax
-; X64-NEXT:    cmovgl %edi, %esi
-; X64-NEXT:    subb %sil, %al
+; X64-NEXT:    movsbl %sil, %eax
+; X64-NEXT:    movsbl %dil, %ecx
+; X64-NEXT:    subl %eax, %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovsl %ecx, %eax
+; X64-NEXT:    negb %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %min = call i8 @llvm.smin.i8(i8 %a, i8 %b)
@@ -550,23 +552,23 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; X86-LABEL: abd_minmax_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    cmovgl %edx, %ecx
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovnsl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_minmax_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    cmpw %si, %di
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    cmovll %edi, %eax
-; X64-NEXT:    cmovgl %edi, %esi
-; X64-NEXT:    subl %esi, %eax
+; X64-NEXT:    movswl %si, %eax
+; X64-NEXT:    movswl %di, %ecx
+; X64-NEXT:    subl %eax, %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovnsl %ecx, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %min = call i16 @llvm.smin.i16(i16 %a, i16 %b)
@@ -578,22 +580,22 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind {
 ; X86-LABEL: abd_minmax_i32:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpl %ecx, %edx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    cmovll %edx, %eax
+; X86-NEXT:    negl %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_minmax_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    cmovll %edi, %eax
-; X64-NEXT:    cmovgl %edi, %esi
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
+; X64-NEXT:    subl %edi, %esi
+; X64-NEXT:    cmovgel %esi, %eax
+; X64-NEXT:    negl %eax
 ; X64-NEXT:    retq
   %min = call i32 @llvm.smin.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.smax.i32(i32 %a, i32 %b)
@@ -634,11 +636,11 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 ;
 ; X64-LABEL: abd_minmax_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    cmpq %rsi, %rdi
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    cmovlq %rdi, %rax
-; X64-NEXT:    cmovgq %rdi, %rsi
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
+; X64-NEXT:    subq %rdi, %rsi
+; X64-NEXT:    cmovgeq %rsi, %rax
+; X64-NEXT:    negq %rax
 ; X64-NEXT:    retq
   %min = call i64 @llvm.smin.i64(i64 %a, i64 %b)
   %max = call i64 @llvm.smax.i64(i64 %a, i64 %b)
@@ -736,27 +738,25 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 ; X86-LABEL: abd_cmp_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    subb %cl, %dl
-; X86-NEXT:    negb %dl
-; X86-NEXT:    subb %cl, %al
-; X86-NEXT:    movzbl %al, %ecx
-; X86-NEXT:    movzbl %dl, %eax
-; X86-NEXT:    cmovlel %ecx, %eax
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovsl %ecx, %eax
+; X86-NEXT:    negb %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_cmp_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    subb %sil, %al
+; X64-NEXT:    movsbl %sil, %eax
+; X64-NEXT:    movsbl %dil, %ecx
+; X64-NEXT:    subl %eax, %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovsl %ecx, %eax
 ; X64-NEXT:    negb %al
-; X64-NEXT:    subb %sil, %dil
-; X64-NEXT:    movzbl %dil, %ecx
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    cmovlel %ecx, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %cmp = icmp sle i8 %a, %b
@@ -769,27 +769,23 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; X86-LABEL: abd_cmp_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    subw %dx, %si
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
-; X86-NEXT:    cmpw %dx, %cx
-; X86-NEXT:    cmovll %esi, %eax
+; X86-NEXT:    cmovnsl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_cmp_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    subw %si, %cx
+; X64-NEXT:    movswl %si, %eax
+; X64-NEXT:    movswl %di, %ecx
+; X64-NEXT:    subl %eax, %ecx
 ; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    negl %eax
-; X64-NEXT:    cmpw %si, %di
-; X64-NEXT:    cmovll %ecx, %eax
+; X64-NEXT:    cmovnsl %ecx, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %cmp = icmp slt i16 %a, %b
@@ -804,20 +800,20 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    subl %ecx, %edx
-; X86-NEXT:    negl %edx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovgel %edx, %eax
+; X86-NEXT:    cmovll %edx, %eax
+; X86-NEXT:    negl %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_cmp_i32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
+; X64-NEXT:    subl %edi, %esi
+; X64-NEXT:    cmovgel %esi, %eax
 ; X64-NEXT:    negl %eax
-; X64-NEXT:    subl %esi, %edi
-; X64-NEXT:    cmovll %edi, %eax
 ; X64-NEXT:    retq
   %cmp = icmp sge i32 %a, %b
   %ab = sub i32 %a, %b
@@ -853,9 +849,9 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
+; X64-NEXT:    subq %rdi, %rsi
+; X64-NEXT:    cmovgeq %rsi, %rax
 ; X64-NEXT:    negq %rax
-; X64-NEXT:    subq %rsi, %rdi
-; X64-NEXT:    cmovlq %rdi, %rax
 ; X64-NEXT:    retq
   %cmp = icmp slt i64 %a, %b
   %ab = sub i64 %a, %b
diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll
index 24962be43b5cf..f9da1a9e04530 100644
--- a/llvm/test/CodeGen/X86/abdu-neg.ll
+++ b/llvm/test/CodeGen/X86/abdu-neg.ll
@@ -504,23 +504,25 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 ; X86-LABEL: abd_minmax_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    cmoval %edx, %ecx
-; X86-NEXT:    subb %cl, %al
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovsl %ecx, %eax
+; X86-NEXT:    negb %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_minmax_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    cmpb %sil, %dil
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    cmovbl %edi, %eax
-; X64-NEXT:    cmoval %edi, %esi
-; X64-NEXT:    subb %sil, %al
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    subl %eax, %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovsl %ecx, %eax
+; X64-NEXT:    negb %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %min = call i8 @llvm.umin.i8(i8 %a, i8 %b)
@@ -532,23 +534,23 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; X86-LABEL: abd_minmax_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    cmoval %edx, %ecx
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovnsl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_minmax_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    cmpw %si, %di
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    cmovbl %edi, %eax
-; X64-NEXT:    cmoval %edi, %esi
-; X64-NEXT:    subl %esi, %eax
+; X64-NEXT:    movzwl %si, %eax
+; X64-NEXT:    movzwl %di, %ecx
+; X64-NEXT:    subl %eax, %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovnsl %ecx, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %min = call i16 @llvm.umin.i16(i16 %a, i16 %b)
@@ -560,22 +562,22 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind {
 ; X86-LABEL: abd_minmax_i32:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpl %ecx, %edx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    cmovbl %edx, %eax
+; X86-NEXT:    negl %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_minmax_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    cmovbl %edi, %eax
-; X64-NEXT:    cmoval %edi, %esi
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
+; X64-NEXT:    subl %edi, %esi
+; X64-NEXT:    cmovael %esi, %eax
+; X64-NEXT:    negl %eax
 ; X64-NEXT:    retq
   %min = call i32 @llvm.umin.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %a, i32 %b)
@@ -616,11 +618,11 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 ;
 ; X64-LABEL: abd_minmax_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    cmpq %rsi, %rdi
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    cmovbq %rdi, %rax
-; X64-NEXT:    cmovaq %rdi, %rsi
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    subq %rsi, %rax
+; X64-NEXT:    subq %rdi, %rsi
+; X64-NEXT:    cmovaeq %rsi, %rax
+; X64-NEXT:    negq %rax
 ; X64-NEXT:    retq
   %min = call i64 @llvm.umin.i64(i64 %a, i64 %b)
   %max = call i64 @llvm.umax.i64(i64 %a, i64 %b)
@@ -720,25 +722,23 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    subb %cl, %dl
-; X86-NEXT:    negb %dl
-; X86-NEXT:    subb %cl, %al
-; X86-NEXT:    movzbl %al, %ecx
-; X86-NEXT:    movzbl %dl, %eax
-; X86-NEXT:    cmovbel %ecx, %eax
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovsl %ecx, %eax
+; X86-NEXT:    negb %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_cmp_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    subb %sil, %al
-; X64-NEXT:    negb %al
-; X64-NEXT:    subb %sil, %dil
+; X64-NEXT:    movzbl %sil, %eax
 ; X64-NEXT:    movzbl %dil, %ecx
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    cmovbel %ecx, %eax
+; X64-NEXT:    subl %eax, %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovsl %ecx, %eax
+; X64-NEXT:    negb %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %cmp = icmp ule i8 %a, %b
@@ -782,20 +782,20 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    subl %ecx, %edx
-; X86-NEXT:    negl %edx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl %eax, %edx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    cmovael %edx, %eax
+; X86-NEXT:    cmovbl %edx, %eax
+; X86-NEXT:    negl %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abd_cmp_i32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subl %esi, %eax
+; X64-NEXT:    subl %edi, %esi
+; X64-NEXT:    cmovael %esi, %eax
 ; X64-NEXT:    negl %eax
-; X64-NEXT:    subl %esi, %edi
-; X64-NEXT:    cmovbl %edi, %eax
 ; X64-NEXT:    retq
   %cmp = icmp uge i32 %a, %b
   %ab = sub i32 %a, %b

From 67f9183c113a340c58bdb9d5d3bfb850b8db4e90 Mon Sep 17 00:00:00 2001
From: yronglin <yronglin777@gmail.com>
Date: Fri, 6 Sep 2024 20:27:11 +0800
Subject: [PATCH 366/425] [clang][bytecode] Implement comparsion operators for
 vector type (#107258)

Implement ==, !=, <, <=, >, >= comparsion operators for vector type.

---------

Signed-off-by: yronglin <yronglin777@gmail.com>
---
 clang/lib/AST/ByteCode/Compiler.cpp           | 101 +++++++++
 clang/lib/AST/ByteCode/Compiler.h             |   2 +-
 clang/test/AST/ByteCode/constexpr-vectors.cpp | 212 +++++++++++++++++-
 3 files changed, 310 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index a831f196abdcb..eea77c2f0a9bb 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -687,6 +687,8 @@ bool Compiler<Emitter>::VisitParenExpr(const ParenExpr *E) {
 template <class Emitter>
 bool Compiler<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
   // Need short-circuiting for these.
+  if (BO->getType()->isVectorType())
+    return this->VisitVectorBinOp(BO);
   if (BO->isLogicalOp())
     return this->VisitLogicalBinOp(BO);
 
@@ -1222,6 +1224,105 @@ bool Compiler<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
   return true;
 }
 
+template <class Emitter>
+bool Compiler<Emitter>::VisitVectorBinOp(const BinaryOperator *E) {
+  assert(E->getType()->isVectorType());
+  assert(E->getLHS()->getType()->isVectorType());
+  assert(E->getRHS()->getType()->isVectorType());
+
+  // FIXME: Current only support comparison binary operator, add support for
+  // other binary operator.
+  if (!E->isComparisonOp())
+    return this->emitInvalid(E);
+  // Prepare storage for result.
+  if (!Initializing) {
+    unsigned LocalIndex = allocateTemporary(E);
+    if (!this->emitGetPtrLocal(LocalIndex, E))
+      return false;
+  }
+
+  const Expr *LHS = E->getLHS();
+  const Expr *RHS = E->getRHS();
+  const auto *VecTy = E->getType()->getAs<VectorType>();
+
+  // The LHS and RHS of a comparison operator must have the same type. So we
+  // just use LHS vector element type here.
+  PrimType ElemT = this->classifyVectorElementType(LHS->getType());
+  PrimType ResultElemT = this->classifyVectorElementType(E->getType());
+
+  // Evaluate LHS and save value to LHSOffset.
+  unsigned LHSOffset = this->allocateLocalPrimitive(LHS, PT_Ptr, true, false);
+  if (!this->visit(LHS))
+    return false;
+  if (!this->emitSetLocal(PT_Ptr, LHSOffset, E))
+    return false;
+
+  // Evaluate RHS and save value to RHSOffset.
+  unsigned RHSOffset = this->allocateLocalPrimitive(RHS, PT_Ptr, true, false);
+  if (!this->visit(RHS))
+    return false;
+  if (!this->emitSetLocal(PT_Ptr, RHSOffset, E))
+    return false;
+
+  auto getElem = [=](unsigned Offset, unsigned Index) {
+    if (!this->emitGetLocal(PT_Ptr, Offset, E))
+      return false;
+    return this->emitArrayElemPop(ElemT, Index, E);
+  };
+
+  for (unsigned I = 0; I != VecTy->getNumElements(); ++I) {
+    if (!getElem(LHSOffset, I))
+      return false;
+    if (!getElem(RHSOffset, I))
+      return false;
+    switch (E->getOpcode()) {
+    case BO_EQ:
+      if (!this->emitEQ(ElemT, E))
+        return false;
+      break;
+    case BO_NE:
+      if (!this->emitNE(ElemT, E))
+        return false;
+      break;
+    case BO_LE:
+      if (!this->emitLE(ElemT, E))
+        return false;
+      break;
+    case BO_LT:
+      if (!this->emitLT(ElemT, E))
+        return false;
+      break;
+    case BO_GE:
+      if (!this->emitGE(ElemT, E))
+        return false;
+      break;
+    case BO_GT:
+      if (!this->emitGT(ElemT, E))
+        return false;
+      break;
+    default:
+      llvm_unreachable("Unsupported binary operator");
+    }
+
+    // The result of the comparison is a vector of the same width and number
+    // of elements as the comparison operands with a signed integral element
+    // type.
+    //
+    // https://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html
+    if (E->isComparisonOp()) {
+      if (!this->emitPrimCast(PT_Bool, ResultElemT, VecTy->getElementType(), E))
+        return false;
+      if (!this->emitNeg(ResultElemT, E))
+        return false;
+    }
+
+    // Initialize array element with the value we just computed.
+    if (!this->emitInitElem(ResultElemT, I, E))
+      return false;
+  }
+  return true;
+}
+
 template <class Emitter>
 bool Compiler<Emitter>::VisitImplicitValueInitExpr(
     const ImplicitValueInitExpr *E) {
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index b18afacdb2e49..e6f54fe05427b 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -130,6 +130,7 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
   bool VisitLogicalBinOp(const BinaryOperator *E);
   bool VisitPointerArithBinOp(const BinaryOperator *E);
   bool VisitComplexBinOp(const BinaryOperator *E);
+  bool VisitVectorBinOp(const BinaryOperator *E);
   bool VisitCXXDefaultArgExpr(const CXXDefaultArgExpr *E);
   bool VisitCallExpr(const CallExpr *E);
   bool VisitBuiltinCallExpr(const CallExpr *E);
@@ -363,7 +364,6 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
   bool emitComplexBoolCast(const Expr *E);
   bool emitComplexComparison(const Expr *LHS, const Expr *RHS,
                              const BinaryOperator *E);
-
   bool emitRecordDestruction(const Record *R);
   bool emitDestruction(const Descriptor *Desc);
   unsigned collectBaseOffset(const QualType BaseType,
diff --git a/clang/test/AST/ByteCode/constexpr-vectors.cpp b/clang/test/AST/ByteCode/constexpr-vectors.cpp
index a738cfe617a0e..684c5810702cc 100644
--- a/clang/test/AST/ByteCode/constexpr-vectors.cpp
+++ b/clang/test/AST/ByteCode/constexpr-vectors.cpp
@@ -15,8 +15,50 @@ using FourI128ExtVec __attribute__((ext_vector_type(4))) = __int128;
 // Only int vs float makes a difference here, so we only need to test 1 of each.
 // Test Char to make sure the mixed-nature of shifts around char is evident.
 void CharUsage() {
-  constexpr auto H = FourCharsVecSize{-1, -1, 0, -1};
-  constexpr auto InvH = -H;
+  constexpr auto w = FourCharsVecSize{1, 2, 3, 4} <
+                     FourCharsVecSize{4, 3, 2, 1};
+  static_assert(w[0] == -1 && w[1] == -1 && w[2] == 0 && w[3] == 0, "");
+
+  constexpr auto x = FourCharsVecSize{1, 2, 3, 4} >
+                     FourCharsVecSize{4, 3, 2, 1};
+  static_assert(x[0] == 0 && x[1] == 0 && x[2] == -1 && x[3] == -1, "");
+
+  constexpr auto y = FourCharsVecSize{1, 2, 3, 4} <=
+                     FourCharsVecSize{4, 3, 3, 1};
+  static_assert(y[0] == -1 && y[1] == -1 && y[2] == -1 && y[3] == 0, "");
+
+  constexpr auto z = FourCharsVecSize{1, 2, 3, 4} >=
+                     FourCharsVecSize{4, 3, 3, 1};
+  static_assert(z[0] == 0 && z[1] == 0 && z[2] == -1 && z[3] == -1, "");
+
+  constexpr auto A = FourCharsVecSize{1, 2, 3, 4} ==
+                     FourCharsVecSize{4, 3, 3, 1};
+  static_assert(A[0] == 0 && A[1] == 0 && A[2] == -1 && A[3] == 0, "");
+
+  constexpr auto B = FourCharsVecSize{1, 2, 3, 4} !=
+                     FourCharsVecSize{4, 3, 3, 1};
+  static_assert(B[0] == -1 && B[1] == -1 && B[2] == 0 && B[3] == -1, "");
+
+  constexpr auto C = FourCharsVecSize{1, 2, 3, 4} < 3;
+  static_assert(C[0] == -1 && C[1] == -1 && C[2] == 0 && C[3] == 0, "");
+
+  constexpr auto D = FourCharsVecSize{1, 2, 3, 4} > 3;
+  static_assert(D[0] == 0 && D[1] == 0 && D[2] == 0 && D[3] == -1, "");
+
+  constexpr auto E = FourCharsVecSize{1, 2, 3, 4} <= 3;
+  static_assert(E[0] == -1 && E[1] == -1 && E[2] == -1 && E[3] == 0, "");
+
+  constexpr auto F = FourCharsVecSize{1, 2, 3, 4} >= 3;
+  static_assert(F[0] == 0 && F[1] == 0 && F[2] == -1 && F[3] == -1, "");
+
+  constexpr auto G = FourCharsVecSize{1, 2, 3, 4} == 3;
+  static_assert(G[0] == 0 && G[1] == 0 && G[2] == -1 && G[3] == 0, "");
+
+  constexpr auto H = FourCharsVecSize{1, 2, 3, 4} != 3;
+  static_assert(H[0] == -1 && H[1] == -1 && H[2] == 0 && H[3] == -1, "");
+
+  constexpr auto H1 = FourCharsVecSize{-1, -1, 0, -1};
+  constexpr auto InvH = -H1;
   static_assert(InvH[0] == 1 && InvH[1] == 1 && InvH[2] == 0 && InvH[3] == 1, "");
 
   constexpr auto ae = ~FourCharsVecSize{1, 2, 10, 20};
@@ -27,8 +69,50 @@ void CharUsage() {
 }
 
 void CharExtVecUsage() {
-  constexpr auto H = FourCharsExtVec{-1, -1, 0, -1};
-  constexpr auto InvH = -H;
+  constexpr auto w = FourCharsExtVec{1, 2, 3, 4} <
+                     FourCharsExtVec{4, 3, 2, 1};
+  static_assert(w[0] == -1 && w[1] == -1 && w[2] == 0 && w[3] == 0, "");
+
+  constexpr auto x = FourCharsExtVec{1, 2, 3, 4} >
+                     FourCharsExtVec{4, 3, 2, 1};
+  static_assert(x[0] == 0 && x[1] == 0 && x[2] == -1 && x[3] == -1, "");
+
+  constexpr auto y = FourCharsExtVec{1, 2, 3, 4} <=
+                     FourCharsExtVec{4, 3, 3, 1};
+  static_assert(y[0] == -1 && y[1] == -1 && y[2] == -1 && y[3] == 0, "");
+
+  constexpr auto z = FourCharsExtVec{1, 2, 3, 4} >=
+                     FourCharsExtVec{4, 3, 3, 1};
+  static_assert(z[0] == 0 && z[1] == 0 && z[2] == -1 && z[3] == -1, "");
+
+  constexpr auto A = FourCharsExtVec{1, 2, 3, 4} ==
+                     FourCharsExtVec{4, 3, 3, 1};
+  static_assert(A[0] == 0 && A[1] == 0 && A[2] == -1 && A[3] == 0, "");
+
+  constexpr auto B = FourCharsExtVec{1, 2, 3, 4} !=
+                     FourCharsExtVec{4, 3, 3, 1};
+  static_assert(B[0] == -1 && B[1] == -1 && B[2] == 0 && B[3] == -1, "");
+
+  constexpr auto C = FourCharsExtVec{1, 2, 3, 4} < 3;
+  static_assert(C[0] == -1 && C[1] == -1 && C[2] == 0 && C[3] == 0, "");
+
+  constexpr auto D = FourCharsExtVec{1, 2, 3, 4} > 3;
+  static_assert(D[0] == 0 && D[1] == 0 && D[2] == 0 && D[3] == -1, "");
+
+  constexpr auto E = FourCharsExtVec{1, 2, 3, 4} <= 3;
+  static_assert(E[0] == -1 && E[1] == -1 && E[2] == -1 && E[3] == 0, "");
+
+  constexpr auto F = FourCharsExtVec{1, 2, 3, 4} >= 3;
+  static_assert(F[0] == 0 && F[1] == 0 && F[2] == -1 && F[3] == -1, "");
+
+  constexpr auto G = FourCharsExtVec{1, 2, 3, 4} == 3;
+  static_assert(G[0] == 0 && G[1] == 0 && G[2] == -1 && G[3] == 0, "");
+
+  constexpr auto H = FourCharsExtVec{1, 2, 3, 4} != 3;
+  static_assert(H[0] == -1 && H[1] == -1 && H[2] == 0 && H[3] == -1, "");
+
+  constexpr auto H1 = FourCharsExtVec{-1, -1, 0, -1};
+  constexpr auto InvH = -H1;
   static_assert(InvH[0] == 1 && InvH[1] == 1 && InvH[2] == 0 && InvH[3] == 1, "");
 
   constexpr auto ae = ~FourCharsExtVec{1, 2, 10, 20};
@@ -39,6 +123,48 @@ void CharExtVecUsage() {
 }
 
 void FloatUsage() {
+  constexpr auto w = FourFloatsVecSize{1, 2, 3, 4} <
+                     FourFloatsVecSize{4, 3, 2, 1};
+  static_assert(w[0] == -1 && w[1] == -1 && w[2] == 0 && w[3] == 0, "");
+
+  constexpr auto x = FourFloatsVecSize{1, 2, 3, 4} >
+                     FourFloatsVecSize{4, 3, 2, 1};
+  static_assert(x[0] == 0 && x[1] == 0 && x[2] == -1 && x[3] == -1, "");
+
+  constexpr auto y = FourFloatsVecSize{1, 2, 3, 4} <=
+                     FourFloatsVecSize{4, 3, 3, 1};
+  static_assert(y[0] == -1 && y[1] == -1 && y[2] == -1 && y[3] == 0, "");
+
+  constexpr auto z = FourFloatsVecSize{1, 2, 3, 4} >=
+                     FourFloatsVecSize{4, 3, 3, 1};
+  static_assert(z[0] == 0 && z[1] == 0 && z[2] == -1 && z[3] == -1, "");
+
+  constexpr auto A = FourFloatsVecSize{1, 2, 3, 4} ==
+                     FourFloatsVecSize{4, 3, 3, 1};
+  static_assert(A[0] == 0 && A[1] == 0 && A[2] == -1 && A[3] == 0, "");
+
+  constexpr auto B = FourFloatsVecSize{1, 2, 3, 4} !=
+                     FourFloatsVecSize{4, 3, 3, 1};
+  static_assert(B[0] == -1 && B[1] == -1 && B[2] == 0 && B[3] == -1, "");
+
+  constexpr auto C = FourFloatsVecSize{1, 2, 3, 4} < 3;
+  static_assert(C[0] == -1 && C[1] == -1 && C[2] == 0 && C[3] == 0, "");
+
+  constexpr auto D = FourFloatsVecSize{1, 2, 3, 4} > 3;
+  static_assert(D[0] == 0 && D[1] == 0 && D[2] == 0 && D[3] == -1, "");
+
+  constexpr auto E = FourFloatsVecSize{1, 2, 3, 4} <= 3;
+  static_assert(E[0] == -1 && E[1] == -1 && E[2] == -1 && E[3] == 0, "");
+
+  constexpr auto F = FourFloatsVecSize{1, 2, 3, 4} >= 3;
+  static_assert(F[0] == 0 && F[1] == 0 && F[2] == -1 && F[3] == -1, "");
+
+  constexpr auto G = FourFloatsVecSize{1, 2, 3, 4} == 3;
+  static_assert(G[0] == 0 && G[1] == 0 && G[2] == -1 && G[3] == 0, "");
+
+  constexpr auto H = FourFloatsVecSize{1, 2, 3, 4} != 3;
+  static_assert(H[0] == -1 && H[1] == -1 && H[2] == 0 && H[3] == -1, "");
+
   constexpr auto Y = FourFloatsVecSize{1.200000e+01, 1.700000e+01, -1.000000e+00, -1.000000e+00};
   constexpr auto Z = -Y;
   static_assert(Z[0] == -1.200000e+01 && Z[1] == -1.700000e+01 && Z[2] == 1.000000e+00 && Z[3] == 1.000000e+00, "");
@@ -51,6 +177,48 @@ void FloatUsage() {
 }
 
 void FloatVecUsage() {
+  constexpr auto w = FourFloatsVecSize{1, 2, 3, 4} <
+                     FourFloatsVecSize{4, 3, 2, 1};
+  static_assert(w[0] == -1 && w[1] == -1 && w[2] == 0 && w[3] == 0, "");
+
+  constexpr auto x = FourFloatsVecSize{1, 2, 3, 4} >
+                     FourFloatsVecSize{4, 3, 2, 1};
+  static_assert(x[0] == 0 && x[1] == 0 && x[2] == -1 && x[2] == -1, "");
+
+  constexpr auto y = FourFloatsVecSize{1, 2, 3, 4} <=
+                     FourFloatsVecSize{4, 3, 3, 1};
+  static_assert(y[0] == -1 && y[1] == -1 && y[2] == -1 && y[3] == 0, "");
+
+  constexpr auto z = FourFloatsVecSize{1, 2, 3, 4} >=
+                     FourFloatsVecSize{4, 3, 3, 1};
+  static_assert(z[0] == 0 && z[1] == 0 && z[2] == -1 && z[3] == -1, "");
+
+  constexpr auto A = FourFloatsVecSize{1, 2, 3, 4} ==
+                     FourFloatsVecSize{4, 3, 3, 1};
+  static_assert(A[0] == 0 && A[1] == 0 && A[2] == -1 && A[3] == 0, "");
+
+  constexpr auto B = FourFloatsVecSize{1, 2, 3, 4} !=
+                     FourFloatsVecSize{4, 3, 3, 1};
+  static_assert(B[0] == -1 && B[1] == -1 && B[2] == 0 && B[3] == -1, "");
+
+  constexpr auto C = FourFloatsVecSize{1, 2, 3, 4} < 3;
+  static_assert(C[0] == -1 && C[1] == -1 && C[2] == 0 && C[3] == 0, "");
+
+  constexpr auto D = FourFloatsVecSize{1, 2, 3, 4} > 3;
+  static_assert(D[0] == 0 && D[1] == 0 && D[2] == 0 && D[3] == -1, "");
+
+  constexpr auto E = FourFloatsVecSize{1, 2, 3, 4} <= 3;
+  static_assert(E[0] == -1 && E[1] == -1 && E[2] == -1 && E[3] == 0, "");
+
+  constexpr auto F = FourFloatsVecSize{1, 2, 3, 4} >= 3;
+  static_assert(F[0] == 0 && F[1] == 0 && F[2] == -1 && F[3] == -1, "");
+
+  constexpr auto G = FourFloatsVecSize{1, 2, 3, 4} == 3;
+  static_assert(G[0] == 0 && G[1] == 0 && G[2] == -1 && G[3] == 0, "");
+
+  constexpr auto H = FourFloatsVecSize{1, 2, 3, 4} != 3;
+  static_assert(H[0] == -1 && H[1] == -1 && H[2] == 0 && H[3] == -1, "");
+
   constexpr auto Y = FourFloatsVecSize{1.200000e+01, 1.700000e+01, -1.000000e+00, -1.000000e+00};
   constexpr auto Z = -Y;
   static_assert(Z[0] == -1.200000e+01 && Z[1] == -1.700000e+01 && Z[2] == 1.000000e+00 && Z[3] == 1.000000e+00, "");
@@ -63,6 +231,12 @@ void FloatVecUsage() {
 }
 
 void I128Usage() {
+  constexpr auto a = FourI128VecSize{1, 2, 3, 4};
+  static_assert(a[0] == 1 && a[1] == 2 && a[2] == 3 && a[3] == 4, "");
+
+  constexpr auto b = a < 3;
+  static_assert(b[0] == -1 && b[1] == -1 && b[2] == 0 && b[3] == 0, "");
+
   // Operator ~ is illegal on floats, so no test for that.
   constexpr auto c = ~FourI128VecSize{1, 2, 10, 20};
    static_assert(c[0] == -2 && c[1] == -3 && c[2] == -11 && c[3] == -21, "");
@@ -72,6 +246,12 @@ void I128Usage() {
 }
 
 void I128VecUsage() {
+  constexpr auto a = FourI128ExtVec{1, 2, 3, 4};
+  static_assert(a[0] == 1 && a[1] == 2 && a[2] == 3 && a[3] == 4, "");
+
+  constexpr auto b = a < 3;
+  static_assert(b[0] == -1 && b[1] == -1 && b[2] == 0 && b[3] == 0, "");
+
   // Operator ~ is illegal on floats, so no test for that.
   constexpr auto c = ~FourI128ExtVec{1, 2, 10, 20};
   static_assert(c[0] == -2 && c[1] == -3 && c[2] == -11 && c[3] == -21, "");
@@ -82,6 +262,30 @@ void I128VecUsage() {
 
 using FourBoolsExtVec __attribute__((ext_vector_type(4))) = bool;
 void BoolVecUsage() {
+  constexpr auto a = FourBoolsExtVec{true, false, true, false} <
+                     FourBoolsExtVec{false, false, true, true};
+  static_assert(a[0] == false && a[1] == false && a[2] == false && a[3] == true, "");
+
+  constexpr auto b = FourBoolsExtVec{true, false, true, false} <=
+                     FourBoolsExtVec{false, false, true, true};
+  static_assert(b[0] == false && b[1] == true && b[2] == true && b[3] == true, "");
+
+  constexpr auto c = FourBoolsExtVec{true, false, true, false} ==
+                     FourBoolsExtVec{false, false, true, true};
+  static_assert(c[0] == false && c[1] == true && c[2] == true && c[3] == false, "");
+
+  constexpr auto d = FourBoolsExtVec{true, false, true, false} !=
+                     FourBoolsExtVec{false, false, true, true};
+  static_assert(d[0] == true && d[1] == false && d[2] == false && d[3] == true, "");
+
+  constexpr auto e = FourBoolsExtVec{true, false, true, false} >=
+                     FourBoolsExtVec{false, false, true, true};
+  static_assert(e[0] == true && e[1] == true && e[2] == true && e[3] == false, "");
+
+  constexpr auto f = FourBoolsExtVec{true, false, true, false} >
+                     FourBoolsExtVec{false, false, true, true};
+  static_assert(f[0] == true && f[1] == false && f[2] == false && f[3] == false, "");
+
   constexpr auto j = !FourBoolsExtVec{true, false, true, false};
   static_assert(j[0] == false && j[1] == true && j[2] == false && j[3] == true, "");
 

From 5cf3677a7bcdf5a9e603c054bd40c1282db984a9 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 6 Sep 2024 08:26:19 -0400
Subject: [PATCH 367/425] [clang] Pass -c to clang in test/Driver/Ofast.c

Without this, `-###` prints the linker invocation as well, which
can lead to `-Wno-msvc-not-found` warnings on Windows bots that
don't have MSVC on path, causing the test to fail.

Since the test isn't trying to test linker-related things, just
pass `-c`. See discussion on #98736.
---
 clang/test/Driver/Ofast.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/clang/test/Driver/Ofast.c b/clang/test/Driver/Ofast.c
index 91de296a4c3ff..b5189e951cc68 100644
--- a/clang/test/Driver/Ofast.c
+++ b/clang/test/Driver/Ofast.c
@@ -1,14 +1,14 @@
-// RUN: %clang -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
-// RUN: %clang -O2 -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
-// RUN: %clang -fno-fast-math -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
-// RUN: %clang -fno-strict-aliasing -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
-// RUN: %clang -fno-vectorize -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
-// RUN: %clang -Ofast -O2 -### -Werror %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-O2 \
+// RUN: %clang -c -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
+// RUN: %clang -c -O2 -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
+// RUN: %clang -c -fno-fast-math -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
+// RUN: %clang -c -fno-strict-aliasing -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
+// RUN: %clang -c -fno-vectorize -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
+// RUN: %clang -c -Ofast -O2 -### -Werror %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-O2 \
 // RUN:  %if target={{.*-windows-msvc.*}} %{ --check-prefix=CHECK-OFAST-O2-ALIASING-MSVC %} \
 // RUN:  %else %{ --check-prefix=CHECK-OFAST-O2-ALIASING %} %s
-// RUN: %clang -Ofast -fno-fast-math -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-FAST-MATH %s
-// RUN: %clang -Ofast -fno-strict-aliasing -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-STRICT-ALIASING %s
-// RUN: %clang -Ofast -fno-vectorize -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-VECTORIZE %s
+// RUN: %clang -c -Ofast -fno-fast-math -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-FAST-MATH %s
+// RUN: %clang -c -Ofast -fno-strict-aliasing -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-STRICT-ALIASING %s
+// RUN: %clang -c -Ofast -fno-vectorize -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-VECTORIZE %s
 
 // CHECK-OFAST: use '-O3 -ffast-math' for the same behavior, or '-O3' to enable only conforming optimizations
 // CHECK-OFAST: -cc1

From 9d12d9316fa91f9832d13f6a1f756c612360d000 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 6 Sep 2024 08:36:38 -0400
Subject: [PATCH 368/425] [gn] port 1f70fcefa981e6e2b

---
 llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn | 5 +++++
 llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn           | 1 +
 2 files changed, 6 insertions(+)

diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
index 39ff9d0f18fa1..e9d9a3be27a6d 100644
--- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
@@ -105,6 +105,11 @@ clang_tablegen("arm_fp16") {
   args = [ "-gen-arm-neon-sema" ]
 }
 
+clang_tablegen("arm_immcheck_types") {
+  args = [ "-gen-arm-immcheck-types" ]
+  td_file = "arm_sve.td"
+}
+
 clang_tablegen("arm_mve_builtins") {
   args = [ "-gen-arm-mve-builtin-def" ]
   td_file = "arm_mve.td"
diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
index d2cf5243627a0..84d569d342654 100644
--- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
@@ -29,6 +29,7 @@ static_library("Basic") {
     "//clang/include/clang/Basic:DiagnosticGroups",
     "//clang/include/clang/Basic:RegularKeywordAttrInfo",
     "//clang/include/clang/Basic:arm_cde_builtins",
+    "//clang/include/clang/Basic:arm_immcheck_types",
     "//clang/include/clang/Basic:arm_mve_builtins",
     "//clang/include/clang/Basic:arm_sme_builtins",
     "//clang/include/clang/Basic:arm_sve_builtins",

From fb6c10da1f6cb4eb9556548d51dafe97d953ba58 Mon Sep 17 00:00:00 2001
From: Nabeel Omer <nabeel.omer@sony.com>
Date: Fri, 6 Sep 2024 13:41:36 +0100
Subject: [PATCH 369/425] [MC] Emit a jump table size section (#101962)

This patch will make LLVM emit a new section .llvm_jump_table_sizes
containing tuples of (jump table address, entry count) in object files.
This section is useful for tools that need to statically reconstruct the
control flow of executables.

At the moment this is only enabled by default for the PS5 target.
---
 clang/lib/Driver/ToolChains/PS4CPU.cpp        |   8 +
 clang/test/Driver/ps4-ps5-toolchain.c         |   5 +
 llvm/docs/Extensions.rst                      |   6 +
 llvm/include/llvm/BinaryFormat/ELF.h          |   1 +
 llvm/include/llvm/CodeGen/AsmPrinter.h        |   4 +
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |  59 +++++
 llvm/lib/MC/MCParser/ELFAsmParser.cpp         |   2 +
 llvm/lib/MC/MCSectionELF.cpp                  |   2 +
 llvm/lib/Object/ELF.cpp                       |   1 +
 .../CodeGen/X86/jump-table-size-section.ll    | 210 ++++++++++++++++++
 10 files changed, 298 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/jump-table-size-section.ll

diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 54ec59e6398f8..48d824171303c 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -272,6 +272,8 @@ void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(D.getLTOMode() == LTOK_Thin ? "--lto=thin"
                                                   : "--lto=full");
 
+  AddLTOFlag("-emit-jump-table-sizes-section");
+
   if (UseJMC)
     AddLTOFlag("-enable-jmc-instrument");
 
@@ -484,6 +486,12 @@ void toolchains::PS4PS5Base::addClangTargetOptions(
     else
       CC1Args.push_back("-fvisibility-externs-nodllstorageclass=keep");
   }
+
+  // Enable jump table sizes section for PS5.
+  if (getTriple().isPS5()) {
+    CC1Args.push_back("-mllvm");
+    CC1Args.push_back("-emit-jump-table-sizes-section");
+  }
 }
 
 // PS4 toolchain.
diff --git a/clang/test/Driver/ps4-ps5-toolchain.c b/clang/test/Driver/ps4-ps5-toolchain.c
index 444e9df24714b..c9987c2b5758b 100644
--- a/clang/test/Driver/ps4-ps5-toolchain.c
+++ b/clang/test/Driver/ps4-ps5-toolchain.c
@@ -11,3 +11,8 @@
 // RUN: %clang %s -### -target x86_64-sie-ps5 -flto 2>&1 | FileCheck %s --check-prefix=LTO
 // LTO-NOT: error:
 // LTO-NOT: unable to pass LLVM bit-code
+
+// Verify that the jump table sizes section is enabled.
+// RUN: %clang %s -target x86_64-sie-ps5 -### 2>&1 | FileCheck -check-prefix=JUMPTABLESIZES %s
+// JUMPTABLESIZES: "-mllvm" "-emit-jump-table-sizes-section"
+// JUMPTABLESIZES: "-plugin-opt=-emit-jump-table-sizes-section"
diff --git a/llvm/docs/Extensions.rst b/llvm/docs/Extensions.rst
index 74ca8cb0aa687..abc34bc3202c0 100644
--- a/llvm/docs/Extensions.rst
+++ b/llvm/docs/Extensions.rst
@@ -554,6 +554,12 @@ time. This section is generated when the compiler enables fat LTO. This section
 has the ``SHF_EXCLUDE`` flag so that it is stripped from the final executable
 or shared library.
 
+``SHT_LLVM_JT_SIZES`` Section (Jump table addresses and sizes)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+This section stores pairs of (jump table address, number of entries).
+This information is useful for tools that need to statically reconstruct
+the control flow of executables.
+
 CodeView-Dependent
 ------------------
 
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index d61d0961d5987..2eff87c911343 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1129,6 +1129,7 @@ enum : unsigned {
   SHT_LLVM_BB_ADDR_MAP = 0x6fff4c0a,        // LLVM Basic Block Address Map.
   SHT_LLVM_OFFLOADING = 0x6fff4c0b,         // LLVM device offloading data.
   SHT_LLVM_LTO = 0x6fff4c0c,                // .llvm.lto for fat LTO.
+  SHT_LLVM_JT_SIZES = 0x6fff4c0d,           // LLVM jump tables sizes.
   // Android's experimental support for SHT_RELR sections.
   // https://android.googlesource.com/platform/bionic/+/b7feec74547f84559a1467aca02708ff61346d2a/libc/include/elf.h#512
   SHT_ANDROID_RELR = 0x6fffff00,   // Relocation entries; only offsets.
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index 36d1b47973870..c9a88d7b1c015 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -894,6 +894,10 @@ class AsmPrinter : public MachineFunctionPass {
 
   void emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
                           const MachineBasicBlock *MBB, unsigned uid) const;
+
+  void emitJumpTableSizesSection(const MachineJumpTableInfo *MJTI,
+                                 const Function &F) const;
+
   void emitLLVMUsedList(const ConstantArray *InitList);
   /// Emit llvm.ident metadata in an '.ident' directive.
   void emitModuleIdents(Module &M);
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 88e9b9d27d3f2..db7adfd3b21e5 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -99,6 +99,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
@@ -107,6 +108,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Remarks/RemarkStreamer.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
@@ -155,6 +157,11 @@ static cl::bits<PGOMapFeaturesEnum> PgoAnalysisMapFeatures(
         "Enable extended information within the SHT_LLVM_BB_ADDR_MAP that is "
         "extracted from PGO related analysis."));
 
+static cl::opt<bool> EmitJumpTableSizesSection(
+    "emit-jump-table-sizes-section",
+    cl::desc("Emit a section containing jump table addresses and sizes"),
+    cl::Hidden, cl::init(false));
+
 STATISTIC(EmittedInsts, "Number of machine instrs printed");
 
 char AsmPrinter::ID = 0;
@@ -2786,10 +2793,62 @@ void AsmPrinter::emitJumpTableInfo() {
     for (const MachineBasicBlock *MBB : JTBBs)
       emitJumpTableEntry(MJTI, MBB, JTI);
   }
+
+  if (EmitJumpTableSizesSection)
+    emitJumpTableSizesSection(MJTI, F);
+
   if (!JTInDiffSection)
     OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
 }
 
+void AsmPrinter::emitJumpTableSizesSection(const MachineJumpTableInfo *MJTI,
+                                           const Function &F) const {
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+
+  if (JT.empty())
+    return;
+
+  StringRef GroupName = F.hasComdat() ? F.getComdat()->getName() : "";
+  MCSection *JumpTableSizesSection = nullptr;
+  StringRef sectionName = ".llvm_jump_table_sizes";
+
+  bool isElf = TM.getTargetTriple().isOSBinFormatELF();
+  bool isCoff = TM.getTargetTriple().isOSBinFormatCOFF();
+
+  if (!isCoff && !isElf)
+    return;
+
+  if (isElf) {
+    MCSymbolELF *LinkedToSym = dyn_cast<MCSymbolELF>(CurrentFnSym);
+    int Flags = F.hasComdat() ? ELF::SHF_GROUP : 0;
+
+    JumpTableSizesSection = OutContext.getELFSection(
+        sectionName, ELF::SHT_LLVM_JT_SIZES, Flags, 0, GroupName, F.hasComdat(),
+        MCSection::NonUniqueID, LinkedToSym);
+  } else if (isCoff) {
+    if (F.hasComdat()) {
+      JumpTableSizesSection = OutContext.getCOFFSection(
+          sectionName,
+          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ |
+              COFF::IMAGE_SCN_LNK_COMDAT | COFF::IMAGE_SCN_MEM_DISCARDABLE,
+          F.getComdat()->getName(), COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE);
+    } else {
+      JumpTableSizesSection = OutContext.getCOFFSection(
+          sectionName, COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                           COFF::IMAGE_SCN_MEM_READ |
+                           COFF::IMAGE_SCN_MEM_DISCARDABLE);
+    }
+  }
+
+  OutStreamer->switchSection(JumpTableSizesSection);
+
+  for (unsigned JTI = 0, E = JT.size(); JTI != E; ++JTI) {
+    const std::vector<MachineBasicBlock *> &JTBBs = JT[JTI].MBBs;
+    OutStreamer->emitSymbolValue(GetJTISymbol(JTI), TM.getProgramPointerSize());
+    OutStreamer->emitIntValue(JTBBs.size(), TM.getProgramPointerSize());
+  }
+}
+
 /// EmitJumpTableEntry - Emit a jump table entry for the specified MBB to the
 /// current stream.
 void AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index e8a22d3defd6e..c4536441665fa 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -677,6 +677,8 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
       Type = ELF::SHT_LLVM_OFFLOADING;
     else if (TypeName == "llvm_lto")
       Type = ELF::SHT_LLVM_LTO;
+    else if (TypeName == "llvm_jt_sizes")
+      Type = ELF::SHT_LLVM_JT_SIZES;
     else if (TypeName.getAsInteger(0, Type))
       return TokError("unknown section type");
   }
diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp
index 5cd6590fb626d..25e62b70b5e2a 100644
--- a/llvm/lib/MC/MCSectionELF.cpp
+++ b/llvm/lib/MC/MCSectionELF.cpp
@@ -172,6 +172,8 @@ void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
     OS << "llvm_offloading";
   else if (Type == ELF::SHT_LLVM_LTO)
     OS << "llvm_lto";
+  else if (Type == ELF::SHT_LLVM_JT_SIZES)
+    OS << "llvm_jt_sizes";
   else
     OS << "0x" << Twine::utohexstr(Type);
 
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index e47a40b8715dd..c66736fb2c919 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -319,6 +319,7 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_BB_ADDR_MAP);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_OFFLOADING);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_LTO);
+    STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_JT_SIZES)
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_ATTRIBUTES);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_HASH);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_verdef);
diff --git a/llvm/test/CodeGen/X86/jump-table-size-section.ll b/llvm/test/CodeGen/X86/jump-table-size-section.ll
new file mode 100644
index 0000000000000..c0b57e96d56b0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/jump-table-size-section.ll
@@ -0,0 +1,210 @@
+; RUN: llc %s -o - -mtriple x86_64-sie-ps5 -emit-jump-table-sizes-section -verify-machineinstrs --relocation-model=pic | FileCheck --check-prefix=PS5-CHECK %s
+; RUN: llc %s -o - -mtriple x86_64-sie-ps5 -verify-machineinstrs --relocation-model=pic | FileCheck --check-prefix=NOFLAG %s
+; RUN: llc %s -o - -mtriple x86_64-sie-ps5 -verify-machineinstrs --relocation-model=pic | FileCheck --check-prefix=NOTABLE %s
+
+; RUN: llc %s -o - -mtriple x86_64-unknown-linux-gnu -emit-jump-table-sizes-section -verify-machineinstrs --relocation-model=pic | FileCheck --check-prefix=ELF-CHECK %s
+; RUN: llc %s -o - -mtriple x86_64-unknown-linux-gnu -verify-machineinstrs --relocation-model=pic | FileCheck --check-prefix=NOFLAG %s
+; RUN: llc %s -o - -mtriple x86_64-unknown-linux-gnu -verify-machineinstrs --relocation-model=pic | FileCheck --check-prefix=NOTABLE %s
+
+; RUN: llc %s -o - -mtriple x86_64-pc-windows-msvc -emit-jump-table-sizes-section -verify-machineinstrs --relocation-model=pic | FileCheck --check-prefix=COFF-CHECK %s
+; RUN: llc %s -o - -mtriple x86_64-pc-windows-msvc -verify-machineinstrs --relocation-model=pic | FileCheck --check-prefix=NOFLAG %s
+; RUN: llc %s -o - -mtriple x86_64-pc-windows-msvc -verify-machineinstrs --relocation-model=pic | FileCheck --check-prefix=NOTABLE %s
+
+; This test verifies the jump table size section. Currently only enabled by default on the PS5 target.
+
+$foo1 = comdat any
+
+; Ensure proper comdat handling.
+define void @foo1(i32 %x, ptr %to) comdat {
+
+; PS5-CHECK-LABEL: foo1
+; PS5-CHECK:       .section        .llvm_jump_table_sizes,"G",@llvm_jt_sizes,foo1,comdat
+; PS5-CHECK-NEXT: .quad   .LJTI0_0
+; PS5-CHECK-NEXT: .quad   6
+
+; ELF-CHECK-LABEL: foo1
+; ELF-CHECK:       .section        .llvm_jump_table_sizes,"G",@llvm_jt_sizes,foo1,comdat
+; ELF-CHECK-NEXT: .quad   .LJTI0_0
+; ELF-CHECK-NEXT: .quad   6
+
+; COFF-CHECK-LABEL: foo1
+; COFF-CHECK:      .section         .llvm_jump_table_sizes,"drD",associative,foo1
+; COFF-CHECK-NEXT: .quad   .LJTI0_0
+; COFF-CHECK-NEXT: .quad   6
+
+; NOFLAG-LABEL: foo1
+; NOFLAG-NOT: .section        .llvm_jump_table_sizes
+
+entry:
+  switch i32 %x, label %default [
+    i32 0, label %bb0
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+    i32 4, label %bb4
+    i32 5, label %bb4
+  ]
+bb0:
+  store i32 0, ptr %to
+  br label %exit
+bb1:
+  store i32 1, ptr %to
+  br label %exit
+bb2:
+  store i32 2, ptr %to
+  br label %exit
+bb3:
+  store i32 3, ptr %to
+  br label %exit
+bb4:
+  store i32 4, ptr %to
+  br label %exit
+exit:
+  ret void
+default:
+  unreachable
+}
+
+define void @foo2(i32 %x, ptr %to) {
+
+; PS5-CHECK-LABEL:    foo2
+; PS5-CHECK:          .section .llvm_jump_table_sizes,"",@llvm_jt_sizes
+; PS5-CHECK-NEXT:     .quad .LJTI1_0
+; PS5-CHECK-NEXT:     .quad   5
+
+; ELF-CHECK-LABEL:    foo2
+; ELF-CHECK:          .section .llvm_jump_table_sizes,"",@llvm_jt_sizes
+; ELF-CHECK-NEXT:     .quad .LJTI1_0
+; ELF-CHECK-NEXT:     .quad   5
+
+; COFF-CHECK-LABEL:   foo2
+; COFF-CHECK:         .section         .llvm_jump_table_sizes,"drD"
+; COFF-CHECK-NEXT:    .quad .LJTI1_0
+; COFF-CHECK-NEXT:    .quad   5
+
+; NOFLAG-LABEL:       foo1
+; NOFLAG-NOT:         .section        .llvm_jump_table_sizes
+
+entry:
+  switch i32 %x, label %default [
+    i32 0, label %bb0
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+    i32 4, label %bb4
+  ]
+bb0:
+  store i32 0, ptr %to
+  br label %exit
+bb1:
+  store i32 1, ptr %to
+  br label %exit
+bb2:
+  store i32 2, ptr %to
+  br label %exit
+bb3:
+  store i32 3, ptr %to
+  br label %exit
+bb4:
+  store i32 4, ptr %to
+  br label %exit
+exit:
+  ret void
+default:
+  unreachable
+}
+
+; Ensure that the section isn't produced if there is no jump table.
+
+define void @foo3(i32 %x, ptr %to) {
+
+; NOTABLE-LABEL:    foo3
+; NOTABLE-NOT:      .section        .llvm_jump_table_sizes
+
+exit:
+  ret void
+}
+
+; Ensure we can deal with nested jump tables.
+
+define void @nested(i32 %x, i32 %y, ptr %to) {
+
+; PS5-CHECK-LABEL:    nested
+; PS5-CHECK:          .section .llvm_jump_table_sizes,"",@llvm_jt_sizes
+; PS5-CHECK-NEXT:     .quad .LJTI3_0
+; PS5-CHECK-NEXT:     .quad   5
+; PS5-CHECK-NEXT:     .quad .LJTI3_1
+; PS5-CHECK-NEXT:     .quad 6
+
+; ELF-CHECK-LABEL:    nested
+; ELF-CHECK:          .section .llvm_jump_table_sizes,"",@llvm_jt_sizes
+; ELF-CHECK-NEXT:     .quad .LJTI3_0
+; ELF-CHECK-NEXT:     .quad   5
+; ELF-CHECK-NEXT:     .quad .LJTI3_1
+; ELF-CHECK-NEXT:     .quad 6
+
+; COFF-CHECK-LABEL:   nested
+; COFF-CHECK:         .section         .llvm_jump_table_sizes,"drD"
+; COFF-CHECK-NEXT:     .quad .LJTI3_0
+; COFF-CHECK-NEXT:     .quad   5
+; COFF-CHECK-NEXT:     .quad .LJTI3_1
+; COFF-CHECK-NEXT:     .quad 6
+
+; NOFLAG-LABEL:       nested
+; NOFLAG-NOT:         .section        .llvm_jump_table_sizes
+
+entry:
+  switch i32 %x, label %default [
+    i32 0, label %bb0
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+    i32 4, label %bb4
+  ]
+bb0:
+  store i32 0, ptr %to
+  br label %exit
+bb1:
+  store i32 1, ptr %to
+  br label %exit
+bb2:
+  store i32 2, ptr %to
+  br label %exit
+bb3:
+  store i32 3, ptr %to
+  br label %exit
+bb4:
+  switch i32 %y, label %default [
+    i32 1, label %bb5
+    i32 2, label %bb6
+    i32 3, label %bb7
+    i32 4, label %bb8
+    i32 5, label %bb9
+    i32 6, label %bb10
+  ]
+  br label %exit2
+bb5:
+  store i32 4, ptr %to
+  br label %exit
+bb6:
+  store i32 4, ptr %to
+  br label %exit
+bb7:
+  store i32 4, ptr %to
+  br label %exit
+bb8:
+  store i32 4, ptr %to
+  br label %exit
+bb9:
+  store i32 4, ptr %to
+  br label %exit
+bb10:
+  store i32 4, ptr %to
+  br label %exit
+exit:
+  ret void
+exit2:
+  ret void
+default:
+  unreachable
+}
\ No newline at end of file

From c782d54f0fb7a0ea56d4e1d5b49d176fed57cb6f Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Fri, 6 Sep 2024 14:57:34 +0200
Subject: [PATCH 370/425] [bazel] port 1f70fcefa981e6e2b

---
 .../llvm-project-overlay/clang/BUILD.bazel    | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 945c3d64a991e..8b52d026336de 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -92,6 +92,7 @@ gentbl(
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_neon.td",
     td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_neon.td",
         "include/clang/Basic/arm_neon_incl.td",
     ],
@@ -107,6 +108,7 @@ gentbl(
     td_file = "include/clang/Basic/arm_fp16.td",
     td_srcs = [
         "include/clang/Basic/arm_fp16.td",
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_neon_incl.td",
     ],
 )
@@ -120,6 +122,7 @@ gentbl(
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_mve.td",
     td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_mve.td",
         "include/clang/Basic/arm_mve_defs.td",
     ],
@@ -134,6 +137,7 @@ gentbl(
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_sve.td",
     td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_sve.td",
         "include/clang/Basic/arm_sve_sme_incl.td",
     ],
@@ -148,11 +152,28 @@ gentbl(
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_sve.td",
     td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_sve.td",
         "include/clang/Basic/arm_sve_sme_incl.td",
     ],
 )
 
+gentbl(
+    name = "basic_arm_sve_immcheck_types_gen",
+    tbl_outs = [(
+        "-gen-arm-immcheck-types",
+        "include/clang/Basic/arm_immcheck_types.inc",
+    )],
+    tblgen = ":clang-tblgen",
+    td_file = "include/clang/Basic/arm_sve.td",
+    td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
+        "include/clang/Basic/arm_sve.td",
+        "include/clang/Basic/arm_sve_sme_incl.td",
+    ],
+)
+
+
 gentbl(
     name = "basic_arm_sve_typeflags_gen",
     tbl_outs = [(
@@ -162,6 +183,7 @@ gentbl(
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_sve.td",
     td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_sve.td",
         "include/clang/Basic/arm_sve_sme_incl.td",
     ],
@@ -176,6 +198,7 @@ gentbl(
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_sve.td",
     td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_sve.td",
         "include/clang/Basic/arm_sve_sme_incl.td",
     ],
@@ -190,6 +213,7 @@ gentbl(
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_sve.td",
     td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_sve.td",
         "include/clang/Basic/arm_sve_sme_incl.td",
     ],
@@ -204,6 +228,7 @@ gentbl(
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_sme.td",
     td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_sme.td",
         "include/clang/Basic/arm_sve_sme_incl.td",
     ],
@@ -220,6 +245,7 @@ gentbl(
     td_srcs = [
         "include/clang/Basic/arm_sme.td",
         "include/clang/Basic/arm_sve_sme_incl.td",
+        "include/clang/Basic/arm_immcheck_incl.td",
     ],
 )
 
@@ -232,6 +258,7 @@ gentbl(
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_sme.td",
     td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_sme.td",
         "include/clang/Basic/arm_sve_sme_incl.td",
     ],
@@ -246,6 +273,7 @@ gentbl(
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_sme.td",
     td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_sme.td",
         "include/clang/Basic/arm_sve_sme_incl.td",
     ],
@@ -260,6 +288,7 @@ gentbl(
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_sme.td",
     td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_sme.td",
         "include/clang/Basic/arm_sve_sme_incl.td",
     ],
@@ -629,6 +658,7 @@ cc_library(
     textual_hdrs = [
         "include/clang/Basic/arm_fp16.inc",
         "include/clang/Basic/arm_mve_builtins.inc",
+        "include/clang/Basic/arm_immcheck_types.inc",
         "include/clang/Basic/arm_mve_builtin_aliases.inc",
         "include/clang/Basic/arm_mve_builtin_cg.inc",
         "include/clang/Basic/arm_mve_builtin_sema.inc",
@@ -1634,6 +1664,7 @@ gentbl(
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_neon.td",
     td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_neon.td",
         "include/clang/Basic/arm_neon_incl.td",
     ],
@@ -1649,6 +1680,7 @@ gentbl(
     td_file = "include/clang/Basic/arm_fp16.td",
     td_srcs = [
         "include/clang/Basic/arm_fp16.td",
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_neon_incl.td",
     ],
 )
@@ -1690,6 +1722,7 @@ gentbl(
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_sve.td",
     td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_sve.td",
         "include/clang/Basic/arm_sve_sme_incl.td",
     ],
@@ -1705,6 +1738,7 @@ gentbl(
     td_file = "include/clang/Basic/arm_bf16.td",
     td_srcs = [
         "include/clang/Basic/arm_bf16.td",
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_neon_incl.td",
     ],
 )
@@ -1722,6 +1756,7 @@ gentbl(
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_sme.td",
     td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_sme.td",
         "include/clang/Basic/arm_sve_sme_incl.td",
     ],
@@ -1736,6 +1771,7 @@ gentbl(
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_neon.td",
     td_srcs = [
+        "include/clang/Basic/arm_immcheck_incl.td",
         "include/clang/Basic/arm_neon.td",
         "include/clang/Basic/arm_neon_incl.td",
     ],

From c64dac2e6c39f0f7f1c676857e7d34c764b4d632 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 6 Sep 2024 09:16:48 -0400
Subject: [PATCH 371/425] [clang] Fix noisy -Wcovered-switch-default warning
 after 1f70fcefa9 / #100278

---
 clang/include/clang/Basic/TargetBuiltins.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index 02b4a4b39bbf4..124e353a5c1c4 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -235,8 +235,6 @@ namespace clang {
         return 64;
       case Poly128:
         return 128;
-      default:
-        llvm_unreachable("Invalid NeonTypeFlag!");
       }
     }
   };

From a67b6e163528ea547ccbcd4850803f062df66d2e Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee@amd.com>
Date: Fri, 6 Sep 2024 14:23:06 +0100
Subject: [PATCH 372/425] Fix typo in test.

---
 offload/test/offloading/fortran/target-map-dynamic.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/offload/test/offloading/fortran/target-map-dynamic.f90 b/offload/test/offloading/fortran/target-map-dynamic.f90
index 8bf6980884a18..d0d060322ce40 100644
--- a/offload/test/offloading/fortran/target-map-dynamic.f90
+++ b/offload/test/offloading/fortran/target-map-dynamic.f90
@@ -6,7 +6,7 @@
 subroutine test_array_target_enter_data(dims)
     integer, intent(in) :: dims(2)
     double precision :: A(2, dims(2))
-    !$omp target enter data map(to: U)
+    !$omp target enter data map(to: A)
 
     A(2,2) = 1.0
     !$omp target

From 71a0eb33178c79eafef2cf55bb8c87ea89e6491e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 6 Sep 2024 14:29:12 +0100
Subject: [PATCH 373/425] [X86] LowerSelect - generalize "select icmp(x,0),
 lhs, rhs" folding patterns. (#107272)

We have many cases where the condition is a "(and x, 1)" pattern (e.g. from a bool argument or some other simplified bitlogic), and we have a large number of existing generic/x86 patterns that make use of this (and trying to convert to a SETCC node can cause infinite loops).

Use the LowerSELECTWithCmpZero helper, simulating the comparison with adjusted operands.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   7 +
 .../pull-conditional-binop-through-shift.ll   | 132 ++++++++----------
 llvm/test/CodeGen/X86/select.ll               |  36 +++--
 3 files changed, 83 insertions(+), 92 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c91d37727b611..f8aa263c30151 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24295,6 +24295,13 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       isOneConstant(Cond.getOperand(1)))
     Cond = Cond.getOperand(0);
 
+  // Attempt to fold "raw cond" cases by treating them as:
+  // (select (and X, 1), Op1, Op2  --> (select (icmpeq (and X, 1), 0), Op2, Op1)
+  if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
+    if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
+                                           Subtarget))
+      return R;
+
   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   // setting operand in place of the X86ISD::SETCC.
   unsigned CondOpcode = Cond.getOpcode();
diff --git a/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll b/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll
index 4f39b8f945413..def4c08a3592e 100644
--- a/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll
+++ b/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll
@@ -77,12 +77,11 @@ define i32 @or_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ; X86-LABEL: or_signbit_select_shl:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    je .LBB2_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    orl $16711680, %eax # imm = 0xFF0000
-; X86-NEXT:  .LBB2_2:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    andl $16711680, %eax # imm = 0xFF0000
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
@@ -106,12 +105,11 @@ define i32 @or_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ; X86-LABEL: or_nosignbit_select_shl:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    je .LBB3_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    orl $16711680, %eax # imm = 0xFF0000
-; X86-NEXT:  .LBB3_2:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    andl $16711680, %eax # imm = 0xFF0000
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
@@ -136,12 +134,11 @@ define i32 @xor_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ; X86-LABEL: xor_signbit_select_shl:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    je .LBB4_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    xorl $16711680, %eax # imm = 0xFF0000
-; X86-NEXT:  .LBB4_2:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    andl $16711680, %eax # imm = 0xFF0000
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
@@ -165,12 +162,11 @@ define i32 @xor_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ; X86-LABEL: xor_nosignbit_select_shl:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    je .LBB5_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    xorl $16711680, %eax # imm = 0xFF0000
-; X86-NEXT:  .LBB5_2:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    andl $16711680, %eax # imm = 0xFF0000
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
@@ -315,12 +311,11 @@ define i32 @or_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ; X86-LABEL: or_signbit_select_lshr:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    je .LBB10_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    orl $-65536, %eax # imm = 0xFFFF0000
-; X86-NEXT:  .LBB10_2:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
@@ -344,12 +339,11 @@ define i32 @or_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ; X86-LABEL: or_nosignbit_select_lshr:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    je .LBB11_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    orl $2147418112, %eax # imm = 0x7FFF0000
-; X86-NEXT:  .LBB11_2:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
@@ -374,12 +368,11 @@ define i32 @xor_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ; X86-LABEL: xor_signbit_select_lshr:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    je .LBB12_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    xorl $-65536, %eax # imm = 0xFFFF0000
-; X86-NEXT:  .LBB12_2:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
@@ -403,12 +396,11 @@ define i32 @xor_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ; X86-LABEL: xor_nosignbit_select_lshr:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    je .LBB13_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    xorl $2147418112, %eax # imm = 0x7FFF0000
-; X86-NEXT:  .LBB13_2:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
@@ -553,12 +545,11 @@ define i32 @or_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ; X86-LABEL: or_signbit_select_ashr:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    je .LBB18_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    orl $-65536, %eax # imm = 0xFFFF0000
-; X86-NEXT:  .LBB18_2:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
@@ -582,12 +573,11 @@ define i32 @or_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ; X86-LABEL: or_nosignbit_select_ashr:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    je .LBB19_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    orl $2147418112, %eax # imm = 0x7FFF0000
-; X86-NEXT:  .LBB19_2:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
@@ -612,12 +602,11 @@ define i32 @xor_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ; X86-LABEL: xor_signbit_select_ashr:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    je .LBB20_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    xorl $-65536, %eax # imm = 0xFFFF0000
-; X86-NEXT:  .LBB20_2:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
@@ -641,12 +630,11 @@ define i32 @xor_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ; X86-LABEL: xor_nosignbit_select_ashr:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    je .LBB21_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    xorl $2147418112, %eax # imm = 0x7FFF0000
-; X86-NEXT:  .LBB21_2:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index ca5558561a65b..f370ac0a8c7c5 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -1475,11 +1475,10 @@ define i16 @select_xor_1b(i16 %A, i8 %cond) {
 ;
 ; MCU-LABEL: select_xor_1b:
 ; MCU:       # %bb.0: # %entry
-; MCU-NEXT:    testb $1, %dl
-; MCU-NEXT:    je .LBB29_2
-; MCU-NEXT:  # %bb.1:
-; MCU-NEXT:    xorl $43, %eax
-; MCU-NEXT:  .LBB29_2: # %entry
+; MCU-NEXT:    andl $1, %edx
+; MCU-NEXT:    negl %edx
+; MCU-NEXT:    andl $43, %edx
+; MCU-NEXT:    xorl %edx, %eax
 ; MCU-NEXT:    # kill: def $ax killed $ax killed $eax
 ; MCU-NEXT:    retl
 entry:
@@ -1545,11 +1544,10 @@ define i32 @select_xor_2b(i32 %A, i32 %B, i8 %cond) {
 ;
 ; MCU-LABEL: select_xor_2b:
 ; MCU:       # %bb.0: # %entry
-; MCU-NEXT:    testb $1, %cl
-; MCU-NEXT:    je .LBB31_2
-; MCU-NEXT:  # %bb.1:
-; MCU-NEXT:    xorl %edx, %eax
-; MCU-NEXT:  .LBB31_2: # %entry
+; MCU-NEXT:    andl $1, %ecx
+; MCU-NEXT:    negl %ecx
+; MCU-NEXT:    andl %edx, %ecx
+; MCU-NEXT:    xorl %ecx, %eax
 ; MCU-NEXT:    retl
 entry:
  %and = and i8 %cond, 1
@@ -1614,11 +1612,10 @@ define i32 @select_or_b(i32 %A, i32 %B, i8 %cond) {
 ;
 ; MCU-LABEL: select_or_b:
 ; MCU:       # %bb.0: # %entry
-; MCU-NEXT:    testb $1, %cl
-; MCU-NEXT:    je .LBB33_2
-; MCU-NEXT:  # %bb.1:
-; MCU-NEXT:    orl %edx, %eax
-; MCU-NEXT:  .LBB33_2: # %entry
+; MCU-NEXT:    andl $1, %ecx
+; MCU-NEXT:    negl %ecx
+; MCU-NEXT:    andl %edx, %ecx
+; MCU-NEXT:    orl %ecx, %eax
 ; MCU-NEXT:    retl
 entry:
  %and = and i8 %cond, 1
@@ -1683,11 +1680,10 @@ define i32 @select_or_1b(i32 %A, i32 %B, i32 %cond) {
 ;
 ; MCU-LABEL: select_or_1b:
 ; MCU:       # %bb.0: # %entry
-; MCU-NEXT:    testb $1, %cl
-; MCU-NEXT:    je .LBB35_2
-; MCU-NEXT:  # %bb.1:
-; MCU-NEXT:    orl %edx, %eax
-; MCU-NEXT:  .LBB35_2: # %entry
+; MCU-NEXT:    andl $1, %ecx
+; MCU-NEXT:    negl %ecx
+; MCU-NEXT:    andl %edx, %ecx
+; MCU-NEXT:    orl %ecx, %eax
 ; MCU-NEXT:    retl
 entry:
  %and = and i32 %cond, 1

From 52fac608bd3fb93caf08c137cea4591372aa8f31 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Fri, 6 Sep 2024 21:37:50 +0800
Subject: [PATCH 374/425]  [InstCombine] Fold `[l|a]shr iN (X-1)&~X, N-1 ->
 [z|s]ext(X==0)` (#107259)

Alive2: https://alive2.llvm.org/ce/z/kwvTFn
Closes #107228.

`ashr iN (X-1)&~X, N-1` also exists. See
https://github.com/dtcxzyw/llvm-opt-benchmark/issues/1274.
---
 .../InstCombine/InstCombineShifts.cpp         |  10 +
 llvm/test/Transforms/InstCombine/ashr-lshr.ll | 203 ++++++++++++++++++
 2 files changed, 213 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 794b384d126eb..10c3ccdb2243a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1478,6 +1478,11 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
         Value *Signbit = Builder.CreateLShr(X, ShAmtC);
         return BinaryOperator::CreateAnd(Signbit, X);
       }
+
+      // lshr iN (X - 1) & ~X, N-1 --> zext (X == 0)
+      if (match(Op0, m_OneUse(m_c_And(m_Add(m_Value(X), m_AllOnes()),
+                                      m_Not(m_Deferred(X))))))
+        return new ZExtInst(Builder.CreateIsNull(X), Ty);
     }
 
     Instruction *TruncSrc;
@@ -1754,6 +1759,11 @@ Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) {
       Value *Y;
       if (match(Op0, m_OneUse(m_NSWSub(m_Value(X), m_Value(Y)))))
         return new SExtInst(Builder.CreateICmpSLT(X, Y), Ty);
+
+      // ashr iN (X - 1) & ~X, N-1 --> sext (X == 0)
+      if (match(Op0, m_OneUse(m_c_And(m_Add(m_Value(X), m_AllOnes()),
+                                      m_Not(m_Deferred(X))))))
+        return new SExtInst(Builder.CreateIsNull(X), Ty);
     }
 
     const APInt *MulC;
diff --git a/llvm/test/Transforms/InstCombine/ashr-lshr.ll b/llvm/test/Transforms/InstCombine/ashr-lshr.ll
index 9e31c9b0738c6..49041906680b3 100644
--- a/llvm/test/Transforms/InstCombine/ashr-lshr.ll
+++ b/llvm/test/Transforms/InstCombine/ashr-lshr.ll
@@ -874,4 +874,207 @@ define i32 @ashr_mul_times_5_div_4_exact_2(i32 %x) {
   ret i32 %ashr
 }
 
+
+define i32 @lsb_mask_sign_zext(i32 %x) {
+; CHECK-LABEL: @lsb_mask_sign_zext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[SHR:%.*]] = zext i1 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[SHR]]
+;
+entry:
+  %sub = add i32 %x, -1
+  %not = xor i32 %x, -1
+  %and = and i32 %sub, %not
+  %shr = lshr i32 %and, 31
+  ret i32 %shr
+}
+
+define i32 @lsb_mask_sign_zext_commuted(i32 %x) {
+; CHECK-LABEL: @lsb_mask_sign_zext_commuted(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[SHR:%.*]] = zext i1 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[SHR]]
+;
+entry:
+  %sub = add i32 %x, -1
+  %not = xor i32 %x, -1
+  %and = and i32 %not, %sub
+  %shr = lshr i32 %and, 31
+  ret i32 %shr
+}
+
+; Negative tests
+
+define i32 @lsb_mask_sign_zext_wrong_cst1(i32 %x) {
+; CHECK-LABEL: @lsb_mask_sign_zext_wrong_cst1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[X:%.*]], -2
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[X]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SUB]], [[NOT]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[AND]], 31
+; CHECK-NEXT:    ret i32 [[SHR]]
+;
+entry:
+  %sub = add i32 %x, -2
+  %not = xor i32 %x, -1
+  %and = and i32 %sub, %not
+  %shr = lshr i32 %and, 31
+  ret i32 %shr
+}
+
+define i32 @lsb_mask_sign_zext_wrong_cst2(i32 %x) {
+; CHECK-LABEL: @lsb_mask_sign_zext_wrong_cst2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SUB]], [[X]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[AND]], 31
+; CHECK-NEXT:    ret i32 [[SHR]]
+;
+entry:
+  %sub = add i32 %x, -1
+  %not = xor i32 %x, 2
+  %and = and i32 %sub, %not
+  %shr = lshr i32 %and, 31
+  ret i32 %shr
+}
+
+define i32 @lsb_mask_sign_zext_wrong_cst3(i32 %x) {
+; CHECK-LABEL: @lsb_mask_sign_zext_wrong_cst3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[X]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SUB]], [[NOT]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[AND]], 30
+; CHECK-NEXT:    ret i32 [[SHR]]
+;
+entry:
+  %sub = add i32 %x, -1
+  %not = xor i32 %x, -1
+  %and = and i32 %sub, %not
+  %shr = lshr i32 %and, 30
+  ret i32 %shr
+}
+
+define i32 @lsb_mask_sign_zext_multiuse(i32 %x) {
+; CHECK-LABEL: @lsb_mask_sign_zext_multiuse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[X]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SUB]], [[NOT]]
+; CHECK-NEXT:    call void @use(i32 [[AND]])
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[AND]], 31
+; CHECK-NEXT:    ret i32 [[SHR]]
+;
+entry:
+  %sub = add i32 %x, -1
+  %not = xor i32 %x, -1
+  %and = and i32 %sub, %not
+  call void @use(i32 %and)
+  %shr = lshr i32 %and, 31
+  ret i32 %shr
+}
+
+define i32 @lsb_mask_sign_sext(i32 %x) {
+; CHECK-LABEL: @lsb_mask_sign_sext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[SHR:%.*]] = sext i1 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[SHR]]
+;
+entry:
+  %sub = add i32 %x, -1
+  %not = xor i32 %x, -1
+  %and = and i32 %sub, %not
+  %shr = ashr i32 %and, 31
+  ret i32 %shr
+}
+
+define i32 @lsb_mask_sign_sext_commuted(i32 %x) {
+; CHECK-LABEL: @lsb_mask_sign_sext_commuted(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[SHR:%.*]] = sext i1 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[SHR]]
+;
+entry:
+  %sub = add i32 %x, -1
+  %not = xor i32 %x, -1
+  %and = and i32 %not, %sub
+  %shr = ashr i32 %and, 31
+  ret i32 %shr
+}
+
+; Negative tests
+
+define i32 @lsb_mask_sign_sext_wrong_cst1(i32 %x) {
+; CHECK-LABEL: @lsb_mask_sign_sext_wrong_cst1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[X:%.*]], -2
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[X]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SUB]], [[NOT]]
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[AND]], 31
+; CHECK-NEXT:    ret i32 [[SHR]]
+;
+entry:
+  %sub = add i32 %x, -2
+  %not = xor i32 %x, -1
+  %and = and i32 %sub, %not
+  %shr = ashr i32 %and, 31
+  ret i32 %shr
+}
+
+define i32 @lsb_mask_sign_sext_wrong_cst2(i32 %x) {
+; CHECK-LABEL: @lsb_mask_sign_sext_wrong_cst2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SUB]], [[X]]
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[AND]], 31
+; CHECK-NEXT:    ret i32 [[SHR]]
+;
+entry:
+  %sub = add i32 %x, -1
+  %not = xor i32 %x, 2
+  %and = and i32 %sub, %not
+  %shr = ashr i32 %and, 31
+  ret i32 %shr
+}
+
+define i32 @lsb_mask_sign_sext_wrong_cst3(i32 %x) {
+; CHECK-LABEL: @lsb_mask_sign_sext_wrong_cst3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[X]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SUB]], [[NOT]]
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[AND]], 30
+; CHECK-NEXT:    ret i32 [[SHR]]
+;
+entry:
+  %sub = add i32 %x, -1
+  %not = xor i32 %x, -1
+  %and = and i32 %sub, %not
+  %shr = ashr i32 %and, 30
+  ret i32 %shr
+}
+
+define i32 @lsb_mask_sign_sext_multiuse(i32 %x) {
+; CHECK-LABEL: @lsb_mask_sign_sext_multiuse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[X]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SUB]], [[NOT]]
+; CHECK-NEXT:    call void @use(i32 [[AND]])
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[AND]], 31
+; CHECK-NEXT:    ret i32 [[SHR]]
+;
+entry:
+  %sub = add i32 %x, -1
+  %not = xor i32 %x, -1
+  %and = and i32 %sub, %not
+  call void @use(i32 %and)
+  %shr = ashr i32 %and, 31
+  ret i32 %shr
+}
+
 declare void @use(i32)

From b0ae93e847141becea7a2eeff35dc566110c1f58 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 6 Sep 2024 14:49:45 +0100
Subject: [PATCH 375/425] [AArch64] Add more type combinations to vector fp
 conversion cost tests.

Generealize test coverage for https://github.com/llvm/llvm-project/pull/107303

Also adjust the name to reflect the fact that it is not limited to
vectorrs with 3 elements now.
---
 .../fp-conversions-odd-vector-types.ll        | 227 ++++++++++++++++++
 .../CostModel/AArch64/vec3-fp-conversions.ll  | 110 ---------
 2 files changed, 227 insertions(+), 110 deletions(-)
 create mode 100644 llvm/test/Analysis/CostModel/AArch64/fp-conversions-odd-vector-types.ll
 delete mode 100644 llvm/test/Analysis/CostModel/AArch64/vec3-fp-conversions.ll

diff --git a/llvm/test/Analysis/CostModel/AArch64/fp-conversions-odd-vector-types.ll b/llvm/test/Analysis/CostModel/AArch64/fp-conversions-odd-vector-types.ll
new file mode 100644
index 0000000000000..51db7529386cd
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/fp-conversions-odd-vector-types.ll
@@ -0,0 +1,227 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=arm64-apple-macosx < %s | FileCheck %s
+
+define <3 x i32> @fptoui_v3f32_to_v3i32(<3 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptoui_v3f32_to_v3i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = fptoui <3 x float> %in to <3 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i32> %conv
+;
+  %conv = fptoui <3 x float> %in to <3 x i32>
+  ret <3 x i32> %conv
+}
+
+define <5 x i32> @fptoui_v5f32_to_v5i32(<5 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptoui_v5f32_to_v5i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %conv = fptoui <5 x float> %in to <5 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <5 x i32> %conv
+;
+  %conv = fptoui <5 x float> %in to <5 x i32>
+  ret <5 x i32> %conv
+}
+
+define <3 x i16> @fptoui_v3f32_to_v3i16(<3 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptoui_v3f32_to_v3i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %conv = fptoui <3 x float> %in to <3 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i16> %conv
+;
+  %conv = fptoui <3 x float> %in to <3 x i16>
+  ret <3 x i16> %conv
+}
+
+define <5 x i16> @fptoui_v5f32_to_v5i16(<5 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptoui_v5f32_to_v5i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %conv = fptoui <5 x float> %in to <5 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <5 x i16> %conv
+;
+  %conv = fptoui <5 x float> %in to <5 x i16>
+  ret <5 x i16> %conv
+}
+
+define <3 x i8> @fptoui_v3f32_to_v3i8(<3 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptoui_v3f32_to_v3i8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %conv = fptoui <3 x float> %in to <3 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i8> %conv
+;
+  %conv = fptoui <3 x float> %in to <3 x i8>
+  ret <3 x i8> %conv
+}
+
+define <9 x i8> @fptoui_v9f32_to_v9i8(<9 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptoui_v9f32_to_v9i8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %conv = fptoui <9 x float> %in to <9 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <9 x i8> %conv
+;
+  %conv = fptoui <9 x float> %in to <9 x i8>
+  ret <9 x i8> %conv
+}
+
+define <3 x i32> @fptosi_v3f32_to_v3i32(<3 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptosi_v3f32_to_v3i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = fptosi <3 x float> %in to <3 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i32> %conv
+;
+  %conv = fptosi <3 x float> %in to <3 x i32>
+  ret <3 x i32> %conv
+}
+
+define <6 x i32> @fptosi_v6f32_to_v6i32(<6 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptosi_v6f32_to_v6i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %conv = fptosi <6 x float> %in to <6 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <6 x i32> %conv
+;
+  %conv = fptosi <6 x float> %in to <6 x i32>
+  ret <6 x i32> %conv
+}
+
+define <3 x i16> @fptosi_v3f32_to_v3i16(<3 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptosi_v3f32_to_v3i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %conv = fptosi <3 x float> %in to <3 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i16> %conv
+;
+  %conv = fptosi <3 x float> %in to <3 x i16>
+  ret <3 x i16> %conv
+}
+
+define <6 x i16> @fptosi_v6f32_to_v6i16(<6 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptosi_v6f32_to_v6i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %conv = fptosi <6 x float> %in to <6 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <6 x i16> %conv
+;
+  %conv = fptosi <6 x float> %in to <6 x i16>
+  ret <6 x i16> %conv
+}
+
+define <3 x i8> @fptosi_v3f32_to_v3i8(<3 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptosi_v3f32_to_v3i8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %conv = fptosi <3 x float> %in to <3 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i8> %conv
+;
+  %conv = fptosi <3 x float> %in to <3 x i8>
+  ret <3 x i8> %conv
+}
+
+define <5 x i8> @fptosi_v5f32_to_v5i8(<5 x float> %in, ptr %dst) {
+; CHECK-LABEL: 'fptosi_v5f32_to_v5i8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %conv = fptosi <5 x float> %in to <5 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <5 x i8> %conv
+;
+  %conv = fptosi <5 x float> %in to <5 x i8>
+  ret <5 x i8> %conv
+}
+
+define <3 x float> @uitofp_v3i32_to_v3f32(<3 x i32> %in, ptr %dst) {
+; CHECK-LABEL: 'uitofp_v3i32_to_v3f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = uitofp <3 x i32> %in to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
+;
+  %conv = uitofp <3 x i32> %in to <3 x float>
+  ret <3 x float> %conv
+}
+
+define <5 x float> @uitofp_v5i32_to_v5f32(<5 x i32> %in, ptr %dst) {
+; CHECK-LABEL: 'uitofp_v5i32_to_v5f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %conv = uitofp <5 x i32> %in to <5 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <5 x float> %conv
+;
+  %conv = uitofp <5 x i32> %in to <5 x float>
+  ret <5 x float> %conv
+}
+
+define <3 x float> @uitofp_v3i16_to_v3f32(<3 x i16> %in, ptr %dst) {
+; CHECK-LABEL: 'uitofp_v3i16_to_v3f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = uitofp <3 x i16> %in to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
+;
+  %conv = uitofp <3 x i16> %in to <3 x float>
+  ret <3 x float> %conv
+}
+
+define <7 x float> @uitofp_v7i16_to_v7f32(<7 x i16> %in, ptr %dst) {
+; CHECK-LABEL: 'uitofp_v7i16_to_v7f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %conv = uitofp <7 x i16> %in to <7 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <7 x float> %conv
+;
+  %conv = uitofp <7 x i16> %in to <7 x float>
+  ret <7 x float> %conv
+}
+
+define <3 x float> @uitofp_v3i8_to_v3f32(<3 x i8> %in, ptr %dst) {
+; CHECK-LABEL: 'uitofp_v3i8_to_v3f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = uitofp <3 x i8> %in to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
+;
+  %conv = uitofp <3 x i8> %in to <3 x float>
+  ret <3 x float> %conv
+}
+
+define <11 x float> @uitofp_v11i8_to_v11f32(<11 x i8> %in, ptr %dst) {
+; CHECK-LABEL: 'uitofp_v11i8_to_v11f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %conv = uitofp <11 x i8> %in to <11 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <11 x float> %conv
+;
+  %conv = uitofp <11 x i8> %in to <11 x float>
+  ret <11 x float> %conv
+}
+
+define <3 x float> @sitofp_v3i32_to_v3f32(<3 x i32> %in, ptr %dst) {
+; CHECK-LABEL: 'sitofp_v3i32_to_v3f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = sitofp <3 x i32> %in to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
+;
+  %conv = sitofp <3 x i32> %in to <3 x float>
+  ret <3 x float> %conv
+}
+
+define <7 x float> @sitofp_v7i32_to_v7f32(<7 x i32> %in, ptr %dst) {
+; CHECK-LABEL: 'sitofp_v7i32_to_v7f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %conv = sitofp <7 x i32> %in to <7 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <7 x float> %conv
+;
+  %conv = sitofp <7 x i32> %in to <7 x float>
+  ret <7 x float> %conv
+}
+
+define <3 x float> @sitofp_v3i16_to_v3f32(<3 x i16> %in, ptr %dst) {
+; CHECK-LABEL: 'sitofp_v3i16_to_v3f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = sitofp <3 x i16> %in to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
+;
+  %conv = sitofp <3 x i16> %in to <3 x float>
+  ret <3 x float> %conv
+}
+
+define <6 x float> @sitofp_v6i16_to_v6f32(<6 x i16> %in, ptr %dst) {
+; CHECK-LABEL: 'sitofp_v6i16_to_v6f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %conv = sitofp <6 x i16> %in to <6 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <6 x float> %conv
+;
+  %conv = sitofp <6 x i16> %in to <6 x float>
+  ret <6 x float> %conv
+}
+
+define <3 x float> @sitofp_v3i8_to_v3f32(<3 x i8> %in, ptr %dst) {
+; CHECK-LABEL: 'sitofp_v3i8_to_v3f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = sitofp <3 x i8> %in to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
+;
+  %conv = sitofp <3 x i8> %in to <3 x float>
+  ret <3 x float> %conv
+}
+
+define <8 x float> @sitofp_v8i8_to_v8f32(<8 x i8> %in, ptr %dst) {
+; CHECK-LABEL: 'sitofp_v8i8_to_v8f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %conv = sitofp <8 x i8> %in to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %conv
+;
+  %conv = sitofp <8 x i8> %in to <8 x float>
+  ret <8 x float> %conv
+}
+
+define <7 x float> @sitofp_v7i8_to_v7f32(<7 x i8> %in, ptr %dst) {
+; CHECK-LABEL: 'sitofp_v7i8_to_v7f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %conv = sitofp <7 x i8> %in to <7 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <7 x float> %conv
+;
+  %conv = sitofp <7 x i8> %in to <7 x float>
+  ret <7 x float> %conv
+}
diff --git a/llvm/test/Analysis/CostModel/AArch64/vec3-fp-conversions.ll b/llvm/test/Analysis/CostModel/AArch64/vec3-fp-conversions.ll
deleted file mode 100644
index 334c6107eb383..0000000000000
--- a/llvm/test/Analysis/CostModel/AArch64/vec3-fp-conversions.ll
+++ /dev/null
@@ -1,110 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=arm64-apple-macosx < %s | FileCheck %s
-
-define <3 x i32> @fptoui_v3f32_to_v3i32(<3 x float> %in, ptr %dst) {
-; CHECK-LABEL: 'fptoui_v3f32_to_v3i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = fptoui <3 x float> %in to <3 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i32> %conv
-;
-  %conv = fptoui <3 x float> %in to <3 x i32>
-  ret <3 x i32> %conv
-}
-
-define <3 x i16> @fptoui_v3f32_to_v3i16(<3 x float> %in, ptr %dst) {
-; CHECK-LABEL: 'fptoui_v3f32_to_v3i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %conv = fptoui <3 x float> %in to <3 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i16> %conv
-;
-  %conv = fptoui <3 x float> %in to <3 x i16>
-  ret <3 x i16> %conv
-}
-
-define <3 x i8> @fptoui_v3f32_to_v3i8(<3 x float> %in, ptr %dst) {
-; CHECK-LABEL: 'fptoui_v3f32_to_v3i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %conv = fptoui <3 x float> %in to <3 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i8> %conv
-;
-  %conv = fptoui <3 x float> %in to <3 x i8>
-  ret <3 x i8> %conv
-}
-
-define <3 x i32> @fptosi_v3f32_to_v3i32(<3 x float> %in, ptr %dst) {
-; CHECK-LABEL: 'fptosi_v3f32_to_v3i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = fptosi <3 x float> %in to <3 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i32> %conv
-;
-  %conv = fptosi <3 x float> %in to <3 x i32>
-  ret <3 x i32> %conv
-}
-
-define <3 x i16> @fptosi_v3f32_to_v3i16(<3 x float> %in, ptr %dst) {
-; CHECK-LABEL: 'fptosi_v3f32_to_v3i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %conv = fptosi <3 x float> %in to <3 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i16> %conv
-;
-  %conv = fptosi <3 x float> %in to <3 x i16>
-  ret <3 x i16> %conv
-}
-
-define <3 x i8> @fptosi_v3f32_to_v3i8(<3 x float> %in, ptr %dst) {
-; CHECK-LABEL: 'fptosi_v3f32_to_v3i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %conv = fptosi <3 x float> %in to <3 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x i8> %conv
-;
-  %conv = fptosi <3 x float> %in to <3 x i8>
-  ret <3 x i8> %conv
-}
-
-define <3 x float> @uitofp_v3i32_to_v3f32(<3 x i32> %in, ptr %dst) {
-; CHECK-LABEL: 'uitofp_v3i32_to_v3f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = uitofp <3 x i32> %in to <3 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
-;
-  %conv = uitofp <3 x i32> %in to <3 x float>
-  ret <3 x float> %conv
-}
-
-define <3 x float> @uitofp_v3i16_to_v3f32(<3 x i16> %in, ptr %dst) {
-; CHECK-LABEL: 'uitofp_v3i16_to_v3f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = uitofp <3 x i16> %in to <3 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
-;
-  %conv = uitofp <3 x i16> %in to <3 x float>
-  ret <3 x float> %conv
-}
-
-define <3 x float> @uitofp_v3i8_to_v3f32(<3 x i8> %in, ptr %dst) {
-; CHECK-LABEL: 'uitofp_v3i8_to_v3f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = uitofp <3 x i8> %in to <3 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
-;
-  %conv = uitofp <3 x i8> %in to <3 x float>
-  ret <3 x float> %conv
-}
-
-define <3 x float> @sitofp_v3i32_to_v3f32(<3 x i32> %in, ptr %dst) {
-; CHECK-LABEL: 'sitofp_v3i32_to_v3f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = sitofp <3 x i32> %in to <3 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
-;
-  %conv = sitofp <3 x i32> %in to <3 x float>
-  ret <3 x float> %conv
-}
-
-define <3 x float> @sitofp_v3i16_to_v3f32(<3 x i16> %in, ptr %dst) {
-; CHECK-LABEL: 'sitofp_v3i16_to_v3f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = sitofp <3 x i16> %in to <3 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
-;
-  %conv = sitofp <3 x i16> %in to <3 x float>
-  ret <3 x float> %conv
-}
-
-define <3 x float> @sitofp_v3i8_to_v3f32(<3 x i8> %in, ptr %dst) {
-; CHECK-LABEL: 'sitofp_v3i8_to_v3f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %conv = sitofp <3 x i8> %in to <3 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <3 x float> %conv
-;
-  %conv = sitofp <3 x i8> %in to <3 x float>
-  ret <3 x float> %conv
-}

From bf57ecf06e688a716fc9ce47386228503fa02e47 Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov@arm.com>
Date: Fri, 6 Sep 2024 14:53:50 +0100
Subject: [PATCH 376/425] [AArch64] Prevent generating tbl instruction instead
 of smull (#106375)

Generating tbl instruction for zext in an expression like: mul(zext(i8),
sext) is not optimal.
Instead, allowing later optimisations to generate smull(zext, sext)
would do some of the type extensions implicitly and be faster.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  10 ++
 llvm/test/CodeGen/AArch64/zext-to-tbl.ll      | 148 ++++++++++++++++++
 2 files changed, 158 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9d80087336d23..4342b579c0cb7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16837,6 +16837,16 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
 
       DstTy = TruncDstType;
     }
+
+    // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
+    // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
+    // most one extra extend step is needed and using tbl is not profitable.
+    if (SrcWidth * 4 <= DstWidth && I->hasOneUser()) {
+      auto *SingleUser = cast<Instruction>(*I->user_begin());
+      if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
+        return false;
+    }
+
     IRBuilder<> Builder(ZExt);
     Value *Result = createTblShuffleForZExt(
         Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 66bb131ce7249..bb98a7c1dcb0e 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -3154,3 +3154,151 @@ loop:
 exit:
   ret i32 0
 }
+
+; Not profitable to convert when there is an opportunity to partially
+; reduce the cost of widening by generating smul
+define i32 @mul_zext_16i8_sext_16i8(ptr %p1, ptr %p2, i32 %h) {
+; CHECK-LABEL: mul_zext_16i8_sext_16i8:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:  LBB27_1: ; %loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1], #16
+; CHECK-NEXT:    subs w2, w2, #1
+; CHECK-NEXT:    sshll2.8h v2, v0, #0
+; CHECK-NEXT:    ushll2.8h v3, v1, #0
+; CHECK-NEXT:    sshll.8h v0, v0, #0
+; CHECK-NEXT:    ushll.8h v1, v1, #0
+; CHECK-NEXT:    smull2.4s v4, v2, v3
+; CHECK-NEXT:    smull.4s v2, v2, v3
+; CHECK-NEXT:    smull.4s v3, v0, v1
+; CHECK-NEXT:    smull2.4s v0, v0, v1
+; CHECK-NEXT:    stp q2, q4, [x0, #32]
+; CHECK-NEXT:    str q3, [x0]
+; CHECK-NEXT:    str q0, [x0, #16]!
+; CHECK-NEXT:    b.ne LBB27_1
+; CHECK-NEXT:  ; %bb.2: ; %exit
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+;
+; CHECK-BE-LABEL: mul_zext_16i8_sext_16i8:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:  .LBB27_1: // %loop
+; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:    ld1 { v0.16b }, [x0]
+; CHECK-BE-NEXT:    ld1 { v1.16b }, [x1]
+; CHECK-BE-NEXT:    add x8, x0, #48
+; CHECK-BE-NEXT:    subs w2, w2, #1
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-BE-NEXT:    ushll2 v3.8h, v1.16b, #0
+; CHECK-BE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-BE-NEXT:    smull2 v4.4s, v2.8h, v3.8h
+; CHECK-BE-NEXT:    smull v2.4s, v2.4h, v3.4h
+; CHECK-BE-NEXT:    smull v3.4s, v0.4h, v1.4h
+; CHECK-BE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-BE-NEXT:    st1 { v4.4s }, [x8]
+; CHECK-BE-NEXT:    add x8, x0, #32
+; CHECK-BE-NEXT:    st1 { v3.4s }, [x0]
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    st1 { v2.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x0]
+; CHECK-BE-NEXT:    b.ne .LBB27_1
+; CHECK-BE-NEXT:  // %bb.2: // %exit
+; CHECK-BE-NEXT:    mov w0, wzr
+; CHECK-BE-NEXT:    ret
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.1 = getelementptr inbounds <16 x i8>, ptr %p1, i32 %iv
+  %gep.2 = getelementptr inbounds <16 x i8>, ptr %p2, i32 %iv
+  %l1 = load <16 x i8>, ptr %gep.1
+  %z1 = sext <16 x i8> %l1 to <16 x i32>
+  %l4 = load <16 x i8>, ptr %gep.2
+  %z5 = zext <16 x i8> %l4 to <16 x i32>
+  %mul = mul <16 x i32> %z1, %z5
+  store <16 x i32> %mul, ptr %gep.1
+  %iv.next= add nuw nsw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv.next, %h
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret i32 0
+}
+
+; Same as above but different type
+define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) {
+; CHECK-LABEL: mul_zext_16i8_sext_16i16:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:  LBB28_1: ; %loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr q0, [x1], #16
+; CHECK-NEXT:    ldr q3, [x0]
+; CHECK-NEXT:    ldr q2, [x8, #16]!
+; CHECK-NEXT:    subs w2, w2, #1
+; CHECK-NEXT:    ushll2.8h v1, v0, #0
+; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    smull2.4s v4, v2, v1
+; CHECK-NEXT:    smull.4s v1, v2, v1
+; CHECK-NEXT:    smull2.4s v2, v3, v0
+; CHECK-NEXT:    smull.4s v0, v3, v0
+; CHECK-NEXT:    stp q1, q4, [x0, #32]
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    str q2, [x8]
+; CHECK-NEXT:    b.ne LBB28_1
+; CHECK-NEXT:  ; %bb.2: ; %exit
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+;
+; CHECK-BE-LABEL: mul_zext_16i8_sext_16i16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:  .LBB28_1: // %loop
+; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:    ld1 { v0.16b }, [x1]
+; CHECK-BE-NEXT:    ld1 { v1.8h }, [x0]
+; CHECK-BE-NEXT:    add x8, x0, #16
+; CHECK-BE-NEXT:    ld1 { v3.8h }, [x8]
+; CHECK-BE-NEXT:    add x9, x0, #48
+; CHECK-BE-NEXT:    add x10, x0, #32
+; CHECK-BE-NEXT:    subs w2, w2, #1
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-BE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-BE-NEXT:    smull v4.4s, v1.4h, v2.4h
+; CHECK-BE-NEXT:    smull2 v5.4s, v3.8h, v0.8h
+; CHECK-BE-NEXT:    smull v0.4s, v3.4h, v0.4h
+; CHECK-BE-NEXT:    smull2 v1.4s, v1.8h, v2.8h
+; CHECK-BE-NEXT:    st1 { v4.4s }, [x0]
+; CHECK-BE-NEXT:    mov x0, x8
+; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x10]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT:    b.ne .LBB28_1
+; CHECK-BE-NEXT:  // %bb.2: // %exit
+; CHECK-BE-NEXT:    mov w0, wzr
+; CHECK-BE-NEXT:    ret
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.1 = getelementptr inbounds <16 x i8>, ptr %p1, i32 %iv
+  %gep.2 = getelementptr inbounds <16 x i8>, ptr %p2, i32 %iv
+  %l1 = load <16 x i16>, ptr %gep.1
+  %z1 = sext <16 x i16> %l1 to <16 x i32>
+  %l4 = load <16 x i8>, ptr %gep.2
+  %z5 = zext <16 x i8> %l4 to <16 x i32>
+  %mul = mul <16 x i32> %z1, %z5
+  store <16 x i32> %mul, ptr %gep.1
+  %iv.next= add nuw nsw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv.next, %h
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret i32 0
+}

From e4bb68b8717a20c5828b479f83c8648c2596e598 Mon Sep 17 00:00:00 2001
From: Arseniy Zaostrovnykh <necto.ne@gmail.com>
Date: Fri, 6 Sep 2024 15:55:16 +0200
Subject: [PATCH 377/425] [analyzer] Model constructor initializer for an array
 member (#107537)

Bind the array member to the compound region associated with the
initializer list, e.g.:

    class C {
      int arr[2];
      C() : arr{1, 2} {}
    };
    C c;

This change enables correct values in `c.arr[0]` and `c.arr[1]`

CPP-5647
---
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp |  9 ++-
 clang/test/Analysis/ctor-array.cpp           | 66 ++++++++++++++++----
 clang/test/Analysis/nullptr.cpp              | 16 +++++
 3 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index dfb7111b51255..315d85319a85a 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1207,9 +1207,14 @@ void ExprEngine::ProcessInitializer(const CFGInitializer CFGInit,
           Init = ASE->getBase()->IgnoreImplicit();
 
         SVal LValue = State->getSVal(Init, stackFrame);
-        if (!Field->getType()->isReferenceType())
-          if (std::optional<Loc> LValueLoc = LValue.getAs<Loc>())
+        if (!Field->getType()->isReferenceType()) {
+          if (std::optional<Loc> LValueLoc = LValue.getAs<Loc>()) {
             InitVal = State->getSVal(*LValueLoc);
+          } else if (auto CV = LValue.getAs<nonloc::CompoundVal>()) {
+            // Initializer list for an array.
+            InitVal = *CV;
+          }
+        }
 
         // If we fail to get the value for some reason, use a symbolic value.
         if (InitVal.isUnknownOrUndef()) {
diff --git a/clang/test/Analysis/ctor-array.cpp b/clang/test/Analysis/ctor-array.cpp
index 49412ee5a68c7..52600b314b010 100644
--- a/clang/test/Analysis/ctor-array.cpp
+++ b/clang/test/Analysis/ctor-array.cpp
@@ -234,16 +234,58 @@ struct Parent {
 void member() {
   Parent arr[2];
 
-  // FIXME: Ideally these are TRUE, but at the moment InitListExpr has no
-  // knowledge about where the initializer list is used, so we can't bind
-  // the initializer list to the required region.
-  clang_analyzer_eval(arr[0].arr[0].x == 1); // expected-warning{{UNKNOWN}}
-  clang_analyzer_eval(arr[0].arr[0].y == 2); // expected-warning{{UNKNOWN}}
-  clang_analyzer_eval(arr[0].arr[1].x == 3); // expected-warning{{UNKNOWN}}
-  clang_analyzer_eval(arr[0].arr[1].y == 4); // expected-warning{{UNKNOWN}}
-
-  clang_analyzer_eval(arr[1].arr[0].x == 1); // expected-warning{{UNKNOWN}}
-  clang_analyzer_eval(arr[1].arr[0].y == 2); // expected-warning{{UNKNOWN}}
-  clang_analyzer_eval(arr[1].arr[1].x == 3); // expected-warning{{UNKNOWN}}
-  clang_analyzer_eval(arr[1].arr[1].y == 4); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval(arr[0].arr[0].x == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(arr[0].arr[0].y == 2); // expected-warning{{TRUE}}
+  clang_analyzer_eval(arr[0].arr[1].x == 3); // expected-warning{{TRUE}}
+  clang_analyzer_eval(arr[0].arr[1].y == 4); // expected-warning{{TRUE}}
+
+  clang_analyzer_eval(arr[1].arr[0].x == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(arr[1].arr[0].y == 2); // expected-warning{{TRUE}}
+  clang_analyzer_eval(arr[1].arr[1].x == 3); // expected-warning{{TRUE}}
+  clang_analyzer_eval(arr[1].arr[1].y == 4); // expected-warning{{TRUE}}
+}
+
+struct HasArr {
+  int arrDefault[2] = {1, 2};
+  int arr[2];
+  HasArr(int x, int y) : arr{x, y} {}
+};
+
+struct ArrCombination : public HasArr {
+    HasArr membDefault = {5, 6};
+    HasArr memb;
+    ArrCombination(int x) : HasArr(3, 4), memb{7, x} {}
+};
+
+void derived_and_member() {
+  ArrCombination a{8};
+  // FIXME: Default initializers for array members are not modeled.
+  clang_analyzer_eval(a.arrDefault[0] == 1); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval(a.arrDefault[1] == 2); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval(a.arr[0] == 3); // expected-warning{{TRUE}}
+  clang_analyzer_eval(a.arr[1] == 4); // expected-warning{{TRUE}}
+  clang_analyzer_eval(a.membDefault.arrDefault[0] == 1); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval(a.membDefault.arrDefault[1] == 2); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval(a.membDefault.arr[0] == 5); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval(a.membDefault.arr[1] == 6); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval(a.memb.arrDefault[0] == 1); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval(a.memb.arrDefault[1] == 2); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval(a.memb.arr[0] == 7); // expected-warning{{TRUE}}
+  clang_analyzer_eval(a.memb.arr[1] == 8); // expected-warning{{TRUE}}
+
+}
+
+struct IncompleteArrInit {
+  int arr[2];
+  int arrDefault[3] = {1, 2, 3};
+  IncompleteArrInit() : arr{1}, arrDefault{2, 3} {}
+};
+
+void incomplete_array_init() {
+  IncompleteArrInit a;
+  clang_analyzer_eval(a.arr[0] == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(a.arr[1] == 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(a.arrDefault[0] == 2); // expected-warning{{TRUE}}
+  clang_analyzer_eval(a.arrDefault[1] == 3); // expected-warning{{TRUE}}
+  clang_analyzer_eval(a.arrDefault[2] == 0); // expected-warning{{TRUE}}
 }
diff --git a/clang/test/Analysis/nullptr.cpp b/clang/test/Analysis/nullptr.cpp
index 825f6570af591..73f10a08d96c8 100644
--- a/clang/test/Analysis/nullptr.cpp
+++ b/clang/test/Analysis/nullptr.cpp
@@ -173,3 +173,19 @@ void test_address_space_bind() {
   AS1 AS_ATTRIBUTE &r = *pa;
   r.x = 0; // no-warning
 }
+
+namespace ArrMemWithCtorInitializer {
+struct ArrayMem {
+  int* ptrArr[1];
+  int* memPtr;
+  ArrayMem() : ptrArr{nullptr}, memPtr{nullptr} {}
+  // expected-note@-1{{Storing null pointer value}}
+};
+
+void tp() {
+  ArrayMem obj; // expected-note{{Calling default constructor for 'ArrayMem'}}
+                // expected-note@-1{{Returning from default constructor for 'ArrayMem'}}
+  *obj.ptrArr[0] = 0; // expected-warning{{Dereference of null pointer}}
+                       // expected-note@-1{{Dereference of null pointer}}
+}
+} // namespace ArrMemWithCtorInitializer

From 6ab5829ab7f03417ccb13e75d68b241871701be1 Mon Sep 17 00:00:00 2001
From: Johannes de Fine Licht <johannes@musicmedia.dk>
Date: Fri, 6 Sep 2024 15:56:02 +0200
Subject: [PATCH 378/425] [MLIR][LLVM][NFC] Remove dead interface and add
 namespace qualifiers (#107573)

The `GetResultPtrElementType` interface is dead now that MLIR has fully
moved to opaque pointers, and can be removed.

Add namespace qualifiers to all argument types and return types of
interface methods for when they're used outside of LLVM dialect.
---
 .../mlir/Dialect/LLVMIR/LLVMInterfaces.td     | 57 ++++++-------------
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   |  6 +-
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    | 31 ----------
 3 files changed, 19 insertions(+), 75 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
index e2180410a8f04..7e38e0b27fd96 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
@@ -25,7 +25,7 @@ def FastmathFlagsInterface : OpInterface<"FastmathFlagsInterface"> {
   let methods = [
     InterfaceMethod<
       /*desc=*/        "Returns a FastmathFlagsAttr attribute for the operation",
-      /*returnType=*/  "FastmathFlagsAttr",
+      /*returnType=*/  "::mlir::LLVM::FastmathFlagsAttr",
       /*methodName=*/  "getFastmathAttr",
       /*args=*/        (ins),
       /*methodBody=*/  [{}],
@@ -37,7 +37,7 @@ def FastmathFlagsInterface : OpInterface<"FastmathFlagsInterface"> {
     StaticInterfaceMethod<
       /*desc=*/        [{Returns the name of the FastmathFlagsAttr attribute
                          for the operation}],
-      /*returnType=*/  "StringRef",
+      /*returnType=*/  "::llvm::StringRef",
       /*methodName=*/  "getFastmathAttrName",
       /*args=*/        (ins),
       /*methodBody=*/  [{}],
@@ -100,7 +100,7 @@ def BranchWeightOpInterface : OpInterface<"BranchWeightOpInterface"> {
   let methods = [
     InterfaceMethod<
       /*desc=*/        "Returns the branch weights attribute or nullptr",
-      /*returnType=*/  "DenseI32ArrayAttr",
+      /*returnType=*/  "::mlir::DenseI32ArrayAttr",
       /*methodName=*/  "getBranchWeightsOrNull",
       /*args=*/        (ins),
       /*methodBody=*/  [{}],
@@ -113,7 +113,7 @@ def BranchWeightOpInterface : OpInterface<"BranchWeightOpInterface"> {
       /*desc=*/        "Sets the branch weights attribute",
       /*returnType=*/  "void",
       /*methodName=*/  "setBranchWeights",
-      /*args=*/        (ins "DenseI32ArrayAttr":$attr),
+      /*args=*/        (ins "::mlir::DenseI32ArrayAttr":$attr),
       /*methodBody=*/  [{}],
       /*defaultImpl=*/ [{
         auto op = cast<ConcreteOp>(this->getOperation());
@@ -137,7 +137,7 @@ def AccessGroupOpInterface : OpInterface<"AccessGroupOpInterface"> {
   let methods = [
     InterfaceMethod<
       /*desc=*/        "Returns the access groups attribute or nullptr",
-      /*returnType=*/  "ArrayAttr",
+      /*returnType=*/  "::mlir::ArrayAttr",
       /*methodName=*/  "getAccessGroupsOrNull",
       /*args=*/        (ins),
       /*methodBody=*/  [{}],
@@ -150,7 +150,7 @@ def AccessGroupOpInterface : OpInterface<"AccessGroupOpInterface"> {
       /*desc=*/        "Sets the access groups attribute",
       /*returnType=*/  "void",
       /*methodName=*/  "setAccessGroups",
-      /*args=*/        (ins "const ArrayAttr":$attr),
+      /*args=*/        (ins "const ::mlir::ArrayAttr":$attr),
       /*methodBody=*/  [{}],
       /*defaultImpl=*/ [{
         auto op = cast<ConcreteOp>(this->getOperation());
@@ -175,7 +175,7 @@ def AliasAnalysisOpInterface : OpInterface<"AliasAnalysisOpInterface"> {
   let methods = [
     InterfaceMethod<
       /*desc=*/        "Returns the alias scopes attribute or nullptr",
-      /*returnType=*/  "ArrayAttr",
+      /*returnType=*/  "::mlir::ArrayAttr",
       /*methodName=*/  "getAliasScopesOrNull",
       /*args=*/        (ins),
       /*methodBody=*/  [{}],
@@ -188,7 +188,7 @@ def AliasAnalysisOpInterface : OpInterface<"AliasAnalysisOpInterface"> {
       /*desc=*/        "Sets the alias scopes attribute",
       /*returnType=*/  "void",
       /*methodName=*/  "setAliasScopes",
-      /*args=*/        (ins "const ArrayAttr":$attr),
+      /*args=*/        (ins "const ::mlir::ArrayAttr":$attr),
       /*methodBody=*/  [{}],
       /*defaultImpl=*/ [{
         auto op = cast<ConcreteOp>(this->getOperation());
@@ -197,7 +197,7 @@ def AliasAnalysisOpInterface : OpInterface<"AliasAnalysisOpInterface"> {
       >,
     InterfaceMethod<
       /*desc=*/        "Returns the noalias scopes attribute or nullptr",
-      /*returnType=*/  "ArrayAttr",
+      /*returnType=*/  "::mlir::ArrayAttr",
       /*methodName=*/  "getNoAliasScopesOrNull",
       /*args=*/        (ins),
       /*methodBody=*/  [{}],
@@ -210,7 +210,7 @@ def AliasAnalysisOpInterface : OpInterface<"AliasAnalysisOpInterface"> {
       /*desc=*/        "Sets the noalias scopes attribute",
       /*returnType=*/  "void",
       /*methodName=*/  "setNoAliasScopes",
-      /*args=*/        (ins "const ArrayAttr":$attr),
+      /*args=*/        (ins "const ::mlir::ArrayAttr":$attr),
       /*methodBody=*/  [{}],
       /*defaultImpl=*/ [{
         auto op = cast<ConcreteOp>(this->getOperation());
@@ -219,7 +219,7 @@ def AliasAnalysisOpInterface : OpInterface<"AliasAnalysisOpInterface"> {
       >,
     InterfaceMethod<
       /*desc=*/        "Returns the tbaa attribute or nullptr",
-      /*returnType=*/  "ArrayAttr",
+      /*returnType=*/  "::mlir::ArrayAttr",
       /*methodName=*/  "getTBAATagsOrNull",
       /*args=*/        (ins),
       /*methodBody=*/  [{}],
@@ -232,7 +232,7 @@ def AliasAnalysisOpInterface : OpInterface<"AliasAnalysisOpInterface"> {
       /*desc=*/        "Sets the tbaa attribute",
       /*returnType=*/  "void",
       /*methodName=*/  "setTBAATags",
-      /*args=*/        (ins "const ArrayAttr":$attr),
+      /*args=*/        (ins "const ::mlir::ArrayAttr":$attr),
       /*methodBody=*/  [{}],
       /*defaultImpl=*/ [{
         auto op = cast<ConcreteOp>(this->getOperation());
@@ -249,29 +249,6 @@ def AliasAnalysisOpInterface : OpInterface<"AliasAnalysisOpInterface"> {
   ];
 }
 
-def GetResultPtrElementType : OpInterface<"GetResultPtrElementType"> {
-  let description = [{
-    An interface for operations that yield an LLVMPointer. Allows the
-    operation to provide the type of the element an LLVMPointer points to,
-    if known. This is only a hint as to how to interpret a given pointer,
-    translating how the current operation understands it.
-  }];
-
-  let cppNamespace = "::mlir::LLVM";
-
-  let methods = [
-    InterfaceMethod<
-      /*desc=*/        [{Returns the the element type hint of the result
-                         LLVMPointer, if known. Returns nullptr if the
-                         requested result is not an LLVMPointer or if the
-                         element type is unknown.}],
-      /*returnType=*/  "Type",
-      /*methodName=*/  "getResultPtrElementType",
-      /*args=*/        (ins)
-      >
-  ];
-}
-
 def FPExceptionBehaviorOpInterface : OpInterface<"FPExceptionBehaviorOpInterface"> {
   let description = [{
     An interface for operations receiving an exception behavior attribute
@@ -283,7 +260,7 @@ def FPExceptionBehaviorOpInterface : OpInterface<"FPExceptionBehaviorOpInterface
   let methods = [
     InterfaceMethod<
       /*desc=*/        "Returns a FPExceptionBehavior attribute for the operation",
-      /*returnType=*/  "FPExceptionBehaviorAttr",
+      /*returnType=*/  "::mlir::LLVM::FPExceptionBehaviorAttr",
       /*methodName=*/  "getFPExceptionBehaviorAttr",
       /*args=*/        (ins),
       /*methodBody=*/  [{}],
@@ -295,7 +272,7 @@ def FPExceptionBehaviorOpInterface : OpInterface<"FPExceptionBehaviorOpInterface
     StaticInterfaceMethod<
       /*desc=*/        [{Returns the name of the FPExceptionBehaviorAttr
                         attribute for the operation}],
-      /*returnType=*/  "StringRef",
+      /*returnType=*/  "::llvm::StringRef",
       /*methodName=*/  "getFPExceptionBehaviorAttrName",
       /*args=*/        (ins),
       /*methodBody=*/  [{}],
@@ -317,7 +294,7 @@ def RoundingModeOpInterface : OpInterface<"RoundingModeOpInterface"> {
   let methods = [
     InterfaceMethod<
       /*desc=*/        "Returns a RoundingMode attribute for the operation",
-      /*returnType=*/  "RoundingModeAttr",
+      /*returnType=*/  "::mlir::LLVM::RoundingModeAttr",
       /*methodName=*/  "getRoundingModeAttr",
       /*args=*/        (ins),
       /*methodBody=*/  [{}],
@@ -329,7 +306,7 @@ def RoundingModeOpInterface : OpInterface<"RoundingModeOpInterface"> {
     StaticInterfaceMethod<
       /*desc=*/        [{Returns the name of the RoundingModeAttr attribute
                          for the operation}],
-      /*returnType=*/  "StringRef",
+      /*returnType=*/  "::llvm::StringRef",
       /*methodName=*/  "getRoundingModeAttrName",
       /*args=*/        (ins),
       /*methodBody=*/  [{}],
@@ -359,7 +336,7 @@ def LLVM_PointerElementTypeInterface
       /*description=*/"Returns the size of the type in bytes.",
       /*retTy=*/"unsigned",
       /*methodName=*/"getSizeInBytes",
-      /*args=*/(ins "const DataLayout &":$dataLayout),
+      /*args=*/(ins "const ::mlir::DataLayout &":$dataLayout),
       /*methodBody=*/"",
       /*defaultImplementation=*/[{
         return dataLayout.getTypeSize($_type);
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 86f6b3e6326c2..d956d7f27f784 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -201,8 +201,7 @@ def LLVM_FNegOp : LLVM_UnaryFloatArithmeticOp<
 // Memory-related operations.
 def LLVM_AllocaOp : LLVM_Op<"alloca",
     [DeclareOpInterfaceMethods<PromotableAllocationOpInterface>,
-     DeclareOpInterfaceMethods<DestructurableAllocationOpInterface>,
-     DeclareOpInterfaceMethods<GetResultPtrElementType>]>,
+     DeclareOpInterfaceMethods<DestructurableAllocationOpInterface>]>,
   LLVM_MemOpPatterns {
   let arguments = (ins AnySignlessInteger:$arraySize,
                    OptionalAttr<I64Attr>:$alignment,
@@ -247,8 +246,7 @@ def LLVM_AllocaOp : LLVM_Op<"alloca",
 def LLVM_GEPOp : LLVM_Op<"getelementptr", [Pure,
     DeclareOpInterfaceMethods<PromotableOpInterface>,
     DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>,
-    DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
-    DeclareOpInterfaceMethods<GetResultPtrElementType>]> {
+    DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>]> {
   let arguments = (ins LLVM_ScalarOrVectorOf<LLVM_AnyPointer>:$base,
                    Variadic<LLVM_ScalarOrVectorOf<AnySignlessInteger>>:$dynamicIndices,
                    DenseI32ArrayAttr:$rawConstantIndices,
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 6e4a964f1fc93..b4ef5d7677085 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -413,8 +413,6 @@ LogicalResult AllocaOp::verify() {
   return success();
 }
 
-Type AllocaOp::getResultPtrElementType() { return getElemType(); }
-
 //===----------------------------------------------------------------------===//
 // LLVM::BrOp
 //===----------------------------------------------------------------------===//
@@ -792,35 +790,6 @@ LogicalResult LLVM::GEPOp::verify() {
                              [&] { return emitOpError(); });
 }
 
-Type GEPOp::getResultPtrElementType() {
-  // Set the initial type currently being used for indexing. This will be
-  // updated as the indices get walked over.
-  Type selectedType = getElemType();
-
-  // Follow the indexed elements in the gep.
-  auto indices = getIndices();
-  for (GEPIndicesAdaptor<ValueRange>::value_type index :
-       llvm::drop_begin(indices)) {
-    // GEPs can only index into aggregates which can be structs or arrays.
-
-    // The resulting type if indexing into an array type is always the element
-    // type, regardless of index.
-    if (auto arrayType = dyn_cast<LLVMArrayType>(selectedType)) {
-      selectedType = arrayType.getElementType();
-      continue;
-    }
-
-    // The GEP verifier ensures that any index into structs are static and
-    // that they refer to a field within the struct.
-    selectedType = cast<DestructurableTypeInterface>(selectedType)
-                       .getTypeAtIndex(cast<IntegerAttr>(index));
-  }
-
-  // When there are no more indices, the type currently being used for indexing
-  // is the type of the value pointed at by the returned indexed pointer.
-  return selectedType;
-}
-
 //===----------------------------------------------------------------------===//
 // LoadOp
 //===----------------------------------------------------------------------===//

From a9daad8280c081ee15c16cf8515630816695fb0e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 6 Sep 2024 18:18:27 +0400
Subject: [PATCH 379/425] AMDGPU: Update live intervals in
 convertToThreeAddress (#104610)

Fixes #98741
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  40 +++--
 .../test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir | 142 +++++++++++++++---
 2 files changed, 156 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 90e11df500bc9..c6f28af1e5e73 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4059,17 +4059,37 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
        !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
     MachineInstr *DefMI;
     const auto killDef = [&]() -> void {
-      const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
       // The only user is the instruction which will be killed.
       Register DefReg = DefMI->getOperand(0).getReg();
-      if (!MRI.hasOneNonDBGUse(DefReg))
-        return;
-      // We cannot just remove the DefMI here, calling pass will crash.
-      DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
-      for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
-        DefMI->removeOperand(I);
-      if (LV)
-        LV->getVarInfo(DefReg).AliveBlocks.clear();
+
+      if (MRI.hasOneNonDBGUse(DefReg)) {
+        // We cannot just remove the DefMI here, calling pass will crash.
+        DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
+        DefMI->getOperand(0).setIsDead(true);
+        for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
+          DefMI->removeOperand(I);
+        if (LV)
+          LV->getVarInfo(DefReg).AliveBlocks.clear();
+      }
+
+      if (LIS) {
+        LiveInterval &DefLI = LIS->getInterval(DefReg);
+
+        // We cannot delete the original instruction here, so hack out the use
+        // in the original instruction with a dummy register so we can use
+        // shrinkToUses to deal with any multi-use edge cases. Other targets do
+        // not have the complexity of deleting a use to consider here.
+        Register DummyReg = MRI.cloneVirtualRegister(DefReg);
+        for (MachineOperand &MIOp : MI.uses()) {
+          if (MIOp.isReg() && MIOp.getReg() == DefReg) {
+            MIOp.setIsUndef(true);
+            MIOp.setReg(DummyReg);
+          }
+        }
+
+        LIS->shrinkToUses(&DefLI);
+      }
     };
 
     int64_t Imm;
@@ -4107,6 +4127,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                   .add(*Src2)
                   .setMIFlags(MI.getFlags());
         updateLiveVariables(LV, MI, *MIB);
+
         if (LIS)
           LIS->ReplaceMachineInstrInMaps(MI, *MIB);
         killDef();
@@ -4129,6 +4150,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                   .add(*Src2)
                   .setMIFlags(MI.getFlags());
         updateLiveVariables(LV, MI, *MIB);
+
         if (LIS)
           LIS->ReplaceMachineInstrInMaps(MI, *MIB);
         if (DefMI)
diff --git a/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir
index 1768e39d1a06c..f814dd335d20c 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir
@@ -1,33 +1,138 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GFX10,GFX10-NOLIS %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -passes=two-address-instruction -verify-each -o - %s | FileCheck --check-prefixes=GFX10,GFX10-NOLIS %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=liveintervals,twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GFX10,GFX10-LIS %s
+
 
 # GFX10-LABEL: name: test_fmamk_reg_imm_f16
-# GFX10: %2:vgpr_32 = IMPLICIT_DEF
+# GFX10: dead %2:vgpr_32 = IMPLICIT_DEF
 # GFX10-NOT: V_MOV_B32
-# GFX10: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
+# GFX10-NOLIS: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
+# GFX10-LIS: V_FMAMK_F16 %0.sub0, 1078523331, %1, implicit $mode, implicit $exec
 ---
 name:            test_fmamk_reg_imm_f16
-registers:
-  - { id: 0, class: vreg_64 }
-  - { id: 1, class: vgpr_32 }
-  - { id: 2, class: vgpr_32 }
-  - { id: 3, class: vgpr_32 }
+tracksRegLiveness: true
 body:             |
   bb.0:
 
-    %0 = IMPLICIT_DEF
-    %1 = COPY %0.sub1
-    %2 = V_MOV_B32_e32 1078523331, implicit $exec
-    %3 = V_FMAC_F16_e32 killed %0.sub0, %2, killed %1, implicit $mode, implicit $exec
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = COPY %0.sub1
+    %2:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
+    %3:vgpr_32 = V_FMAC_F16_e32 killed %0.sub0, %2, killed %1, implicit $mode, implicit $exec
+
+...
+
+# GFX10-LABEL: name: test_fmamk_reg_imm_f16__imm_is_subreg
+# GFX10: %0:vreg_64 = IMPLICIT_DEF
+# GFX10: %1:vgpr_32 = COPY %0.sub1
+# GFX10: dead undef %2.sub0:vreg_64 = IMPLICIT_DEF
+# GFX10-NOLIS: %3:vgpr_32 = V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
+# GFX10-LIS: %3:vgpr_32 = V_FMAMK_F16 %0.sub0, 1078523331, %1, implicit $mode, implicit $exec
+---
+name:            test_fmamk_reg_imm_f16__imm_is_subreg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = COPY %0.sub1
+    undef %2.sub0:vreg_64 = V_MOV_B32_e32 1078523331, implicit $exec
+    %3:vgpr_32 = V_FMAC_F16_e32 killed %0.sub0, %2.sub0, killed %1, implicit $mode, implicit $exec
+
+...
+
+# GFX10-LABEL: name: test_fmamk_reg_imm_f16__imm_is_subreg_fully_defined
+# GFX10: %0:vreg_64 = IMPLICIT_DEF
+# GFX10: %1:vgpr_32 = COPY %0.sub1
+# GFX10: undef %2.sub1:vreg_64 = V_MOV_B32_e32 9999, implicit $exec
+# GFX10: %2.sub0:vreg_64 = V_MOV_B32_e32 1078523331, implicit $exec
+# GFX10-NOLIS: %3:vgpr_32 = V_FMA_F16_gfx9_e64 0, killed %0.sub0, 0, %2.sub0, 0, killed %1, 0, 0, 0, implicit $mode, implicit $e
+# GFX10-LIS: %3:vgpr_32 = V_FMA_F16_gfx9_e64 0, %0.sub0, 0, %2.sub0, 0, %1, 0, 0, 0, implicit $mode, implicit $e
+---
+name:            test_fmamk_reg_imm_f16__imm_is_subreg_fully_defined
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = COPY %0.sub1
+    undef %2.sub1 = V_MOV_B32_e32 9999, implicit $exec
+    %2.sub0:vreg_64 = V_MOV_B32_e32 1078523331, implicit $exec
+    %3:vgpr_32 = V_FMAC_F16_e32 killed %0.sub0, %2.sub0, killed %1, implicit $mode, implicit $exec
+
+...
+
+# GFX10-LABEL: name: test_fmamk_reg_imm_f16__use_imm_before_mac
+# GFX10: %0:vreg_64 = IMPLICIT_DEF
+# GFX10: %1:vgpr_32 = COPY %0.sub1
+# GFX10: %2:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
+# GFX10: S_NOP 0, implicit %2
+# GFX10-NOLIS: %3:vgpr_32 = V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
+# GFX10-LIS: %3:vgpr_32 = V_FMAMK_F16 %0.sub0, 1078523331, %1, implicit $mode, implicit $exec
+---
+name:            test_fmamk_reg_imm_f16__use_imm_before_mac
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = COPY %0.sub1
+    %2:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
+    S_NOP 0, implicit %2
+    %3:vgpr_32 = V_FMAC_F16_e32 killed %0.sub0, %2, killed %1, implicit $mode, implicit $exec
+
+...
+
+# GFX10-LABEL: name: test_fmamk_reg_imm_f16__use_imm_after_mac
+# GFX10: %0:vreg_64 = IMPLICIT_DEF
+# GFX10: %1:vgpr_32 = COPY %0.sub1
+# GFX10: %2:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
+# GFX10-NOLIS: %3:vgpr_32 = V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
+# GFX10-LIS: %3:vgpr_32 = V_FMAMK_F16 %0.sub0, 1078523331, %1, implicit $mode, implicit $exec
+---
+name:            test_fmamk_reg_imm_f16__use_imm_after_mac
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = COPY %0.sub1
+    %2:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
+    %3:vgpr_32 = V_FMAC_F16_e32 killed %0.sub0, %2, killed %1, implicit $mode, implicit $exec
+    S_NOP 0, implicit %2
+
+...
+
+# GFX10-LABEL: name: test_fmamk_reg_imm_f16__use_imm_before_after_mac
+# GFX10: %0:vreg_64 = IMPLICIT_DEF
+# GFX10: %1:vgpr_32 = COPY %0.sub1
+# GFX10: %2:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
+# GFX10: S_NOP 0, implicit %2
+# GFX10-NOLIS: %3:vgpr_32 = V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
+# GFX10-LIS: %3:vgpr_32 = V_FMAMK_F16 %0.sub0, 1078523331, %1, implicit $mode, implicit $exec
+# GFX10: S_NOP 0, implicit %2
+
+---
+name:            test_fmamk_reg_imm_f16__use_imm_before_after_mac
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = COPY %0.sub1
+    %2:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
+    S_NOP 0, implicit %2
+    %3:vgpr_32 = V_FMAC_F16_e32 killed %0.sub0, %2, killed %1, implicit $mode, implicit $exec
+    S_NOP 0, implicit %2
 
 ...
 
 # GFX10-LABEL: name: test_fmamk_imm_reg_f16
-# GFX10: %2:vgpr_32 = IMPLICIT_DEF
+# GFX10: dead %2:vgpr_32 = IMPLICIT_DEF
 # GFX10-NOT: V_MOV_B32
-# GFX10: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
+# GFX10-NOLIS: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
+# GFX10-LIS: V_FMAMK_F16 %0.sub0, 1078523331, %1, implicit $mode, implicit $exec
 ---
 name:            test_fmamk_imm_reg_f16
+tracksRegLiveness: true
 registers:
   - { id: 0, class: vreg_64 }
   - { id: 1, class: vgpr_32 }
@@ -46,9 +151,11 @@ body:             |
 # GFX10-LABEL: name: test_fmaak_f16
 # GFX10: %1:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
 # GFX10-NOT: V_MOV_B32
-# GFX10: V_FMAAK_F16 killed %0.sub0, %0.sub1, 1078523331, implicit $mode, implicit $exec
+# GFX10-NOLIS: V_FMAAK_F16 killed %0.sub0, %0.sub1, 1078523331, implicit $mode, implicit $exec
+# GFX10-LIS: V_FMAAK_F16 %0.sub0, %0.sub1, 1078523331, implicit $mode, implicit $exec
 ---
 name:            test_fmaak_f16
+tracksRegLiveness: true
 registers:
   - { id: 0, class: vreg_64 }
   - { id: 1, class: vgpr_32 }
@@ -65,7 +172,8 @@ body:             |
 # GFX10-LABEL: name: test_fmaak_inline_literal_f16
 # GFX10: %1:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
 # GFX10-NOT: V_MOV_B32
-# GFX10: %2:vgpr_32 = V_FMAAK_F16 16384, killed %0, 49664, implicit $mode, implicit $exec
+# GFX10-NOLIS: %2:vgpr_32 = V_FMAAK_F16 16384, killed %0, 49664, implicit $mode, implicit $exec
+# GFX10-LIS: %2:vgpr_32 = V_FMAAK_F16 16384, %0, 49664, implicit $mode, implicit $exec
 
 ---
 name:            test_fmaak_inline_literal_f16

From 383057e5a0b53a79926dbdbae4cfcd39bfb44645 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson@arm.com>
Date: Fri, 6 Sep 2024 13:49:49 +0000
Subject: [PATCH 380/425] [Clang][AArch64] Fix -Wcovered-switch-default warning
 (SemaARM)

---
 clang/include/clang/Basic/TargetBuiltins.h | 1 +
 clang/lib/Sema/SemaARM.cpp                 | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index 124e353a5c1c4..d0f41b17c154f 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -236,6 +236,7 @@ namespace clang {
       case Poly128:
         return 128;
       }
+      llvm_unreachable("Invalid NeonTypeFlag!");
     }
   };
 
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 770968cb2d4b4..efde354860de4 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -504,9 +504,6 @@ bool SemaARM::CheckImmediateArg(CallExpr *TheCall, unsigned CheckTy,
         SemaRef.BuiltinConstantArgMultiple(TheCall, ArgIdx, 2))
       return true;
     break;
-  default:
-    llvm_unreachable("Invalid immediate range typeflag!");
-    break;
   }
   return false;
 }

From 100d9b89947bb1d42af20010bb594fa4c02542fc Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 6 Sep 2024 11:17:47 +0400
Subject: [PATCH 381/425] Reapply "AtomicExpand: Allow incrementally legalizing
 atomicrmw" (#107307)

This reverts commit 63da545ccdd41d9eb2392a8d0e848a65eb24f5fa.

Use reverse iteration in the instruction loop to avoid sanitizer
errors. This also has the side effect of avoiding the AArch64
codegen quality regressions.

Closes #107309
---
 llvm/lib/CodeGen/AtomicExpandPass.cpp    |  35 +++++---
 llvm/test/CodeGen/NVPTX/atomics-sm70.ll  |  16 ++--
 llvm/test/CodeGen/NVPTX/atomics-sm90.ll  | 100 +++++++++++------------
 llvm/test/CodeGen/PowerPC/all-atomics.ll |  88 ++++++++++----------
 llvm/test/CodeGen/X86/atomic6432.ll      |  56 ++++++-------
 llvm/test/CodeGen/X86/pr5145.ll          |  16 ++--
 6 files changed, 162 insertions(+), 149 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 39a705599f90c..f2b58d542b373 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -351,17 +351,30 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
 
   bool MadeChange = false;
 
-  SmallVector<Instruction *, 1> AtomicInsts;
-
-  // Changing control-flow while iterating through it is a bad idea, so gather a
-  // list of all atomic instructions before we start.
-  for (Instruction &I : instructions(F))
-    if (I.isAtomic() && !isa<FenceInst>(&I))
-      AtomicInsts.push_back(&I);
-
-  for (auto *I : AtomicInsts) {
-    if (processAtomicInstr(I))
-      MadeChange = true;
+  for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE;) {
+    BasicBlock *BB = &*BBI;
+    ++BBI;
+
+    BasicBlock::reverse_iterator Next;
+
+    for (BasicBlock::reverse_iterator I = BB->rbegin(), E = BB->rend(); I != E;
+         I = Next) {
+      Instruction &Inst = *I;
+      Next = std::next(I);
+
+      if (processAtomicInstr(&Inst)) {
+        MadeChange = true;
+
+        // Detect control flow change and resume iteration from the original
+        // block to inspect any newly inserted blocks. This allows incremental
+        // legalization of atomicrmw and cmpxchg.
+        if (Next != E && BB != Next->getParent()) {
+          BBI = BB->getIterator();
+          BBE = F.end();
+          break;
+        }
+      }
+    }
   }
 
   return MadeChange;
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
index 9cc45fbe313b7..0c1ca8cb7ac16 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -61,7 +61,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    shl.b32 %r27, %r26, %r2;
 ; CHECKPTX62-NEXT:    not.b32 %r3, %r27;
 ; CHECKPTX62-NEXT:    ld.u32 %r54, [%r1];
-; CHECKPTX62-NEXT:  $L__BB0_1: // %atomicrmw.start
+; CHECKPTX62-NEXT:  $L__BB0_1: // %atomicrmw.start45
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX62-NEXT:    shr.u32 %r28, %r54, %r2;
 ; CHECKPTX62-NEXT:    cvt.u16.u32 %rs2, %r28;
@@ -74,9 +74,9 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    setp.ne.s32 %p1, %r6, %r54;
 ; CHECKPTX62-NEXT:    mov.u32 %r54, %r6;
 ; CHECKPTX62-NEXT:    @%p1 bra $L__BB0_1;
-; CHECKPTX62-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECKPTX62-NEXT:  // %bb.2: // %atomicrmw.end44
 ; CHECKPTX62-NEXT:    ld.u32 %r55, [%r1];
-; CHECKPTX62-NEXT:  $L__BB0_3: // %atomicrmw.start9
+; CHECKPTX62-NEXT:  $L__BB0_3: // %atomicrmw.start27
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX62-NEXT:    shr.u32 %r33, %r55, %r2;
 ; CHECKPTX62-NEXT:    cvt.u16.u32 %rs6, %r33;
@@ -90,14 +90,14 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    setp.ne.s32 %p2, %r9, %r55;
 ; CHECKPTX62-NEXT:    mov.u32 %r55, %r9;
 ; CHECKPTX62-NEXT:    @%p2 bra $L__BB0_3;
-; CHECKPTX62-NEXT:  // %bb.4: // %atomicrmw.end8
+; CHECKPTX62-NEXT:  // %bb.4: // %atomicrmw.end26
 ; CHECKPTX62-NEXT:    and.b32 %r10, %r22, -4;
 ; CHECKPTX62-NEXT:    shl.b32 %r38, %r22, 3;
 ; CHECKPTX62-NEXT:    and.b32 %r11, %r38, 24;
 ; CHECKPTX62-NEXT:    shl.b32 %r40, %r26, %r11;
 ; CHECKPTX62-NEXT:    not.b32 %r12, %r40;
 ; CHECKPTX62-NEXT:    ld.global.u32 %r56, [%r10];
-; CHECKPTX62-NEXT:  $L__BB0_5: // %atomicrmw.start27
+; CHECKPTX62-NEXT:  $L__BB0_5: // %atomicrmw.start9
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX62-NEXT:    shr.u32 %r41, %r56, %r11;
 ; CHECKPTX62-NEXT:    cvt.u16.u32 %rs11, %r41;
@@ -110,14 +110,14 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    setp.ne.s32 %p3, %r15, %r56;
 ; CHECKPTX62-NEXT:    mov.u32 %r56, %r15;
 ; CHECKPTX62-NEXT:    @%p3 bra $L__BB0_5;
-; CHECKPTX62-NEXT:  // %bb.6: // %atomicrmw.end26
+; CHECKPTX62-NEXT:  // %bb.6: // %atomicrmw.end8
 ; CHECKPTX62-NEXT:    and.b32 %r16, %r23, -4;
 ; CHECKPTX62-NEXT:    shl.b32 %r46, %r23, 3;
 ; CHECKPTX62-NEXT:    and.b32 %r17, %r46, 24;
 ; CHECKPTX62-NEXT:    shl.b32 %r48, %r26, %r17;
 ; CHECKPTX62-NEXT:    not.b32 %r18, %r48;
 ; CHECKPTX62-NEXT:    ld.shared.u32 %r57, [%r16];
-; CHECKPTX62-NEXT:  $L__BB0_7: // %atomicrmw.start45
+; CHECKPTX62-NEXT:  $L__BB0_7: // %atomicrmw.start
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX62-NEXT:    shr.u32 %r49, %r57, %r17;
 ; CHECKPTX62-NEXT:    cvt.u16.u32 %rs15, %r49;
@@ -130,7 +130,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    setp.ne.s32 %p4, %r21, %r57;
 ; CHECKPTX62-NEXT:    mov.u32 %r57, %r21;
 ; CHECKPTX62-NEXT:    @%p4 bra $L__BB0_7;
-; CHECKPTX62-NEXT:  // %bb.8: // %atomicrmw.end44
+; CHECKPTX62-NEXT:  // %bb.8: // %atomicrmw.end
 ; CHECKPTX62-NEXT:    ret;
   %r1 = atomicrmw fadd ptr %dp0, half %val seq_cst
   %r2 = atomicrmw fadd ptr %dp0, half 1.0 seq_cst
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index d69dd8ad1c940..8bae18dcc5eef 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -45,62 +45,62 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ;
 ; CHECKPTX71-LABEL: test(
 ; CHECKPTX71:       {
-; CHECKPTX71-NEXT:  	.reg .pred 	%p<5>;
-; CHECKPTX71-NEXT:  	.reg .b16 	%rs<34>;
-; CHECKPTX71-NEXT:  	.reg .b32 	%r<4>;
-; CHECKPTX71-NEXT:  	.reg .f32 	%f<12>;
+; CHECKPTX71-NEXT:    .reg .pred %p<5>;
+; CHECKPTX71-NEXT:    .reg .b16 %rs<34>;
+; CHECKPTX71-NEXT:    .reg .b32 %r<4>;
+; CHECKPTX71-NEXT:    .reg .f32 %f<12>;
 ; CHECKPTX71-EMPTY:
 ; CHECKPTX71-NEXT:  // %bb.0:
-; CHECKPTX71-NEXT:  	ld.param.b16 	%rs13, [test_param_3];
-; CHECKPTX71-NEXT:  	ld.param.u32 	%r3, [test_param_2];
-; CHECKPTX71-NEXT:  	ld.param.u32 	%r2, [test_param_1];
-; CHECKPTX71-NEXT:  	ld.param.u32 	%r1, [test_param_0];
-; CHECKPTX71-NEXT:  	ld.b16 	%rs30, [%r1];
-; CHECKPTX71-NEXT:  	cvt.f32.bf16 	%f1, %rs13;
-; CHECKPTX71-NEXT:  $L__BB0_1:                              // %atomicrmw.start
+; CHECKPTX71-NEXT:    ld.param.b16 %rs13, [test_param_3];
+; CHECKPTX71-NEXT:    ld.param.u32 %r3, [test_param_2];
+; CHECKPTX71-NEXT:    ld.param.u32 %r2, [test_param_1];
+; CHECKPTX71-NEXT:    ld.param.u32 %r1, [test_param_0];
+; CHECKPTX71-NEXT:    ld.b16 %rs30, [%r1];
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f1, %rs13;
+; CHECKPTX71-NEXT:  $L__BB0_1: // %atomicrmw.start14
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:  	cvt.f32.bf16 	%f2, %rs30;
-; CHECKPTX71-NEXT:  	add.rn.f32 	%f3, %f2, %f1;
-; CHECKPTX71-NEXT:  	cvt.rn.bf16.f32 	%rs14, %f3;
-; CHECKPTX71-NEXT:  	atom.cas.b16 	%rs17, [%r1], %rs30, %rs14;
-; CHECKPTX71-NEXT:  	setp.ne.s16 	%p1, %rs17, %rs30;
-; CHECKPTX71-NEXT:  	mov.u16 	%rs30, %rs17;
-; CHECKPTX71-NEXT:  	@%p1 bra 	$L__BB0_1;
-; CHECKPTX71-NEXT:  // %bb.2:                               // %atomicrmw.end
-; CHECKPTX71-NEXT:  	ld.b16 	%rs31, [%r1];
-; CHECKPTX71-NEXT:  $L__BB0_3:                              // %atomicrmw.start2
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f2, %rs30;
+; CHECKPTX71-NEXT:    add.rn.f32 %f3, %f2, %f1;
+; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs14, %f3;
+; CHECKPTX71-NEXT:    atom.cas.b16 %rs17, [%r1], %rs30, %rs14;
+; CHECKPTX71-NEXT:    setp.ne.s16 %p1, %rs17, %rs30;
+; CHECKPTX71-NEXT:    mov.u16 %rs30, %rs17;
+; CHECKPTX71-NEXT:    @%p1 bra $L__BB0_1;
+; CHECKPTX71-NEXT:  // %bb.2: // %atomicrmw.end13
+; CHECKPTX71-NEXT:    ld.b16 %rs31, [%r1];
+; CHECKPTX71-NEXT:  $L__BB0_3: // %atomicrmw.start8
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:  	cvt.f32.bf16 	%f4, %rs31;
-; CHECKPTX71-NEXT:  	add.rn.f32 	%f5, %f4, 0f3F800000;
-; CHECKPTX71-NEXT:  	cvt.rn.bf16.f32 	%rs18, %f5;
-; CHECKPTX71-NEXT:  	atom.cas.b16 	%rs21, [%r1], %rs31, %rs18;
-; CHECKPTX71-NEXT:  	setp.ne.s16 	%p2, %rs21, %rs31;
-; CHECKPTX71-NEXT:  	mov.u16 	%rs31, %rs21;
-; CHECKPTX71-NEXT:  	@%p2 bra 	$L__BB0_3;
-; CHECKPTX71-NEXT:  // %bb.4:                               // %atomicrmw.end1
-; CHECKPTX71-NEXT:  	ld.global.b16 	%rs32, [%r2];
-; CHECKPTX71-NEXT:  $L__BB0_5:                              // %atomicrmw.start8
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f4, %rs31;
+; CHECKPTX71-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs18, %f5;
+; CHECKPTX71-NEXT:    atom.cas.b16 %rs21, [%r1], %rs31, %rs18;
+; CHECKPTX71-NEXT:    setp.ne.s16 %p2, %rs21, %rs31;
+; CHECKPTX71-NEXT:    mov.u16 %rs31, %rs21;
+; CHECKPTX71-NEXT:    @%p2 bra $L__BB0_3;
+; CHECKPTX71-NEXT:  // %bb.4: // %atomicrmw.end7
+; CHECKPTX71-NEXT:    ld.global.b16 %rs32, [%r2];
+; CHECKPTX71-NEXT:  $L__BB0_5: // %atomicrmw.start2
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:  	cvt.f32.bf16 	%f7, %rs32;
-; CHECKPTX71-NEXT:  	add.rn.f32 	%f8, %f7, %f1;
-; CHECKPTX71-NEXT:  	cvt.rn.bf16.f32 	%rs22, %f8;
-; CHECKPTX71-NEXT:  	atom.global.cas.b16 	%rs25, [%r2], %rs32, %rs22;
-; CHECKPTX71-NEXT:  	setp.ne.s16 	%p3, %rs25, %rs32;
-; CHECKPTX71-NEXT:  	mov.u16 	%rs32, %rs25;
-; CHECKPTX71-NEXT:  	@%p3 bra 	$L__BB0_5;
-; CHECKPTX71-NEXT:  // %bb.6:                               // %atomicrmw.end7
-; CHECKPTX71-NEXT:  	ld.shared.b16 	%rs33, [%r3];
-; CHECKPTX71-NEXT:  $L__BB0_7:                              // %atomicrmw.start14
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f7, %rs32;
+; CHECKPTX71-NEXT:    add.rn.f32 %f8, %f7, %f1;
+; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs22, %f8;
+; CHECKPTX71-NEXT:    atom.global.cas.b16 %rs25, [%r2], %rs32, %rs22;
+; CHECKPTX71-NEXT:    setp.ne.s16 %p3, %rs25, %rs32;
+; CHECKPTX71-NEXT:    mov.u16 %rs32, %rs25;
+; CHECKPTX71-NEXT:    @%p3 bra $L__BB0_5;
+; CHECKPTX71-NEXT:  // %bb.6: // %atomicrmw.end1
+; CHECKPTX71-NEXT:    ld.shared.b16 %rs33, [%r3];
+; CHECKPTX71-NEXT:  $L__BB0_7: // %atomicrmw.start
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:  	cvt.f32.bf16 	%f10, %rs33;
-; CHECKPTX71-NEXT:  	add.rn.f32 	%f11, %f10, %f1;
-; CHECKPTX71-NEXT:  	cvt.rn.bf16.f32 	%rs26, %f11;
-; CHECKPTX71-NEXT:  	atom.shared.cas.b16 	%rs29, [%r3], %rs33, %rs26;
-; CHECKPTX71-NEXT:  	setp.ne.s16 	%p4, %rs29, %rs33;
-; CHECKPTX71-NEXT:  	mov.u16 	%rs33, %rs29;
-; CHECKPTX71-NEXT:  	@%p4 bra 	$L__BB0_7;
-; CHECKPTX71-NEXT:  // %bb.8:                               // %atomicrmw.end13
-; CHECKPTX71-NEXT:  	ret;
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f10, %rs33;
+; CHECKPTX71-NEXT:    add.rn.f32 %f11, %f10, %f1;
+; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs26, %f11;
+; CHECKPTX71-NEXT:    atom.shared.cas.b16 %rs29, [%r3], %rs33, %rs26;
+; CHECKPTX71-NEXT:    setp.ne.s16 %p4, %rs29, %rs33;
+; CHECKPTX71-NEXT:    mov.u16 %rs33, %rs29;
+; CHECKPTX71-NEXT:    @%p4 bra $L__BB0_7;
+; CHECKPTX71-NEXT:  // %bb.8: // %atomicrmw.end
+; CHECKPTX71-NEXT:    ret;
   %r1 = atomicrmw fadd ptr %dp0, bfloat %val seq_cst
   %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 seq_cst
   %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val seq_cst
diff --git a/llvm/test/CodeGen/PowerPC/all-atomics.ll b/llvm/test/CodeGen/PowerPC/all-atomics.ll
index 093253bf8f691..531e559ea7309 100644
--- a/llvm/test/CodeGen/PowerPC/all-atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/all-atomics.ll
@@ -913,7 +913,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwz 6, 4(31)
 ; AIX32-NEXT:    lwz 7, 0(31)
 ; AIX32-NEXT:    .align 4
-; AIX32-NEXT:  L..BB0_49: # %atomicrmw.start
+; AIX32-NEXT:  L..BB0_49: # %atomicrmw.start2
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    xori 3, 5, 1
 ; AIX32-NEXT:    stw 7, 72(1)
@@ -938,7 +938,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwz 7, 72(1)
 ; AIX32-NEXT:    cmplwi 3, 0
 ; AIX32-NEXT:    beq 0, L..BB0_49
-; AIX32-NEXT:  # %bb.50: # %atomicrmw.end
+; AIX32-NEXT:  # %bb.50: # %atomicrmw.end1
 ; AIX32-NEXT:    lwz 31, L..C9(2) # @s128
 ; AIX32-NEXT:    addi 30, 1, 72
 ; AIX32-NEXT:    addi 29, 1, 56
@@ -947,7 +947,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwz 6, 4(31)
 ; AIX32-NEXT:    lwz 7, 0(31)
 ; AIX32-NEXT:    .align 4
-; AIX32-NEXT:  L..BB0_51: # %atomicrmw.start2
+; AIX32-NEXT:  L..BB0_51: # %atomicrmw.start
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    xori 3, 5, 1
 ; AIX32-NEXT:    stw 7, 72(1)
@@ -972,13 +972,13 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwz 7, 72(1)
 ; AIX32-NEXT:    cmplwi 3, 0
 ; AIX32-NEXT:    beq 0, L..BB0_51
-; AIX32-NEXT:  # %bb.52: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.52: # %atomicrmw.end
 ; AIX32-NEXT:    li 29, 1
 ; AIX32-NEXT:    li 3, 255
 ; AIX32-NEXT:    sync
 ; AIX32-NEXT:    slw 18, 29, 26
 ; AIX32-NEXT:    slw 3, 3, 26
-; AIX32-NEXT:  L..BB0_53: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB0_53: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 4, 0, 28
 ; AIX32-NEXT:    nand 5, 18, 4
@@ -987,13 +987,13 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 28
 ; AIX32-NEXT:    bne 0, L..BB0_53
-; AIX32-NEXT:  # %bb.54: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.54: # %atomicrmw.end
 ; AIX32-NEXT:    li 3, 255
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    slw 17, 29, 24
 ; AIX32-NEXT:    sync
 ; AIX32-NEXT:    slw 3, 3, 24
-; AIX32-NEXT:  L..BB0_55: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB0_55: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 4, 0, 27
 ; AIX32-NEXT:    nand 5, 17, 4
@@ -1002,14 +1002,14 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 27
 ; AIX32-NEXT:    bne 0, L..BB0_55
-; AIX32-NEXT:  # %bb.56: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.56: # %atomicrmw.end
 ; AIX32-NEXT:    li 3, 0
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    slw 16, 29, 22
 ; AIX32-NEXT:    sync
 ; AIX32-NEXT:    ori 3, 3, 65535
 ; AIX32-NEXT:    slw 3, 3, 22
-; AIX32-NEXT:  L..BB0_57: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB0_57: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 4, 0, 25
 ; AIX32-NEXT:    nand 5, 16, 4
@@ -1018,14 +1018,14 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 25
 ; AIX32-NEXT:    bne 0, L..BB0_57
-; AIX32-NEXT:  # %bb.58: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.58: # %atomicrmw.end
 ; AIX32-NEXT:    li 3, 0
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    slw 15, 29, 21
 ; AIX32-NEXT:    sync
 ; AIX32-NEXT:    ori 3, 3, 65535
 ; AIX32-NEXT:    slw 3, 3, 21
-; AIX32-NEXT:  L..BB0_59: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB0_59: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 4, 0, 23
 ; AIX32-NEXT:    nand 5, 15, 4
@@ -1034,25 +1034,25 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 23
 ; AIX32-NEXT:    bne 0, L..BB0_59
-; AIX32-NEXT:  # %bb.60: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.60: # %atomicrmw.end
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
-; AIX32-NEXT:  L..BB0_61: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB0_61: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 3, 0, 20
 ; AIX32-NEXT:    nand 3, 29, 3
 ; AIX32-NEXT:    stwcx. 3, 0, 20
 ; AIX32-NEXT:    bne 0, L..BB0_61
-; AIX32-NEXT:  # %bb.62: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.62: # %atomicrmw.end
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
-; AIX32-NEXT:  L..BB0_63: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB0_63: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 3, 0, 19
 ; AIX32-NEXT:    nand 3, 29, 3
 ; AIX32-NEXT:    stwcx. 3, 0, 19
 ; AIX32-NEXT:    bne 0, L..BB0_63
-; AIX32-NEXT:  # %bb.64: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.64: # %atomicrmw.end
 ; AIX32-NEXT:    lwz 31, L..C6(2) # @sll
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    li 4, 0
@@ -1071,7 +1071,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    li 3, 255
 ; AIX32-NEXT:    sync
 ; AIX32-NEXT:    slw 3, 3, 26
-; AIX32-NEXT:  L..BB0_65: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB0_65: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 4, 0, 28
 ; AIX32-NEXT:    and 5, 18, 4
@@ -1080,12 +1080,12 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 28
 ; AIX32-NEXT:    bne 0, L..BB0_65
-; AIX32-NEXT:  # %bb.66: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.66: # %atomicrmw.end
 ; AIX32-NEXT:    li 3, 255
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
 ; AIX32-NEXT:    slw 3, 3, 24
-; AIX32-NEXT:  L..BB0_67: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB0_67: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 4, 0, 27
 ; AIX32-NEXT:    and 5, 17, 4
@@ -1094,13 +1094,13 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 27
 ; AIX32-NEXT:    bne 0, L..BB0_67
-; AIX32-NEXT:  # %bb.68: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.68: # %atomicrmw.end
 ; AIX32-NEXT:    li 3, 0
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
 ; AIX32-NEXT:    ori 3, 3, 65535
 ; AIX32-NEXT:    slw 3, 3, 22
-; AIX32-NEXT:  L..BB0_69: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB0_69: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 4, 0, 25
 ; AIX32-NEXT:    and 5, 16, 4
@@ -1109,13 +1109,13 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 25
 ; AIX32-NEXT:    bne 0, L..BB0_69
-; AIX32-NEXT:  # %bb.70: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.70: # %atomicrmw.end
 ; AIX32-NEXT:    li 3, 0
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
 ; AIX32-NEXT:    ori 3, 3, 65535
 ; AIX32-NEXT:    slw 3, 3, 21
-; AIX32-NEXT:  L..BB0_71: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB0_71: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 4, 0, 23
 ; AIX32-NEXT:    and 5, 15, 4
@@ -1124,25 +1124,25 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 23
 ; AIX32-NEXT:    bne 0, L..BB0_71
-; AIX32-NEXT:  # %bb.72: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.72: # %atomicrmw.end
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
-; AIX32-NEXT:  L..BB0_73: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB0_73: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 3, 0, 20
 ; AIX32-NEXT:    and 3, 29, 3
 ; AIX32-NEXT:    stwcx. 3, 0, 20
 ; AIX32-NEXT:    bne 0, L..BB0_73
-; AIX32-NEXT:  # %bb.74: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.74: # %atomicrmw.end
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
-; AIX32-NEXT:  L..BB0_75: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB0_75: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 3, 0, 19
 ; AIX32-NEXT:    and 3, 29, 3
 ; AIX32-NEXT:    stwcx. 3, 0, 19
 ; AIX32-NEXT:    bne 0, L..BB0_75
-; AIX32-NEXT:  # %bb.76: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.76: # %atomicrmw.end
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    li 4, 0
 ; AIX32-NEXT:    li 5, 1
@@ -3863,7 +3863,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwz 7, 0(29)
 ; AIX32-NEXT:    stw 3, 4(30)
 ; AIX32-NEXT:    .align 4
-; AIX32-NEXT:  L..BB2_61: # %atomicrmw.start
+; AIX32-NEXT:  L..BB2_61: # %atomicrmw.start2
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    and 3, 4, 23
 ; AIX32-NEXT:    stw 7, 80(1)
@@ -3889,7 +3889,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwz 7, 80(1)
 ; AIX32-NEXT:    cmplwi 3, 0
 ; AIX32-NEXT:    beq 0, L..BB2_61
-; AIX32-NEXT:  # %bb.62: # %atomicrmw.end
+; AIX32-NEXT:  # %bb.62: # %atomicrmw.end1
 ; AIX32-NEXT:    and 3, 4, 23
 ; AIX32-NEXT:    stw 17, 0(29)
 ; AIX32-NEXT:    lbz 23, 0(26)
@@ -3905,7 +3905,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwz 6, 4(29)
 ; AIX32-NEXT:    lwz 7, 0(29)
 ; AIX32-NEXT:    .align 4
-; AIX32-NEXT:  L..BB2_63: # %atomicrmw.start2
+; AIX32-NEXT:  L..BB2_63: # %atomicrmw.start
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    and 3, 4, 23
 ; AIX32-NEXT:    stw 7, 80(1)
@@ -3931,7 +3931,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwz 7, 80(1)
 ; AIX32-NEXT:    cmplwi 3, 0
 ; AIX32-NEXT:    beq 0, L..BB2_63
-; AIX32-NEXT:  # %bb.64: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.64: # %atomicrmw.end
 ; AIX32-NEXT:    and 3, 4, 23
 ; AIX32-NEXT:    li 5, 255
 ; AIX32-NEXT:    xor 3, 3, 17
@@ -3943,7 +3943,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lbz 3, 0(26)
 ; AIX32-NEXT:    sync
 ; AIX32-NEXT:    slw 4, 3, 24
-; AIX32-NEXT:  L..BB2_65: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB2_65: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 6, 0, 22
 ; AIX32-NEXT:    and 7, 4, 6
@@ -3952,7 +3952,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 22
 ; AIX32-NEXT:    bne 0, L..BB2_65
-; AIX32-NEXT:  # %bb.66: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.66: # %atomicrmw.end
 ; AIX32-NEXT:    srw 4, 6, 24
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    li 5, 255
@@ -3965,7 +3965,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    sync
 ; AIX32-NEXT:    lwz 9, 56(1) # 4-byte Folded Reload
 ; AIX32-NEXT:    slw 4, 3, 21
-; AIX32-NEXT:  L..BB2_67: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB2_67: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 6, 0, 19
 ; AIX32-NEXT:    and 7, 4, 6
@@ -3974,7 +3974,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 19
 ; AIX32-NEXT:    bne 0, L..BB2_67
-; AIX32-NEXT:  # %bb.68: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.68: # %atomicrmw.end
 ; AIX32-NEXT:    srw 4, 6, 21
 ; AIX32-NEXT:    li 5, 0
 ; AIX32-NEXT:    lwsync
@@ -3985,7 +3985,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    stb 3, 0(26)
 ; AIX32-NEXT:    sync
 ; AIX32-NEXT:    slw 4, 3, 18
-; AIX32-NEXT:  L..BB2_69: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB2_69: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 6, 0, 16
 ; AIX32-NEXT:    and 7, 4, 6
@@ -3994,7 +3994,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 16
 ; AIX32-NEXT:    bne 0, L..BB2_69
-; AIX32-NEXT:  # %bb.70: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.70: # %atomicrmw.end
 ; AIX32-NEXT:    srw 4, 6, 18
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    li 5, 0
@@ -4006,7 +4006,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lbz 3, 0(26)
 ; AIX32-NEXT:    sync
 ; AIX32-NEXT:    slw 4, 3, 15
-; AIX32-NEXT:  L..BB2_71: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB2_71: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 6, 0, 14
 ; AIX32-NEXT:    and 7, 4, 6
@@ -4015,7 +4015,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 14
 ; AIX32-NEXT:    bne 0, L..BB2_71
-; AIX32-NEXT:  # %bb.72: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.72: # %atomicrmw.end
 ; AIX32-NEXT:    srw 4, 6, 15
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    clrlwi 4, 4, 16
@@ -4023,24 +4023,24 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    sth 3, 0(20)
 ; AIX32-NEXT:    lbz 3, 0(26)
 ; AIX32-NEXT:    sync
-; AIX32-NEXT:  L..BB2_73: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB2_73: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 4, 0, 13
 ; AIX32-NEXT:    and 4, 3, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 13
 ; AIX32-NEXT:    bne 0, L..BB2_73
-; AIX32-NEXT:  # %bb.74: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.74: # %atomicrmw.end
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 4, 0(13)
 ; AIX32-NEXT:    lbz 3, 0(26)
 ; AIX32-NEXT:    sync
-; AIX32-NEXT:  L..BB2_75: # %atomicrmw.end1
+; AIX32-NEXT:  L..BB2_75: # %atomicrmw.end
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 4, 0, 25
 ; AIX32-NEXT:    and 4, 3, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 25
 ; AIX32-NEXT:    bne 0, L..BB2_75
-; AIX32-NEXT:  # %bb.76: # %atomicrmw.end1
+; AIX32-NEXT:  # %bb.76: # %atomicrmw.end
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 4, 0(25)
 ; AIX32-NEXT:    li 4, 0
diff --git a/llvm/test/CodeGen/X86/atomic6432.ll b/llvm/test/CodeGen/X86/atomic6432.ll
index b0167f41a5fe2..8ff5f338e1482 100644
--- a/llvm/test/CodeGen/X86/atomic6432.ll
+++ b/llvm/test/CodeGen/X86/atomic6432.ll
@@ -14,7 +14,7 @@ define void @atomic_fetch_add64() nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB0_1
-; X32-NEXT:  .LBB0_1: # %atomicrmw.start
+; X32-NEXT:  .LBB0_1: # %atomicrmw.start14
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -27,13 +27,13 @@ define void @atomic_fetch_add64() nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB0_1
 ; X32-NEXT:    jmp .LBB0_2
-; X32-NEXT:  .LBB0_2: # %atomicrmw.end
+; X32-NEXT:  .LBB0_2: # %atomicrmw.end13
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB0_3
-; X32-NEXT:  .LBB0_3: # %atomicrmw.start2
+; X32-NEXT:  .LBB0_3: # %atomicrmw.start8
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -46,13 +46,13 @@ define void @atomic_fetch_add64() nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB0_3
 ; X32-NEXT:    jmp .LBB0_4
-; X32-NEXT:  .LBB0_4: # %atomicrmw.end1
+; X32-NEXT:  .LBB0_4: # %atomicrmw.end7
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB0_5
-; X32-NEXT:  .LBB0_5: # %atomicrmw.start8
+; X32-NEXT:  .LBB0_5: # %atomicrmw.start2
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -67,13 +67,13 @@ define void @atomic_fetch_add64() nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB0_5
 ; X32-NEXT:    jmp .LBB0_6
-; X32-NEXT:  .LBB0_6: # %atomicrmw.end7
+; X32-NEXT:  .LBB0_6: # %atomicrmw.end1
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB0_7
-; X32-NEXT:  .LBB0_7: # %atomicrmw.start14
+; X32-NEXT:  .LBB0_7: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
@@ -88,7 +88,7 @@ define void @atomic_fetch_add64() nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB0_7
 ; X32-NEXT:    jmp .LBB0_8
-; X32-NEXT:  .LBB0_8: # %atomicrmw.end13
+; X32-NEXT:  .LBB0_8: # %atomicrmw.end
 ; X32-NEXT:    addl $40, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
@@ -112,7 +112,7 @@ define void @atomic_fetch_sub64() nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB1_1
-; X32-NEXT:  .LBB1_1: # %atomicrmw.start
+; X32-NEXT:  .LBB1_1: # %atomicrmw.start14
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -125,13 +125,13 @@ define void @atomic_fetch_sub64() nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB1_1
 ; X32-NEXT:    jmp .LBB1_2
-; X32-NEXT:  .LBB1_2: # %atomicrmw.end
+; X32-NEXT:  .LBB1_2: # %atomicrmw.end13
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB1_3
-; X32-NEXT:  .LBB1_3: # %atomicrmw.start2
+; X32-NEXT:  .LBB1_3: # %atomicrmw.start8
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -144,13 +144,13 @@ define void @atomic_fetch_sub64() nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB1_3
 ; X32-NEXT:    jmp .LBB1_4
-; X32-NEXT:  .LBB1_4: # %atomicrmw.end1
+; X32-NEXT:  .LBB1_4: # %atomicrmw.end7
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB1_5
-; X32-NEXT:  .LBB1_5: # %atomicrmw.start8
+; X32-NEXT:  .LBB1_5: # %atomicrmw.start2
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -165,13 +165,13 @@ define void @atomic_fetch_sub64() nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB1_5
 ; X32-NEXT:    jmp .LBB1_6
-; X32-NEXT:  .LBB1_6: # %atomicrmw.end7
+; X32-NEXT:  .LBB1_6: # %atomicrmw.end1
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB1_7
-; X32-NEXT:  .LBB1_7: # %atomicrmw.start14
+; X32-NEXT:  .LBB1_7: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
@@ -186,7 +186,7 @@ define void @atomic_fetch_sub64() nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB1_7
 ; X32-NEXT:    jmp .LBB1_8
-; X32-NEXT:  .LBB1_8: # %atomicrmw.end13
+; X32-NEXT:  .LBB1_8: # %atomicrmw.end
 ; X32-NEXT:    addl $40, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
@@ -209,7 +209,7 @@ define void @atomic_fetch_and64() nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB2_1
-; X32-NEXT:  .LBB2_1: # %atomicrmw.start
+; X32-NEXT:  .LBB2_1: # %atomicrmw.start8
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -221,7 +221,7 @@ define void @atomic_fetch_and64() nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB2_1
 ; X32-NEXT:    jmp .LBB2_2
-; X32-NEXT:  .LBB2_2: # %atomicrmw.end
+; X32-NEXT:  .LBB2_2: # %atomicrmw.end7
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -248,7 +248,7 @@ define void @atomic_fetch_and64() nounwind {
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB2_5
-; X32-NEXT:  .LBB2_5: # %atomicrmw.start8
+; X32-NEXT:  .LBB2_5: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
@@ -263,7 +263,7 @@ define void @atomic_fetch_and64() nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB2_5
 ; X32-NEXT:    jmp .LBB2_6
-; X32-NEXT:  .LBB2_6: # %atomicrmw.end7
+; X32-NEXT:  .LBB2_6: # %atomicrmw.end
 ; X32-NEXT:    addl $32, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
@@ -285,7 +285,7 @@ define void @atomic_fetch_or64() nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB3_1
-; X32-NEXT:  .LBB3_1: # %atomicrmw.start
+; X32-NEXT:  .LBB3_1: # %atomicrmw.start8
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -297,7 +297,7 @@ define void @atomic_fetch_or64() nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB3_1
 ; X32-NEXT:    jmp .LBB3_2
-; X32-NEXT:  .LBB3_2: # %atomicrmw.end
+; X32-NEXT:  .LBB3_2: # %atomicrmw.end7
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -324,7 +324,7 @@ define void @atomic_fetch_or64() nounwind {
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB3_5
-; X32-NEXT:  .LBB3_5: # %atomicrmw.start8
+; X32-NEXT:  .LBB3_5: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
@@ -339,7 +339,7 @@ define void @atomic_fetch_or64() nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB3_5
 ; X32-NEXT:    jmp .LBB3_6
-; X32-NEXT:  .LBB3_6: # %atomicrmw.end7
+; X32-NEXT:  .LBB3_6: # %atomicrmw.end
 ; X32-NEXT:    addl $32, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
@@ -361,7 +361,7 @@ define void @atomic_fetch_xor64() nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB4_1
-; X32-NEXT:  .LBB4_1: # %atomicrmw.start
+; X32-NEXT:  .LBB4_1: # %atomicrmw.start8
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -373,7 +373,7 @@ define void @atomic_fetch_xor64() nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB4_1
 ; X32-NEXT:    jmp .LBB4_2
-; X32-NEXT:  .LBB4_2: # %atomicrmw.end
+; X32-NEXT:  .LBB4_2: # %atomicrmw.end7
 ; X32-NEXT:    movl sc64+4, %edx
 ; X32-NEXT:    movl sc64, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -400,7 +400,7 @@ define void @atomic_fetch_xor64() nounwind {
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jmp .LBB4_5
-; X32-NEXT:  .LBB4_5: # %atomicrmw.start8
+; X32-NEXT:  .LBB4_5: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
@@ -415,7 +415,7 @@ define void @atomic_fetch_xor64() nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB4_5
 ; X32-NEXT:    jmp .LBB4_6
-; X32-NEXT:  .LBB4_6: # %atomicrmw.end7
+; X32-NEXT:  .LBB4_6: # %atomicrmw.end
 ; X32-NEXT:    addl $32, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/pr5145.ll b/llvm/test/CodeGen/X86/pr5145.ll
index da15bd6720ce2..16398cb57061f 100644
--- a/llvm/test/CodeGen/X86/pr5145.ll
+++ b/llvm/test/CodeGen/X86/pr5145.ll
@@ -7,7 +7,7 @@ define void @atomic_maxmin_i8() {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movzbl sc8(%rip), %eax
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start
+; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start14
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpb $6, %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -16,10 +16,10 @@ define void @atomic_maxmin_i8() {
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    lock cmpxchgb %cl, sc8(%rip)
 ; CHECK-NEXT:    jne .LBB0_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end13
 ; CHECK-NEXT:    movzbl sc8(%rip), %eax
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_3: # %atomicrmw.start2
+; CHECK-NEXT:  .LBB0_3: # %atomicrmw.start8
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpb $7, %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -28,10 +28,10 @@ define void @atomic_maxmin_i8() {
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    lock cmpxchgb %cl, sc8(%rip)
 ; CHECK-NEXT:    jne .LBB0_3
-; CHECK-NEXT:  # %bb.4: # %atomicrmw.end1
+; CHECK-NEXT:  # %bb.4: # %atomicrmw.end7
 ; CHECK-NEXT:    movzbl sc8(%rip), %eax
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_5: # %atomicrmw.start8
+; CHECK-NEXT:  .LBB0_5: # %atomicrmw.start2
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpb $8, %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -40,10 +40,10 @@ define void @atomic_maxmin_i8() {
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    lock cmpxchgb %cl, sc8(%rip)
 ; CHECK-NEXT:    jne .LBB0_5
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.end7
+; CHECK-NEXT:  # %bb.6: # %atomicrmw.end1
 ; CHECK-NEXT:    movzbl sc8(%rip), %eax
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_7: # %atomicrmw.start14
+; CHECK-NEXT:  .LBB0_7: # %atomicrmw.start
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpb $9, %al
 ; CHECK-NEXT:    movzbl %al, %eax
@@ -52,7 +52,7 @@ define void @atomic_maxmin_i8() {
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    lock cmpxchgb %cl, sc8(%rip)
 ; CHECK-NEXT:    jne .LBB0_7
-; CHECK-NEXT:  # %bb.8: # %atomicrmw.end13
+; CHECK-NEXT:  # %bb.8: # %atomicrmw.end
 ; CHECK-NEXT:    retq
   %1 = atomicrmw max  ptr @sc8, i8 5 acquire
   %2 = atomicrmw min  ptr @sc8, i8 6 acquire

From 9528bcd5327c0120c82c84031b52b167037aa650 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 6 Sep 2024 07:39:54 -0700
Subject: [PATCH 382/425] [IRSim] Avoid repeated hash lookups (NFC) (#107510)

---
 llvm/lib/Analysis/IRSimilarityIdentifier.cpp | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
index ebe22f5c35d18..41815c633fdf2 100644
--- a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
+++ b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
@@ -459,16 +459,14 @@ IRSimilarityCandidate::IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
     // Map the operand values to an unsigned integer if it does not already
     // have an unsigned integer assigned to it.
     for (Value *Arg : ID->OperVals)
-      if (!ValueToNumber.contains(Arg)) {
-        ValueToNumber.try_emplace(Arg, LocalValNumber);
+      if (ValueToNumber.try_emplace(Arg, LocalValNumber).second) {
         NumberToValue.try_emplace(LocalValNumber, Arg);
         LocalValNumber++;
       }
 
     // Mapping the instructions to an unsigned integer if it is not already
     // exist in the mapping.
-    if (!ValueToNumber.contains(ID->Inst)) {
-      ValueToNumber.try_emplace(ID->Inst, LocalValNumber);
+    if (ValueToNumber.try_emplace(ID->Inst, LocalValNumber).second) {
       NumberToValue.try_emplace(LocalValNumber, ID->Inst);
       LocalValNumber++;
     }
@@ -484,12 +482,10 @@ IRSimilarityCandidate::IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
   DenseSet<BasicBlock *> BBSet;
   getBasicBlocks(BBSet);
   for (BasicBlock *BB : BBSet) {
-    if (ValueToNumber.contains(BB))
-      continue;
-    
-    ValueToNumber.try_emplace(BB, LocalValNumber);
-    NumberToValue.try_emplace(LocalValNumber, BB);
-    LocalValNumber++;
+    if (ValueToNumber.try_emplace(BB, LocalValNumber).second) {
+      NumberToValue.try_emplace(LocalValNumber, BB);
+      LocalValNumber++;
+    }
   }
 }
 

From bd1559533d88f0d32b7ca17aa316b07b7924be2d Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 6 Sep 2024 07:40:27 -0700
Subject: [PATCH 383/425] [IndVars] Avoid repeated hash lookups (NFC) (#107513)

---
 llvm/lib/Transforms/Utils/SimplifyIndVar.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index a950a4f57ef48..ff13c0653e9b9 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -1103,10 +1103,8 @@ class WidenIV {
 
   void updatePostIncRangeInfo(Value *Def, Instruction *UseI, ConstantRange R) {
     DefUserPair Key(Def, UseI);
-    auto It = PostIncRangeInfos.find(Key);
-    if (It == PostIncRangeInfos.end())
-      PostIncRangeInfos.insert({Key, R});
-    else
+    auto [It, Inserted] = PostIncRangeInfos.try_emplace(Key, R);
+    if (!Inserted)
       It->second = R.intersectWith(It->second);
   }
 

From 2461bc1c7ad5a78f39f75c6e99acf502b83401d2 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 6 Sep 2024 07:40:51 -0700
Subject: [PATCH 384/425] [Convergence] Avoid repeated hash lookups (NFC)
 (#107515)

---
 llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h b/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h
index 7525c9eb758be..e076ac4a14162 100644
--- a/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h
+++ b/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h
@@ -210,11 +210,10 @@ void GenericConvergenceVerifier<ContextT>::verify(const DominatorTreeT &DT) {
     // Propagate token liveness
     for (auto *Succ : successors(BB)) {
       auto *SuccNode = DT.getNode(Succ);
-      auto LTIt = LiveTokenMap.find(Succ);
-      if (LTIt == LiveTokenMap.end()) {
+      auto [LTIt, Inserted] = LiveTokenMap.try_emplace(Succ);
+      if (Inserted) {
         // We're the first predecessor: all tokens which dominate the
         // successor are live for now.
-        LTIt = LiveTokenMap.try_emplace(Succ).first;
         for (auto LiveToken : LiveTokens) {
           if (!DT.dominates(DT.getNode(LiveToken->getParent()), SuccNode))
             break;

From b8d6885ff67efbc3142a2b49506ed0cc2b95e054 Mon Sep 17 00:00:00 2001
From: Daniil Fukalov <dfukalov@gmail.com>
Date: Fri, 6 Sep 2024 16:41:24 +0200
Subject: [PATCH 385/425] [NFC] Add explicit #include llvm-config.h where its
 macros are used, clang part. (#107301)

(this is clang related part)

Without these explicit includes, removing other headers, who implicitly
include llvm-config.h, may have non-trivial side effects. For example,
`clagd` may report even `llvm-config.h` as "no used" in case it defines
a macro, that is explicitly used with #ifdef. It is actually amplified
with different build configs which use different set of macros.
---
 clang-tools-extra/clangd/Feature.cpp                        | 1 +
 clang-tools-extra/clangd/unittests/ClangdTests.cpp          | 1 +
 clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp | 1 +
 clang-tools-extra/clangd/unittests/SerializationTests.cpp   | 1 +
 clang/include/clang/Interpreter/Value.h                     | 1 +
 clang/lib/Driver/ToolChains/Cuda.cpp                        | 1 +
 clang/lib/Driver/ToolChains/MinGW.cpp                       | 1 +
 clang/lib/Driver/ToolChains/WebAssembly.cpp                 | 1 +
 clang/lib/Frontend/FrontendActions.cpp                      | 1 +
 clang/tools/driver/driver.cpp                               | 2 ++
 clang/unittests/Driver/GCCVersionTest.cpp                   | 1 +
 11 files changed, 12 insertions(+)

diff --git a/clang-tools-extra/clangd/Feature.cpp b/clang-tools-extra/clangd/Feature.cpp
index 859618a7470ac..ec707a33f656b 100644
--- a/clang-tools-extra/clangd/Feature.cpp
+++ b/clang-tools-extra/clangd/Feature.cpp
@@ -8,6 +8,7 @@
 
 #include "Feature.h"
 #include "clang/Basic/Version.h"
+#include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
 #include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Host.h"
 
diff --git a/clang-tools-extra/clangd/unittests/ClangdTests.cpp b/clang-tools-extra/clangd/unittests/ClangdTests.cpp
index c324643498d94..643b8e9f12d75 100644
--- a/clang-tools-extra/clangd/unittests/ClangdTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ClangdTests.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Path.h"
diff --git a/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp b/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp
index b64dd4acad4c2..2ce2975bd962b 100644
--- a/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
diff --git a/clang-tools-extra/clangd/unittests/SerializationTests.cpp b/clang-tools-extra/clangd/unittests/SerializationTests.cpp
index 35a2e2ba77a65..2a7a6c36d3d17 100644
--- a/clang-tools-extra/clangd/unittests/SerializationTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SerializationTests.cpp
@@ -12,6 +12,7 @@
 #include "support/Logger.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ScopedPrinter.h"
diff --git a/clang/include/clang/Interpreter/Value.h b/clang/include/clang/Interpreter/Value.h
index d70e8f8719026..a93c0841915fc 100644
--- a/clang/include/clang/Interpreter/Value.h
+++ b/clang/include/clang/Interpreter/Value.h
@@ -33,6 +33,7 @@
 #ifndef LLVM_CLANG_INTERPRETER_VALUE_H
 #define LLVM_CLANG_INTERPRETER_VALUE_H
 
+#include "llvm/Config/llvm-config.h" // for LLVM_BUILD_LLVM_DYLIB, LLVM_BUILD_SHARED_LIBS
 #include "llvm/Support/Compiler.h"
 #include <cstdint>
 
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 3f9885b196ec5..ef44ffa5594da 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -17,6 +17,7 @@
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Options.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormatAdapters.h"
diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp
index c81a7ed170296..85f40893e5424 100644
--- a/clang/lib/Driver/ToolChains/MinGW.cpp
+++ b/clang/lib/Driver/ToolChains/MinGW.cpp
@@ -15,6 +15,7 @@
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index 9aacda5fd5702..9aec11e69fde1 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -15,6 +15,7 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Options.h"
+#include "llvm/Config/llvm-config.h" // for LLVM_VERSION_STRING
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp
index 9f5d09e33ce24..64f90c493c105 100644
--- a/clang/lib/Frontend/FrontendActions.cpp
+++ b/clang/lib/Frontend/FrontendActions.cpp
@@ -26,6 +26,7 @@
 #include "clang/Serialization/ASTReader.h"
 #include "clang/Serialization/ASTWriter.h"
 #include "clang/Serialization/ModuleFile.h"
+#include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp
index 83b5bbb71f521..686eaea0aa7c8 100644
--- a/clang/tools/driver/driver.cpp
+++ b/clang/tools/driver/driver.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
@@ -52,6 +53,7 @@
 #include <optional>
 #include <set>
 #include <system_error>
+
 using namespace clang;
 using namespace clang::driver;
 using namespace llvm::opt;
diff --git a/clang/unittests/Driver/GCCVersionTest.cpp b/clang/unittests/Driver/GCCVersionTest.cpp
index 3158911fe5db9..1aab13b7abf80 100644
--- a/clang/unittests/Driver/GCCVersionTest.cpp
+++ b/clang/unittests/Driver/GCCVersionTest.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "../../lib/Driver/ToolChains/Gnu.h"
+#include "llvm/Config/llvm-config.h" // for LLVM_BUILD_LLVM_DYLIB, LLVM_BUILD_SHARED_LIBS
 #include "gtest/gtest.h"
 
 // The Generic_GCC class is hidden in dylib/shared library builds, so

From 56b29074fe924243640547a9fec35bef0942b210 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 6 Sep 2024 07:41:52 -0700
Subject: [PATCH 386/425] [mlir] Avoid repeated hash lookups (NFC) (#107518)

---
 .../mlir/Dialect/Transform/IR/TransformDialect.h      | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.h b/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.h
index 128eacdbe6ab7..494026d78af3f 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.h
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.h
@@ -298,13 +298,10 @@ void TransformDialect::addTypeIfNotRegistered() {
 template <typename DataTy>
 DataTy &TransformDialect::getOrCreateExtraData() {
   TypeID typeID = TypeID::get<DataTy>();
-  auto it = extraData.find(typeID);
-  if (it != extraData.end())
-    return static_cast<DataTy &>(*it->getSecond());
-
-  auto emplaced =
-      extraData.try_emplace(typeID, std::make_unique<DataTy>(getContext()));
-  return static_cast<DataTy &>(*emplaced.first->getSecond());
+  auto [it, inserted] = extraData.try_emplace(typeID);
+  if (inserted)
+    it->getSecond() = std::make_unique<DataTy>(getContext());
+  return static_cast<DataTy &>(*it->getSecond());
 }
 
 /// A wrapper for transform dialect extensions that forces them to be

From a37f7ae6268b25ff1673c94a7294ab242b9a11d8 Mon Sep 17 00:00:00 2001
From: Nicolas van Kempen <nvankemp@gmail.com>
Date: Fri, 6 Sep 2024 10:47:53 -0400
Subject: [PATCH 387/425] [clang-tidy] Add type annotations to
 add_new_check.py, fix minor bug (#106801)

```
> python3 -m mypy --strict clang-tools-extra/clang-tidy/add_new_check.py
Success: no issues found in 1 source file
```

Also fix a bug when `--standard` is not provided on the command line:
the
generated test case has a `None` causing issues:
```
> python3 clang-tools-extra/clang-tidy/add_new_check.py performance XXX
Updating clang-tools-extra/clang-tidy/performance/CMakeLists.txt...
Creating clang-tools-extra/clang-tidy/performance/XxxCheck.h...
Creating clang-tools-extra/clang-tidy/performance/XxxCheck.cpp...
Updating clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp...
Updating clang-tools-extra/docs/ReleaseNotes.rst...
Creating clang-tools-extra/test/clang-tidy/checkers/performance/XXX.cpp...
Creating clang-tools-extra/docs/clang-tidy/checks/performance/XXX.rst...
Updating clang-tools-extra/docs/clang-tidy/checks/list.rst...
Done. Now it's your turn!

> head -n 1 clang-tools-extra/test/clang-tidy/checkers/performance/XXX.cpp
// RUN: %check_clang_tidy None%s performance-XXX %t
```
---
 clang-tools-extra/clang-tidy/add_new_check.py | 91 +++++++++++--------
 1 file changed, 53 insertions(+), 38 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/add_new_check.py b/clang-tools-extra/clang-tidy/add_new_check.py
index bd69bddcc6825..d384dbae28abb 100755
--- a/clang-tools-extra/clang-tidy/add_new_check.py
+++ b/clang-tools-extra/clang-tidy/add_new_check.py
@@ -8,9 +8,6 @@
 #
 # ===-----------------------------------------------------------------------===#
 
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import argparse
 import io
 import itertools
@@ -19,10 +16,13 @@
 import sys
 import textwrap
 
+# FIXME Python 3.9: Replace typing.Tuple with builtins.tuple.
+from typing import Optional, Tuple
+
 
 # Adapts the module's CMakelist file. Returns 'True' if it could add a new
 # entry and 'False' if the entry already existed.
-def adapt_cmake(module_path, check_name_camel):
+def adapt_cmake(module_path: str, check_name_camel: str) -> bool:
     filename = os.path.join(module_path, "CMakeLists.txt")
 
     # The documentation files are encoded using UTF-8, however on Windows the
@@ -57,14 +57,14 @@ def adapt_cmake(module_path, check_name_camel):
 
 # Adds a header for the new check.
 def write_header(
-    module_path,
-    module,
-    namespace,
-    check_name,
-    check_name_camel,
-    description,
-    lang_restrict,
-):
+    module_path: str,
+    module: str,
+    namespace: str,
+    check_name: str,
+    check_name_camel: str,
+    description: str,
+    lang_restrict: str,
+) -> None:
     wrapped_desc = "\n".join(
         textwrap.wrap(
             description, width=80, initial_indent="/// ", subsequent_indent="/// "
@@ -139,7 +139,9 @@ class %(check_name_camel)s : public ClangTidyCheck {
 
 
 # Adds the implementation of the new check.
-def write_implementation(module_path, module, namespace, check_name_camel):
+def write_implementation(
+    module_path: str, module: str, namespace: str, check_name_camel: str
+) -> None:
     filename = os.path.join(module_path, check_name_camel) + ".cpp"
     print("Creating %s..." % filename)
     with io.open(filename, "w", encoding="utf8", newline="\n") as f:
@@ -187,7 +189,7 @@ def write_implementation(module_path, module, namespace, check_name_camel):
 
 
 # Returns the source filename that implements the module.
-def get_module_filename(module_path, module):
+def get_module_filename(module_path: str, module: str) -> str:
     modulecpp = list(
         filter(
             lambda p: p.lower() == module.lower() + "tidymodule.cpp",
@@ -198,7 +200,9 @@ def get_module_filename(module_path, module):
 
 
 # Modifies the module to include the new check.
-def adapt_module(module_path, module, check_name, check_name_camel):
+def adapt_module(
+    module_path: str, module: str, check_name: str, check_name_camel: str
+) -> None:
     filename = get_module_filename(module_path, module)
     with io.open(filename, "r", encoding="utf8") as f:
         lines = f.readlines()
@@ -217,10 +221,10 @@ def adapt_module(module_path, module, check_name, check_name_camel):
             + '");\n'
         )
 
-        lines = iter(lines)
+        lines_iter = iter(lines)
         try:
             while True:
-                line = next(lines)
+                line = next(lines_iter)
                 if not header_added:
                     match = re.search('#include "(.*)"', line)
                     if match:
@@ -247,10 +251,11 @@ def adapt_module(module_path, module, check_name, check_name_camel):
                                 # If we didn't find the check name on this line, look on the
                                 # next one.
                                 prev_line = line
-                                line = next(lines)
+                                line = next(lines_iter)
                                 match = re.search(' *"([^"]*)"', line)
                                 if match:
                                     current_check_name = match.group(1)
+                            assert current_check_name
                             if current_check_name > check_fq_name:
                                 check_added = True
                                 f.write(check_decl)
@@ -262,7 +267,9 @@ def adapt_module(module_path, module, check_name, check_name_camel):
 
 
 # Adds a release notes entry.
-def add_release_notes(module_path, module, check_name, description):
+def add_release_notes(
+    module_path: str, module: str, check_name: str, description: str
+) -> None:
     wrapped_desc = "\n".join(
         textwrap.wrap(
             description, width=80, initial_indent="  ", subsequent_indent="  "
@@ -324,9 +331,14 @@ def add_release_notes(module_path, module, check_name, description):
 
 
 # Adds a test for the check.
-def write_test(module_path, module, check_name, test_extension, test_standard):
-    if test_standard:
-        test_standard = f"-std={test_standard}-or-later "
+def write_test(
+    module_path: str,
+    module: str,
+    check_name: str,
+    test_extension: str,
+    test_standard: Optional[str],
+) -> None:
+    test_standard = f"-std={test_standard}-or-later " if test_standard else ""
     check_name_dashes = module + "-" + check_name
     filename = os.path.normpath(
         os.path.join(
@@ -362,7 +374,7 @@ def write_test(module_path, module, check_name, test_extension, test_standard):
         )
 
 
-def get_actual_filename(dirname, filename):
+def get_actual_filename(dirname: str, filename: str) -> str:
     if not os.path.isdir(dirname):
         return ""
     name = os.path.join(dirname, filename)
@@ -376,7 +388,7 @@ def get_actual_filename(dirname, filename):
 
 
 # Recreates the list of checks in the docs/clang-tidy/checks directory.
-def update_checks_list(clang_tidy_path):
+def update_checks_list(clang_tidy_path: str) -> None:
     docs_dir = os.path.join(clang_tidy_path, "../docs/clang-tidy/checks")
     filename = os.path.normpath(os.path.join(docs_dir, "list.rst"))
     # Read the content of the current list.rst file
@@ -390,12 +402,12 @@ def update_checks_list(clang_tidy_path):
         for file in filter(
             lambda s: s.endswith(".rst"), os.listdir(os.path.join(docs_dir, subdir))
         ):
-            doc_files.append([subdir, file])
+            doc_files.append((subdir, file))
     doc_files.sort()
 
     # We couldn't find the source file from the check name, so try to find the
     # class name that corresponds to the check in the module file.
-    def filename_from_module(module_name, check_name):
+    def filename_from_module(module_name: str, check_name: str) -> str:
         module_path = os.path.join(clang_tidy_path, module_name)
         if not os.path.isdir(module_path):
             return ""
@@ -433,7 +445,7 @@ def filename_from_module(module_name, check_name):
         return ""
 
     # Examine code looking for a c'tor definition to get the base class name.
-    def get_base_class(code, check_file):
+    def get_base_class(code: str, check_file: str) -> str:
         check_class_name = os.path.splitext(os.path.basename(check_file))[0]
         ctor_pattern = check_class_name + r"\([^:]*\)\s*:\s*([A-Z][A-Za-z0-9]*Check)\("
         matches = re.search(r"\s+" + check_class_name + "::" + ctor_pattern, code)
@@ -452,7 +464,7 @@ def get_base_class(code, check_file):
         return ""
 
     # Some simple heuristics to figure out if a check has an autofix or not.
-    def has_fixits(code):
+    def has_fixits(code: str) -> bool:
         for needle in [
             "FixItHint",
             "ReplacementText",
@@ -464,7 +476,7 @@ def has_fixits(code):
         return False
 
     # Try to figure out of the check supports fixits.
-    def has_auto_fix(check_name):
+    def has_auto_fix(check_name: str) -> str:
         dirname, _, check_name = check_name.partition("-")
 
         check_file = get_actual_filename(
@@ -499,7 +511,7 @@ def has_auto_fix(check_name):
 
         return ""
 
-    def process_doc(doc_file):
+    def process_doc(doc_file: Tuple[str, str]) -> Tuple[str, Optional[re.Match[str]]]:
         check_name = doc_file[0] + "-" + doc_file[1].replace(".rst", "")
 
         with io.open(os.path.join(docs_dir, *doc_file), "r", encoding="utf8") as doc:
@@ -508,13 +520,13 @@ def process_doc(doc_file):
 
             if match:
                 # Orphan page, don't list it.
-                return "", ""
+                return "", None
 
             match = re.search(r".*:http-equiv=refresh: \d+;URL=(.*).html(.*)", content)
             # Is it a redirect?
             return check_name, match
 
-    def format_link(doc_file):
+    def format_link(doc_file: Tuple[str, str]) -> str:
         check_name, match = process_doc(doc_file)
         if not match and check_name and not check_name.startswith("clang-analyzer-"):
             return "   :doc:`%(check_name)s <%(module)s/%(check)s>`,%(autofix)s\n" % {
@@ -526,7 +538,7 @@ def format_link(doc_file):
         else:
             return ""
 
-    def format_link_alias(doc_file):
+    def format_link_alias(doc_file: Tuple[str, str]) -> str:
         check_name, match = process_doc(doc_file)
         if (match or (check_name.startswith("clang-analyzer-"))) and check_name:
             module = doc_file[0]
@@ -543,6 +555,7 @@ def format_link_alias(doc_file):
                 ref_end = "_"
             else:
                 redirect_parts = re.search(r"^\.\./([^/]*)/([^/]*)$", match.group(1))
+                assert redirect_parts
                 title = redirect_parts[1] + "-" + redirect_parts[2]
                 target = redirect_parts[1] + "/" + redirect_parts[2]
                 autofix = has_auto_fix(title)
@@ -599,7 +612,7 @@ def format_link_alias(doc_file):
 
 
 # Adds a documentation for the check.
-def write_docs(module_path, module, check_name):
+def write_docs(module_path: str, module: str, check_name: str) -> None:
     check_name_dashes = module + "-" + check_name
     filename = os.path.normpath(
         os.path.join(
@@ -623,15 +636,15 @@ def write_docs(module_path, module, check_name):
         )
 
 
-def get_camel_name(check_name):
+def get_camel_name(check_name: str) -> str:
     return "".join(map(lambda elem: elem.capitalize(), check_name.split("-")))
 
 
-def get_camel_check_name(check_name):
+def get_camel_check_name(check_name: str) -> str:
     return get_camel_name(check_name) + "Check"
 
 
-def main():
+def main() -> None:
     language_to_extension = {
         "c": "c",
         "c++": "cpp",
@@ -756,6 +769,8 @@ def main():
         )
     elif language in ["objc", "objc++"]:
         language_restrict = "%(lang)s.ObjC"
+    else:
+        raise ValueError(f"Unsupported language '{language}' was specified")
 
     write_header(
         module_path,
@@ -769,7 +784,7 @@ def main():
     write_implementation(module_path, module, namespace, check_name_camel)
     adapt_module(module_path, module, check_name, check_name_camel)
     add_release_notes(module_path, module, check_name, description)
-    test_extension = language_to_extension.get(language)
+    test_extension = language_to_extension[language]
     write_test(module_path, module, check_name, test_extension, args.standard)
     write_docs(module_path, module, check_name)
     update_checks_list(clang_tidy_path)

From 01eb071de014759101940096a31d65babc8af04e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 6 Sep 2024 07:48:39 -0700
Subject: [PATCH 388/425] [mlir] Avoid repeated hash lookups (NFC) (#107519)

---
 mlir/lib/IR/AttrTypeSubElements.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/IR/AttrTypeSubElements.cpp b/mlir/lib/IR/AttrTypeSubElements.cpp
index 783236ed3a9df..863e1aa95f2fb 100644
--- a/mlir/lib/IR/AttrTypeSubElements.cpp
+++ b/mlir/lib/IR/AttrTypeSubElements.cpp
@@ -27,10 +27,10 @@ WalkResult AttrTypeWalker::walkImpl(T element, WalkFns &walkFns,
                                     WalkOrder order) {
   // Check if we've already walk this element before.
   auto key = std::make_pair(element.getAsOpaquePointer(), (int)order);
-  auto it = visitedAttrTypes.find(key);
-  if (it != visitedAttrTypes.end())
+  auto [it, inserted] =
+      visitedAttrTypes.try_emplace(key, WalkResult::advance());
+  if (!inserted)
     return it->second;
-  visitedAttrTypes.try_emplace(key, WalkResult::advance());
 
   // If we are walking in post order, walk the sub elements first.
   if (order == WalkOrder::PostOrder) {

From 109cd11dc4aea6b3596f8b2cb5a719f35b190cfa Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Fri, 6 Sep 2024 11:00:30 -0400
Subject: [PATCH 389/425] [Attributor] Skip AS specialization for volatile
 memory instructions (#107250)

---
 .../Transforms/IPO/AttributorAttributes.cpp   |  56 +++++---
 .../AMDGPU/GlobalISel/flat-scratch-init.ll    |   7 +-
 llvm/test/CodeGen/AMDGPU/aa-as-infer.ll       |  88 +++++++++++++
 llvm/test/CodeGen/AMDGPU/addrspacecast.ll     | 120 +++++++++++-------
 .../AMDGPU/annotate-kernel-features-hsa.ll    |  36 ++----
 .../callee-special-input-sgprs-fixed-abi.ll   |  10 +-
 6 files changed, 223 insertions(+), 94 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/aa-as-infer.ll

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 1fe8e6515fe0e..1258387b9185a 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -12492,6 +12492,33 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo {
 
 /// ------------------------ Address Space  ------------------------------------
 namespace {
+
+template <typename InstType>
+static bool makeChange(Attributor &A, InstType *MemInst, const Use &U,
+                       Value *OriginalValue, PointerType *NewPtrTy,
+                       bool UseOriginalValue) {
+  if (U.getOperandNo() != InstType::getPointerOperandIndex())
+    return false;
+
+  if (MemInst->isVolatile()) {
+    auto *TTI = A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(
+        *MemInst->getFunction());
+    unsigned NewAS = NewPtrTy->getPointerAddressSpace();
+    if (!TTI || !TTI->hasVolatileVariant(MemInst, NewAS))
+      return false;
+  }
+
+  if (UseOriginalValue) {
+    A.changeUseAfterManifest(const_cast<Use &>(U), *OriginalValue);
+    return true;
+  }
+
+  Instruction *CastInst = new AddrSpaceCastInst(OriginalValue, NewPtrTy);
+  CastInst->insertBefore(MemInst);
+  A.changeUseAfterManifest(const_cast<Use &>(U), *CastInst);
+  return true;
+}
+
 struct AAAddressSpaceImpl : public AAAddressSpace {
   AAAddressSpaceImpl(const IRPosition &IRP, Attributor &A)
       : AAAddressSpace(IRP, A) {}
@@ -12535,25 +12562,15 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
             getAssociatedType()->getPointerAddressSpace())
       return ChangeStatus::UNCHANGED;
 
-    Type *NewPtrTy = PointerType::get(getAssociatedType()->getContext(),
-                                      static_cast<uint32_t>(getAddressSpace()));
+    PointerType *NewPtrTy =
+        PointerType::get(getAssociatedType()->getContext(),
+                         static_cast<uint32_t>(getAddressSpace()));
     bool UseOriginalValue =
         OriginalValue->getType()->getPointerAddressSpace() ==
         static_cast<uint32_t>(getAddressSpace());
 
     bool Changed = false;
 
-    auto MakeChange = [&](Instruction *I, Use &U) {
-      Changed = true;
-      if (UseOriginalValue) {
-        A.changeUseAfterManifest(U, *OriginalValue);
-        return;
-      }
-      Instruction *CastInst = new AddrSpaceCastInst(OriginalValue, NewPtrTy);
-      CastInst->insertBefore(cast<Instruction>(I));
-      A.changeUseAfterManifest(U, *CastInst);
-    };
-
     auto Pred = [&](const Use &U, bool &) {
       if (U.get() != AssociatedValue)
         return true;
@@ -12564,12 +12581,13 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
       // CGSCC if the AA is run on CGSCC instead of the entire module.
       if (!A.isRunOn(Inst->getFunction()))
         return true;
-      if (isa<LoadInst>(Inst))
-        MakeChange(Inst, const_cast<Use &>(U));
-      if (isa<StoreInst>(Inst)) {
-        // We only make changes if the use is the pointer operand.
-        if (U.getOperandNo() == 1)
-          MakeChange(Inst, const_cast<Use &>(U));
+      if (auto *LI = dyn_cast<LoadInst>(Inst)) {
+        Changed |=
+            makeChange(A, LI, U, OriginalValue, NewPtrTy, UseOriginalValue);
+      }
+      if (auto *SI = dyn_cast<StoreInst>(Inst)) {
+        Changed |=
+            makeChange(A, SI, U, OriginalValue, NewPtrTy, UseOriginalValue);
       }
       return true;
     };
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
index 66b88236bbb4c..9b9249b62b0bc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
@@ -7,11 +7,10 @@ target triple = "amdgcn-amd-amdhsa"
 ; Make sure flat_scratch_init is set
 
 ; GCN-LABEL: {{^}}stack_object_addrspacecast_in_kernel_no_calls:
-; RW-FLAT:     s_add_u32 s0, s0, s7
-; RW-FLAT:     s_addc_u32 s1, s1, 0
+; RW-FLAT:     s_add_u32 flat_scratch_lo, s4, s7
+; RW-FLAT:     s_addc_u32 flat_scratch_hi, s5, 0
 ; RO-FLAT-NOT: flat_scratch
-; RW-FLAT:     buffer_store_dword
-; RO-FLAT:     scratch_store_dword
+; GCN:         flat_store_dword
 ; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
 ; RW-FLAT:     .amdhsa_user_sgpr_flat_scratch_init 1
 ; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
diff --git a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
new file mode 100644
index 0000000000000..37689edc53e25
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -S %s -o - | FileCheck %s
+
+@g1 = protected addrspace(1) externally_initialized global i32 0, align 4
+
+define internal void @volatile_load_store_as_0(ptr %p) {
+; CHECK-LABEL: define internal void @volatile_load_store_as_0(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[VAL_0:%.*]] = load i32, ptr addrspace(1) @g1, align 4
+; CHECK-NEXT:    [[VAL_1:%.*]] = load volatile i32, ptr [[P]], align 4
+; CHECK-NEXT:    store i32 [[VAL_1]], ptr addrspace(1) @g1, align 4
+; CHECK-NEXT:    store volatile i32 [[VAL_0]], ptr [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+  %val.0 = load i32, ptr addrspace(1) @g1, align 4
+  %val.1 = load volatile i32, ptr %p, align 4
+  store i32 %val.1, ptr addrspace(1) @g1, align 4
+  store volatile i32 %val.0, ptr %p, align 4
+  ret void
+}
+
+define void @call_volatile_load_store_as_0(ptr %p1, ptr %p2) {
+; CHECK-LABEL: define void @call_volatile_load_store_as_0(
+; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @volatile_load_store_as_0(ptr [[P1]])
+; CHECK-NEXT:    call void @volatile_load_store_as_0(ptr [[P2]])
+; CHECK-NEXT:    ret void
+;
+  call void @volatile_load_store_as_0(ptr %p1)
+  call void @volatile_load_store_as_0(ptr %p2)
+  ret void
+}
+
+define internal void @volatile_load_store_as_1(ptr %p) {
+; CHECK-LABEL: define internal void @volatile_load_store_as_1(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[VAL_0:%.*]] = load i32, ptr addrspace(1) @g1, align 4
+; CHECK-NEXT:    [[VAL_1:%.*]] = load volatile i32, ptr [[P]], align 4
+; CHECK-NEXT:    store i32 [[VAL_1]], ptr addrspace(1) @g1, align 4
+; CHECK-NEXT:    store volatile i32 [[VAL_0]], ptr [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+  %val.0 = load i32, ptr addrspace(1) @g1, align 4
+  %val.1 = load volatile i32, ptr %p, align 4
+  store i32 %val.1, ptr addrspace(1) @g1, align 4
+  store volatile i32 %val.0, ptr %p, align 4
+  ret void
+}
+
+define void @call_volatile_load_store_as_1(ptr addrspace(1) %p1, ptr addrspace(1) %p2) {
+; CHECK-LABEL: define void @call_volatile_load_store_as_1(
+; CHECK-SAME: ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[P1_CAST:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr
+; CHECK-NEXT:    [[P2_CAST:%.*]] = addrspacecast ptr addrspace(1) [[P2]] to ptr
+; CHECK-NEXT:    call void @volatile_load_store_as_1(ptr [[P1_CAST]])
+; CHECK-NEXT:    call void @volatile_load_store_as_1(ptr [[P2_CAST]])
+; CHECK-NEXT:    ret void
+;
+  %p1.cast = addrspacecast ptr addrspace(1) %p1 to ptr
+  %p2.cast = addrspacecast ptr addrspace(1) %p2 to ptr
+  call void @volatile_load_store_as_1(ptr %p1.cast)
+  call void @volatile_load_store_as_1(ptr %p2.cast)
+  ret void
+}
+
+define internal void @volatile_load_store_as_4(ptr %p) {
+  %val.0 = load i32, ptr addrspace(1) @g1, align 4
+  %val.1 = load volatile i32, ptr %p, align 4
+  store i32 %val.1, ptr addrspace(1) @g1, align 4
+  store volatile i32 %val.0, ptr %p, align 4
+  ret void
+}
+
+define void @call_volatile_load_store_as_4(ptr addrspace(4) %p1, ptr addrspace(4) %p2) {
+; CHECK-LABEL: define void @call_volatile_load_store_as_4(
+; CHECK-SAME: ptr addrspace(4) [[P1:%.*]], ptr addrspace(4) [[P2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[P1_CAST:%.*]] = addrspacecast ptr addrspace(4) [[P1]] to ptr
+; CHECK-NEXT:    [[P2_CAST:%.*]] = addrspacecast ptr addrspace(4) [[P2]] to ptr
+; CHECK-NEXT:    call void @volatile_load_store_as_1(ptr [[P1_CAST]])
+; CHECK-NEXT:    call void @volatile_load_store_as_1(ptr [[P2_CAST]])
+; CHECK-NEXT:    ret void
+;
+  %p1.cast = addrspacecast ptr addrspace(4) %p1 to ptr
+  %p2.cast = addrspacecast ptr addrspace(4) %p2 to ptr
+  call void @volatile_load_store_as_1(ptr %p1.cast)
+  call void @volatile_load_store_as_1(ptr %p2.cast)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 4b1484e9bd958..7336543b41cbc 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -5,11 +5,22 @@ target triple = "amdgcn-amd-amdhsa"
 
 ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
 
-; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}}
-; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}}
-; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]]
+; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
+; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
+; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
+; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
+; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
+
+; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base
+
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; HSA-DAG: ds_write_b32 [[PTR]], [[K]]
+; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
+
+; GFX9: s_cmp_lg_u32 [[PTR]], -1
+; GFX9-DAG: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
+; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
+
+; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
 
 ; HSA:  .amdhsa_user_sgpr_private_segment_buffer 1
 ; HSA:  .amdhsa_user_sgpr_dispatch_ptr 0
@@ -28,8 +39,22 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr
 
 ; Test handling inside a non-kernel
 ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func:
+; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x10{{$}}
+; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
+; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0
+
+; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base
+
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; HSA-DAG: ds_write_b32 v0, [[K]]
+
+; GFX9-DAG: v_mov_b32_e32 v[[VREG_HIBASE:[0-9]+]], s[[HIBASE]]
+; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
+; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc
+; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, v[[VREG_HIBASE]], vcc
+
+; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
 define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
   store volatile i32 7, ptr %stof
@@ -38,16 +63,23 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
 
 ; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
 
-; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}}
-; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}}
-; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]]
-; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
-; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
-; SI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
-; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
-; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; HSA: buffer_store_dword [[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
+; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
+; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
+
+; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
+; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
+; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
+
+; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
+; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_private_base
+
+; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; GFX9: s_cmp_lg_u32 [[PTR]], -1
+; GFX9: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
+; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
+
+; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
 
 ; HSA:  .amdhsa_user_sgpr_private_segment_buffer 1
 ; HSA:  .amdhsa_user_sgpr_dispatch_ptr 0
@@ -65,12 +97,10 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 ; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
 
 ; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
-; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
-; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; CI: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
-; GFX9-DAG: v_mov_b32_e32 [[ADDR:v[0-9]+]], 0
-; GFX9: global_store_dword [[ADDR]], [[K]], s[[[PTRLO]]:[[PTRHI]]]
+; HSA: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
 
 ; HSA:  .amdhsa_user_sgpr_queue_ptr 0
 define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) #0 {
@@ -82,7 +112,9 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
 ; no-op
 ; HSA-LABEL: {{^}}use_constant_to_flat_addrspacecast:
 ; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
-; HSA-DAG: s_load_dword s0, s[[[PTRLO]]:[[PTRHI]]], 0x0
+; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; HSA: flat_load_dword v{{[0-9]+}}, v[[[VPTRLO]]:[[VPTRHI]]]
 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) #0 {
   %stof = addrspacecast ptr addrspace(4) %ptr to ptr
   %ld = load volatile i32, ptr %stof
@@ -183,9 +215,11 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #0 {
 }
 
 ; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
+
 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
-; HSA: ds_write_b32 v[[LO]], v[[K]]
+; HSA: flat_store_dword v[[[LO]]:[[HI]]], v[[K]]
 define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
   %cast = addrspacecast ptr addrspace(3) null to ptr
   store volatile i32 7, ptr %cast
@@ -203,9 +237,10 @@ define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
 }
 
 ; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
+; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
-; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1
-; HSA: ds_write_b32 v[[LO]], v[[K]]
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
 define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
   %cast = addrspacecast ptr addrspace(3) inttoptr (i32 -1 to ptr addrspace(3)) to ptr
   store volatile i32 7, ptr %cast
@@ -224,13 +259,10 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
 
 ; FIXME: Shouldn't need to enable queue ptr
 ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
-; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
-; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
-; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
-; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
+; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
-; HSA: buffer_store_dword v[[K]], off, s[[[BASELO]]:[[RSRCHI]]], 0
+; HSA: flat_store_dword v[[[LO]]:[[HI]]], v[[K]]
 define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
   %cast = addrspacecast ptr addrspace(5) null to ptr
   store volatile i32 7, ptr %cast
@@ -249,14 +281,10 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
 
 ; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast:
 
-; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
-; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
-; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
-; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
-; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
-; HSA: buffer_store_dword v[[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
 
 ; CI:  .amdhsa_user_sgpr_queue_ptr 1
 ; GFX9:  .amdhsa_user_sgpr_queue_ptr 0
@@ -306,18 +334,16 @@ end:
 
 ; Check for prologue initializing special SGPRs pointing to scratch.
 ; HSA-LABEL: {{^}}store_flat_scratch:
+; CI-DAG: s_mov_b32 flat_scratch_lo, s9
 ; CI-DAG: s_add_i32 [[ADD:s[0-9]+]], s8, s11
 ; CI-DAG: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
-; HSA: buffer_store_dword
-; HSA:  s_barrier
-; HSA: buffer_load_dword [[K:v[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen glc
-; HSA-DAG: s_load_dwordx2
-; CI-DAG: s_mov_b32 flat_scratch_lo, s9
-; CI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s4
-; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s5
-; GFX9-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0
-; CI: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
-; GFX9: global_store_dword [[PTR]], [[K]]
+
+; GFX9: s_add_u32 flat_scratch_lo, s6, s9
+; GFX9: s_addc_u32 flat_scratch_hi, s7, 0
+
+; HSA: {{flat|global}}_store_dword
+; HSA: s_barrier
+; HSA: {{flat|global}}_load_dword
 define amdgpu_kernel void @store_flat_scratch(ptr addrspace(1) noalias %out, i32) #0 {
   %alloca = alloca i32, i32 9, align 4, addrspace(5)
   %x = call i32 @llvm.amdgcn.workitem.id.x() #2
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 879bceaef97c0..43cdf85ed3818 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -425,7 +425,8 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
 ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] {
-; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(3) [[PTR]], align 4
+; ATTRIBUTOR_HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
+; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr [[STOF]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
@@ -442,7 +443,8 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast
 ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR12]] {
-; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(5) [[PTR]], align 4
+; ATTRIBUTOR_HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
+; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr [[STOF]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
@@ -476,16 +478,11 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) #1 {
 
 ; No-op addrspacecast should not use queue ptr
 define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
-; AKF_HSA-NEXT:    store volatile i32 0, ptr [[STOF]], align 4
-; AKF_HSA-NEXT:    ret void
-;
-; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) [[PTR]], align 4
-; ATTRIBUTOR_HSA-NEXT:    ret void
+; HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
+; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; HSA-NEXT:    store volatile i32 0, ptr [[STOF]], align 4
+; HSA-NEXT:    ret void
 ;
   %stof = addrspacecast ptr addrspace(1) %ptr to ptr
   store volatile i32 0, ptr %stof
@@ -493,16 +490,11 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
 }
 
 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
-; AKF_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr
-; AKF_HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4
-; AKF_HSA-NEXT:    ret void
-;
-; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] {
-; ATTRIBUTOR_HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[PTR]], align 4
-; ATTRIBUTOR_HSA-NEXT:    ret void
+; HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
+; HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr
+; HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4
+; HSA-NEXT:    ret void
 ;
   %stof = addrspacecast ptr addrspace(4) %ptr to ptr
   %ld = load volatile i32, ptr %stof
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 032ec65fa8513..8ef2d89e76d4e 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -38,9 +38,15 @@ define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 {
 }
 
 ; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast:
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0
-; GCN-DAG: ds_write_b32 v[[LO]], v[[LO]] offset:16
+; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[4:5], 0x0
+; CIVI: v_mov_b32_e32 v[[LO:[0-9]+]], 16
+; CIVI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]]
 
+; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base
+; GFX9-DAG: v_mov_b32_e32 v[[VGPR_HI:[0-9]+]], s[[HI]]
+; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[VGPR_HI]]]
+
+; CIVI: {{flat|global}}_store_dword v[[[LO]]:[[HI]]]
 define hidden void @use_queue_ptr_addrspacecast() #1 {
   %asc = addrspacecast ptr addrspace(3) inttoptr (i32 16 to ptr addrspace(3)) to ptr
   store volatile i32 0, ptr %asc

From 4af249fe6e81abd137c95bc25f5060ae305134ca Mon Sep 17 00:00:00 2001
From: anjenner <161845516+anjenner@users.noreply.github.com>
Date: Fri, 6 Sep 2024 16:19:20 +0100
Subject: [PATCH 390/425] Add usub_cond and usub_sat operations to atomicrmw
 (#105568)

These both perform conditional subtraction, returning the minuend and
zero respectively, if the difference is negative.
---
 llvm/bindings/ocaml/llvm/llvm.ml              |    2 +
 llvm/bindings/ocaml/llvm/llvm.mli             |    2 +
 llvm/docs/GlobalISel/GenericOpcode.rst        |    4 +-
 llvm/docs/LangRef.rst                         |    4 +
 llvm/docs/ReleaseNotes.rst                    |    2 +
 llvm/include/llvm/AsmParser/LLToken.h         |    2 +
 llvm/include/llvm/Bitcode/LLVMBitCodes.h      |    4 +-
 .../CodeGen/GlobalISel/MachineIRBuilder.h     |   35 +
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |    2 +
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h |   40 +-
 llvm/include/llvm/IR/Instructions.h           |   10 +-
 llvm/include/llvm/Support/TargetOpcodes.def   |    4 +-
 llvm/include/llvm/Target/GenericOpcodes.td    |    2 +
 .../Target/GlobalISel/SelectionDAGCompat.td   |    2 +
 .../include/llvm/Target/TargetSelectionDAG.td |    4 +
 llvm/lib/AsmParser/LLLexer.cpp                |    2 +
 llvm/lib/AsmParser/LLParser.cpp               |    6 +
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |    4 +
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     |    4 +
 llvm/lib/CodeGen/AtomicExpandPass.cpp         |    8 +-
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |    6 +
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   24 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |    6 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |    4 +
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |    2 +
 llvm/lib/IR/Instructions.cpp                  |    4 +
 .../LoongArch/LoongArchISelLowering.cpp       |    4 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |    2 +
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |    4 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |    2 +
 llvm/lib/Transforms/Utils/LowerAtomic.cpp     |    8 +
 llvm/test/Assembler/atomic.ll                 |   10 +
 llvm/test/Bitcode/compatibility.ll            |   28 +
 .../GlobalISel/legalizer-info-validation.mir  |    6 +
 .../AArch64/atomicrmw-cond-sub-clamp.ll       |  140 ++
 .../CodeGen/ARM/atomicrmw-cond-sub-clamp.ll   |  180 +++
 .../Hexagon/atomicrmw-cond-sub-clamp.ll       |  341 +++++
 .../LoongArch/atomicrmw-cond-sub-clamp.ll     |  356 +++++
 .../PowerPC/atomicrmw-cond-sub-clamp.ll       |  384 +++++
 .../CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll | 1265 +++++++++++++++++
 .../VE/Scalar/atomicrmw-cond-sub-clamp.ll     |  235 +++
 .../WebAssembly/atomicrmw-cond-sub-clamp.ll   |  363 +++++
 .../CodeGen/X86/atomicrmw-cond-sub-clamp.ll   |  436 ++++++
 .../GlobalISelCombinerEmitter/match-table.td  |   54 +-
 llvm/test/TableGen/GlobalISelEmitter.td       |    2 +-
 mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td |    6 +-
 .../test/Target/LLVMIR/Import/instructions.ll |    6 +-
 mlir/test/Target/LLVMIR/llvmir.mlir           |    6 +-
 48 files changed, 3955 insertions(+), 72 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/atomicrmw-cond-sub-clamp.ll
 create mode 100644 llvm/test/CodeGen/ARM/atomicrmw-cond-sub-clamp.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/atomicrmw-cond-sub-clamp.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/atomicrmw-cond-sub-clamp.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
 create mode 100644 llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll
 create mode 100644 llvm/test/CodeGen/WebAssembly/atomicrmw-cond-sub-clamp.ll
 create mode 100644 llvm/test/CodeGen/X86/atomicrmw-cond-sub-clamp.ll

diff --git a/llvm/bindings/ocaml/llvm/llvm.ml b/llvm/bindings/ocaml/llvm/llvm.ml
index 8e059ae71613d..74ba31389b378 100644
--- a/llvm/bindings/ocaml/llvm/llvm.ml
+++ b/llvm/bindings/ocaml/llvm/llvm.ml
@@ -300,6 +300,8 @@ module AtomicRMWBinOp = struct
   | FMin
   | UInc_Wrap
   | UDec_Wrap
+  | USub_Cond
+  | USub_Sat
 end
 
 module ValueKind = struct
diff --git a/llvm/bindings/ocaml/llvm/llvm.mli b/llvm/bindings/ocaml/llvm/llvm.mli
index b8fdac7e38c6a..076e651ba158f 100644
--- a/llvm/bindings/ocaml/llvm/llvm.mli
+++ b/llvm/bindings/ocaml/llvm/llvm.mli
@@ -335,6 +335,8 @@ module AtomicRMWBinOp : sig
   | FMin
   | UInc_Wrap
   | UDec_Wrap
+  | USub_Cond
+  | USub_Sat
 end
 
 (** The kind of an [llvalue], the result of [classify_value v].
diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index d32aeff5a69bb..bba56d9a5c0ec 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -863,7 +863,9 @@ operands.
                                G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
                                G_ATOMICRMW_UMIN, G_ATOMICRMW_FADD,
                                G_ATOMICRMW_FSUB, G_ATOMICRMW_FMAX,
-                               G_ATOMICRMW_FMIN
+                               G_ATOMICRMW_FMIN, G_ATOMICRMW_UINC_WRAP,
+			       G_ATOMICRMW_UDEC_WRAP, G_ATOMICRMW_USUB_COND,
+			       G_ATOMICRMW_USUB_SAT
 
 Generic atomicrmw. Expects a MachineMemOperand in addition to explicit
 operands.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index c75b75edaf2ca..144b4497ca63c 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -11241,6 +11241,8 @@ operation. The operation must be one of the following keywords:
 -  fmin
 -  uinc_wrap
 -  udec_wrap
+-  usub_cond
+-  usub_sat
 
 For most of these operations, the type of '<value>' must be an integer
 type whose bit width is a power of two greater than or equal to eight
@@ -11291,6 +11293,8 @@ operation argument:
 -  fmin: ``*ptr = minnum(*ptr, val)`` (match the `llvm.minnum.*`` intrinsic)
 -  uinc_wrap: ``*ptr = (*ptr u>= val) ? 0 : (*ptr + 1)`` (increment value with wraparound to zero when incremented above input value)
 -  udec_wrap: ``*ptr = ((*ptr == 0) || (*ptr u> val)) ? val : (*ptr - 1)`` (decrement with wraparound to input value when decremented below zero).
+-  usub_cond: ``*ptr = (*ptr u>= val) ? *ptr - val : *ptr`` (subtract only if no unsigned overflow).
+-  usub_sat: ``*ptr = (*ptr u>= val) ? *ptr - val : 0`` (subtract with unsigned clamping to zero).
 
 
 Example:
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 9982b5f427e4b..52456896f2fc6 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -54,6 +54,8 @@ Changes to the LLVM IR
   the standard vector type ``<1 x i64>`` in bitcode upgrade.
 * Renamed ``llvm.experimental.stepvector`` intrinsic to ``llvm.stepvector``.
 
+* Added ``usub_cond`` and ``usub_sat`` operations to ``atomicrmw``.
+
 Changes to LLVM infrastructure
 ------------------------------
 
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index db6780b70ca5a..19029842a572a 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -268,6 +268,8 @@ enum Kind {
   kw_fmin,
   kw_uinc_wrap,
   kw_udec_wrap,
+  kw_usub_cond,
+  kw_usub_sat,
 
   // Instruction Opcodes (Opcode in UIntVal).
   kw_fneg,
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 4beac37a58344..49a48f1c1510c 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -485,7 +485,9 @@ enum RMWOperations {
   RMW_FMAX = 13,
   RMW_FMIN = 14,
   RMW_UINC_WRAP = 15,
-  RMW_UDEC_WRAP = 16
+  RMW_UDEC_WRAP = 16,
+  RMW_USUB_COND = 17,
+  RMW_USUB_SAT = 18
 };
 
 /// OverflowingBinaryOperatorOptionalFlags - Flags for serializing
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 56a77b8596a18..c0b9d0eac23c3 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1636,6 +1636,41 @@ class MachineIRBuilder {
         const DstOp &OldValRes, const SrcOp &Addr, const SrcOp &Val,
         MachineMemOperand &MMO);
 
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_USUB_COND Addr, Val, MMO`.
+  ///
+  /// Atomically replace the value at \p Addr with the original value minus \p
+  /// Val if the original value is greater than or equal to \p Val, or leaves it
+  /// unchanged otherwise. Puts the original value from \p Addr in \p OldValRes.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p OldValRes must be a generic virtual register.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  /// \pre \p OldValRes, and \p Val must be generic virtual registers of the
+  ///      same type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildAtomicRMWUSubCond(const DstOp &OldValRes,
+                                             const SrcOp &Addr,
+                                             const SrcOp &Val,
+                                             MachineMemOperand &MMO);
+
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_USUB_SAT Addr, Val, MMO`.
+  ///
+  /// Atomically replace the value at \p Addr with the original value minus \p
+  /// Val, with clamping to zero if the unsigned subtraction would overflow.
+  /// Puts the original value from \p Addr in \p OldValRes.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p OldValRes must be a generic virtual register.
+  /// \pre \p Addr must be a generic virtual register with pointer type.
+  /// \pre \p OldValRes, and \p Val must be generic virtual registers of the
+  ///      same type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildAtomicRMWUSubSat(const DstOp &OldValRes,
+                                            const SrcOp &Addr, const SrcOp &Val,
+                                            MachineMemOperand &MMO);
+
   /// Build and insert `G_FENCE Ordering, Scope`.
   MachineInstrBuilder buildFence(unsigned Ordering, unsigned Scope);
 
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 187d624f0a73b..65514882343db 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1345,6 +1345,8 @@ enum NodeType {
   ATOMIC_LOAD_FMIN,
   ATOMIC_LOAD_UINC_WRAP,
   ATOMIC_LOAD_UDEC_WRAP,
+  ATOMIC_LOAD_USUB_COND,
+  ATOMIC_LOAD_USUB_SAT,
 
   /// Masked load and store - consecutive vector load and store operations
   /// with additional mask operand that prevents memory accesses to the
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 88549d9c9a285..6067b3b29ea18 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1484,6 +1484,8 @@ class MemSDNode : public SDNode {
     case ISD::ATOMIC_LOAD_FMIN:
     case ISD::ATOMIC_LOAD_UINC_WRAP:
     case ISD::ATOMIC_LOAD_UDEC_WRAP:
+    case ISD::ATOMIC_LOAD_USUB_COND:
+    case ISD::ATOMIC_LOAD_USUB_SAT:
     case ISD::ATOMIC_LOAD:
     case ISD::ATOMIC_STORE:
     case ISD::MLOAD:
@@ -1550,27 +1552,29 @@ class AtomicSDNode : public MemSDNode {
 
   // Methods to support isa and dyn_cast
   static bool classof(const SDNode *N) {
-    return N->getOpcode() == ISD::ATOMIC_CMP_SWAP     ||
+    return N->getOpcode() == ISD::ATOMIC_CMP_SWAP ||
            N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS ||
-           N->getOpcode() == ISD::ATOMIC_SWAP         ||
-           N->getOpcode() == ISD::ATOMIC_LOAD_ADD     ||
-           N->getOpcode() == ISD::ATOMIC_LOAD_SUB     ||
-           N->getOpcode() == ISD::ATOMIC_LOAD_AND     ||
-           N->getOpcode() == ISD::ATOMIC_LOAD_CLR     ||
-           N->getOpcode() == ISD::ATOMIC_LOAD_OR      ||
-           N->getOpcode() == ISD::ATOMIC_LOAD_XOR     ||
-           N->getOpcode() == ISD::ATOMIC_LOAD_NAND    ||
-           N->getOpcode() == ISD::ATOMIC_LOAD_MIN     ||
-           N->getOpcode() == ISD::ATOMIC_LOAD_MAX     ||
-           N->getOpcode() == ISD::ATOMIC_LOAD_UMIN    ||
-           N->getOpcode() == ISD::ATOMIC_LOAD_UMAX    ||
-           N->getOpcode() == ISD::ATOMIC_LOAD_FADD    ||
-           N->getOpcode() == ISD::ATOMIC_LOAD_FSUB    ||
-           N->getOpcode() == ISD::ATOMIC_LOAD_FMAX    ||
-           N->getOpcode() == ISD::ATOMIC_LOAD_FMIN    ||
+           N->getOpcode() == ISD::ATOMIC_SWAP ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_ADD ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_SUB ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_AND ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_CLR ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_OR ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_XOR ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_NAND ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_MIN ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_MAX ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_UMIN ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_UMAX ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_FADD ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_FSUB ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_FMAX ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_FMIN ||
            N->getOpcode() == ISD::ATOMIC_LOAD_UINC_WRAP ||
            N->getOpcode() == ISD::ATOMIC_LOAD_UDEC_WRAP ||
-           N->getOpcode() == ISD::ATOMIC_LOAD         ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_USUB_COND ||
+           N->getOpcode() == ISD::ATOMIC_LOAD_USUB_SAT ||
+           N->getOpcode() == ISD::ATOMIC_LOAD ||
            N->getOpcode() == ISD::ATOMIC_STORE;
   }
 };
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 6257d03458cab..ab3321ee75571 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -751,8 +751,16 @@ class AtomicRMWInst : public Instruction {
     /// *p = ((old == 0) || (old u> v)) ? v : (old - 1)
     UDecWrap,
 
+    /// Subtract only if no unsigned overflow.
+    /// *p = (old u>= v) ? old - v : old
+    USubCond,
+
+    /// *p = usub.sat(old, v)
+    /// \p usub.sat matches the behavior of \p llvm.usub.sat.*.
+    USubSat,
+
     FIRST_BINOP = Xchg,
-    LAST_BINOP = UDecWrap,
+    LAST_BINOP = USubSat,
     BAD_BINOP
   };
 
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index e1883de0c93b4..a3692a5fb6ebd 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -422,12 +422,14 @@ HANDLE_TARGET_OPCODE(G_ATOMICRMW_FMAX)
 HANDLE_TARGET_OPCODE(G_ATOMICRMW_FMIN)
 HANDLE_TARGET_OPCODE(G_ATOMICRMW_UINC_WRAP)
 HANDLE_TARGET_OPCODE(G_ATOMICRMW_UDEC_WRAP)
+HANDLE_TARGET_OPCODE(G_ATOMICRMW_USUB_COND)
+HANDLE_TARGET_OPCODE(G_ATOMICRMW_USUB_SAT)
 
 // Marker for start of Generic AtomicRMW opcodes
 HANDLE_TARGET_OPCODE_MARKER(GENERIC_ATOMICRMW_OP_START, G_ATOMICRMW_XCHG)
 
 // Marker for end of Generic AtomicRMW opcodes
-HANDLE_TARGET_OPCODE_MARKER(GENERIC_ATOMICRMW_OP_END, G_ATOMICRMW_UDEC_WRAP)
+HANDLE_TARGET_OPCODE_MARKER(GENERIC_ATOMICRMW_OP_END, G_ATOMICRMW_USUB_SAT)
 
 // Generic atomic fence
 HANDLE_TARGET_OPCODE(G_FENCE)
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 36a0a087ba457..f4934af4563d8 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1311,6 +1311,8 @@ def G_ATOMICRMW_FMAX : G_ATOMICRMW_OP;
 def G_ATOMICRMW_FMIN : G_ATOMICRMW_OP;
 def G_ATOMICRMW_UINC_WRAP : G_ATOMICRMW_OP;
 def G_ATOMICRMW_UDEC_WRAP : G_ATOMICRMW_OP;
+def G_ATOMICRMW_USUB_COND : G_ATOMICRMW_OP;
+def G_ATOMICRMW_USUB_SAT : G_ATOMICRMW_OP;
 
 def G_FENCE : GenericInstruction {
   let OutOperandList = (outs);
diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 72d155b483cf2..93444876b1296 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -260,6 +260,8 @@ def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax>;
 def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin>;
 def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap>;
 def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_COND, atomic_load_usub_cond>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_SAT, atomic_load_usub_sat>;
 def : GINodeEquiv<G_FENCE, atomic_fence>;
 def : GINodeEquiv<G_PREFETCH, prefetch>;
 def : GINodeEquiv<G_TRAP, trap>;
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index dd79002dcbdb4..759fd7810b230 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -758,6 +758,10 @@ def atomic_load_uinc_wrap : SDNode<"ISD::ATOMIC_LOAD_UINC_WRAP", SDTAtomic2,
                     [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 def atomic_load_udec_wrap : SDNode<"ISD::ATOMIC_LOAD_UDEC_WRAP", SDTAtomic2,
                     [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+def atomic_load_usub_cond : SDNode<"ISD::ATOMIC_LOAD_USUB_COND", SDTAtomic2,
+                    [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+def atomic_load_usub_sat : SDNode<"ISD::ATOMIC_LOAD_USUB_SAT", SDTAtomic2,
+                    [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 
 def atomic_load      : SDNode<"ISD::ATOMIC_LOAD", SDTAtomicLoad,
                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 7c97f7afbe093..a3e47da77fe77 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -704,6 +704,8 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(umin); KEYWORD(fmax); KEYWORD(fmin);
   KEYWORD(uinc_wrap);
   KEYWORD(udec_wrap);
+  KEYWORD(usub_cond);
+  KEYWORD(usub_sat);
 
   KEYWORD(splat);
   KEYWORD(vscale);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 93dc2bd241581..d8380fa27a2a2 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -8357,6 +8357,12 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
   case lltok::kw_udec_wrap:
     Operation = AtomicRMWInst::UDecWrap;
     break;
+  case lltok::kw_usub_cond:
+    Operation = AtomicRMWInst::USubCond;
+    break;
+  case lltok::kw_usub_sat:
+    Operation = AtomicRMWInst::USubSat;
+    break;
   case lltok::kw_fadd:
     Operation = AtomicRMWInst::FAdd;
     IsFP = true;
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 1cd9ec6b8fca2..f887c9be73e21 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1349,6 +1349,10 @@ static AtomicRMWInst::BinOp getDecodedRMWOperation(unsigned Val) {
     return AtomicRMWInst::UIncWrap;
   case bitc::RMW_UDEC_WRAP:
     return AtomicRMWInst::UDecWrap;
+  case bitc::RMW_USUB_COND:
+    return AtomicRMWInst::USubCond;
+  case bitc::RMW_USUB_SAT:
+    return AtomicRMWInst::USubSat;
   }
 }
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 26fd02b3e1a04..bf2eea5bc1582 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -668,6 +668,10 @@ static unsigned getEncodedRMWOperation(AtomicRMWInst::BinOp Op) {
     return bitc::RMW_UINC_WRAP;
   case AtomicRMWInst::UDecWrap:
     return bitc::RMW_UDEC_WRAP;
+  case AtomicRMWInst::USubCond:
+    return bitc::RMW_USUB_COND;
+  case AtomicRMWInst::USubSat:
+    return bitc::RMW_USUB_SAT;
   }
 }
 
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index f2b58d542b373..303058416166d 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -901,7 +901,9 @@ static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
   case AtomicRMWInst::FMin:
   case AtomicRMWInst::FMax:
   case AtomicRMWInst::UIncWrap:
-  case AtomicRMWInst::UDecWrap: {
+  case AtomicRMWInst::UDecWrap:
+  case AtomicRMWInst::USubCond:
+  case AtomicRMWInst::USubSat: {
     // Finally, other ops will operate on the full value, so truncate down to
     // the original size, and expand out again after doing the
     // operation. Bitcasts will be inserted for FP values.
@@ -1816,7 +1818,9 @@ static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {
   case AtomicRMWInst::FSub:
   case AtomicRMWInst::UIncWrap:
   case AtomicRMWInst::UDecWrap:
-    // No atomic libcalls are available for max/min/umax/umin.
+  case AtomicRMWInst::USubCond:
+  case AtomicRMWInst::USubSat:
+    // No atomic libcalls are available for these.
     return {};
   }
   llvm_unreachable("Unexpected AtomicRMW operation.");
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index b290d7fb4ce4a..6c98800b29c26 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -3308,6 +3308,12 @@ bool IRTranslator::translateAtomicRMW(const User &U,
   case AtomicRMWInst::UDecWrap:
     Opcode = TargetOpcode::G_ATOMICRMW_UDEC_WRAP;
     break;
+  case AtomicRMWInst::USubCond:
+    Opcode = TargetOpcode::G_ATOMICRMW_USUB_COND;
+    break;
+  case AtomicRMWInst::USubSat:
+    Opcode = TargetOpcode::G_ATOMICRMW_USUB_SAT;
+    break;
   }
 
   MIRBuilder.buildAtomicRMW(
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 870dbce3baa86..29505f444b765 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8755,24 +8755,18 @@ SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl,
 SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
                                 SDValue Chain, SDValue Ptr, SDValue Val,
                                 MachineMemOperand *MMO) {
-  assert((Opcode == ISD::ATOMIC_LOAD_ADD ||
-          Opcode == ISD::ATOMIC_LOAD_SUB ||
-          Opcode == ISD::ATOMIC_LOAD_AND ||
-          Opcode == ISD::ATOMIC_LOAD_CLR ||
-          Opcode == ISD::ATOMIC_LOAD_OR ||
-          Opcode == ISD::ATOMIC_LOAD_XOR ||
-          Opcode == ISD::ATOMIC_LOAD_NAND ||
-          Opcode == ISD::ATOMIC_LOAD_MIN ||
-          Opcode == ISD::ATOMIC_LOAD_MAX ||
-          Opcode == ISD::ATOMIC_LOAD_UMIN ||
-          Opcode == ISD::ATOMIC_LOAD_UMAX ||
-          Opcode == ISD::ATOMIC_LOAD_FADD ||
-          Opcode == ISD::ATOMIC_LOAD_FSUB ||
-          Opcode == ISD::ATOMIC_LOAD_FMAX ||
+  assert((Opcode == ISD::ATOMIC_LOAD_ADD || Opcode == ISD::ATOMIC_LOAD_SUB ||
+          Opcode == ISD::ATOMIC_LOAD_AND || Opcode == ISD::ATOMIC_LOAD_CLR ||
+          Opcode == ISD::ATOMIC_LOAD_OR || Opcode == ISD::ATOMIC_LOAD_XOR ||
+          Opcode == ISD::ATOMIC_LOAD_NAND || Opcode == ISD::ATOMIC_LOAD_MIN ||
+          Opcode == ISD::ATOMIC_LOAD_MAX || Opcode == ISD::ATOMIC_LOAD_UMIN ||
+          Opcode == ISD::ATOMIC_LOAD_UMAX || Opcode == ISD::ATOMIC_LOAD_FADD ||
+          Opcode == ISD::ATOMIC_LOAD_FSUB || Opcode == ISD::ATOMIC_LOAD_FMAX ||
           Opcode == ISD::ATOMIC_LOAD_FMIN ||
           Opcode == ISD::ATOMIC_LOAD_UINC_WRAP ||
           Opcode == ISD::ATOMIC_LOAD_UDEC_WRAP ||
-          Opcode == ISD::ATOMIC_SWAP ||
+          Opcode == ISD::ATOMIC_LOAD_USUB_COND ||
+          Opcode == ISD::ATOMIC_LOAD_USUB_SAT || Opcode == ISD::ATOMIC_SWAP ||
           Opcode == ISD::ATOMIC_STORE) &&
          "Invalid Atomic Op");
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 382a555aa656f..1dbcf8fd76510 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5112,6 +5112,12 @@ void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
   case AtomicRMWInst::UDecWrap:
     NT = ISD::ATOMIC_LOAD_UDEC_WRAP;
     break;
+  case AtomicRMWInst::USubCond:
+    NT = ISD::ATOMIC_LOAD_USUB_COND;
+    break;
+  case AtomicRMWInst::USubSat:
+    NT = ISD::ATOMIC_LOAD_USUB_SAT;
+    break;
   }
   AtomicOrdering Ordering = I.getOrdering();
   SyncScope::ID SSID = I.getSyncScopeID();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index a253d1a0e2017..1b663eae1fcfc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -103,6 +103,10 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     return "AtomicLoadUIncWrap";
   case ISD::ATOMIC_LOAD_UDEC_WRAP:
     return "AtomicLoadUDecWrap";
+  case ISD::ATOMIC_LOAD_USUB_COND:
+    return "AtomicLoadUSubCond";
+  case ISD::ATOMIC_LOAD_USUB_SAT:
+    return "AtomicLoadUSubSat";
   case ISD::ATOMIC_LOAD:                return "AtomicLoad";
   case ISD::ATOMIC_STORE:               return "AtomicStore";
   case ISD::PCMARKER:                   return "PCMarker";
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 71d51affba642..515b74cbb7588 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -7936,6 +7936,8 @@ Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
   case AtomicRMWInst::FMin:
   case AtomicRMWInst::UIncWrap:
   case AtomicRMWInst::UDecWrap:
+  case AtomicRMWInst::USubCond:
+  case AtomicRMWInst::USubSat:
     llvm_unreachable("Unsupported atomic update operation");
   }
   llvm_unreachable("Unsupported atomic update operation");
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 93fa635e9b4e1..19da1f60d424d 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -1431,6 +1431,10 @@ StringRef AtomicRMWInst::getOperationName(BinOp Op) {
     return "uinc_wrap";
   case AtomicRMWInst::UDecWrap:
     return "udec_wrap";
+  case AtomicRMWInst::USubCond:
+    return "usub_cond";
+  case AtomicRMWInst::USubSat:
+    return "usub_sat";
   case AtomicRMWInst::BAD_BINOP:
     return "<invalid operation>";
   }
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 0e17ce7ea02bb..2f7714fe63638 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -5713,7 +5713,9 @@ LoongArchTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   // operations, use CmpXChg to expand.
   if (AI->isFloatingPointOperation() ||
       AI->getOperation() == AtomicRMWInst::UIncWrap ||
-      AI->getOperation() == AtomicRMWInst::UDecWrap)
+      AI->getOperation() == AtomicRMWInst::UDecWrap ||
+      AI->getOperation() == AtomicRMWInst::USubCond ||
+      AI->getOperation() == AtomicRMWInst::USubSat)
     return AtomicExpansionKind::CmpXChg;
 
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index f1bd14d7ee011..8b794656d5c21 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -18858,6 +18858,8 @@ PPCTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   switch (AI->getOperation()) {
   case AtomicRMWInst::UIncWrap:
   case AtomicRMWInst::UDecWrap:
+  case AtomicRMWInst::USubCond:
+  case AtomicRMWInst::USubSat:
     return AtomicExpansionKind::CmpXChg;
   default:
     return TargetLowering::shouldExpandAtomicRMWInIR(AI);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index acee6443bc452..b0c8c95d67cde 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -20453,7 +20453,9 @@ RISCVTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   // forward-progress guarantee.
   if (AI->isFloatingPointOperation() ||
       AI->getOperation() == AtomicRMWInst::UIncWrap ||
-      AI->getOperation() == AtomicRMWInst::UDecWrap)
+      AI->getOperation() == AtomicRMWInst::UDecWrap ||
+      AI->getOperation() == AtomicRMWInst::USubCond ||
+      AI->getOperation() == AtomicRMWInst::USubSat)
     return AtomicExpansionKind::CmpXChg;
 
   // Don't expand forced atomics, we want to have __sync libcalls instead.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f8aa263c30151..44592afcf7216 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31004,6 +31004,8 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   case AtomicRMWInst::FMin:
   case AtomicRMWInst::UIncWrap:
   case AtomicRMWInst::UDecWrap:
+  case AtomicRMWInst::USubCond:
+  case AtomicRMWInst::USubSat:
   default:
     // These always require a non-trivial set of data operations on x86. We must
     // use a cmpxchg loop.
diff --git a/llvm/lib/Transforms/Utils/LowerAtomic.cpp b/llvm/lib/Transforms/Utils/LowerAtomic.cpp
index f9bf419fb0225..8b3a0ce338e57 100644
--- a/llvm/lib/Transforms/Utils/LowerAtomic.cpp
+++ b/llvm/lib/Transforms/Utils/LowerAtomic.cpp
@@ -95,6 +95,14 @@ Value *llvm::buildAtomicRMWValue(AtomicRMWInst::BinOp Op,
     Value *Or = Builder.CreateOr(CmpEq0, CmpOldGtVal);
     return Builder.CreateSelect(Or, Val, Dec, "new");
   }
+  case AtomicRMWInst::USubCond: {
+    Value *Cmp = Builder.CreateICmpUGE(Loaded, Val);
+    Value *Sub = Builder.CreateSub(Loaded, Val);
+    return Builder.CreateSelect(Cmp, Sub, Loaded, "new");
+  }
+  case AtomicRMWInst::USubSat:
+    return Builder.CreateIntrinsic(Intrinsic::usub_sat, Loaded->getType(),
+                                   {Loaded, Val}, nullptr, "new");
   default:
     llvm_unreachable("Unknown atomic op");
   }
diff --git a/llvm/test/Assembler/atomic.ll b/llvm/test/Assembler/atomic.ll
index 32fe82ef2268c..a44dcccc16bef 100644
--- a/llvm/test/Assembler/atomic.ll
+++ b/llvm/test/Assembler/atomic.ll
@@ -42,6 +42,16 @@ define void @f(ptr %x) {
   ; CHECK: atomicrmw volatile udec_wrap ptr %x, i32 10 syncscope("agent") monotonic
   atomicrmw volatile udec_wrap ptr %x, i32 10 syncscope("agent") monotonic
 
+  ; CHECK: atomicrmw volatile usub_cond ptr %x, i32 10 monotonic
+  atomicrmw volatile usub_cond ptr %x, i32 10 monotonic
+  ; CHECK: atomicrmw volatile usub_cond ptr %x, i32 10 syncscope("agent") monotonic
+  atomicrmw volatile usub_cond ptr %x, i32 10 syncscope("agent") monotonic
+
+  ; CHECK: atomicrmw volatile usub_sat ptr %x, i32 10 monotonic
+  atomicrmw volatile usub_sat ptr %x, i32 10 monotonic
+  ; CHECK: atomicrmw volatile usub_sat ptr %x, i32 10 syncscope("agent") monotonic
+  atomicrmw volatile usub_sat ptr %x, i32 10 syncscope("agent") monotonic
+
   ; CHECK: fence syncscope("singlethread") release
   fence syncscope("singlethread") release
   ; CHECK: fence seq_cst
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index fd60c49a4be39..e38c9783c9a8f 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -906,6 +906,34 @@ define void @uinc_udec_wrap_atomics(ptr %word) {
   ret void
 }
 
+define void @usub_cond_usub_sat_atomics(ptr %word) {
+; CHECK: %atomicrmw.condsub0 = atomicrmw usub_cond ptr %word, i32 64 monotonic
+  %atomicrmw.condsub0 = atomicrmw usub_cond ptr %word, i32 64 monotonic
+
+; CHECK: %atomicrmw.condsub1 = atomicrmw usub_cond ptr %word, i32 128 seq_cst
+  %atomicrmw.condsub1 = atomicrmw usub_cond ptr %word, i32 128 seq_cst
+
+; CHECK: %atomicrmw.condsub2 = atomicrmw volatile usub_cond ptr %word, i32 128 seq_cst
+  %atomicrmw.condsub2 = atomicrmw volatile usub_cond ptr %word, i32 128 seq_cst
+
+; CHECK: %atomicrmw.condsub0.syncscope = atomicrmw usub_cond ptr %word, i32 27 syncscope("agent") monotonic
+  %atomicrmw.condsub0.syncscope = atomicrmw usub_cond ptr %word, i32 27 syncscope("agent") monotonic
+
+; CHECK: %atomicrmw.subclamp0 = atomicrmw usub_sat ptr %word, i32 99 monotonic
+  %atomicrmw.subclamp0 = atomicrmw usub_sat ptr %word, i32 99 monotonic
+
+; CHECK: %atomicrmw.subclamp1 = atomicrmw usub_sat ptr %word, i32 12 seq_cst
+  %atomicrmw.subclamp1 = atomicrmw usub_sat ptr %word, i32 12 seq_cst
+
+; CHECK: %atomicrmw.subclamp2 = atomicrmw volatile usub_sat ptr %word, i32 12 seq_cst
+  %atomicrmw.subclamp2 = atomicrmw volatile usub_sat ptr %word, i32 12 seq_cst
+
+; CHECK: %atomicrmw.subclamp0.syncscope = atomicrmw usub_sat ptr %word, i32 5 syncscope("system") monotonic
+  %atomicrmw.subclamp0.syncscope = atomicrmw usub_sat ptr %word, i32 5 syncscope("system") monotonic
+
+  ret void
+}
+
 define void @pointer_atomics(ptr %word) {
 ; CHECK: %atomicrmw.xchg = atomicrmw xchg ptr %word, ptr null monotonic
   %atomicrmw.xchg = atomicrmw xchg ptr %word, ptr null monotonic
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index b3b85090d1125..62d98a224fa3d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -266,6 +266,12 @@
 # DEBUG-NEXT: G_ATOMICRMW_UDEC_WRAP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: G_ATOMICRMW_USUB_COND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: G_ATOMICRMW_USUB_SAT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_FENCE (opcode {{[0-9]+}}): 0 type indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/AArch64/atomicrmw-cond-sub-clamp.ll
new file mode 100644
index 0000000000000..83fe8664f72b0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-cond-sub-clamp.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:  .LBB0_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxrb w8, [x0]
+; CHECK-NEXT:    sub w9, w8, w1
+; CHECK-NEXT:    cmp w8, w1, uxtb
+; CHECK-NEXT:    csel w9, w9, w8, hs
+; CHECK-NEXT:    stlxrb w10, w9, [x0]
+; CHECK-NEXT:    cbnz w10, .LBB0_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+  %result = atomicrmw usub_cond ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:  .LBB1_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxrh w8, [x0]
+; CHECK-NEXT:    sub w9, w8, w1
+; CHECK-NEXT:    cmp w8, w1, uxth
+; CHECK-NEXT:    csel w9, w9, w8, hs
+; CHECK-NEXT:    stlxrh w10, w9, [x0]
+; CHECK-NEXT:    cbnz w10, .LBB1_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+  %result = atomicrmw usub_cond ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:  .LBB2_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxr w8, [x0]
+; CHECK-NEXT:    subs w9, w8, w1
+; CHECK-NEXT:    csel w9, w9, w8, hs
+; CHECK-NEXT:    stlxr w10, w9, [x0]
+; CHECK-NEXT:    cbnz w10, .LBB2_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+  %result = atomicrmw usub_cond ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:  .LBB3_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxr x0, [x8]
+; CHECK-NEXT:    subs x9, x0, x1
+; CHECK-NEXT:    csel x9, x9, x0, hs
+; CHECK-NEXT:    stlxr w10, x9, [x8]
+; CHECK-NEXT:    cbnz w10, .LBB3_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    ret
+  %result = atomicrmw usub_cond ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
+
+define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:  .LBB4_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxrb w8, [x0]
+; CHECK-NEXT:    subs w9, w8, w1, uxtb
+; CHECK-NEXT:    csel w9, wzr, w9, lo
+; CHECK-NEXT:    stlxrb w10, w9, [x0]
+; CHECK-NEXT:    cbnz w10, .LBB4_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+  %result = atomicrmw usub_sat ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:  .LBB5_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxrh w8, [x0]
+; CHECK-NEXT:    subs w9, w8, w1, uxth
+; CHECK-NEXT:    csel w9, wzr, w9, lo
+; CHECK-NEXT:    stlxrh w10, w9, [x0]
+; CHECK-NEXT:    cbnz w10, .LBB5_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+  %result = atomicrmw usub_sat ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:  .LBB6_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxr w8, [x0]
+; CHECK-NEXT:    subs w9, w8, w1
+; CHECK-NEXT:    csel w9, wzr, w9, lo
+; CHECK-NEXT:    stlxr w10, w9, [x0]
+; CHECK-NEXT:    cbnz w10, .LBB6_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+  %result = atomicrmw usub_sat ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:  .LBB7_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxr x8, [x0]
+; CHECK-NEXT:    subs x9, x8, x1
+; CHECK-NEXT:    csel x9, xzr, x9, lo
+; CHECK-NEXT:    stlxr w10, x9, [x0]
+; CHECK-NEXT:    cbnz w10, .LBB7_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ret
+  %result = atomicrmw usub_sat ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
diff --git a/llvm/test/CodeGen/ARM/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/ARM/atomicrmw-cond-sub-clamp.ll
new file mode 100644
index 0000000000000..62711ee683489
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/atomicrmw-cond-sub-clamp.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 < %s | FileCheck %s
+
+define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:  .LBB0_1: @ %atomicrmw.start
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrexb r12, [r0]
+; CHECK-NEXT:    uxtb r3, r1
+; CHECK-NEXT:    cmp r12, r3
+; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    subhs r3, r3, r1
+; CHECK-NEXT:    strexb r2, r3, [r0]
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    bne .LBB0_1
+; CHECK-NEXT:  @ %bb.2: @ %atomicrmw.end
+; CHECK-NEXT:    mov r0, r12
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:    bx lr
+  %result = atomicrmw usub_cond ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:  .LBB1_1: @ %atomicrmw.start
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrexh r12, [r0]
+; CHECK-NEXT:    uxth r3, r1
+; CHECK-NEXT:    cmp r12, r3
+; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    subhs r3, r3, r1
+; CHECK-NEXT:    strexh r2, r3, [r0]
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    bne .LBB1_1
+; CHECK-NEXT:  @ %bb.2: @ %atomicrmw.end
+; CHECK-NEXT:    mov r0, r12
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:    bx lr
+  %result = atomicrmw usub_cond ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:  .LBB2_1: @ %atomicrmw.start
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrex r12, [r0]
+; CHECK-NEXT:    subs r3, r12, r1
+; CHECK-NEXT:    movlo r3, r12
+; CHECK-NEXT:    strex r2, r3, [r0]
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    bne .LBB2_1
+; CHECK-NEXT:  @ %bb.2: @ %atomicrmw.end
+; CHECK-NEXT:    mov r0, r12
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:    bx lr
+  %result = atomicrmw usub_cond ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:  .LBB3_1: @ %atomicrmw.start
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrexd r4, r5, [r0]
+; CHECK-NEXT:    mov r1, #0
+; CHECK-NEXT:    subs r6, r4, r2
+; CHECK-NEXT:    sbcs r7, r5, r3
+; CHECK-NEXT:    movwhs r1, #1
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    moveq r7, r5
+; CHECK-NEXT:    moveq r6, r4
+; CHECK-NEXT:    strexd r1, r6, r7, [r0]
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    bne .LBB3_1
+; CHECK-NEXT:  @ %bb.2: @ %atomicrmw.end
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+  %result = atomicrmw usub_cond ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
+
+define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:  .LBB4_1: @ %atomicrmw.start
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrexb r12, [r0]
+; CHECK-NEXT:    uqsub8 r3, r12, r1
+; CHECK-NEXT:    strexb r2, r3, [r0]
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    bne .LBB4_1
+; CHECK-NEXT:  @ %bb.2: @ %atomicrmw.end
+; CHECK-NEXT:    mov r0, r12
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:    bx lr
+  %result = atomicrmw usub_sat ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:  .LBB5_1: @ %atomicrmw.start
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrexh r12, [r0]
+; CHECK-NEXT:    uqsub16 r3, r12, r1
+; CHECK-NEXT:    strexh r2, r3, [r0]
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    bne .LBB5_1
+; CHECK-NEXT:  @ %bb.2: @ %atomicrmw.end
+; CHECK-NEXT:    mov r0, r12
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:    bx lr
+  %result = atomicrmw usub_sat ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:  .LBB6_1: @ %atomicrmw.start
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrex r12, [r0]
+; CHECK-NEXT:    subs r3, r12, r1
+; CHECK-NEXT:    movlo r3, #0
+; CHECK-NEXT:    strex r2, r3, [r0]
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    bne .LBB6_1
+; CHECK-NEXT:  @ %bb.2: @ %atomicrmw.end
+; CHECK-NEXT:    mov r0, r12
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:    bx lr
+  %result = atomicrmw usub_sat ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT:    mov r12, #0
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:  .LBB7_1: @ %atomicrmw.start
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrexd r4, r5, [r0]
+; CHECK-NEXT:    subs r6, r4, r2
+; CHECK-NEXT:    sbcs r7, r5, r3
+; CHECK-NEXT:    adc r1, r12, #0
+; CHECK-NEXT:    eors r1, r1, #1
+; CHECK-NEXT:    movwne r7, #0
+; CHECK-NEXT:    movwne r6, #0
+; CHECK-NEXT:    strexd r1, r6, r7, [r0]
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    bne .LBB7_1
+; CHECK-NEXT:  @ %bb.2: @ %atomicrmw.end
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    dmb ish
+; CHECK-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+  %result = atomicrmw usub_sat ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
diff --git a/llvm/test/CodeGen/Hexagon/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/Hexagon/atomicrmw-cond-sub-clamp.ll
new file mode 100644
index 0000000000000..92a3da1793b90
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/atomicrmw-cond-sub-clamp.ll
@@ -0,0 +1,341 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i8:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = and(#24,asl(r0,#3))
+; CHECK-NEXT:     r3 = and(r0,#-4)
+; CHECK-NEXT:     r2 = #255
+; CHECK-NEXT:     r4 = and(r1,#255)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = asl(r2,r0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 = sub(#-1,r2)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r6 = memw_locked(r3)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = lsr(r6,r0)
+; CHECK-NEXT:     r6 = and(r6,r5)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r7 = and(r2,#255)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     p0 = cmp.gtu(r4,r7)
+; CHECK-NEXT:     if (p0.new) r7 = add(r2,#0)
+; CHECK-NEXT:     if (!p0.new) r7 = sub(r2,r1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r7 = and(r7,#255)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r6 |= asl(r7,r0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memw_locked(r3,p0) = r6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     if (!p0) jump:nt .LBB0_1
+; CHECK-NEXT:    }
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = r2
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = atomicrmw usub_cond ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i16:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = and(#24,asl(r0,#3))
+; CHECK-NEXT:     r3 = and(r0,#-4)
+; CHECK-NEXT:     r2 = ##65535
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = asl(r2,r0)
+; CHECK-NEXT:     r4 = zxth(r1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 = sub(#-1,r2)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB1_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r6 = memw_locked(r3)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = lsr(r6,r0)
+; CHECK-NEXT:     r6 = and(r6,r5)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r7 = zxth(r2)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     p0 = cmp.gtu(r4,r7)
+; CHECK-NEXT:     if (p0.new) r7 = add(r2,#0)
+; CHECK-NEXT:     if (!p0.new) r7 = sub(r2,r1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r7 = zxth(r7)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r6 |= asl(r7,r0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memw_locked(r3,p0) = r6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     if (!p0) jump:nt .LBB1_1
+; CHECK-NEXT:    }
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = r2
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = atomicrmw usub_cond ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB2_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = memw_locked(r0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     p0 = cmp.gtu(r1,r2)
+; CHECK-NEXT:     if (p0.new) r3 = add(r2,#0)
+; CHECK-NEXT:     if (!p0.new) r3 = sub(r2,r1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memw_locked(r0,p0) = r3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     if (!p0) jump:nt .LBB2_1
+; CHECK-NEXT:    }
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = r2
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = atomicrmw usub_cond ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i64:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB3_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5:4 = memd_locked(r0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r7:6 = sub(r5:4,r3:2)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     p0 = cmp.gtu(r3:2,r5:4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r8 = mux(p0,r4,r6)
+; CHECK-NEXT:     r9 = mux(p0,r5,r7)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd_locked(r0,p0) = r9:8
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     if (!p0) jump:nt .LBB3_1
+; CHECK-NEXT:    }
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1:0 = combine(r5,r4)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = atomicrmw usub_cond ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
+
+define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i8:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = and(#24,asl(r0,#3))
+; CHECK-NEXT:     r2 = and(r0,#-4)
+; CHECK-NEXT:     r3 = #255
+; CHECK-NEXT:     r1 = and(r1,#255)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r3 = asl(r3,r0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r3 = sub(#-1,r3)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB4_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 = #255
+; CHECK-NEXT:     r4 = memw_locked(r2)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 &= lsr(r4,r0)
+; CHECK-NEXT:     r6 = and(r4,r3)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 = maxu(r5,r1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 = sub(r5,r1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r6 |= asl(r5,r0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memw_locked(r2,p0) = r6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     if (!p0) jump:nt .LBB4_1
+; CHECK-NEXT:    }
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = lsr(r4,r0)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = atomicrmw usub_sat ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i16:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = and(#24,asl(r0,#3))
+; CHECK-NEXT:     r2 = and(r0,#-4)
+; CHECK-NEXT:     r3 = ##65535
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r3 = asl(r3,r0)
+; CHECK-NEXT:     r1 = zxth(r1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r3 = sub(#-1,r3)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB5_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 = ##65535
+; CHECK-NEXT:     r4 = memw_locked(r2)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 &= lsr(r4,r0)
+; CHECK-NEXT:     r6 = and(r4,r3)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 = maxu(r5,r1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 = sub(r5,r1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r6 |= asl(r5,r0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memw_locked(r2,p0) = r6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     if (!p0) jump:nt .LBB5_1
+; CHECK-NEXT:    }
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = lsr(r4,r0)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = atomicrmw usub_sat ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB6_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = memw_locked(r0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r3 = maxu(r2,r1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r3 = sub(r3,r1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memw_locked(r0,p0) = r3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     if (!p0) jump:nt .LBB6_1
+; CHECK-NEXT:    }
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = r2
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = atomicrmw usub_sat ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i64:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB7_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5:4 = memd_locked(r0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r7:6 = maxu(r5:4,r3:2)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r7:6 = sub(r7:6,r3:2)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd_locked(r0,p0) = r7:6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     if (!p0) jump:nt .LBB7_1
+; CHECK-NEXT:    }
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1:0 = combine(r5,r4)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = atomicrmw usub_sat ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
diff --git a/llvm/test/CodeGen/LoongArch/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/LoongArch/atomicrmw-cond-sub-clamp.ll
new file mode 100644
index 0000000000000..95bb25c41dabc
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/atomicrmw-cond-sub-clamp.ll
@@ -0,0 +1,356 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 < %s | FileCheck --check-prefix=LA64 %s
+
+define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
+; LA64-LABEL: atomicrmw_usub_cond_i8:
+; LA64:       # %bb.0:
+; LA64-NEXT:    slli.d $a3, $a0, 3
+; LA64-NEXT:    bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT:    andi $a2, $a3, 24
+; LA64-NEXT:    ori $a4, $zero, 255
+; LA64-NEXT:    ld.w $a5, $a0, 0
+; LA64-NEXT:    sll.w $a3, $a4, $a3
+; LA64-NEXT:    nor $a3, $a3, $zero
+; LA64-NEXT:    andi $a4, $a1, 255
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB0_1: # %atomicrmw.start
+; LA64-NEXT:    # =>This Loop Header: Depth=1
+; LA64-NEXT:    # Child Loop BB0_3 Depth 2
+; LA64-NEXT:    move $a6, $a5
+; LA64-NEXT:    srl.w $a5, $a5, $a2
+; LA64-NEXT:    andi $a7, $a5, 255
+; LA64-NEXT:    sltu $a7, $a7, $a4
+; LA64-NEXT:    xori $a7, $a7, 1
+; LA64-NEXT:    sub.d $t0, $a5, $a1
+; LA64-NEXT:    masknez $a5, $a5, $a7
+; LA64-NEXT:    maskeqz $a7, $t0, $a7
+; LA64-NEXT:    or $a5, $a7, $a5
+; LA64-NEXT:    andi $a5, $a5, 255
+; LA64-NEXT:    sll.w $a5, $a5, $a2
+; LA64-NEXT:    and $a7, $a6, $a3
+; LA64-NEXT:    or $a7, $a7, $a5
+; LA64-NEXT:  .LBB0_3: # %atomicrmw.start
+; LA64-NEXT:    # Parent Loop BB0_1 Depth=1
+; LA64-NEXT:    # => This Inner Loop Header: Depth=2
+; LA64-NEXT:    ll.w $a5, $a0, 0
+; LA64-NEXT:    bne $a5, $a6, .LBB0_5
+; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB0_3 Depth=2
+; LA64-NEXT:    move $t0, $a7
+; LA64-NEXT:    sc.w $t0, $a0, 0
+; LA64-NEXT:    beqz $t0, .LBB0_3
+; LA64-NEXT:    b .LBB0_6
+; LA64-NEXT:  .LBB0_5: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; LA64-NEXT:    dbar 20
+; LA64-NEXT:  .LBB0_6: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; LA64-NEXT:    bne $a5, $a6, .LBB0_1
+; LA64-NEXT:  # %bb.2: # %atomicrmw.end
+; LA64-NEXT:    srl.w $a0, $a5, $a2
+; LA64-NEXT:    ret
+  %result = atomicrmw usub_cond ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
+; LA64-LABEL: atomicrmw_usub_cond_i16:
+; LA64:       # %bb.0:
+; LA64-NEXT:    slli.d $a3, $a0, 3
+; LA64-NEXT:    bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT:    andi $a2, $a3, 24
+; LA64-NEXT:    lu12i.w $a4, 15
+; LA64-NEXT:    ori $a4, $a4, 4095
+; LA64-NEXT:    ld.w $a5, $a0, 0
+; LA64-NEXT:    sll.w $a3, $a4, $a3
+; LA64-NEXT:    nor $a3, $a3, $zero
+; LA64-NEXT:    bstrpick.d $a4, $a1, 15, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB1_1: # %atomicrmw.start
+; LA64-NEXT:    # =>This Loop Header: Depth=1
+; LA64-NEXT:    # Child Loop BB1_3 Depth 2
+; LA64-NEXT:    move $a6, $a5
+; LA64-NEXT:    srl.w $a5, $a5, $a2
+; LA64-NEXT:    bstrpick.d $a7, $a5, 15, 0
+; LA64-NEXT:    sltu $a7, $a7, $a4
+; LA64-NEXT:    xori $a7, $a7, 1
+; LA64-NEXT:    sub.d $t0, $a5, $a1
+; LA64-NEXT:    masknez $a5, $a5, $a7
+; LA64-NEXT:    maskeqz $a7, $t0, $a7
+; LA64-NEXT:    or $a5, $a7, $a5
+; LA64-NEXT:    bstrpick.d $a5, $a5, 15, 0
+; LA64-NEXT:    sll.w $a5, $a5, $a2
+; LA64-NEXT:    and $a7, $a6, $a3
+; LA64-NEXT:    or $a7, $a7, $a5
+; LA64-NEXT:  .LBB1_3: # %atomicrmw.start
+; LA64-NEXT:    # Parent Loop BB1_1 Depth=1
+; LA64-NEXT:    # => This Inner Loop Header: Depth=2
+; LA64-NEXT:    ll.w $a5, $a0, 0
+; LA64-NEXT:    bne $a5, $a6, .LBB1_5
+; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB1_3 Depth=2
+; LA64-NEXT:    move $t0, $a7
+; LA64-NEXT:    sc.w $t0, $a0, 0
+; LA64-NEXT:    beqz $t0, .LBB1_3
+; LA64-NEXT:    b .LBB1_6
+; LA64-NEXT:  .LBB1_5: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB1_1 Depth=1
+; LA64-NEXT:    dbar 20
+; LA64-NEXT:  .LBB1_6: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB1_1 Depth=1
+; LA64-NEXT:    bne $a5, $a6, .LBB1_1
+; LA64-NEXT:  # %bb.2: # %atomicrmw.end
+; LA64-NEXT:    srl.w $a0, $a5, $a2
+; LA64-NEXT:    ret
+  %result = atomicrmw usub_cond ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) {
+; LA64-LABEL: atomicrmw_usub_cond_i32:
+; LA64:       # %bb.0:
+; LA64-NEXT:    ld.w $a2, $a0, 0
+; LA64-NEXT:    addi.w $a3, $a1, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB2_1: # %atomicrmw.start
+; LA64-NEXT:    # =>This Loop Header: Depth=1
+; LA64-NEXT:    # Child Loop BB2_3 Depth 2
+; LA64-NEXT:    move $a4, $a2
+; LA64-NEXT:    sltu $a2, $a2, $a3
+; LA64-NEXT:    xori $a2, $a2, 1
+; LA64-NEXT:    sub.w $a5, $a4, $a1
+; LA64-NEXT:    maskeqz $a5, $a5, $a2
+; LA64-NEXT:    masknez $a2, $a4, $a2
+; LA64-NEXT:    or $a5, $a5, $a2
+; LA64-NEXT:  .LBB2_3: # %atomicrmw.start
+; LA64-NEXT:    # Parent Loop BB2_1 Depth=1
+; LA64-NEXT:    # => This Inner Loop Header: Depth=2
+; LA64-NEXT:    ll.w $a2, $a0, 0
+; LA64-NEXT:    bne $a2, $a4, .LBB2_5
+; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB2_3 Depth=2
+; LA64-NEXT:    move $a6, $a5
+; LA64-NEXT:    sc.w $a6, $a0, 0
+; LA64-NEXT:    beqz $a6, .LBB2_3
+; LA64-NEXT:    b .LBB2_6
+; LA64-NEXT:  .LBB2_5: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB2_1 Depth=1
+; LA64-NEXT:    dbar 20
+; LA64-NEXT:  .LBB2_6: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB2_1 Depth=1
+; LA64-NEXT:    bne $a2, $a4, .LBB2_1
+; LA64-NEXT:  # %bb.2: # %atomicrmw.end
+; LA64-NEXT:    move $a0, $a2
+; LA64-NEXT:    ret
+  %result = atomicrmw usub_cond ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
+; LA64-LABEL: atomicrmw_usub_cond_i64:
+; LA64:       # %bb.0:
+; LA64-NEXT:    ld.d $a2, $a0, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB3_1: # %atomicrmw.start
+; LA64-NEXT:    # =>This Loop Header: Depth=1
+; LA64-NEXT:    # Child Loop BB3_3 Depth 2
+; LA64-NEXT:    move $a3, $a2
+; LA64-NEXT:    sltu $a2, $a2, $a1
+; LA64-NEXT:    xori $a2, $a2, 1
+; LA64-NEXT:    sub.d $a4, $a3, $a1
+; LA64-NEXT:    maskeqz $a4, $a4, $a2
+; LA64-NEXT:    masknez $a2, $a3, $a2
+; LA64-NEXT:    or $a4, $a4, $a2
+; LA64-NEXT:  .LBB3_3: # %atomicrmw.start
+; LA64-NEXT:    # Parent Loop BB3_1 Depth=1
+; LA64-NEXT:    # => This Inner Loop Header: Depth=2
+; LA64-NEXT:    ll.d $a2, $a0, 0
+; LA64-NEXT:    bne $a2, $a3, .LBB3_5
+; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB3_3 Depth=2
+; LA64-NEXT:    move $a5, $a4
+; LA64-NEXT:    sc.d $a5, $a0, 0
+; LA64-NEXT:    beqz $a5, .LBB3_3
+; LA64-NEXT:    b .LBB3_6
+; LA64-NEXT:  .LBB3_5: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; LA64-NEXT:    dbar 20
+; LA64-NEXT:  .LBB3_6: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; LA64-NEXT:    bne $a2, $a3, .LBB3_1
+; LA64-NEXT:  # %bb.2: # %atomicrmw.end
+; LA64-NEXT:    move $a0, $a2
+; LA64-NEXT:    ret
+  %result = atomicrmw usub_cond ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
+
+define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
+; LA64-LABEL: atomicrmw_usub_sat_i8:
+; LA64:       # %bb.0:
+; LA64-NEXT:    slli.d $a3, $a0, 3
+; LA64-NEXT:    bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT:    andi $a2, $a3, 24
+; LA64-NEXT:    ori $a5, $zero, 255
+; LA64-NEXT:    ld.w $a4, $a0, 0
+; LA64-NEXT:    sll.w $a3, $a5, $a3
+; LA64-NEXT:    nor $a3, $a3, $zero
+; LA64-NEXT:    andi $a1, $a1, 255
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB4_1: # %atomicrmw.start
+; LA64-NEXT:    # =>This Loop Header: Depth=1
+; LA64-NEXT:    # Child Loop BB4_3 Depth 2
+; LA64-NEXT:    move $a5, $a4
+; LA64-NEXT:    srl.w $a4, $a4, $a2
+; LA64-NEXT:    andi $a4, $a4, 255
+; LA64-NEXT:    sub.d $a6, $a4, $a1
+; LA64-NEXT:    sltu $a4, $a4, $a6
+; LA64-NEXT:    masknez $a4, $a6, $a4
+; LA64-NEXT:    sll.w $a4, $a4, $a2
+; LA64-NEXT:    and $a6, $a5, $a3
+; LA64-NEXT:    or $a6, $a6, $a4
+; LA64-NEXT:  .LBB4_3: # %atomicrmw.start
+; LA64-NEXT:    # Parent Loop BB4_1 Depth=1
+; LA64-NEXT:    # => This Inner Loop Header: Depth=2
+; LA64-NEXT:    ll.w $a4, $a0, 0
+; LA64-NEXT:    bne $a4, $a5, .LBB4_5
+; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB4_3 Depth=2
+; LA64-NEXT:    move $a7, $a6
+; LA64-NEXT:    sc.w $a7, $a0, 0
+; LA64-NEXT:    beqz $a7, .LBB4_3
+; LA64-NEXT:    b .LBB4_6
+; LA64-NEXT:  .LBB4_5: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB4_1 Depth=1
+; LA64-NEXT:    dbar 20
+; LA64-NEXT:  .LBB4_6: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB4_1 Depth=1
+; LA64-NEXT:    bne $a4, $a5, .LBB4_1
+; LA64-NEXT:  # %bb.2: # %atomicrmw.end
+; LA64-NEXT:    srl.w $a0, $a4, $a2
+; LA64-NEXT:    ret
+  %result = atomicrmw usub_sat ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
+; LA64-LABEL: atomicrmw_usub_sat_i16:
+; LA64:       # %bb.0:
+; LA64-NEXT:    slli.d $a3, $a0, 3
+; LA64-NEXT:    bstrins.d $a0, $zero, 1, 0
+; LA64-NEXT:    andi $a2, $a3, 24
+; LA64-NEXT:    lu12i.w $a4, 15
+; LA64-NEXT:    ori $a5, $a4, 4095
+; LA64-NEXT:    ld.w $a4, $a0, 0
+; LA64-NEXT:    sll.w $a3, $a5, $a3
+; LA64-NEXT:    nor $a3, $a3, $zero
+; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB5_1: # %atomicrmw.start
+; LA64-NEXT:    # =>This Loop Header: Depth=1
+; LA64-NEXT:    # Child Loop BB5_3 Depth 2
+; LA64-NEXT:    move $a5, $a4
+; LA64-NEXT:    srl.w $a4, $a4, $a2
+; LA64-NEXT:    bstrpick.d $a4, $a4, 15, 0
+; LA64-NEXT:    sub.d $a6, $a4, $a1
+; LA64-NEXT:    sltu $a4, $a4, $a6
+; LA64-NEXT:    masknez $a4, $a6, $a4
+; LA64-NEXT:    sll.w $a4, $a4, $a2
+; LA64-NEXT:    and $a6, $a5, $a3
+; LA64-NEXT:    or $a6, $a6, $a4
+; LA64-NEXT:  .LBB5_3: # %atomicrmw.start
+; LA64-NEXT:    # Parent Loop BB5_1 Depth=1
+; LA64-NEXT:    # => This Inner Loop Header: Depth=2
+; LA64-NEXT:    ll.w $a4, $a0, 0
+; LA64-NEXT:    bne $a4, $a5, .LBB5_5
+; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB5_3 Depth=2
+; LA64-NEXT:    move $a7, $a6
+; LA64-NEXT:    sc.w $a7, $a0, 0
+; LA64-NEXT:    beqz $a7, .LBB5_3
+; LA64-NEXT:    b .LBB5_6
+; LA64-NEXT:  .LBB5_5: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB5_1 Depth=1
+; LA64-NEXT:    dbar 20
+; LA64-NEXT:  .LBB5_6: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB5_1 Depth=1
+; LA64-NEXT:    bne $a4, $a5, .LBB5_1
+; LA64-NEXT:  # %bb.2: # %atomicrmw.end
+; LA64-NEXT:    srl.w $a0, $a4, $a2
+; LA64-NEXT:    ret
+  %result = atomicrmw usub_sat ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
+; LA64-LABEL: atomicrmw_usub_sat_i32:
+; LA64:       # %bb.0:
+; LA64-NEXT:    ld.w $a2, $a0, 0
+; LA64-NEXT:    addi.w $a1, $a1, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB6_1: # %atomicrmw.start
+; LA64-NEXT:    # =>This Loop Header: Depth=1
+; LA64-NEXT:    # Child Loop BB6_3 Depth 2
+; LA64-NEXT:    move $a3, $a2
+; LA64-NEXT:    sub.d $a2, $a2, $a1
+; LA64-NEXT:    sltu $a4, $a3, $a2
+; LA64-NEXT:    masknez $a4, $a2, $a4
+; LA64-NEXT:  .LBB6_3: # %atomicrmw.start
+; LA64-NEXT:    # Parent Loop BB6_1 Depth=1
+; LA64-NEXT:    # => This Inner Loop Header: Depth=2
+; LA64-NEXT:    ll.w $a2, $a0, 0
+; LA64-NEXT:    bne $a2, $a3, .LBB6_5
+; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB6_3 Depth=2
+; LA64-NEXT:    move $a5, $a4
+; LA64-NEXT:    sc.w $a5, $a0, 0
+; LA64-NEXT:    beqz $a5, .LBB6_3
+; LA64-NEXT:    b .LBB6_6
+; LA64-NEXT:  .LBB6_5: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; LA64-NEXT:    dbar 20
+; LA64-NEXT:  .LBB6_6: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; LA64-NEXT:    bne $a2, $a3, .LBB6_1
+; LA64-NEXT:  # %bb.2: # %atomicrmw.end
+; LA64-NEXT:    move $a0, $a2
+; LA64-NEXT:    ret
+  %result = atomicrmw usub_sat ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
+; LA64-LABEL: atomicrmw_usub_sat_i64:
+; LA64:       # %bb.0:
+; LA64-NEXT:    ld.d $a2, $a0, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB7_1: # %atomicrmw.start
+; LA64-NEXT:    # =>This Loop Header: Depth=1
+; LA64-NEXT:    # Child Loop BB7_3 Depth 2
+; LA64-NEXT:    move $a3, $a2
+; LA64-NEXT:    sub.d $a2, $a2, $a1
+; LA64-NEXT:    sltu $a4, $a3, $a2
+; LA64-NEXT:    masknez $a4, $a2, $a4
+; LA64-NEXT:  .LBB7_3: # %atomicrmw.start
+; LA64-NEXT:    # Parent Loop BB7_1 Depth=1
+; LA64-NEXT:    # => This Inner Loop Header: Depth=2
+; LA64-NEXT:    ll.d $a2, $a0, 0
+; LA64-NEXT:    bne $a2, $a3, .LBB7_5
+; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB7_3 Depth=2
+; LA64-NEXT:    move $a5, $a4
+; LA64-NEXT:    sc.d $a5, $a0, 0
+; LA64-NEXT:    beqz $a5, .LBB7_3
+; LA64-NEXT:    b .LBB7_6
+; LA64-NEXT:  .LBB7_5: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB7_1 Depth=1
+; LA64-NEXT:    dbar 20
+; LA64-NEXT:  .LBB7_6: # %atomicrmw.start
+; LA64-NEXT:    # in Loop: Header=BB7_1 Depth=1
+; LA64-NEXT:    bne $a2, $a3, .LBB7_1
+; LA64-NEXT:  # %bb.2: # %atomicrmw.end
+; LA64-NEXT:    move $a0, $a2
+; LA64-NEXT:    ret
+  %result = atomicrmw usub_sat ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
diff --git a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll
new file mode 100644
index 0000000000000..933311140465f
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll
@@ -0,0 +1,384 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+
+define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    mr 5, 3
+; CHECK-NEXT:    rlwinm 7, 5, 3, 27, 28
+; CHECK-NEXT:    lbz 3, 0(3)
+; CHECK-NEXT:    xori 7, 7, 24
+; CHECK-NEXT:    li 8, 255
+; CHECK-NEXT:    clrlwi 6, 4, 24
+; CHECK-NEXT:    rldicr 5, 5, 0, 61
+; CHECK-NEXT:    slw 8, 8, 7
+; CHECK-NEXT:    b .LBB0_2
+; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    srw 3, 11, 7
+; CHECK-NEXT:    cmplw 3, 9
+; CHECK-NEXT:    beq 0, .LBB0_7
+; CHECK-NEXT:  .LBB0_2: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Loop Header: Depth=1
+; CHECK-NEXT:    # Child Loop BB0_5 Depth 2
+; CHECK-NEXT:    clrlwi 9, 3, 24
+; CHECK-NEXT:    cmplw 9, 6
+; CHECK-NEXT:    blt 0, .LBB0_4
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    sub 3, 3, 4
+; CHECK-NEXT:  .LBB0_4: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    slw 3, 3, 7
+; CHECK-NEXT:    slw 10, 9, 7
+; CHECK-NEXT:    and 3, 3, 8
+; CHECK-NEXT:    and 10, 10, 8
+; CHECK-NEXT:  .LBB0_5: # %atomicrmw.start
+; CHECK-NEXT:    # Parent Loop BB0_2 Depth=1
+; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 12, 0, 5
+; CHECK-NEXT:    and 11, 12, 8
+; CHECK-NEXT:    cmpw 11, 10
+; CHECK-NEXT:    bne 0, .LBB0_1
+; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    andc 12, 12, 8
+; CHECK-NEXT:    or 12, 12, 3
+; CHECK-NEXT:    stwcx. 12, 0, 5
+; CHECK-NEXT:    bne 0, .LBB0_5
+; CHECK-NEXT:    b .LBB0_1
+; CHECK-NEXT:  .LBB0_7: # %atomicrmw.end
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    blr
+  %result = atomicrmw usub_cond ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    mr 5, 3
+; CHECK-NEXT:    li 8, 0
+; CHECK-NEXT:    lhz 3, 0(3)
+; CHECK-NEXT:    rlwinm 7, 5, 3, 27, 27
+; CHECK-NEXT:    xori 7, 7, 16
+; CHECK-NEXT:    ori 8, 8, 65535
+; CHECK-NEXT:    clrlwi 6, 4, 16
+; CHECK-NEXT:    rldicr 5, 5, 0, 61
+; CHECK-NEXT:    slw 8, 8, 7
+; CHECK-NEXT:    b .LBB1_2
+; CHECK-NEXT:  .LBB1_1: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    srw 3, 11, 7
+; CHECK-NEXT:    cmplw 3, 9
+; CHECK-NEXT:    beq 0, .LBB1_7
+; CHECK-NEXT:  .LBB1_2: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Loop Header: Depth=1
+; CHECK-NEXT:    # Child Loop BB1_5 Depth 2
+; CHECK-NEXT:    clrlwi 9, 3, 16
+; CHECK-NEXT:    cmplw 9, 6
+; CHECK-NEXT:    blt 0, .LBB1_4
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    sub 3, 3, 4
+; CHECK-NEXT:  .LBB1_4: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    slw 3, 3, 7
+; CHECK-NEXT:    slw 10, 9, 7
+; CHECK-NEXT:    and 3, 3, 8
+; CHECK-NEXT:    and 10, 10, 8
+; CHECK-NEXT:  .LBB1_5: # %atomicrmw.start
+; CHECK-NEXT:    # Parent Loop BB1_2 Depth=1
+; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 12, 0, 5
+; CHECK-NEXT:    and 11, 12, 8
+; CHECK-NEXT:    cmpw 11, 10
+; CHECK-NEXT:    bne 0, .LBB1_1
+; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    andc 12, 12, 8
+; CHECK-NEXT:    or 12, 12, 3
+; CHECK-NEXT:    stwcx. 12, 0, 5
+; CHECK-NEXT:    bne 0, .LBB1_5
+; CHECK-NEXT:    b .LBB1_1
+; CHECK-NEXT:  .LBB1_7: # %atomicrmw.end
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    blr
+  %result = atomicrmw usub_cond ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    lwz 6, 0(3)
+; CHECK-NEXT:    b .LBB2_2
+; CHECK-NEXT:  .LBB2_1: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    cmplw 5, 6
+; CHECK-NEXT:    mr 6, 5
+; CHECK-NEXT:    beq 0, .LBB2_7
+; CHECK-NEXT:  .LBB2_2: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Loop Header: Depth=1
+; CHECK-NEXT:    # Child Loop BB2_5 Depth 2
+; CHECK-NEXT:    cmplw 6, 4
+; CHECK-NEXT:    bge 0, .LBB2_4
+; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    mr 7, 6
+; CHECK-NEXT:    b .LBB2_5
+; CHECK-NEXT:  .LBB2_4:
+; CHECK-NEXT:    sub 7, 6, 4
+; CHECK-NEXT:  .LBB2_5: # %atomicrmw.start
+; CHECK-NEXT:    # Parent Loop BB2_2 Depth=1
+; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 5, 0, 3
+; CHECK-NEXT:    cmpw 5, 6
+; CHECK-NEXT:    bne 0, .LBB2_1
+; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stwcx. 7, 0, 3
+; CHECK-NEXT:    bne 0, .LBB2_5
+; CHECK-NEXT:    b .LBB2_1
+; CHECK-NEXT:  .LBB2_7: # %atomicrmw.end
+; CHECK-NEXT:    mr 3, 5
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    blr
+  %result = atomicrmw usub_cond ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    ld 6, 0(3)
+; CHECK-NEXT:    b .LBB3_2
+; CHECK-NEXT:  .LBB3_1: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    cmpld 5, 6
+; CHECK-NEXT:    mr 6, 5
+; CHECK-NEXT:    beq 0, .LBB3_7
+; CHECK-NEXT:  .LBB3_2: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Loop Header: Depth=1
+; CHECK-NEXT:    # Child Loop BB3_5 Depth 2
+; CHECK-NEXT:    cmpld 6, 4
+; CHECK-NEXT:    bge 0, .LBB3_4
+; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    mr 7, 6
+; CHECK-NEXT:    b .LBB3_5
+; CHECK-NEXT:  .LBB3_4:
+; CHECK-NEXT:    sub 7, 6, 4
+; CHECK-NEXT:  .LBB3_5: # %atomicrmw.start
+; CHECK-NEXT:    # Parent Loop BB3_2 Depth=1
+; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    ldarx 5, 0, 3
+; CHECK-NEXT:    cmpd 5, 6
+; CHECK-NEXT:    bne 0, .LBB3_1
+; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stdcx. 7, 0, 3
+; CHECK-NEXT:    bne 0, .LBB3_5
+; CHECK-NEXT:    b .LBB3_1
+; CHECK-NEXT:  .LBB3_7: # %atomicrmw.end
+; CHECK-NEXT:    mr 3, 5
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    blr
+  %result = atomicrmw usub_cond ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
+
+define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    mr 5, 3
+; CHECK-NEXT:    rlwinm 6, 5, 3, 27, 28
+; CHECK-NEXT:    lbz 3, 0(3)
+; CHECK-NEXT:    xori 6, 6, 24
+; CHECK-NEXT:    li 7, 255
+; CHECK-NEXT:    clrlwi 4, 4, 24
+; CHECK-NEXT:    rldicr 5, 5, 0, 61
+; CHECK-NEXT:    slw 7, 7, 6
+; CHECK-NEXT:    b .LBB4_2
+; CHECK-NEXT:  .LBB4_1: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    srw 3, 10, 6
+; CHECK-NEXT:    cmplw 3, 8
+; CHECK-NEXT:    beq 0, .LBB4_7
+; CHECK-NEXT:  .LBB4_2: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Loop Header: Depth=1
+; CHECK-NEXT:    # Child Loop BB4_5 Depth 2
+; CHECK-NEXT:    clrlwi 8, 3, 24
+; CHECK-NEXT:    sub 3, 8, 4
+; CHECK-NEXT:    cmplw 3, 8
+; CHECK-NEXT:    li 9, 0
+; CHECK-NEXT:    bgt 0, .LBB4_4
+; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    mr 9, 3
+; CHECK-NEXT:  .LBB4_4: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    slw 3, 9, 6
+; CHECK-NEXT:    slw 9, 8, 6
+; CHECK-NEXT:    and 3, 3, 7
+; CHECK-NEXT:    and 9, 9, 7
+; CHECK-NEXT:  .LBB4_5: # %atomicrmw.start
+; CHECK-NEXT:    # Parent Loop BB4_2 Depth=1
+; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 11, 0, 5
+; CHECK-NEXT:    and 10, 11, 7
+; CHECK-NEXT:    cmpw 10, 9
+; CHECK-NEXT:    bne 0, .LBB4_1
+; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    andc 11, 11, 7
+; CHECK-NEXT:    or 11, 11, 3
+; CHECK-NEXT:    stwcx. 11, 0, 5
+; CHECK-NEXT:    bne 0, .LBB4_5
+; CHECK-NEXT:    b .LBB4_1
+; CHECK-NEXT:  .LBB4_7: # %atomicrmw.end
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    blr
+  %result = atomicrmw usub_sat ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    mr 5, 3
+; CHECK-NEXT:    li 7, 0
+; CHECK-NEXT:    lhz 3, 0(3)
+; CHECK-NEXT:    rlwinm 6, 5, 3, 27, 27
+; CHECK-NEXT:    xori 6, 6, 16
+; CHECK-NEXT:    ori 7, 7, 65535
+; CHECK-NEXT:    clrlwi 4, 4, 16
+; CHECK-NEXT:    rldicr 5, 5, 0, 61
+; CHECK-NEXT:    slw 7, 7, 6
+; CHECK-NEXT:    b .LBB5_2
+; CHECK-NEXT:  .LBB5_1: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    srw 3, 10, 6
+; CHECK-NEXT:    cmplw 3, 8
+; CHECK-NEXT:    beq 0, .LBB5_7
+; CHECK-NEXT:  .LBB5_2: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Loop Header: Depth=1
+; CHECK-NEXT:    # Child Loop BB5_5 Depth 2
+; CHECK-NEXT:    clrlwi 8, 3, 16
+; CHECK-NEXT:    sub 3, 8, 4
+; CHECK-NEXT:    cmplw 3, 8
+; CHECK-NEXT:    li 9, 0
+; CHECK-NEXT:    bgt 0, .LBB5_4
+; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    mr 9, 3
+; CHECK-NEXT:  .LBB5_4: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    slw 3, 9, 6
+; CHECK-NEXT:    slw 9, 8, 6
+; CHECK-NEXT:    and 3, 3, 7
+; CHECK-NEXT:    and 9, 9, 7
+; CHECK-NEXT:  .LBB5_5: # %atomicrmw.start
+; CHECK-NEXT:    # Parent Loop BB5_2 Depth=1
+; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 11, 0, 5
+; CHECK-NEXT:    and 10, 11, 7
+; CHECK-NEXT:    cmpw 10, 9
+; CHECK-NEXT:    bne 0, .LBB5_1
+; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    andc 11, 11, 7
+; CHECK-NEXT:    or 11, 11, 3
+; CHECK-NEXT:    stwcx. 11, 0, 5
+; CHECK-NEXT:    bne 0, .LBB5_5
+; CHECK-NEXT:    b .LBB5_1
+; CHECK-NEXT:  .LBB5_7: # %atomicrmw.end
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    blr
+  %result = atomicrmw usub_sat ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    lwz 6, 0(3)
+; CHECK-NEXT:    b .LBB6_2
+; CHECK-NEXT:  .LBB6_1: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    cmplw 5, 6
+; CHECK-NEXT:    mr 6, 5
+; CHECK-NEXT:    beq 0, .LBB6_6
+; CHECK-NEXT:  .LBB6_2: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Loop Header: Depth=1
+; CHECK-NEXT:    # Child Loop BB6_4 Depth 2
+; CHECK-NEXT:    sub 5, 6, 4
+; CHECK-NEXT:    cmplw 5, 6
+; CHECK-NEXT:    li 7, 0
+; CHECK-NEXT:    bgt 0, .LBB6_4
+; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    mr 7, 5
+; CHECK-NEXT:  .LBB6_4: # %atomicrmw.start
+; CHECK-NEXT:    # Parent Loop BB6_2 Depth=1
+; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 5, 0, 3
+; CHECK-NEXT:    cmpw 5, 6
+; CHECK-NEXT:    bne 0, .LBB6_1
+; CHECK-NEXT:  # %bb.5: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stwcx. 7, 0, 3
+; CHECK-NEXT:    bne 0, .LBB6_4
+; CHECK-NEXT:    b .LBB6_1
+; CHECK-NEXT:  .LBB6_6: # %atomicrmw.end
+; CHECK-NEXT:    mr 3, 5
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    blr
+  %result = atomicrmw usub_sat ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    ld 6, 0(3)
+; CHECK-NEXT:    b .LBB7_2
+; CHECK-NEXT:  .LBB7_1: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    cmpld 5, 6
+; CHECK-NEXT:    mr 6, 5
+; CHECK-NEXT:    beq 0, .LBB7_6
+; CHECK-NEXT:  .LBB7_2: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Loop Header: Depth=1
+; CHECK-NEXT:    # Child Loop BB7_4 Depth 2
+; CHECK-NEXT:    sub 5, 6, 4
+; CHECK-NEXT:    cmpld 5, 6
+; CHECK-NEXT:    li 7, 0
+; CHECK-NEXT:    bgt 0, .LBB7_4
+; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    mr 7, 5
+; CHECK-NEXT:  .LBB7_4: # %atomicrmw.start
+; CHECK-NEXT:    # Parent Loop BB7_2 Depth=1
+; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    ldarx 5, 0, 3
+; CHECK-NEXT:    cmpd 5, 6
+; CHECK-NEXT:    bne 0, .LBB7_1
+; CHECK-NEXT:  # %bb.5: # %atomicrmw.start
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stdcx. 7, 0, 3
+; CHECK-NEXT:    bne 0, .LBB7_4
+; CHECK-NEXT:    b .LBB7_1
+; CHECK-NEXT:  .LBB7_6: # %atomicrmw.end
+; CHECK-NEXT:    mr 3, 5
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    blr
+  %result = atomicrmw usub_sat ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
new file mode 100644
index 0000000000000..a9c8a4be7d2b4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
@@ -0,0 +1,1265 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32I %s
+; RUN: llc -mtriple=riscv32 -mattr=+a -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32IA %s
+; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-ztso -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32IA %s
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I %s
+; RUN: llc -mtriple=riscv64 -mattr=+a -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64IA %s
+; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-ztso -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64IA %s
+
+
+define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
+; RV32I-LABEL: atomicrmw_usub_cond_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    andi s2, a1, 255
+; RV32I-NEXT:  .LBB0_1: # %atomicrmw.start
+; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    andi a0, a3, 255
+; RV32I-NEXT:    sltu a0, a0, s2
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    and a0, a0, s1
+; RV32I-NEXT:    sub a2, a3, a0
+; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    addi a1, sp, 15
+; RV32I-NEXT:    li a3, 5
+; RV32I-NEXT:    li a4, 5
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call __atomic_compare_exchange_1
+; RV32I-NEXT:    lbu a3, 15(sp)
+; RV32I-NEXT:    beqz a0, .LBB0_1
+; RV32I-NEXT:  # %bb.2: # %atomicrmw.end
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomicrmw_usub_cond_i8:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    andi a2, a0, -4
+; RV32IA-NEXT:    slli a3, a0, 3
+; RV32IA-NEXT:    andi a0, a3, 24
+; RV32IA-NEXT:    li a4, 255
+; RV32IA-NEXT:    lw a5, 0(a2)
+; RV32IA-NEXT:    sll a3, a4, a3
+; RV32IA-NEXT:    not a3, a3
+; RV32IA-NEXT:    andi a4, a1, 255
+; RV32IA-NEXT:  .LBB0_1: # %atomicrmw.start
+; RV32IA-NEXT:    # =>This Loop Header: Depth=1
+; RV32IA-NEXT:    # Child Loop BB0_3 Depth 2
+; RV32IA-NEXT:    mv a6, a5
+; RV32IA-NEXT:    srl a5, a5, a0
+; RV32IA-NEXT:    andi a7, a5, 255
+; RV32IA-NEXT:    sltu a7, a7, a4
+; RV32IA-NEXT:    addi a7, a7, -1
+; RV32IA-NEXT:    and a7, a7, a1
+; RV32IA-NEXT:    sub a5, a5, a7
+; RV32IA-NEXT:    andi a5, a5, 255
+; RV32IA-NEXT:    sll a5, a5, a0
+; RV32IA-NEXT:    and a7, a6, a3
+; RV32IA-NEXT:    or a7, a7, a5
+; RV32IA-NEXT:  .LBB0_3: # %atomicrmw.start
+; RV32IA-NEXT:    # Parent Loop BB0_1 Depth=1
+; RV32IA-NEXT:    # => This Inner Loop Header: Depth=2
+; RV32IA-NEXT:    lr.w.aqrl a5, (a2)
+; RV32IA-NEXT:    bne a5, a6, .LBB0_1
+; RV32IA-NEXT:  # %bb.4: # %atomicrmw.start
+; RV32IA-NEXT:    # in Loop: Header=BB0_3 Depth=2
+; RV32IA-NEXT:    sc.w.rl t0, a7, (a2)
+; RV32IA-NEXT:    bnez t0, .LBB0_3
+; RV32IA-NEXT:  # %bb.5: # %atomicrmw.start
+; RV32IA-NEXT:  # %bb.2: # %atomicrmw.end
+; RV32IA-NEXT:    srl a0, a5, a0
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomicrmw_usub_cond_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    andi s2, a1, 255
+; RV64I-NEXT:  .LBB0_1: # %atomicrmw.start
+; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    andi a0, a3, 255
+; RV64I-NEXT:    sltu a0, a0, s2
+; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    and a0, a0, s1
+; RV64I-NEXT:    sub a2, a3, a0
+; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    addi a1, sp, 15
+; RV64I-NEXT:    li a3, 5
+; RV64I-NEXT:    li a4, 5
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    call __atomic_compare_exchange_1
+; RV64I-NEXT:    lbu a3, 15(sp)
+; RV64I-NEXT:    beqz a0, .LBB0_1
+; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64I-NEXT:    mv a0, a3
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomicrmw_usub_cond_i8:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    andi a2, a0, -4
+; RV64IA-NEXT:    slli a4, a0, 3
+; RV64IA-NEXT:    andi a0, a4, 24
+; RV64IA-NEXT:    li a5, 255
+; RV64IA-NEXT:    lw a3, 0(a2)
+; RV64IA-NEXT:    sllw a4, a5, a4
+; RV64IA-NEXT:    not a4, a4
+; RV64IA-NEXT:    andi a5, a1, 255
+; RV64IA-NEXT:  .LBB0_1: # %atomicrmw.start
+; RV64IA-NEXT:    # =>This Loop Header: Depth=1
+; RV64IA-NEXT:    # Child Loop BB0_3 Depth 2
+; RV64IA-NEXT:    srlw a6, a3, a0
+; RV64IA-NEXT:    sext.w a7, a3
+; RV64IA-NEXT:    andi t0, a6, 255
+; RV64IA-NEXT:    sltu t0, t0, a5
+; RV64IA-NEXT:    addi t0, t0, -1
+; RV64IA-NEXT:    and t0, t0, a1
+; RV64IA-NEXT:    subw a6, a6, t0
+; RV64IA-NEXT:    andi a6, a6, 255
+; RV64IA-NEXT:    sllw a6, a6, a0
+; RV64IA-NEXT:    and a3, a3, a4
+; RV64IA-NEXT:    or a6, a3, a6
+; RV64IA-NEXT:  .LBB0_3: # %atomicrmw.start
+; RV64IA-NEXT:    # Parent Loop BB0_1 Depth=1
+; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
+; RV64IA-NEXT:    lr.w.aqrl a3, (a2)
+; RV64IA-NEXT:    bne a3, a7, .LBB0_1
+; RV64IA-NEXT:  # %bb.4: # %atomicrmw.start
+; RV64IA-NEXT:    # in Loop: Header=BB0_3 Depth=2
+; RV64IA-NEXT:    sc.w.rl t0, a6, (a2)
+; RV64IA-NEXT:    bnez t0, .LBB0_3
+; RV64IA-NEXT:  # %bb.5: # %atomicrmw.start
+; RV64IA-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64IA-NEXT:    srlw a0, a3, a0
+; RV64IA-NEXT:    ret
+  %result = atomicrmw usub_cond ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
+; RV32I-LABEL: atomicrmw_usub_cond_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    lui s2, 16
+; RV32I-NEXT:    addi s2, s2, -1
+; RV32I-NEXT:    and s3, s0, s2
+; RV32I-NEXT:  .LBB1_1: # %atomicrmw.start
+; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    and a0, a1, s2
+; RV32I-NEXT:    sltu a0, a0, s3
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    and a0, a0, s0
+; RV32I-NEXT:    sub a2, a1, a0
+; RV32I-NEXT:    sh a1, 10(sp)
+; RV32I-NEXT:    addi a1, sp, 10
+; RV32I-NEXT:    li a3, 5
+; RV32I-NEXT:    li a4, 5
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    call __atomic_compare_exchange_2
+; RV32I-NEXT:    lh a1, 10(sp)
+; RV32I-NEXT:    beqz a0, .LBB1_1
+; RV32I-NEXT:  # %bb.2: # %atomicrmw.end
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomicrmw_usub_cond_i16:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    andi a2, a0, -4
+; RV32IA-NEXT:    slli a4, a0, 3
+; RV32IA-NEXT:    andi a0, a4, 24
+; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    addi a3, a3, -1
+; RV32IA-NEXT:    lw a6, 0(a2)
+; RV32IA-NEXT:    sll a4, a3, a4
+; RV32IA-NEXT:    not a4, a4
+; RV32IA-NEXT:    and a5, a1, a3
+; RV32IA-NEXT:  .LBB1_1: # %atomicrmw.start
+; RV32IA-NEXT:    # =>This Loop Header: Depth=1
+; RV32IA-NEXT:    # Child Loop BB1_3 Depth 2
+; RV32IA-NEXT:    mv a7, a6
+; RV32IA-NEXT:    srl a6, a6, a0
+; RV32IA-NEXT:    and t0, a6, a3
+; RV32IA-NEXT:    sltu t0, t0, a5
+; RV32IA-NEXT:    addi t0, t0, -1
+; RV32IA-NEXT:    and t0, t0, a1
+; RV32IA-NEXT:    sub a6, a6, t0
+; RV32IA-NEXT:    and a6, a6, a3
+; RV32IA-NEXT:    sll a6, a6, a0
+; RV32IA-NEXT:    and t0, a7, a4
+; RV32IA-NEXT:    or t0, t0, a6
+; RV32IA-NEXT:  .LBB1_3: # %atomicrmw.start
+; RV32IA-NEXT:    # Parent Loop BB1_1 Depth=1
+; RV32IA-NEXT:    # => This Inner Loop Header: Depth=2
+; RV32IA-NEXT:    lr.w.aqrl a6, (a2)
+; RV32IA-NEXT:    bne a6, a7, .LBB1_1
+; RV32IA-NEXT:  # %bb.4: # %atomicrmw.start
+; RV32IA-NEXT:    # in Loop: Header=BB1_3 Depth=2
+; RV32IA-NEXT:    sc.w.rl t1, t0, (a2)
+; RV32IA-NEXT:    bnez t1, .LBB1_3
+; RV32IA-NEXT:  # %bb.5: # %atomicrmw.start
+; RV32IA-NEXT:  # %bb.2: # %atomicrmw.end
+; RV32IA-NEXT:    srl a0, a6, a0
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomicrmw_usub_cond_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    .cfi_offset s3, -40
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lui s2, 16
+; RV64I-NEXT:    addiw s2, s2, -1
+; RV64I-NEXT:    and s3, s0, s2
+; RV64I-NEXT:  .LBB1_1: # %atomicrmw.start
+; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    and a0, a1, s2
+; RV64I-NEXT:    sltu a0, a0, s3
+; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    and a0, a0, s0
+; RV64I-NEXT:    sub a2, a1, a0
+; RV64I-NEXT:    sh a1, 6(sp)
+; RV64I-NEXT:    addi a1, sp, 6
+; RV64I-NEXT:    li a3, 5
+; RV64I-NEXT:    li a4, 5
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    call __atomic_compare_exchange_2
+; RV64I-NEXT:    lh a1, 6(sp)
+; RV64I-NEXT:    beqz a0, .LBB1_1
+; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomicrmw_usub_cond_i16:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    andi a2, a0, -4
+; RV64IA-NEXT:    slli a5, a0, 3
+; RV64IA-NEXT:    andi a0, a5, 24
+; RV64IA-NEXT:    lui a3, 16
+; RV64IA-NEXT:    addiw a3, a3, -1
+; RV64IA-NEXT:    lw a4, 0(a2)
+; RV64IA-NEXT:    sllw a5, a3, a5
+; RV64IA-NEXT:    not a5, a5
+; RV64IA-NEXT:    and a6, a1, a3
+; RV64IA-NEXT:  .LBB1_1: # %atomicrmw.start
+; RV64IA-NEXT:    # =>This Loop Header: Depth=1
+; RV64IA-NEXT:    # Child Loop BB1_3 Depth 2
+; RV64IA-NEXT:    srlw a7, a4, a0
+; RV64IA-NEXT:    sext.w t0, a4
+; RV64IA-NEXT:    and t1, a7, a3
+; RV64IA-NEXT:    sltu t1, t1, a6
+; RV64IA-NEXT:    addi t1, t1, -1
+; RV64IA-NEXT:    and t1, t1, a1
+; RV64IA-NEXT:    subw a7, a7, t1
+; RV64IA-NEXT:    and a7, a7, a3
+; RV64IA-NEXT:    sllw a7, a7, a0
+; RV64IA-NEXT:    and a4, a4, a5
+; RV64IA-NEXT:    or a7, a4, a7
+; RV64IA-NEXT:  .LBB1_3: # %atomicrmw.start
+; RV64IA-NEXT:    # Parent Loop BB1_1 Depth=1
+; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
+; RV64IA-NEXT:    lr.w.aqrl a4, (a2)
+; RV64IA-NEXT:    bne a4, t0, .LBB1_1
+; RV64IA-NEXT:  # %bb.4: # %atomicrmw.start
+; RV64IA-NEXT:    # in Loop: Header=BB1_3 Depth=2
+; RV64IA-NEXT:    sc.w.rl t1, a7, (a2)
+; RV64IA-NEXT:    bnez t1, .LBB1_3
+; RV64IA-NEXT:  # %bb.5: # %atomicrmw.start
+; RV64IA-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64IA-NEXT:    srlw a0, a4, a0
+; RV64IA-NEXT:    ret
+  %result = atomicrmw usub_cond ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) {
+; RV32I-LABEL: atomicrmw_usub_cond_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lw a3, 0(a0)
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:  .LBB2_1: # %atomicrmw.start
+; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sltu a0, a3, s1
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    and a0, a0, s1
+; RV32I-NEXT:    sub a2, a3, a0
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    mv a1, sp
+; RV32I-NEXT:    li a3, 5
+; RV32I-NEXT:    li a4, 5
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call __atomic_compare_exchange_4
+; RV32I-NEXT:    lw a3, 0(sp)
+; RV32I-NEXT:    beqz a0, .LBB2_1
+; RV32I-NEXT:  # %bb.2: # %atomicrmw.end
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomicrmw_usub_cond_i32:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    lw a2, 0(a0)
+; RV32IA-NEXT:  .LBB2_1: # %atomicrmw.start
+; RV32IA-NEXT:    # =>This Loop Header: Depth=1
+; RV32IA-NEXT:    # Child Loop BB2_3 Depth 2
+; RV32IA-NEXT:    mv a3, a2
+; RV32IA-NEXT:    sltu a2, a2, a1
+; RV32IA-NEXT:    addi a2, a2, -1
+; RV32IA-NEXT:    and a2, a2, a1
+; RV32IA-NEXT:    sub a4, a3, a2
+; RV32IA-NEXT:  .LBB2_3: # %atomicrmw.start
+; RV32IA-NEXT:    # Parent Loop BB2_1 Depth=1
+; RV32IA-NEXT:    # => This Inner Loop Header: Depth=2
+; RV32IA-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IA-NEXT:    bne a2, a3, .LBB2_1
+; RV32IA-NEXT:  # %bb.4: # %atomicrmw.start
+; RV32IA-NEXT:    # in Loop: Header=BB2_3 Depth=2
+; RV32IA-NEXT:    sc.w.rl a5, a4, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB2_3
+; RV32IA-NEXT:  # %bb.5: # %atomicrmw.start
+; RV32IA-NEXT:  # %bb.2: # %atomicrmw.end
+; RV32IA-NEXT:    mv a0, a2
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomicrmw_usub_cond_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lw a3, 0(a0)
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    sext.w s2, a1
+; RV64I-NEXT:  .LBB2_1: # %atomicrmw.start
+; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sltu a0, a3, s2
+; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    and a0, a0, s1
+; RV64I-NEXT:    subw a2, a3, a0
+; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    addi a1, sp, 12
+; RV64I-NEXT:    li a3, 5
+; RV64I-NEXT:    li a4, 5
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    call __atomic_compare_exchange_4
+; RV64I-NEXT:    lw a3, 12(sp)
+; RV64I-NEXT:    beqz a0, .LBB2_1
+; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64I-NEXT:    mv a0, a3
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomicrmw_usub_cond_i32:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    lw a2, 0(a0)
+; RV64IA-NEXT:    sext.w a3, a1
+; RV64IA-NEXT:  .LBB2_1: # %atomicrmw.start
+; RV64IA-NEXT:    # =>This Loop Header: Depth=1
+; RV64IA-NEXT:    # Child Loop BB2_3 Depth 2
+; RV64IA-NEXT:    sext.w a4, a2
+; RV64IA-NEXT:    sltu a5, a4, a3
+; RV64IA-NEXT:    addi a5, a5, -1
+; RV64IA-NEXT:    and a5, a5, a1
+; RV64IA-NEXT:    subw a5, a2, a5
+; RV64IA-NEXT:  .LBB2_3: # %atomicrmw.start
+; RV64IA-NEXT:    # Parent Loop BB2_1 Depth=1
+; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
+; RV64IA-NEXT:    lr.w.aqrl a2, (a0)
+; RV64IA-NEXT:    bne a2, a4, .LBB2_1
+; RV64IA-NEXT:  # %bb.4: # %atomicrmw.start
+; RV64IA-NEXT:    # in Loop: Header=BB2_3 Depth=2
+; RV64IA-NEXT:    sc.w.rl a6, a5, (a0)
+; RV64IA-NEXT:    bnez a6, .LBB2_3
+; RV64IA-NEXT:  # %bb.5: # %atomicrmw.start
+; RV64IA-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64IA-NEXT:    mv a0, a2
+; RV64IA-NEXT:    ret
+  %result = atomicrmw usub_cond ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
+; RV32I-LABEL: atomicrmw_usub_cond_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lw a5, 4(a0)
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    j .LBB3_3
+; RV32I-NEXT:  .LBB3_1: # %atomicrmw.start
+; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
+; RV32I-NEXT:    sltu a0, a5, s1
+; RV32I-NEXT:  .LBB3_2: # %atomicrmw.start
+; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
+; RV32I-NEXT:    xori a0, a0, 1
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    and a1, a0, s2
+; RV32I-NEXT:    sltu a2, a4, a1
+; RV32I-NEXT:    and a0, a0, s1
+; RV32I-NEXT:    sub a3, a5, a0
+; RV32I-NEXT:    sub a3, a3, a2
+; RV32I-NEXT:    sub a2, a4, a1
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    addi a1, sp, 8
+; RV32I-NEXT:    li a4, 5
+; RV32I-NEXT:    li a5, 5
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call __atomic_compare_exchange_8
+; RV32I-NEXT:    lw a5, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    bnez a0, .LBB3_5
+; RV32I-NEXT:  .LBB3_3: # %atomicrmw.start
+; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    bne a5, s1, .LBB3_1
+; RV32I-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
+; RV32I-NEXT:    sltu a0, a4, s2
+; RV32I-NEXT:    j .LBB3_2
+; RV32I-NEXT:  .LBB3_5: # %atomicrmw.end
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomicrmw_usub_cond_i64:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -32
+; RV32IA-NEXT:    .cfi_def_cfa_offset 32
+; RV32IA-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    .cfi_offset ra, -4
+; RV32IA-NEXT:    .cfi_offset s0, -8
+; RV32IA-NEXT:    .cfi_offset s1, -12
+; RV32IA-NEXT:    .cfi_offset s2, -16
+; RV32IA-NEXT:    mv s0, a0
+; RV32IA-NEXT:    lw a5, 4(a0)
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    j .LBB3_3
+; RV32IA-NEXT:  .LBB3_1: # %atomicrmw.start
+; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
+; RV32IA-NEXT:    sltu a0, a5, s1
+; RV32IA-NEXT:  .LBB3_2: # %atomicrmw.start
+; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
+; RV32IA-NEXT:    xori a0, a0, 1
+; RV32IA-NEXT:    neg a0, a0
+; RV32IA-NEXT:    and a1, a0, s2
+; RV32IA-NEXT:    sltu a2, a4, a1
+; RV32IA-NEXT:    and a0, a0, s1
+; RV32IA-NEXT:    sub a3, a5, a0
+; RV32IA-NEXT:    sub a3, a3, a2
+; RV32IA-NEXT:    sub a2, a4, a1
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    addi a1, sp, 8
+; RV32IA-NEXT:    li a4, 5
+; RV32IA-NEXT:    li a5, 5
+; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    call __atomic_compare_exchange_8
+; RV32IA-NEXT:    lw a5, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    bnez a0, .LBB3_5
+; RV32IA-NEXT:  .LBB3_3: # %atomicrmw.start
+; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    bne a5, s1, .LBB3_1
+; RV32IA-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
+; RV32IA-NEXT:    sltu a0, a4, s2
+; RV32IA-NEXT:    j .LBB3_2
+; RV32IA-NEXT:  .LBB3_5: # %atomicrmw.end
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    addi sp, sp, 32
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomicrmw_usub_cond_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    ld a3, 0(a0)
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:  .LBB3_1: # %atomicrmw.start
+; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sltu a0, a3, s1
+; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    and a0, a0, s1
+; RV64I-NEXT:    sub a2, a3, a0
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    mv a1, sp
+; RV64I-NEXT:    li a3, 5
+; RV64I-NEXT:    li a4, 5
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    call __atomic_compare_exchange_8
+; RV64I-NEXT:    ld a3, 0(sp)
+; RV64I-NEXT:    beqz a0, .LBB3_1
+; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64I-NEXT:    mv a0, a3
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomicrmw_usub_cond_i64:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    ld a2, 0(a0)
+; RV64IA-NEXT:  .LBB3_1: # %atomicrmw.start
+; RV64IA-NEXT:    # =>This Loop Header: Depth=1
+; RV64IA-NEXT:    # Child Loop BB3_3 Depth 2
+; RV64IA-NEXT:    mv a3, a2
+; RV64IA-NEXT:    sltu a2, a2, a1
+; RV64IA-NEXT:    addi a2, a2, -1
+; RV64IA-NEXT:    and a2, a2, a1
+; RV64IA-NEXT:    sub a4, a3, a2
+; RV64IA-NEXT:  .LBB3_3: # %atomicrmw.start
+; RV64IA-NEXT:    # Parent Loop BB3_1 Depth=1
+; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
+; RV64IA-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IA-NEXT:    bne a2, a3, .LBB3_1
+; RV64IA-NEXT:  # %bb.4: # %atomicrmw.start
+; RV64IA-NEXT:    # in Loop: Header=BB3_3 Depth=2
+; RV64IA-NEXT:    sc.d.rl a5, a4, (a0)
+; RV64IA-NEXT:    bnez a5, .LBB3_3
+; RV64IA-NEXT:  # %bb.5: # %atomicrmw.start
+; RV64IA-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64IA-NEXT:    mv a0, a2
+; RV64IA-NEXT:    ret
+  %result = atomicrmw usub_cond ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
+
+define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
+; RV32I-LABEL: atomicrmw_usub_sat_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    andi s1, a1, 255
+; RV32I-NEXT:  .LBB4_1: # %atomicrmw.start
+; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    andi a0, a3, 255
+; RV32I-NEXT:    sub a1, a0, s1
+; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    sb a3, 3(sp)
+; RV32I-NEXT:    addi a1, sp, 3
+; RV32I-NEXT:    li a3, 5
+; RV32I-NEXT:    li a4, 5
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call __atomic_compare_exchange_1
+; RV32I-NEXT:    lbu a3, 3(sp)
+; RV32I-NEXT:    beqz a0, .LBB4_1
+; RV32I-NEXT:  # %bb.2: # %atomicrmw.end
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomicrmw_usub_sat_i8:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    andi a2, a0, -4
+; RV32IA-NEXT:    slli a3, a0, 3
+; RV32IA-NEXT:    andi a0, a3, 24
+; RV32IA-NEXT:    li a5, 255
+; RV32IA-NEXT:    lw a4, 0(a2)
+; RV32IA-NEXT:    sll a3, a5, a3
+; RV32IA-NEXT:    not a3, a3
+; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:  .LBB4_1: # %atomicrmw.start
+; RV32IA-NEXT:    # =>This Loop Header: Depth=1
+; RV32IA-NEXT:    # Child Loop BB4_3 Depth 2
+; RV32IA-NEXT:    mv a5, a4
+; RV32IA-NEXT:    srl a4, a4, a0
+; RV32IA-NEXT:    andi a4, a4, 255
+; RV32IA-NEXT:    sub a6, a4, a1
+; RV32IA-NEXT:    sltu a4, a4, a6
+; RV32IA-NEXT:    addi a4, a4, -1
+; RV32IA-NEXT:    and a4, a4, a6
+; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    and a6, a5, a3
+; RV32IA-NEXT:    or a6, a6, a4
+; RV32IA-NEXT:  .LBB4_3: # %atomicrmw.start
+; RV32IA-NEXT:    # Parent Loop BB4_1 Depth=1
+; RV32IA-NEXT:    # => This Inner Loop Header: Depth=2
+; RV32IA-NEXT:    lr.w.aqrl a4, (a2)
+; RV32IA-NEXT:    bne a4, a5, .LBB4_1
+; RV32IA-NEXT:  # %bb.4: # %atomicrmw.start
+; RV32IA-NEXT:    # in Loop: Header=BB4_3 Depth=2
+; RV32IA-NEXT:    sc.w.rl a7, a6, (a2)
+; RV32IA-NEXT:    bnez a7, .LBB4_3
+; RV32IA-NEXT:  # %bb.5: # %atomicrmw.start
+; RV32IA-NEXT:  # %bb.2: # %atomicrmw.end
+; RV32IA-NEXT:    srl a0, a4, a0
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomicrmw_usub_sat_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    andi s1, a1, 255
+; RV64I-NEXT:  .LBB4_1: # %atomicrmw.start
+; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    andi a0, a3, 255
+; RV64I-NEXT:    sub a1, a0, s1
+; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    sb a3, 7(sp)
+; RV64I-NEXT:    addi a1, sp, 7
+; RV64I-NEXT:    li a3, 5
+; RV64I-NEXT:    li a4, 5
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    call __atomic_compare_exchange_1
+; RV64I-NEXT:    lbu a3, 7(sp)
+; RV64I-NEXT:    beqz a0, .LBB4_1
+; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64I-NEXT:    mv a0, a3
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomicrmw_usub_sat_i8:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    andi a2, a0, -4
+; RV64IA-NEXT:    slli a4, a0, 3
+; RV64IA-NEXT:    andi a0, a4, 24
+; RV64IA-NEXT:    li a5, 255
+; RV64IA-NEXT:    lw a3, 0(a2)
+; RV64IA-NEXT:    sllw a4, a5, a4
+; RV64IA-NEXT:    not a4, a4
+; RV64IA-NEXT:    andi a1, a1, 255
+; RV64IA-NEXT:  .LBB4_1: # %atomicrmw.start
+; RV64IA-NEXT:    # =>This Loop Header: Depth=1
+; RV64IA-NEXT:    # Child Loop BB4_3 Depth 2
+; RV64IA-NEXT:    srlw a5, a3, a0
+; RV64IA-NEXT:    sext.w a6, a3
+; RV64IA-NEXT:    andi a5, a5, 255
+; RV64IA-NEXT:    sub a7, a5, a1
+; RV64IA-NEXT:    sltu a5, a5, a7
+; RV64IA-NEXT:    addi a5, a5, -1
+; RV64IA-NEXT:    and a5, a5, a7
+; RV64IA-NEXT:    sllw a5, a5, a0
+; RV64IA-NEXT:    and a3, a3, a4
+; RV64IA-NEXT:    or a5, a3, a5
+; RV64IA-NEXT:  .LBB4_3: # %atomicrmw.start
+; RV64IA-NEXT:    # Parent Loop BB4_1 Depth=1
+; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
+; RV64IA-NEXT:    lr.w.aqrl a3, (a2)
+; RV64IA-NEXT:    bne a3, a6, .LBB4_1
+; RV64IA-NEXT:  # %bb.4: # %atomicrmw.start
+; RV64IA-NEXT:    # in Loop: Header=BB4_3 Depth=2
+; RV64IA-NEXT:    sc.w.rl a7, a5, (a2)
+; RV64IA-NEXT:    bnez a7, .LBB4_3
+; RV64IA-NEXT:  # %bb.5: # %atomicrmw.start
+; RV64IA-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64IA-NEXT:    srlw a0, a3, a0
+; RV64IA-NEXT:    ret
+  %result = atomicrmw usub_sat ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
+; RV32I-LABEL: atomicrmw_usub_sat_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lhu a3, 0(a0)
+; RV32I-NEXT:    lui s1, 16
+; RV32I-NEXT:    addi s1, s1, -1
+; RV32I-NEXT:    and s2, a1, s1
+; RV32I-NEXT:  .LBB5_1: # %atomicrmw.start
+; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    and a0, a3, s1
+; RV32I-NEXT:    sub a1, a0, s2
+; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    sh a3, 14(sp)
+; RV32I-NEXT:    addi a1, sp, 14
+; RV32I-NEXT:    li a3, 5
+; RV32I-NEXT:    li a4, 5
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call __atomic_compare_exchange_2
+; RV32I-NEXT:    lh a3, 14(sp)
+; RV32I-NEXT:    beqz a0, .LBB5_1
+; RV32I-NEXT:  # %bb.2: # %atomicrmw.end
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomicrmw_usub_sat_i16:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    andi a2, a0, -4
+; RV32IA-NEXT:    slli a4, a0, 3
+; RV32IA-NEXT:    andi a0, a4, 24
+; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    addi a3, a3, -1
+; RV32IA-NEXT:    lw a5, 0(a2)
+; RV32IA-NEXT:    sll a4, a3, a4
+; RV32IA-NEXT:    not a4, a4
+; RV32IA-NEXT:    and a1, a1, a3
+; RV32IA-NEXT:  .LBB5_1: # %atomicrmw.start
+; RV32IA-NEXT:    # =>This Loop Header: Depth=1
+; RV32IA-NEXT:    # Child Loop BB5_3 Depth 2
+; RV32IA-NEXT:    mv a6, a5
+; RV32IA-NEXT:    srl a5, a5, a0
+; RV32IA-NEXT:    and a5, a5, a3
+; RV32IA-NEXT:    sub a7, a5, a1
+; RV32IA-NEXT:    sltu a5, a5, a7
+; RV32IA-NEXT:    addi a5, a5, -1
+; RV32IA-NEXT:    and a5, a5, a7
+; RV32IA-NEXT:    sll a5, a5, a0
+; RV32IA-NEXT:    and a7, a6, a4
+; RV32IA-NEXT:    or a7, a7, a5
+; RV32IA-NEXT:  .LBB5_3: # %atomicrmw.start
+; RV32IA-NEXT:    # Parent Loop BB5_1 Depth=1
+; RV32IA-NEXT:    # => This Inner Loop Header: Depth=2
+; RV32IA-NEXT:    lr.w.aqrl a5, (a2)
+; RV32IA-NEXT:    bne a5, a6, .LBB5_1
+; RV32IA-NEXT:  # %bb.4: # %atomicrmw.start
+; RV32IA-NEXT:    # in Loop: Header=BB5_3 Depth=2
+; RV32IA-NEXT:    sc.w.rl t0, a7, (a2)
+; RV32IA-NEXT:    bnez t0, .LBB5_3
+; RV32IA-NEXT:  # %bb.5: # %atomicrmw.start
+; RV32IA-NEXT:  # %bb.2: # %atomicrmw.end
+; RV32IA-NEXT:    srl a0, a5, a0
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomicrmw_usub_sat_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    .cfi_def_cfa_offset 48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    .cfi_offset s2, -32
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lhu a3, 0(a0)
+; RV64I-NEXT:    lui s1, 16
+; RV64I-NEXT:    addiw s1, s1, -1
+; RV64I-NEXT:    and s2, a1, s1
+; RV64I-NEXT:  .LBB5_1: # %atomicrmw.start
+; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    and a0, a3, s1
+; RV64I-NEXT:    sub a1, a0, s2
+; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    sh a3, 14(sp)
+; RV64I-NEXT:    addi a1, sp, 14
+; RV64I-NEXT:    li a3, 5
+; RV64I-NEXT:    li a4, 5
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    call __atomic_compare_exchange_2
+; RV64I-NEXT:    lh a3, 14(sp)
+; RV64I-NEXT:    beqz a0, .LBB5_1
+; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64I-NEXT:    mv a0, a3
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomicrmw_usub_sat_i16:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    andi a2, a0, -4
+; RV64IA-NEXT:    slli a5, a0, 3
+; RV64IA-NEXT:    andi a0, a5, 24
+; RV64IA-NEXT:    lui a3, 16
+; RV64IA-NEXT:    addiw a3, a3, -1
+; RV64IA-NEXT:    lw a4, 0(a2)
+; RV64IA-NEXT:    sllw a5, a3, a5
+; RV64IA-NEXT:    not a5, a5
+; RV64IA-NEXT:    and a1, a1, a3
+; RV64IA-NEXT:  .LBB5_1: # %atomicrmw.start
+; RV64IA-NEXT:    # =>This Loop Header: Depth=1
+; RV64IA-NEXT:    # Child Loop BB5_3 Depth 2
+; RV64IA-NEXT:    srlw a6, a4, a0
+; RV64IA-NEXT:    sext.w a7, a4
+; RV64IA-NEXT:    and a6, a6, a3
+; RV64IA-NEXT:    sub t0, a6, a1
+; RV64IA-NEXT:    sltu a6, a6, t0
+; RV64IA-NEXT:    addi a6, a6, -1
+; RV64IA-NEXT:    and a6, a6, t0
+; RV64IA-NEXT:    sllw a6, a6, a0
+; RV64IA-NEXT:    and a4, a4, a5
+; RV64IA-NEXT:    or a6, a4, a6
+; RV64IA-NEXT:  .LBB5_3: # %atomicrmw.start
+; RV64IA-NEXT:    # Parent Loop BB5_1 Depth=1
+; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
+; RV64IA-NEXT:    lr.w.aqrl a4, (a2)
+; RV64IA-NEXT:    bne a4, a7, .LBB5_1
+; RV64IA-NEXT:  # %bb.4: # %atomicrmw.start
+; RV64IA-NEXT:    # in Loop: Header=BB5_3 Depth=2
+; RV64IA-NEXT:    sc.w.rl t0, a6, (a2)
+; RV64IA-NEXT:    bnez t0, .LBB5_3
+; RV64IA-NEXT:  # %bb.5: # %atomicrmw.start
+; RV64IA-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64IA-NEXT:    srlw a0, a4, a0
+; RV64IA-NEXT:    ret
+  %result = atomicrmw usub_sat ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
+; RV32I-LABEL: atomicrmw_usub_sat_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lw a3, 0(a0)
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:  .LBB6_1: # %atomicrmw.start
+; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub a0, a3, s1
+; RV32I-NEXT:    sltu a1, a3, a0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a2, a1, a0
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    mv a1, sp
+; RV32I-NEXT:    li a3, 5
+; RV32I-NEXT:    li a4, 5
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call __atomic_compare_exchange_4
+; RV32I-NEXT:    lw a3, 0(sp)
+; RV32I-NEXT:    beqz a0, .LBB6_1
+; RV32I-NEXT:  # %bb.2: # %atomicrmw.end
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomicrmw_usub_sat_i32:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    lw a2, 0(a0)
+; RV32IA-NEXT:  .LBB6_1: # %atomicrmw.start
+; RV32IA-NEXT:    # =>This Loop Header: Depth=1
+; RV32IA-NEXT:    # Child Loop BB6_3 Depth 2
+; RV32IA-NEXT:    mv a3, a2
+; RV32IA-NEXT:    sub a2, a2, a1
+; RV32IA-NEXT:    sltu a4, a3, a2
+; RV32IA-NEXT:    addi a4, a4, -1
+; RV32IA-NEXT:    and a4, a4, a2
+; RV32IA-NEXT:  .LBB6_3: # %atomicrmw.start
+; RV32IA-NEXT:    # Parent Loop BB6_1 Depth=1
+; RV32IA-NEXT:    # => This Inner Loop Header: Depth=2
+; RV32IA-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IA-NEXT:    bne a2, a3, .LBB6_1
+; RV32IA-NEXT:  # %bb.4: # %atomicrmw.start
+; RV32IA-NEXT:    # in Loop: Header=BB6_3 Depth=2
+; RV32IA-NEXT:    sc.w.rl a5, a4, (a0)
+; RV32IA-NEXT:    bnez a5, .LBB6_3
+; RV32IA-NEXT:  # %bb.5: # %atomicrmw.start
+; RV32IA-NEXT:  # %bb.2: # %atomicrmw.end
+; RV32IA-NEXT:    mv a0, a2
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomicrmw_usub_sat_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lw a3, 0(a0)
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:  .LBB6_1: # %atomicrmw.start
+; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    subw a0, a3, s1
+; RV64I-NEXT:    sltu a1, a3, a0
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    and a2, a1, a0
+; RV64I-NEXT:    sw a3, 4(sp)
+; RV64I-NEXT:    addi a1, sp, 4
+; RV64I-NEXT:    li a3, 5
+; RV64I-NEXT:    li a4, 5
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    call __atomic_compare_exchange_4
+; RV64I-NEXT:    lw a3, 4(sp)
+; RV64I-NEXT:    beqz a0, .LBB6_1
+; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64I-NEXT:    mv a0, a3
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomicrmw_usub_sat_i32:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    lw a2, 0(a0)
+; RV64IA-NEXT:  .LBB6_1: # %atomicrmw.start
+; RV64IA-NEXT:    # =>This Loop Header: Depth=1
+; RV64IA-NEXT:    # Child Loop BB6_3 Depth 2
+; RV64IA-NEXT:    subw a3, a2, a1
+; RV64IA-NEXT:    sext.w a4, a2
+; RV64IA-NEXT:    sltu a2, a4, a3
+; RV64IA-NEXT:    addi a2, a2, -1
+; RV64IA-NEXT:    and a3, a2, a3
+; RV64IA-NEXT:  .LBB6_3: # %atomicrmw.start
+; RV64IA-NEXT:    # Parent Loop BB6_1 Depth=1
+; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
+; RV64IA-NEXT:    lr.w.aqrl a2, (a0)
+; RV64IA-NEXT:    bne a2, a4, .LBB6_1
+; RV64IA-NEXT:  # %bb.4: # %atomicrmw.start
+; RV64IA-NEXT:    # in Loop: Header=BB6_3 Depth=2
+; RV64IA-NEXT:    sc.w.rl a5, a3, (a0)
+; RV64IA-NEXT:    bnez a5, .LBB6_3
+; RV64IA-NEXT:  # %bb.5: # %atomicrmw.start
+; RV64IA-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64IA-NEXT:    mv a0, a2
+; RV64IA-NEXT:    ret
+  %result = atomicrmw usub_sat ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
+; RV32I-LABEL: atomicrmw_usub_sat_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    .cfi_def_cfa_offset 32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lw a5, 4(a0)
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    j .LBB7_3
+; RV32I-NEXT:  .LBB7_1: # %atomicrmw.start
+; RV32I-NEXT:    # in Loop: Header=BB7_3 Depth=1
+; RV32I-NEXT:    sltu a2, a5, a0
+; RV32I-NEXT:  .LBB7_2: # %atomicrmw.start
+; RV32I-NEXT:    # in Loop: Header=BB7_3 Depth=1
+; RV32I-NEXT:    addi a3, a2, -1
+; RV32I-NEXT:    and a2, a3, a1
+; RV32I-NEXT:    and a3, a3, a0
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    addi a1, sp, 8
+; RV32I-NEXT:    li a4, 5
+; RV32I-NEXT:    li a5, 5
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call __atomic_compare_exchange_8
+; RV32I-NEXT:    lw a5, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    bnez a0, .LBB7_5
+; RV32I-NEXT:  .LBB7_3: # %atomicrmw.start
+; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sltu a0, a4, s2
+; RV32I-NEXT:    sub a1, a5, s1
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    sub a1, a4, s2
+; RV32I-NEXT:    bne a0, a5, .LBB7_1
+; RV32I-NEXT:  # %bb.4: # in Loop: Header=BB7_3 Depth=1
+; RV32I-NEXT:    sltu a2, a4, a1
+; RV32I-NEXT:    j .LBB7_2
+; RV32I-NEXT:  .LBB7_5: # %atomicrmw.end
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomicrmw_usub_sat_i64:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -32
+; RV32IA-NEXT:    .cfi_def_cfa_offset 32
+; RV32IA-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    .cfi_offset ra, -4
+; RV32IA-NEXT:    .cfi_offset s0, -8
+; RV32IA-NEXT:    .cfi_offset s1, -12
+; RV32IA-NEXT:    .cfi_offset s2, -16
+; RV32IA-NEXT:    mv s0, a0
+; RV32IA-NEXT:    lw a5, 4(a0)
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    j .LBB7_3
+; RV32IA-NEXT:  .LBB7_1: # %atomicrmw.start
+; RV32IA-NEXT:    # in Loop: Header=BB7_3 Depth=1
+; RV32IA-NEXT:    sltu a2, a5, a0
+; RV32IA-NEXT:  .LBB7_2: # %atomicrmw.start
+; RV32IA-NEXT:    # in Loop: Header=BB7_3 Depth=1
+; RV32IA-NEXT:    addi a3, a2, -1
+; RV32IA-NEXT:    and a2, a3, a1
+; RV32IA-NEXT:    and a3, a3, a0
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    addi a1, sp, 8
+; RV32IA-NEXT:    li a4, 5
+; RV32IA-NEXT:    li a5, 5
+; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    call __atomic_compare_exchange_8
+; RV32IA-NEXT:    lw a5, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    bnez a0, .LBB7_5
+; RV32IA-NEXT:  .LBB7_3: # %atomicrmw.start
+; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IA-NEXT:    sltu a0, a4, s2
+; RV32IA-NEXT:    sub a1, a5, s1
+; RV32IA-NEXT:    sub a0, a1, a0
+; RV32IA-NEXT:    sub a1, a4, s2
+; RV32IA-NEXT:    bne a0, a5, .LBB7_1
+; RV32IA-NEXT:  # %bb.4: # in Loop: Header=BB7_3 Depth=1
+; RV32IA-NEXT:    sltu a2, a4, a1
+; RV32IA-NEXT:    j .LBB7_2
+; RV32IA-NEXT:  .LBB7_5: # %atomicrmw.end
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
+; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    addi sp, sp, 32
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomicrmw_usub_sat_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    ld a3, 0(a0)
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:  .LBB7_1: # %atomicrmw.start
+; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub a0, a3, s1
+; RV64I-NEXT:    sltu a1, a3, a0
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    and a2, a1, a0
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    mv a1, sp
+; RV64I-NEXT:    li a3, 5
+; RV64I-NEXT:    li a4, 5
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    call __atomic_compare_exchange_8
+; RV64I-NEXT:    ld a3, 0(sp)
+; RV64I-NEXT:    beqz a0, .LBB7_1
+; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64I-NEXT:    mv a0, a3
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomicrmw_usub_sat_i64:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    ld a2, 0(a0)
+; RV64IA-NEXT:  .LBB7_1: # %atomicrmw.start
+; RV64IA-NEXT:    # =>This Loop Header: Depth=1
+; RV64IA-NEXT:    # Child Loop BB7_3 Depth 2
+; RV64IA-NEXT:    mv a3, a2
+; RV64IA-NEXT:    sub a2, a2, a1
+; RV64IA-NEXT:    sltu a4, a3, a2
+; RV64IA-NEXT:    addi a4, a4, -1
+; RV64IA-NEXT:    and a4, a4, a2
+; RV64IA-NEXT:  .LBB7_3: # %atomicrmw.start
+; RV64IA-NEXT:    # Parent Loop BB7_1 Depth=1
+; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
+; RV64IA-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IA-NEXT:    bne a2, a3, .LBB7_1
+; RV64IA-NEXT:  # %bb.4: # %atomicrmw.start
+; RV64IA-NEXT:    # in Loop: Header=BB7_3 Depth=2
+; RV64IA-NEXT:    sc.d.rl a5, a4, (a0)
+; RV64IA-NEXT:    bnez a5, .LBB7_3
+; RV64IA-NEXT:  # %bb.5: # %atomicrmw.start
+; RV64IA-NEXT:  # %bb.2: # %atomicrmw.end
+; RV64IA-NEXT:    mv a0, a2
+; RV64IA-NEXT:    ret
+  %result = atomicrmw usub_sat ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
diff --git a/llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll
new file mode 100644
index 0000000000000..860c4004658db
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=ve-unknown-unknown < %s | FileCheck %s
+
+define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    and %s2, -4, %s0
+; CHECK-NEXT:    and %s0, 3, %s0
+; CHECK-NEXT:    sla.w.sx %s0, %s0, 3
+; CHECK-NEXT:    sla.w.sx %s3, (56)0, %s0
+; CHECK-NEXT:    ldl.sx %s5, (, %s2)
+; CHECK-NEXT:    xor %s3, -1, %s3
+; CHECK-NEXT:    and %s3, %s3, (32)0
+; CHECK-NEXT:    and %s4, %s1, (56)0
+; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    or %s6, 0, %s5
+; CHECK-NEXT:    and %s5, %s6, (32)0
+; CHECK-NEXT:    srl %s5, %s5, %s0
+; CHECK-NEXT:    and %s7, %s5, (56)0
+; CHECK-NEXT:    subs.w.sx %s34, %s5, %s1
+; CHECK-NEXT:    cmpu.w %s7, %s7, %s4
+; CHECK-NEXT:    cmov.w.ge %s5, %s34, %s7
+; CHECK-NEXT:    and %s5, %s5, (56)0
+; CHECK-NEXT:    sla.w.sx %s5, %s5, %s0
+; CHECK-NEXT:    and %s7, %s6, %s3
+; CHECK-NEXT:    or %s5, %s7, %s5
+; CHECK-NEXT:    cas.w %s5, (%s2), %s6
+; CHECK-NEXT:    brne.w %s5, %s6, .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-NEXT:    and %s1, %s5, (32)0
+; CHECK-NEXT:    srl %s0, %s1, %s0
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    b.l.t (, %s10)
+  %result = atomicrmw usub_cond ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    and %s2, -4, %s0
+; CHECK-NEXT:    and %s0, 3, %s0
+; CHECK-NEXT:    sla.w.sx %s0, %s0, 3
+; CHECK-NEXT:    sla.w.sx %s3, (48)0, %s0
+; CHECK-NEXT:    ldl.sx %s5, (, %s2)
+; CHECK-NEXT:    xor %s3, -1, %s3
+; CHECK-NEXT:    and %s3, %s3, (32)0
+; CHECK-NEXT:    and %s4, %s1, (48)0
+; CHECK-NEXT:  .LBB1_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    or %s6, 0, %s5
+; CHECK-NEXT:    and %s5, %s6, (32)0
+; CHECK-NEXT:    srl %s5, %s5, %s0
+; CHECK-NEXT:    and %s7, %s5, (48)0
+; CHECK-NEXT:    subs.w.sx %s34, %s5, %s1
+; CHECK-NEXT:    cmpu.w %s7, %s7, %s4
+; CHECK-NEXT:    cmov.w.ge %s5, %s34, %s7
+; CHECK-NEXT:    and %s5, %s5, (48)0
+; CHECK-NEXT:    sla.w.sx %s5, %s5, %s0
+; CHECK-NEXT:    and %s7, %s6, %s3
+; CHECK-NEXT:    or %s5, %s7, %s5
+; CHECK-NEXT:    cas.w %s5, (%s2), %s6
+; CHECK-NEXT:    brne.w %s5, %s6, .LBB1_1
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-NEXT:    and %s1, %s5, (32)0
+; CHECK-NEXT:    srl %s0, %s1, %s0
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    b.l.t (, %s10)
+  %result = atomicrmw usub_cond ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    ldl.sx %s2, (, %s0)
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:  .LBB2_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    or %s3, 0, %s2
+; CHECK-NEXT:    subs.w.sx %s4, %s2, %s1
+; CHECK-NEXT:    cmpu.w %s5, %s2, %s1
+; CHECK-NEXT:    or %s2, 0, %s3
+; CHECK-NEXT:    cmov.w.ge %s2, %s4, %s5
+; CHECK-NEXT:    cas.w %s2, (%s0), %s3
+; CHECK-NEXT:    brne.w %s2, %s3, .LBB2_1
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %result = atomicrmw usub_cond ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_cond_sub_i64(ptr %ptr, i64 %val) {
+; CHECK-LABEL: atomicrmw_usub_cond_sub_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s2, 0, %s0
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    ld %s0, (, %s0)
+; CHECK-NEXT:  .LBB3_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    or %s3, 0, %s0
+; CHECK-NEXT:    subs.l %s4, %s0, %s1
+; CHECK-NEXT:    cmpu.l %s5, %s0, %s1
+; CHECK-NEXT:    cmov.l.ge %s0, %s4, %s5
+; CHECK-NEXT:    cas.l %s0, (%s2), %s3
+; CHECK-NEXT:    brne.l %s0, %s3, .LBB3_1
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    b.l.t (, %s10)
+  %result = atomicrmw usub_cond ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
+
+define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s3, %s1, (32)0
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    and %s1, -4, %s0
+; CHECK-NEXT:    and %s0, 3, %s0
+; CHECK-NEXT:    sla.w.sx %s0, %s0, 3
+; CHECK-NEXT:    sla.w.sx %s2, (56)0, %s0
+; CHECK-NEXT:    ldl.sx %s4, (, %s1)
+; CHECK-NEXT:    xor %s2, -1, %s2
+; CHECK-NEXT:    and %s2, %s2, (32)0
+; CHECK-NEXT:    and %s3, %s3, (56)0
+; CHECK-NEXT:  .LBB4_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    or %s5, 0, %s4
+; CHECK-NEXT:    and %s4, %s5, (32)0
+; CHECK-NEXT:    srl %s4, %s4, %s0
+; CHECK-NEXT:    and %s4, %s4, (56)0
+; CHECK-NEXT:    subs.w.sx %s6, %s4, %s3
+; CHECK-NEXT:    cmpu.w %s4, %s6, %s4
+; CHECK-NEXT:    cmov.w.gt %s6, (0)1, %s4
+; CHECK-NEXT:    sla.w.sx %s4, %s6, %s0
+; CHECK-NEXT:    and %s6, %s5, %s2
+; CHECK-NEXT:    or %s4, %s6, %s4
+; CHECK-NEXT:    cas.w %s4, (%s1), %s5
+; CHECK-NEXT:    brne.w %s4, %s5, .LBB4_1
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-NEXT:    and %s1, %s4, (32)0
+; CHECK-NEXT:    srl %s0, %s1, %s0
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    b.l.t (, %s10)
+  %result = atomicrmw usub_sat ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s3, %s1, (32)0
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    and %s1, -4, %s0
+; CHECK-NEXT:    and %s0, 3, %s0
+; CHECK-NEXT:    sla.w.sx %s0, %s0, 3
+; CHECK-NEXT:    sla.w.sx %s2, (48)0, %s0
+; CHECK-NEXT:    ldl.sx %s4, (, %s1)
+; CHECK-NEXT:    xor %s2, -1, %s2
+; CHECK-NEXT:    and %s2, %s2, (32)0
+; CHECK-NEXT:    and %s3, %s3, (48)0
+; CHECK-NEXT:  .LBB5_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    or %s5, 0, %s4
+; CHECK-NEXT:    and %s4, %s5, (32)0
+; CHECK-NEXT:    srl %s4, %s4, %s0
+; CHECK-NEXT:    and %s4, %s4, (48)0
+; CHECK-NEXT:    subs.w.sx %s6, %s4, %s3
+; CHECK-NEXT:    cmpu.w %s4, %s6, %s4
+; CHECK-NEXT:    cmov.w.gt %s6, (0)1, %s4
+; CHECK-NEXT:    sla.w.sx %s4, %s6, %s0
+; CHECK-NEXT:    and %s6, %s5, %s2
+; CHECK-NEXT:    or %s4, %s6, %s4
+; CHECK-NEXT:    cas.w %s4, (%s1), %s5
+; CHECK-NEXT:    brne.w %s4, %s5, .LBB5_1
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-NEXT:    and %s1, %s4, (32)0
+; CHECK-NEXT:    srl %s0, %s1, %s0
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    b.l.t (, %s10)
+  %result = atomicrmw usub_sat ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    ldl.sx %s2, (, %s0)
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:  .LBB6_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    or %s3, 0, %s2
+; CHECK-NEXT:    subs.w.sx %s2, %s2, %s1
+; CHECK-NEXT:    cmpu.w %s4, %s2, %s3
+; CHECK-NEXT:    cmov.w.gt %s2, (0)1, %s4
+; CHECK-NEXT:    cas.w %s2, (%s0), %s3
+; CHECK-NEXT:    brne.w %s2, %s3, .LBB6_1
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %result = atomicrmw usub_sat ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
+; CHECK-LABEL: atomicrmw_usub_sat_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    ld %s2, (, %s0)
+; CHECK-NEXT:  .LBB7_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    or %s3, 0, %s2
+; CHECK-NEXT:    subs.l %s2, %s2, %s1
+; CHECK-NEXT:    cmpu.l %s4, %s2, %s3
+; CHECK-NEXT:    cmov.l.gt %s2, (0)1, %s4
+; CHECK-NEXT:    cas.l %s2, (%s0), %s3
+; CHECK-NEXT:    brne.l %s2, %s3, .LBB7_1
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-NEXT:    fencem 3
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %result = atomicrmw usub_sat ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
diff --git a/llvm/test/CodeGen/WebAssembly/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/WebAssembly/atomicrmw-cond-sub-clamp.ll
new file mode 100644
index 0000000000000..3355237425b42
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/atomicrmw-cond-sub-clamp.ll
@@ -0,0 +1,363 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=wasm32-unknown-unknown < %s | FileCheck -check-prefix=WASM32 %s
+; RUN: llc -mtriple=wasm64-unknown-unknown < %s | FileCheck -check-prefix=WASM64 %s
+
+define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
+; WASM32-LABEL: atomicrmw_usub_cond_i8:
+; WASM32:         .functype atomicrmw_usub_cond_i8 (i32, i32) -> (i32)
+; WASM32-NEXT:    .local i32
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32.load8_u 0
+; WASM32-NEXT:    local.tee 2
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.sub
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.const 255
+; WASM32-NEXT:    i32.and
+; WASM32-NEXT:    i32.ge_u
+; WASM32-NEXT:    i32.select
+; WASM32-NEXT:    i32.store8 0
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: atomicrmw_usub_cond_i8:
+; WASM64:         .functype atomicrmw_usub_cond_i8 (i64, i32) -> (i32)
+; WASM64-NEXT:    .local i32
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32.load8_u 0
+; WASM64-NEXT:    local.tee 2
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    i32.sub
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    i32.const 255
+; WASM64-NEXT:    i32.and
+; WASM64-NEXT:    i32.ge_u
+; WASM64-NEXT:    i32.select
+; WASM64-NEXT:    i32.store8 0
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    # fallthrough-return
+  %result = atomicrmw usub_cond ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
+; WASM32-LABEL: atomicrmw_usub_cond_i16:
+; WASM32:         .functype atomicrmw_usub_cond_i16 (i32, i32) -> (i32)
+; WASM32-NEXT:    .local i32
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32.load16_u 0
+; WASM32-NEXT:    local.tee 2
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.sub
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.const 65535
+; WASM32-NEXT:    i32.and
+; WASM32-NEXT:    i32.ge_u
+; WASM32-NEXT:    i32.select
+; WASM32-NEXT:    i32.store16 0
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: atomicrmw_usub_cond_i16:
+; WASM64:         .functype atomicrmw_usub_cond_i16 (i64, i32) -> (i32)
+; WASM64-NEXT:    .local i32
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32.load16_u 0
+; WASM64-NEXT:    local.tee 2
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    i32.sub
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    i32.const 65535
+; WASM64-NEXT:    i32.and
+; WASM64-NEXT:    i32.ge_u
+; WASM64-NEXT:    i32.select
+; WASM64-NEXT:    i32.store16 0
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    # fallthrough-return
+  %result = atomicrmw usub_cond ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) {
+; WASM32-LABEL: atomicrmw_usub_cond_i32:
+; WASM32:         .functype atomicrmw_usub_cond_i32 (i32, i32) -> (i32)
+; WASM32-NEXT:    .local i32
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32.load 0
+; WASM32-NEXT:    local.tee 2
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.sub
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.ge_u
+; WASM32-NEXT:    i32.select
+; WASM32-NEXT:    i32.store 0
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: atomicrmw_usub_cond_i32:
+; WASM64:         .functype atomicrmw_usub_cond_i32 (i64, i32) -> (i32)
+; WASM64-NEXT:    .local i32
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32.load 0
+; WASM64-NEXT:    local.tee 2
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    i32.sub
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    i32.ge_u
+; WASM64-NEXT:    i32.select
+; WASM64-NEXT:    i32.store 0
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    # fallthrough-return
+  %result = atomicrmw usub_cond ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
+; WASM32-LABEL: atomicrmw_usub_cond_i64:
+; WASM32:         .functype atomicrmw_usub_cond_i64 (i32, i64) -> (i64)
+; WASM32-NEXT:    .local i64
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i64.load 0
+; WASM32-NEXT:    local.tee 2
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i64.sub
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i64.ge_u
+; WASM32-NEXT:    i64.select
+; WASM32-NEXT:    i64.store 0
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: atomicrmw_usub_cond_i64:
+; WASM64:         .functype atomicrmw_usub_cond_i64 (i64, i64) -> (i64)
+; WASM64-NEXT:    .local i64
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i64.load 0
+; WASM64-NEXT:    local.tee 2
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    i64.sub
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    i64.ge_u
+; WASM64-NEXT:    i64.select
+; WASM64-NEXT:    i64.store 0
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    # fallthrough-return
+  %result = atomicrmw usub_cond ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
+
+define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
+; WASM32-LABEL: atomicrmw_usub_sat_i8:
+; WASM32:         .functype atomicrmw_usub_sat_i8 (i32, i32) -> (i32)
+; WASM32-NEXT:    .local i32
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32.const 0
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32.load8_u 0
+; WASM32-NEXT:    local.tee 2
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.const 255
+; WASM32-NEXT:    i32.and
+; WASM32-NEXT:    i32.sub
+; WASM32-NEXT:    local.tee 1
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    i32.gt_u
+; WASM32-NEXT:    i32.select
+; WASM32-NEXT:    i32.store8 0
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: atomicrmw_usub_sat_i8:
+; WASM64:         .functype atomicrmw_usub_sat_i8 (i64, i32) -> (i32)
+; WASM64-NEXT:    .local i32
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32.const 0
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32.load8_u 0
+; WASM64-NEXT:    local.tee 2
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    i32.const 255
+; WASM64-NEXT:    i32.and
+; WASM64-NEXT:    i32.sub
+; WASM64-NEXT:    local.tee 1
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    i32.gt_u
+; WASM64-NEXT:    i32.select
+; WASM64-NEXT:    i32.store8 0
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    # fallthrough-return
+  %result = atomicrmw usub_sat ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
+; WASM32-LABEL: atomicrmw_usub_sat_i16:
+; WASM32:         .functype atomicrmw_usub_sat_i16 (i32, i32) -> (i32)
+; WASM32-NEXT:    .local i32
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32.const 0
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32.load16_u 0
+; WASM32-NEXT:    local.tee 2
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.const 65535
+; WASM32-NEXT:    i32.and
+; WASM32-NEXT:    i32.sub
+; WASM32-NEXT:    local.tee 1
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    i32.gt_u
+; WASM32-NEXT:    i32.select
+; WASM32-NEXT:    i32.store16 0
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: atomicrmw_usub_sat_i16:
+; WASM64:         .functype atomicrmw_usub_sat_i16 (i64, i32) -> (i32)
+; WASM64-NEXT:    .local i32
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32.const 0
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32.load16_u 0
+; WASM64-NEXT:    local.tee 2
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    i32.const 65535
+; WASM64-NEXT:    i32.and
+; WASM64-NEXT:    i32.sub
+; WASM64-NEXT:    local.tee 1
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    i32.gt_u
+; WASM64-NEXT:    i32.select
+; WASM64-NEXT:    i32.store16 0
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    # fallthrough-return
+  %result = atomicrmw usub_sat ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
+; WASM32-LABEL: atomicrmw_usub_sat_i32:
+; WASM32:         .functype atomicrmw_usub_sat_i32 (i32, i32) -> (i32)
+; WASM32-NEXT:    .local i32
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32.const 0
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32.load 0
+; WASM32-NEXT:    local.tee 2
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.sub
+; WASM32-NEXT:    local.tee 1
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    i32.gt_u
+; WASM32-NEXT:    i32.select
+; WASM32-NEXT:    i32.store 0
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: atomicrmw_usub_sat_i32:
+; WASM64:         .functype atomicrmw_usub_sat_i32 (i64, i32) -> (i32)
+; WASM64-NEXT:    .local i32
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32.const 0
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32.load 0
+; WASM64-NEXT:    local.tee 2
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    i32.sub
+; WASM64-NEXT:    local.tee 1
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    i32.gt_u
+; WASM64-NEXT:    i32.select
+; WASM64-NEXT:    i32.store 0
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    # fallthrough-return
+  %result = atomicrmw usub_sat ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
+; WASM32-LABEL: atomicrmw_usub_sat_i64:
+; WASM32:         .functype atomicrmw_usub_sat_i64 (i32, i64) -> (i64)
+; WASM32-NEXT:    .local i64
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i64.const 0
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i64.load 0
+; WASM32-NEXT:    local.tee 2
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i64.sub
+; WASM32-NEXT:    local.tee 1
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    i64.gt_u
+; WASM32-NEXT:    i64.select
+; WASM32-NEXT:    i64.store 0
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: atomicrmw_usub_sat_i64:
+; WASM64:         .functype atomicrmw_usub_sat_i64 (i64, i64) -> (i64)
+; WASM64-NEXT:    .local i64
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i64.const 0
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i64.load 0
+; WASM64-NEXT:    local.tee 2
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    i64.sub
+; WASM64-NEXT:    local.tee 1
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    i64.gt_u
+; WASM64-NEXT:    i64.select
+; WASM64-NEXT:    i64.store 0
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    # fallthrough-return
+  %result = atomicrmw usub_sat ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
diff --git a/llvm/test/CodeGen/X86/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/X86/atomicrmw-cond-sub-clamp.ll
new file mode 100644
index 0000000000000..04bfb4e367b9d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/atomicrmw-cond-sub-clamp.ll
@@ -0,0 +1,436 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple i686-pc-linux < %s | FileCheck %s --check-prefix=CHECK-32
+; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck %s --check-prefix=CHECK-64
+
+define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
+; CHECK-32-LABEL: atomicrmw_usub_cond_i8:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-32-NEXT:    movzbl (%edx), %eax
+; CHECK-32-NEXT:    jmp .LBB0_1
+; CHECK-32-NEXT:    .p2align 4, 0x90
+; CHECK-32-NEXT:  .LBB0_3: # %atomicrmw.start
+; CHECK-32-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-32-NEXT:    lock cmpxchgb %ah, (%edx)
+; CHECK-32-NEXT:    je .LBB0_4
+; CHECK-32-NEXT:  .LBB0_1: # %atomicrmw.start
+; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-32-NEXT:    movb %al, %ah
+; CHECK-32-NEXT:    subb %cl, %ah
+; CHECK-32-NEXT:    jae .LBB0_3
+; CHECK-32-NEXT:  # %bb.2: # %atomicrmw.start
+; CHECK-32-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-32-NEXT:    movb %al, %ah
+; CHECK-32-NEXT:    jmp .LBB0_3
+; CHECK-32-NEXT:  .LBB0_4: # %atomicrmw.end
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: atomicrmw_usub_cond_i8:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movzbl (%rdi), %eax
+; CHECK-64-NEXT:    .p2align 4, 0x90
+; CHECK-64-NEXT:  .LBB0_1: # %atomicrmw.start
+; CHECK-64-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-64-NEXT:    movzbl %al, %ecx
+; CHECK-64-NEXT:    subb %sil, %al
+; CHECK-64-NEXT:    movzbl %al, %edx
+; CHECK-64-NEXT:    cmovbl %ecx, %edx
+; CHECK-64-NEXT:    movl %ecx, %eax
+; CHECK-64-NEXT:    lock cmpxchgb %dl, (%rdi)
+; CHECK-64-NEXT:    jne .LBB0_1
+; CHECK-64-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-64-NEXT:    retq
+  %result = atomicrmw usub_cond ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
+; CHECK-32-LABEL: atomicrmw_usub_cond_i16:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    pushl %esi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-32-NEXT:    .cfi_offset %esi, -8
+; CHECK-32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-32-NEXT:    movzwl (%edx), %eax
+; CHECK-32-NEXT:    jmp .LBB1_1
+; CHECK-32-NEXT:    .p2align 4, 0x90
+; CHECK-32-NEXT:  .LBB1_3: # %atomicrmw.start
+; CHECK-32-NEXT:    # in Loop: Header=BB1_1 Depth=1
+; CHECK-32-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-32-NEXT:    lock cmpxchgw %si, (%edx)
+; CHECK-32-NEXT:    # kill: def $ax killed $ax def $eax
+; CHECK-32-NEXT:    je .LBB1_4
+; CHECK-32-NEXT:  .LBB1_1: # %atomicrmw.start
+; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-32-NEXT:    movl %eax, %esi
+; CHECK-32-NEXT:    subw %cx, %si
+; CHECK-32-NEXT:    jae .LBB1_3
+; CHECK-32-NEXT:  # %bb.2: # %atomicrmw.start
+; CHECK-32-NEXT:    # in Loop: Header=BB1_1 Depth=1
+; CHECK-32-NEXT:    movl %eax, %esi
+; CHECK-32-NEXT:    jmp .LBB1_3
+; CHECK-32-NEXT:  .LBB1_4: # %atomicrmw.end
+; CHECK-32-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-32-NEXT:    popl %esi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: atomicrmw_usub_cond_i16:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movzwl (%rdi), %eax
+; CHECK-64-NEXT:    .p2align 4, 0x90
+; CHECK-64-NEXT:  .LBB1_1: # %atomicrmw.start
+; CHECK-64-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-64-NEXT:    movl %eax, %ecx
+; CHECK-64-NEXT:    subw %si, %cx
+; CHECK-64-NEXT:    cmovbl %eax, %ecx
+; CHECK-64-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-64-NEXT:    lock cmpxchgw %cx, (%rdi)
+; CHECK-64-NEXT:    # kill: def $ax killed $ax def $eax
+; CHECK-64-NEXT:    jne .LBB1_1
+; CHECK-64-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-64-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-64-NEXT:    retq
+  %result = atomicrmw usub_cond ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) {
+; CHECK-32-LABEL: atomicrmw_usub_cond_i32:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    pushl %esi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-32-NEXT:    .cfi_offset %esi, -8
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-32-NEXT:    movl (%edx), %eax
+; CHECK-32-NEXT:    jmp .LBB2_1
+; CHECK-32-NEXT:    .p2align 4, 0x90
+; CHECK-32-NEXT:  .LBB2_3: # %atomicrmw.start
+; CHECK-32-NEXT:    # in Loop: Header=BB2_1 Depth=1
+; CHECK-32-NEXT:    lock cmpxchgl %esi, (%edx)
+; CHECK-32-NEXT:    je .LBB2_4
+; CHECK-32-NEXT:  .LBB2_1: # %atomicrmw.start
+; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-32-NEXT:    movl %eax, %esi
+; CHECK-32-NEXT:    subl %ecx, %esi
+; CHECK-32-NEXT:    jae .LBB2_3
+; CHECK-32-NEXT:  # %bb.2: # %atomicrmw.start
+; CHECK-32-NEXT:    # in Loop: Header=BB2_1 Depth=1
+; CHECK-32-NEXT:    movl %eax, %esi
+; CHECK-32-NEXT:    jmp .LBB2_3
+; CHECK-32-NEXT:  .LBB2_4: # %atomicrmw.end
+; CHECK-32-NEXT:    popl %esi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: atomicrmw_usub_cond_i32:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl (%rdi), %eax
+; CHECK-64-NEXT:    .p2align 4, 0x90
+; CHECK-64-NEXT:  .LBB2_1: # %atomicrmw.start
+; CHECK-64-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-64-NEXT:    movl %eax, %ecx
+; CHECK-64-NEXT:    subl %esi, %ecx
+; CHECK-64-NEXT:    cmovbl %eax, %ecx
+; CHECK-64-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; CHECK-64-NEXT:    jne .LBB2_1
+; CHECK-64-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-64-NEXT:    retq
+  %result = atomicrmw usub_cond ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
+; CHECK-32-LABEL: atomicrmw_usub_cond_i64:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    pushl %ebp
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-32-NEXT:    pushl %ebx
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-32-NEXT:    pushl %edi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-32-NEXT:    pushl %esi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK-32-NEXT:    .cfi_offset %esi, -20
+; CHECK-32-NEXT:    .cfi_offset %edi, -16
+; CHECK-32-NEXT:    .cfi_offset %ebx, -12
+; CHECK-32-NEXT:    .cfi_offset %ebp, -8
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; CHECK-32-NEXT:    movl (%ebp), %eax
+; CHECK-32-NEXT:    movl 4(%ebp), %edx
+; CHECK-32-NEXT:    jmp .LBB3_1
+; CHECK-32-NEXT:    .p2align 4, 0x90
+; CHECK-32-NEXT:  .LBB3_3: # %atomicrmw.start
+; CHECK-32-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; CHECK-32-NEXT:    lock cmpxchg8b (%ebp)
+; CHECK-32-NEXT:    je .LBB3_4
+; CHECK-32-NEXT:  .LBB3_1: # %atomicrmw.start
+; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-32-NEXT:    movl %eax, %ebx
+; CHECK-32-NEXT:    subl %edi, %ebx
+; CHECK-32-NEXT:    movl %edx, %ecx
+; CHECK-32-NEXT:    sbbl %esi, %ecx
+; CHECK-32-NEXT:    jae .LBB3_3
+; CHECK-32-NEXT:  # %bb.2: # %atomicrmw.start
+; CHECK-32-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; CHECK-32-NEXT:    movl %edx, %ecx
+; CHECK-32-NEXT:    movl %eax, %ebx
+; CHECK-32-NEXT:    jmp .LBB3_3
+; CHECK-32-NEXT:  .LBB3_4: # %atomicrmw.end
+; CHECK-32-NEXT:    popl %esi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-32-NEXT:    popl %edi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-32-NEXT:    popl %ebx
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-32-NEXT:    popl %ebp
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: atomicrmw_usub_cond_i64:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movq (%rdi), %rax
+; CHECK-64-NEXT:    .p2align 4, 0x90
+; CHECK-64-NEXT:  .LBB3_1: # %atomicrmw.start
+; CHECK-64-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-64-NEXT:    movq %rax, %rcx
+; CHECK-64-NEXT:    subq %rsi, %rcx
+; CHECK-64-NEXT:    cmovbq %rax, %rcx
+; CHECK-64-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; CHECK-64-NEXT:    jne .LBB3_1
+; CHECK-64-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-64-NEXT:    retq
+  %result = atomicrmw usub_cond ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
+
+define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
+; CHECK-32-LABEL: atomicrmw_usub_sat_i8:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    pushl %ebx
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-32-NEXT:    .cfi_offset %ebx, -8
+; CHECK-32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-32-NEXT:    movzbl (%edx), %eax
+; CHECK-32-NEXT:    jmp .LBB4_1
+; CHECK-32-NEXT:    .p2align 4, 0x90
+; CHECK-32-NEXT:  .LBB4_3: # %atomicrmw.start
+; CHECK-32-NEXT:    # in Loop: Header=BB4_1 Depth=1
+; CHECK-32-NEXT:    lock cmpxchgb %bl, (%edx)
+; CHECK-32-NEXT:    je .LBB4_4
+; CHECK-32-NEXT:  .LBB4_1: # %atomicrmw.start
+; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-32-NEXT:    movl %eax, %ebx
+; CHECK-32-NEXT:    subb %cl, %bl
+; CHECK-32-NEXT:    jae .LBB4_3
+; CHECK-32-NEXT:  # %bb.2: # in Loop: Header=BB4_1 Depth=1
+; CHECK-32-NEXT:    xorl %ebx, %ebx
+; CHECK-32-NEXT:    jmp .LBB4_3
+; CHECK-32-NEXT:  .LBB4_4: # %atomicrmw.end
+; CHECK-32-NEXT:    popl %ebx
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: atomicrmw_usub_sat_i8:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movzbl (%rdi), %eax
+; CHECK-64-NEXT:    xorl %ecx, %ecx
+; CHECK-64-NEXT:    .p2align 4, 0x90
+; CHECK-64-NEXT:  .LBB4_1: # %atomicrmw.start
+; CHECK-64-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-64-NEXT:    movl %eax, %edx
+; CHECK-64-NEXT:    subb %sil, %dl
+; CHECK-64-NEXT:    movzbl %dl, %edx
+; CHECK-64-NEXT:    cmovbl %ecx, %edx
+; CHECK-64-NEXT:    lock cmpxchgb %dl, (%rdi)
+; CHECK-64-NEXT:    jne .LBB4_1
+; CHECK-64-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-64-NEXT:    retq
+  %result = atomicrmw usub_sat ptr %ptr, i8 %val seq_cst
+  ret i8 %result
+}
+
+define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
+; CHECK-32-LABEL: atomicrmw_usub_sat_i16:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    pushl %edi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-32-NEXT:    pushl %esi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-32-NEXT:    .cfi_offset %esi, -12
+; CHECK-32-NEXT:    .cfi_offset %edi, -8
+; CHECK-32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-32-NEXT:    movzwl (%edx), %eax
+; CHECK-32-NEXT:    jmp .LBB5_1
+; CHECK-32-NEXT:    .p2align 4, 0x90
+; CHECK-32-NEXT:  .LBB5_3: # %atomicrmw.start
+; CHECK-32-NEXT:    # in Loop: Header=BB5_1 Depth=1
+; CHECK-32-NEXT:    lock cmpxchgw %si, (%edx)
+; CHECK-32-NEXT:    je .LBB5_4
+; CHECK-32-NEXT:  .LBB5_1: # %atomicrmw.start
+; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-32-NEXT:    xorl %esi, %esi
+; CHECK-32-NEXT:    movl %eax, %edi
+; CHECK-32-NEXT:    subw %cx, %di
+; CHECK-32-NEXT:    jb .LBB5_3
+; CHECK-32-NEXT:  # %bb.2: # %atomicrmw.start
+; CHECK-32-NEXT:    # in Loop: Header=BB5_1 Depth=1
+; CHECK-32-NEXT:    movl %edi, %esi
+; CHECK-32-NEXT:    jmp .LBB5_3
+; CHECK-32-NEXT:  .LBB5_4: # %atomicrmw.end
+; CHECK-32-NEXT:    popl %esi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-32-NEXT:    popl %edi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: atomicrmw_usub_sat_i16:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movzwl (%rdi), %eax
+; CHECK-64-NEXT:    xorl %ecx, %ecx
+; CHECK-64-NEXT:    .p2align 4, 0x90
+; CHECK-64-NEXT:  .LBB5_1: # %atomicrmw.start
+; CHECK-64-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-64-NEXT:    movl %eax, %edx
+; CHECK-64-NEXT:    subw %si, %dx
+; CHECK-64-NEXT:    cmovbl %ecx, %edx
+; CHECK-64-NEXT:    lock cmpxchgw %dx, (%rdi)
+; CHECK-64-NEXT:    jne .LBB5_1
+; CHECK-64-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-64-NEXT:    retq
+  %result = atomicrmw usub_sat ptr %ptr, i16 %val seq_cst
+  ret i16 %result
+}
+
+define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
+; CHECK-32-LABEL: atomicrmw_usub_sat_i32:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    pushl %edi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-32-NEXT:    pushl %esi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-32-NEXT:    .cfi_offset %esi, -12
+; CHECK-32-NEXT:    .cfi_offset %edi, -8
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-32-NEXT:    movl (%edx), %eax
+; CHECK-32-NEXT:    jmp .LBB6_1
+; CHECK-32-NEXT:    .p2align 4, 0x90
+; CHECK-32-NEXT:  .LBB6_3: # %atomicrmw.start
+; CHECK-32-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; CHECK-32-NEXT:    lock cmpxchgl %esi, (%edx)
+; CHECK-32-NEXT:    je .LBB6_4
+; CHECK-32-NEXT:  .LBB6_1: # %atomicrmw.start
+; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-32-NEXT:    xorl %esi, %esi
+; CHECK-32-NEXT:    movl %eax, %edi
+; CHECK-32-NEXT:    subl %ecx, %edi
+; CHECK-32-NEXT:    jb .LBB6_3
+; CHECK-32-NEXT:  # %bb.2: # %atomicrmw.start
+; CHECK-32-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; CHECK-32-NEXT:    movl %edi, %esi
+; CHECK-32-NEXT:    jmp .LBB6_3
+; CHECK-32-NEXT:  .LBB6_4: # %atomicrmw.end
+; CHECK-32-NEXT:    popl %esi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-32-NEXT:    popl %edi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: atomicrmw_usub_sat_i32:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl (%rdi), %eax
+; CHECK-64-NEXT:    xorl %ecx, %ecx
+; CHECK-64-NEXT:    .p2align 4, 0x90
+; CHECK-64-NEXT:  .LBB6_1: # %atomicrmw.start
+; CHECK-64-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-64-NEXT:    movl %eax, %edx
+; CHECK-64-NEXT:    subl %esi, %edx
+; CHECK-64-NEXT:    cmovbl %ecx, %edx
+; CHECK-64-NEXT:    lock cmpxchgl %edx, (%rdi)
+; CHECK-64-NEXT:    jne .LBB6_1
+; CHECK-64-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-64-NEXT:    retq
+  %result = atomicrmw usub_sat ptr %ptr, i32 %val seq_cst
+  ret i32 %result
+}
+
+define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
+; CHECK-32-LABEL: atomicrmw_usub_sat_i64:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    pushl %ebp
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-32-NEXT:    pushl %ebx
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-32-NEXT:    pushl %edi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-32-NEXT:    pushl %esi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK-32-NEXT:    .cfi_offset %esi, -20
+; CHECK-32-NEXT:    .cfi_offset %edi, -16
+; CHECK-32-NEXT:    .cfi_offset %ebx, -12
+; CHECK-32-NEXT:    .cfi_offset %ebp, -8
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; CHECK-32-NEXT:    movl (%ebp), %esi
+; CHECK-32-NEXT:    movl 4(%ebp), %edi
+; CHECK-32-NEXT:    jmp .LBB7_1
+; CHECK-32-NEXT:    .p2align 4, 0x90
+; CHECK-32-NEXT:  .LBB7_3: # %atomicrmw.start
+; CHECK-32-NEXT:    # in Loop: Header=BB7_1 Depth=1
+; CHECK-32-NEXT:    movl %esi, %eax
+; CHECK-32-NEXT:    movl %edi, %edx
+; CHECK-32-NEXT:    lock cmpxchg8b (%ebp)
+; CHECK-32-NEXT:    movl %eax, %esi
+; CHECK-32-NEXT:    movl %edx, %edi
+; CHECK-32-NEXT:    je .LBB7_4
+; CHECK-32-NEXT:  .LBB7_1: # %atomicrmw.start
+; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-32-NEXT:    xorl %ecx, %ecx
+; CHECK-32-NEXT:    movl %esi, %eax
+; CHECK-32-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    movl %edi, %edx
+; CHECK-32-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
+; CHECK-32-NEXT:    movl $0, %ebx
+; CHECK-32-NEXT:    jb .LBB7_3
+; CHECK-32-NEXT:  # %bb.2: # %atomicrmw.start
+; CHECK-32-NEXT:    # in Loop: Header=BB7_1 Depth=1
+; CHECK-32-NEXT:    movl %edx, %ecx
+; CHECK-32-NEXT:    movl %eax, %ebx
+; CHECK-32-NEXT:    jmp .LBB7_3
+; CHECK-32-NEXT:  .LBB7_4: # %atomicrmw.end
+; CHECK-32-NEXT:    movl %esi, %eax
+; CHECK-32-NEXT:    movl %edi, %edx
+; CHECK-32-NEXT:    popl %esi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-32-NEXT:    popl %edi
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-32-NEXT:    popl %ebx
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-32-NEXT:    popl %ebp
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: atomicrmw_usub_sat_i64:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movq (%rdi), %rax
+; CHECK-64-NEXT:    xorl %ecx, %ecx
+; CHECK-64-NEXT:    .p2align 4, 0x90
+; CHECK-64-NEXT:  .LBB7_1: # %atomicrmw.start
+; CHECK-64-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-64-NEXT:    movq %rax, %rdx
+; CHECK-64-NEXT:    subq %rsi, %rdx
+; CHECK-64-NEXT:    cmovbq %rcx, %rdx
+; CHECK-64-NEXT:    lock cmpxchgq %rdx, (%rdi)
+; CHECK-64-NEXT:    jne .LBB7_1
+; CHECK-64-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-64-NEXT:    retq
+  %result = atomicrmw usub_sat ptr %ptr, i64 %val seq_cst
+  ret i64 %result
+}
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
index 00601b7ae6e0d..bafc19a2b15de 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
@@ -136,14 +136,14 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
 // CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2([[#LOWER:]]), GIMT_Encode2([[#UPPER:]]), /*)*//*default:*//*Label 6*/ GIMT_Encode4([[#DEFAULT:]]),
-// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(474), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(510), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(557), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(591), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(614), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(626),
+// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(482), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(518), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(565), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(599), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(622), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(634),
 // CHECK-NEXT:     // Label 0: @[[#%u, mul(UPPER-LOWER, 4) + 10]]
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(498), // Rule ID 4 //
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(506), // Rule ID 4 //
 // CHECK-NEXT:       GIM_CheckFeatures, GIMT_Encode2(GIFBS_HasAnswerToEverything),
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule3Enabled),
 // CHECK-NEXT:       // MIs[0] a
@@ -156,8 +156,8 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIM_CheckIsSafeToFold, /*NumInsns*/1,
 // CHECK-NEXT:       // Combiner Rule #3: InstTest1
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner2),
-// CHECK-NEXT:     // Label 7: @498
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(509), // Rule ID 3 //
+// CHECK-NEXT:     // Label 7: @506
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(517), // Rule ID 3 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled),
 // CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
@@ -165,10 +165,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       // No operand predicates
 // CHECK-NEXT:       // Combiner Rule #2: InstTest0
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner1),
-// CHECK-NEXT:     // Label 8: @509
+// CHECK-NEXT:     // Label 8: @517
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 1: @510
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(556), // Rule ID 6 //
+// CHECK-NEXT:     // Label 1: @518
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(564), // Rule ID 6 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule5Enabled),
 // CHECK-NEXT:       GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
 // CHECK-NEXT:       // MIs[0] dst
@@ -185,10 +185,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_RootToRootCopy, /*OpIdx*/0, // dst
 // CHECK-NEXT:       GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // z
 // CHECK-NEXT:       GIR_EraseRootFromParent_Done,
-// CHECK-NEXT:     // Label 9: @556
+// CHECK-NEXT:     // Label 9: @564
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @557
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(590), // Rule ID 5 //
+// CHECK-NEXT:     // Label 2: @565
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(598), // Rule ID 5 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule4Enabled),
 // CHECK-NEXT:       // MIs[0] tmp
 // CHECK-NEXT:       GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/0, // MIs[1]
@@ -204,29 +204,29 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_RootToRootCopy, /*OpIdx*/1, // ptr
 // CHECK-NEXT:       GIR_MergeMemOperands, /*InsnID*/0, /*NumInsns*/2, /*MergeInsnID's*/0, 1,
 // CHECK-NEXT:       GIR_EraseRootFromParent_Done,
-// CHECK-NEXT:     // Label 10: @590
+// CHECK-NEXT:     // Label 10: @598
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 3: @591
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(602), // Rule ID 0 //
+// CHECK-NEXT:     // Label 3: @599
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(610), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       // Combiner Rule #0: WipOpcodeTest0; wip_match_opcode 'G_TRUNC'
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0),
-// CHECK-NEXT:     // Label 11: @602
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(613), // Rule ID 1 //
+// CHECK-NEXT:     // Label 11: @610
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(621), // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_TRUNC'
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0),
-// CHECK-NEXT:     // Label 12: @613
+// CHECK-NEXT:     // Label 12: @621
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 4: @614
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(625), // Rule ID 2 //
+// CHECK-NEXT:     // Label 4: @622
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(633), // Rule ID 2 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_SEXT'
 // CHECK-NEXT:       GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0),
-// CHECK-NEXT:     // Label 13: @625
+// CHECK-NEXT:     // Label 13: @633
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 5: @626
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(660), // Rule ID 7 //
+// CHECK-NEXT:     // Label 5: @634
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(668), // Rule ID 7 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule6Enabled),
 // CHECK-NEXT:       // MIs[0] dst
 // CHECK-NEXT:       // No operand predicates
@@ -240,7 +240,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_RootToRootCopy, /*OpIdx*/0, // dst
 // CHECK-NEXT:       GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseRootFromParent_Done,
-// CHECK-NEXT:     // Label 14: @660
+// CHECK-NEXT:     // Label 14: @668
 // CHECK-NEXT:     GIM_Reject,
 // CHECK-NEXT:     // Label 6: @[[#%u, DEFAULT]]
 // CHECK-NEXT:     GIM_Reject,
diff --git a/llvm/test/TableGen/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter.td
index 853831366fa53..b9aea33ac96aa 100644
--- a/llvm/test/TableGen/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter.td
@@ -513,7 +513,7 @@ def : Pat<(frag GPR32:$src1, complex:$src2, complex:$src3),
 // R00O-NEXT:  GIM_Reject,
 // R00O:       // Label [[DEFAULT_NUM]]: @[[DEFAULT]]
 // R00O-NEXT:  GIM_Reject,
-// R00O-NEXT:  }; // Size: 1816 bytes
+// R00O-NEXT:  }; // Size: 1824 bytes
 
 def INSNBOB : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3, GPR32:$src4),
                  [(set GPR32:$dst,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td
index f41a97f9ecc81..4a43c16903394 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td
@@ -106,6 +106,10 @@ def AtomicBinOpUIncWrap : LLVM_EnumAttrCase<"uinc_wrap",
                                             "uinc_wrap", "UIncWrap", 15>;
 def AtomicBinOpUDecWrap : LLVM_EnumAttrCase<"udec_wrap",
                                             "udec_wrap", "UDecWrap", 16>;
+def AtomicBinOpUSubCond : LLVM_EnumAttrCase<"usub_cond",
+                                            "usub_cond", "USubCond", 17>;
+def AtomicBinOpUSubSat : LLVM_EnumAttrCase<"usub_sat",
+                                           "usub_sat", "USubSat", 18>;
 
 // A sentinel value that has no MLIR counterpart.
 def AtomicBadBinOp : LLVM_EnumAttrCase<"", "", "BAD_BINOP", 0>;
@@ -118,7 +122,7 @@ def AtomicBinOp : LLVM_EnumAttr<
      AtomicBinOpNand, AtomicBinOpOr, AtomicBinOpXor, AtomicBinOpMax,
      AtomicBinOpMin, AtomicBinOpUMax, AtomicBinOpUMin, AtomicBinOpFAdd,
      AtomicBinOpFSub, AtomicBinOpFMax, AtomicBinOpFMin, AtomicBinOpUIncWrap,
-     AtomicBinOpUDecWrap],
+     AtomicBinOpUDecWrap, AtomicBinOpUSubCond, AtomicBinOpUSubSat],
     [AtomicBadBinOp]> {
   let cppNamespace = "::mlir::LLVM";
 }
diff --git a/mlir/test/Target/LLVMIR/Import/instructions.ll b/mlir/test/Target/LLVMIR/Import/instructions.ll
index 3b1dcee1e85c7..f75c79ea63380 100644
--- a/mlir/test/Target/LLVMIR/Import/instructions.ll
+++ b/mlir/test/Target/LLVMIR/Import/instructions.ll
@@ -440,11 +440,15 @@ define void @atomic_rmw(ptr %ptr1, i32 %val1, ptr %ptr2, float %val2) {
   %16 = atomicrmw uinc_wrap ptr %ptr1, i32 %val1 acquire
   ; CHECK:  llvm.atomicrmw udec_wrap %[[PTR1]], %[[VAL1]] acquire
   %17 = atomicrmw udec_wrap ptr %ptr1, i32 %val1 acquire
+  ; CHECK:  llvm.atomicrmw usub_cond %[[PTR1]], %[[VAL1]] acquire
+  %18 = atomicrmw usub_cond ptr %ptr1, i32 %val1 acquire
+  ; CHECK:  llvm.atomicrmw usub_sat %[[PTR1]], %[[VAL1]] acquire
+  %19 = atomicrmw usub_sat ptr %ptr1, i32 %val1 acquire
 
   ; CHECK:  llvm.atomicrmw volatile
   ; CHECK-SAME:  syncscope("singlethread")
   ; CHECK-SAME:  {alignment = 8 : i64}
-  %18 = atomicrmw volatile udec_wrap ptr %ptr1, i32 %val1 syncscope("singlethread") acquire, align 8
+  %20 = atomicrmw volatile udec_wrap ptr %ptr1, i32 %val1 syncscope("singlethread") acquire, align 8
   ret void
 }
 
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index d2cd0221e0ea7..9086963c6f683 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -1522,11 +1522,15 @@ llvm.func @atomicrmw(
   %15 = llvm.atomicrmw uinc_wrap %i32_ptr, %i32 monotonic : !llvm.ptr, i32
   // CHECK: atomicrmw udec_wrap ptr %{{.*}}, i32 %{{.*}} monotonic
   %16 = llvm.atomicrmw udec_wrap %i32_ptr, %i32 monotonic : !llvm.ptr, i32
+  // CHECK: atomicrmw usub_cond ptr %{{.*}}, i32 %{{.*}} monotonic
+  %17 = llvm.atomicrmw usub_cond %i32_ptr, %i32 monotonic : !llvm.ptr, i32
+  // CHECK: atomicrmw usub_sat ptr %{{.*}}, i32 %{{.*}} monotonic
+  %18 = llvm.atomicrmw usub_sat %i32_ptr, %i32 monotonic : !llvm.ptr, i32
 
   // CHECK: atomicrmw volatile
   // CHECK-SAME:  syncscope("singlethread")
   // CHECK-SAME:  align 8
-  %17 = llvm.atomicrmw volatile udec_wrap %i32_ptr, %i32 syncscope("singlethread") monotonic {alignment = 8 : i64} : !llvm.ptr, i32
+  %19 = llvm.atomicrmw volatile udec_wrap %i32_ptr, %i32 syncscope("singlethread") monotonic {alignment = 8 : i64} : !llvm.ptr, i32
   llvm.return
 }
 

From 61ba60c15416db03872e94217fcc215371caad5d Mon Sep 17 00:00:00 2001
From: Ben Langmuir <blangmuir@apple.com>
Date: Fri, 6 Sep 2024 08:35:30 -0700
Subject: [PATCH 391/425] [orc] Avoid pathological propogation order (#107488)

In certain pathological object files we were getting extremely slow
linking because we were repeatedly propogating dependencies to the same
blocks instead of accumulating as many changes as possible. Change the
order of iteration so that we go through every node in the worklist
before returning to any previous node, reducing the number of expensive
dependency iterations.

In practice, this took one case from 60 seconds to 2 seconds. Note: the
performance is still non-deterministic, because the block order itself
is non-deterministic.

rdar://133734391
---
 llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index a66c40ddb6877..073c25ea788c2 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -605,7 +605,7 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
       bool DependenciesChanged = true;
     };
     DenseMap<Block *, BlockInfo> BlockInfos;
-    SmallVector<Block *> WorkList;
+    std::deque<Block *> WorkList;
 
     // Pre-allocate map entries. This prevents any iterator/reference
     // invalidation in the next loop.
@@ -637,7 +637,8 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
 
     // Propagate block-level dependencies through the block-dependence graph.
     while (!WorkList.empty()) {
-      auto *B = WorkList.pop_back_val();
+      auto *B = WorkList.back();
+      WorkList.pop_back();
 
       auto &BI = BlockInfos[B];
       assert(BI.DependenciesChanged &&
@@ -650,7 +651,7 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
               DependantBI.Dependencies.insert(Dependency).second)
             if (!DependantBI.DependenciesChanged) {
               DependantBI.DependenciesChanged = true;
-              WorkList.push_back(Dependant);
+              WorkList.push_front(Dependant);
             }
         }
       }

From 2cb4d1b1bd7bde2724b79976e859684bd3f5c771 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <109674256+skachkov-sc@users.noreply.github.com>
Date: Fri, 6 Sep 2024 18:39:47 +0300
Subject: [PATCH 392/425] [LSR] Do not create duplicated PHI nodes while
 preserving LCSSA form (#107380)

Motivating example: https://godbolt.org/z/eb97zrxhx
Here we have 2 induction variables in the loop: one is corresponding to
i variable (add rdx, 4), the other - to res (add rax, 2). The second
induction variable can be removed by rewriteLoopExitValues() method
(final value of res at loop exit is unroll_iter * -2); however, this
doesn't happen because we have duplicated LCSSA phi nodes at loop exit:
```
; Preheader:
for.body.preheader.new:                           ; preds = %for.body.preheader
  %unroll_iter = and i64 %N, -4
  br label %for.body

; Loop:
for.body:                                         ; preds = %for.body, %for.body.preheader.new
  %lsr.iv = phi i64 [ %lsr.iv.next, %for.body ], [ 0, %for.body.preheader.new ]
  %i.07 = phi i64 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
  %inc.3 = add nuw i64 %i.07, 4
  %lsr.iv.next = add nsw i64 %lsr.iv, -2
  %niter.ncmp.3.not = icmp eq i64 %unroll_iter, %inc.3
  br i1 %niter.ncmp.3.not, label %for.end.loopexit.unr-lcssa.loopexit, label %for.body, !llvm.loop !7

; Exit blocks
for.end.loopexit.unr-lcssa.loopexit:              ; preds = %for.body
  %inc.3.lcssa = phi i64 [ %inc.3, %for.body ]
  %lsr.iv.next.lcssa11 = phi i64 [ %lsr.iv.next, %for.body ]
  %lsr.iv.next.lcssa = phi i64 [ %lsr.iv.next, %for.body ]
  br label %for.end.loopexit.unr-lcssa
```
rewriteLoopExitValues requires %lsr.iv.next value to have only 2 uses:
one in LCSSA phi node, the other - in induction phi node. Here we have 3
uses of this value because of duplicated lcssa nodes, so the transform
doesn't apply and leads to an extra add operation inside the loop. The
proposed solution is to accumulate inserted instructions that will
require LCSSA form update into SetVector and then call
formLCSSAForInstructions for this SetVector once, so the same
instructions don't process twice.
---
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |  31 ++---
 .../2011-10-03-CritEdgeMerge.ll               |  19 ++-
 .../AMDGPU/lsr-invalid-ptr-extend.ll          |   2 +-
 .../X86/2011-11-29-postincphi.ll              |  11 +-
 .../X86/expander-crashes.ll                   |   2 +-
 .../X86/missing-phi-operand-update.ll         |  25 ++--
 .../LoopStrengthReduce/preserve-lcssa.ll      | 116 ++++++++++++++++++
 7 files changed, 162 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 3ca3818938fd2..f966ccaa83842 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -2186,6 +2186,12 @@ class LSRInstance {
   /// Induction variables that were generated and inserted by the SCEV Expander.
   SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
 
+  // Inserting instructions in the loop and using them as PHI's input could
+  // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
+  // corresponding incoming block is not loop exiting). So collect all such
+  // instructions to form LCSSA for them later.
+  SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
+
   void OptimizeShadowIV();
   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
@@ -2276,9 +2282,9 @@ class LSRInstance {
                 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
   void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
                      const Formula &F,
-                     SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+                     SmallVectorImpl<WeakTrackingVH> &DeadInsts);
   void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
-               SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+               SmallVectorImpl<WeakTrackingVH> &DeadInsts);
   void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
 
 public:
@@ -5858,17 +5864,11 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
 /// Helper for Rewrite. PHI nodes are special because the use of their operands
 /// effectively happens in their predecessor blocks, so the expression may need
 /// to be expanded in multiple places.
-void LSRInstance::RewriteForPHI(
-    PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
-    SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
+void LSRInstance::RewriteForPHI(PHINode *PN, const LSRUse &LU,
+                                const LSRFixup &LF, const Formula &F,
+                                SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
   DenseMap<BasicBlock *, Value *> Inserted;
 
-  // Inserting instructions in the loop and using them as PHI's input could
-  // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
-  // corresponding incoming block is not loop exiting). So collect all such
-  // instructions to form LCSSA for them later.
-  SmallVector<Instruction *, 4> InsertedNonLCSSAInsts;
-
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
     if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
       bool needUpdateFixups = false;
@@ -5939,7 +5939,7 @@ void LSRInstance::RewriteForPHI(
         // the inserted value.
         if (auto *I = dyn_cast<Instruction>(FullV))
           if (L->contains(I) && !L->contains(BB))
-            InsertedNonLCSSAInsts.push_back(I);
+            InsertedNonLCSSAInsts.insert(I);
 
         PN->setIncomingValue(i, FullV);
         Pair.first->second = FullV;
@@ -5983,8 +5983,6 @@ void LSRInstance::RewriteForPHI(
             }
       }
     }
-
-  formLCSSAForInstructions(InsertedNonLCSSAInsts, DT, LI, &SE);
 }
 
 /// Emit instructions for the leading candidate expression for this LSRUse (this
@@ -5992,7 +5990,7 @@ void LSRInstance::RewriteForPHI(
 /// expanded value.
 void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
                           const Formula &F,
-                          SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
+                          SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
   // First, find an insertion point that dominates UserInst. For PHI nodes,
   // find the nearest block which dominates all the relevant uses.
   if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
@@ -6080,6 +6078,9 @@ void LSRInstance::ImplementSolution(
       Changed = true;
     }
 
+  auto InsertedInsts = InsertedNonLCSSAInsts.takeVector();
+  formLCSSAForInstructions(InsertedInsts, DT, LI, &SE);
+
   for (const IVChain &Chain : IVChainVec) {
     GenerateIVChain(Chain, DeadInsts);
     Changed = true;
diff --git a/llvm/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll b/llvm/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll
index bf52c968ad870..7195d4cab96f4 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll
@@ -24,15 +24,15 @@ define ptr @test1() {
 ; CHECK-NEXT:    br i1 false, label [[BBA:%.*]], label [[BBB:%.*]]
 ; CHECK:       bbA:
 ; CHECK-NEXT:    switch i32 0, label [[BBA_BB89_CRIT_EDGE:%.*]] [
-; CHECK-NEXT:    i32 47, label [[BBA_BB89_CRIT_EDGE]]
-; CHECK-NEXT:    i32 58, label [[BBA_BB89_CRIT_EDGE]]
+; CHECK-NEXT:      i32 47, label [[BBA_BB89_CRIT_EDGE]]
+; CHECK-NEXT:      i32 58, label [[BBA_BB89_CRIT_EDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bbA.bb89_crit_edge:
 ; CHECK-NEXT:    br label [[BB89:%.*]]
 ; CHECK:       bbB:
 ; CHECK-NEXT:    switch i8 0, label [[BBB_BB89_CRIT_EDGE:%.*]] [
-; CHECK-NEXT:    i8 47, label [[BBB_BB89_CRIT_EDGE]]
-; CHECK-NEXT:    i8 58, label [[BBB_BB89_CRIT_EDGE]]
+; CHECK-NEXT:      i8 47, label [[BBB_BB89_CRIT_EDGE]]
+; CHECK-NEXT:      i8 58, label [[BBB_BB89_CRIT_EDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bbB.bb89_crit_edge:
 ; CHECK-NEXT:    br label [[BB89]]
@@ -85,23 +85,22 @@ define ptr @test2() {
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 1
 ; CHECK-NEXT:    br i1 false, label [[LOOP]], label [[LOOPEXIT:%.*]]
 ; CHECK:       loopexit:
-; CHECK-NEXT:    [[SCEVGEP_LCSSA1:%.*]] = phi ptr [ [[SCEVGEP]], [[LOOP]] ]
 ; CHECK-NEXT:    [[SCEVGEP_LCSSA:%.*]] = phi ptr [ [[SCEVGEP]], [[LOOP]] ]
 ; CHECK-NEXT:    br i1 false, label [[BBA:%.*]], label [[BBB:%.*]]
 ; CHECK:       bbA:
 ; CHECK-NEXT:    switch i32 0, label [[BB89:%.*]] [
-; CHECK-NEXT:    i32 47, label [[BB89]]
-; CHECK-NEXT:    i32 58, label [[BB89]]
+; CHECK-NEXT:      i32 47, label [[BB89]]
+; CHECK-NEXT:      i32 58, label [[BB89]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bbB:
 ; CHECK-NEXT:    switch i8 0, label [[BBB_EXIT_CRIT_EDGE:%.*]] [
-; CHECK-NEXT:    i8 47, label [[BBB_EXIT_CRIT_EDGE]]
-; CHECK-NEXT:    i8 58, label [[BBB_EXIT_CRIT_EDGE]]
+; CHECK-NEXT:      i8 47, label [[BBB_EXIT_CRIT_EDGE]]
+; CHECK-NEXT:      i8 58, label [[BBB_EXIT_CRIT_EDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bbB.exit_crit_edge:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       bb89:
-; CHECK-NEXT:    [[TMP75PHI:%.*]] = phi ptr [ [[SCEVGEP_LCSSA1]], [[BBA]] ], [ [[SCEVGEP_LCSSA1]], [[BBA]] ], [ [[SCEVGEP_LCSSA1]], [[BBA]] ]
+; CHECK-NEXT:    [[TMP75PHI:%.*]] = phi ptr [ [[SCEVGEP_LCSSA]], [[BBA]] ], [ [[SCEVGEP_LCSSA]], [[BBA]] ], [ [[SCEVGEP_LCSSA]], [[BBA]] ]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[RESULT:%.*]] = phi ptr [ [[TMP75PHI]], [[BB89]] ], [ [[SCEVGEP_LCSSA]], [[BBB_EXIT_CRIT_EDGE]] ]
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll
index b4fb4fe7aaf96..737a590394e5f 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll
@@ -16,8 +16,8 @@ define amdgpu_kernel void @scaledregtest() local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       loopexit:
-; CHECK-NEXT:    [[SCEVGEP13_LCSSA:%.*]] = phi ptr [ [[SCEVGEP13:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SCEVGEP11_LCSSA:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP11:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SCEVGEP13_LCSSA:%.*]] = phi ptr [ [[SCEVGEP13:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY_1:%.*]]
 ; CHECK:       for.body.1:
 ; CHECK-NEXT:    [[LSR_IV5:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP6:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP11_LCSSA]], [[LOOPEXIT:%.*]] ]
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/2011-11-29-postincphi.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/2011-11-29-postincphi.ll
index fbb9e2a7b6b82..841836c7d2dd8 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/2011-11-29-postincphi.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/2011-11-29-postincphi.ll
@@ -20,16 +20,17 @@ define i64 @sqlite3DropTriggerPtr() nounwind {
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_1: # %bb1
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rbx, %rcx
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    je .LBB0_3
+; CHECK-NEXT:    je .LBB0_4
 ; CHECK-NEXT:  # %bb.2: # %bb4
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    leaq 1(%rcx), %rbx
+; CHECK-NEXT:    incq %rbx
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    jne .LBB0_1
-; CHECK-NEXT:  .LBB0_3: # %bb8
-; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:  # %bb.3: # %bb8split
+; CHECK-NEXT:    decq %rbx
+; CHECK-NEXT:  .LBB0_4: # %bb8
+; CHECK-NEXT:    movq %rbx, %rax
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    retq
 bb:
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/expander-crashes.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/expander-crashes.ll
index 29c03b88c5fb1..d652e5c5aa060 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/expander-crashes.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/expander-crashes.ll
@@ -21,8 +21,8 @@ define i64 @blam(ptr %start, ptr %end, ptr %ptr.2) {
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[IV_NEXT]], [[END:%.*]]
 ; CHECK-NEXT:    br i1 [[EC]], label [[LOOP_2_PH:%.*]], label [[LOOP_1_HEADER]]
 ; CHECK:       loop.2.ph:
-; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi ptr [ [[IV_NEXT]], [[LOOP_1_HEADER]] ]
 ; CHECK-NEXT:    [[LSR_IV_NEXT5_LCSSA:%.*]] = phi i64 [ [[LSR_IV_NEXT5]], [[LOOP_1_HEADER]] ]
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi ptr [ [[IV_NEXT]], [[LOOP_1_HEADER]] ]
 ; CHECK-NEXT:    br label [[LOOP_2_HEADER:%.*]]
 ; CHECK:       loop.2.header:
 ; CHECK-NEXT:    [[LSR_IV2:%.*]] = phi i64 [ [[LSR_IV_NEXT3:%.*]], [[LOOP_2_LATCH:%.*]] ], [ [[LSR_IV_NEXT5_LCSSA]], [[LOOP_2_PH]] ]
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/missing-phi-operand-update.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/missing-phi-operand-update.ll
index b13503543d6ee..ae24da06415cc 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/missing-phi-operand-update.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/missing-phi-operand-update.ll
@@ -18,23 +18,24 @@ define i32 @foo(ptr %A, i32 %t) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP_32:%.*]]
 ; CHECK:       loop.exit.loopexitsplitsplitsplit:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[LSR_IV:%.*]], -1
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV1:%.*]], [[IFMERGE_34:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[LSR_IV]], -1
 ; CHECK-NEXT:    br label [[LOOP_EXIT_LOOPEXITSPLITSPLIT:%.*]]
 ; CHECK:       ifmerge.38.loop.exit.loopexitsplitsplit_crit_edge:
-; CHECK-NEXT:    [[LSR_IV_LCSSA10:%.*]] = phi i64 [ [[LSR_IV]], [[IFMERGE_38:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV_LCSSA10:%.*]] = phi i64 [ [[LSR_IV1]], [[IFMERGE_38:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_EXIT_LOOPEXITSPLITSPLIT]]
 ; CHECK:       loop.exit.loopexitsplitsplit:
 ; CHECK-NEXT:    [[INDVARS_IV_LCSSA_PH_PH_PH:%.*]] = phi i64 [ [[LSR_IV_LCSSA10]], [[IFMERGE_38_LOOP_EXIT_LOOPEXITSPLITSPLIT_CRIT_EDGE:%.*]] ], [ [[TMP0]], [[LOOP_EXIT_LOOPEXITSPLITSPLITSPLIT:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_EXIT_LOOPEXITSPLIT:%.*]]
 ; CHECK:       ifmerge.42.loop.exit.loopexitsplit_crit_edge:
-; CHECK-NEXT:    [[LSR_IV_LCSSA11:%.*]] = phi i64 [ [[LSR_IV]], [[IFMERGE_42:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV_LCSSA11:%.*]] = phi i64 [ [[LSR_IV1]], [[IFMERGE_42:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[LSR_IV_LCSSA11]], 1
 ; CHECK-NEXT:    br label [[LOOP_EXIT_LOOPEXITSPLIT]]
 ; CHECK:       loop.exit.loopexitsplit:
 ; CHECK-NEXT:    [[INDVARS_IV_LCSSA_PH_PH:%.*]] = phi i64 [ [[TMP1]], [[IFMERGE_42_LOOP_EXIT_LOOPEXITSPLIT_CRIT_EDGE:%.*]] ], [ [[INDVARS_IV_LCSSA_PH_PH_PH]], [[LOOP_EXIT_LOOPEXITSPLITSPLIT]] ]
 ; CHECK-NEXT:    br label [[LOOP_EXIT_LOOPEXIT:%.*]]
 ; CHECK:       then.34.loop.exit.loopexit_crit_edge:
-; CHECK-NEXT:    [[LSR_IV_LCSSA:%.*]] = phi i64 [ [[LSR_IV]], [[THEN_34:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV_LCSSA:%.*]] = phi i64 [ [[LSR_IV1]], [[THEN_34:%.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[LSR_IV_LCSSA]], -2
 ; CHECK-NEXT:    br label [[LOOP_EXIT_LOOPEXIT]]
 ; CHECK:       loop.exit.loopexit:
@@ -48,23 +49,23 @@ define i32 @foo(ptr %A, i32 %t) {
 ; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ [[TMP]], [[LOOP_EXIT]] ], [ 50, [[THEN_8_1]] ], [ 50, [[IFMERGE_8:%.*]] ]
 ; CHECK-NEXT:    ret i32 [[I_0_LCSSA]]
 ; CHECK:       loop.32:
-; CHECK-NEXT:    [[LSR_IV]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[IFMERGE_46:%.*]] ], [ 2, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV1]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[IFMERGE_46:%.*]] ], [ 2, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[I1_I64_0:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTIVLOOP_32:%.*]], [[IFMERGE_46]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[LSR_IV]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[LSR_IV1]], 2
 ; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[SCEVGEP8:%.*]] = getelementptr i8, ptr [[SCEVGEP7]], i64 -4
 ; CHECK-NEXT:    [[GEPLOAD:%.*]] = load i32, ptr [[SCEVGEP8]], align 4
 ; CHECK-NEXT:    [[CMP_34:%.*]] = icmp sgt i32 [[GEPLOAD]], [[T]]
-; CHECK-NEXT:    br i1 [[CMP_34]], label [[THEN_34]], label [[IFMERGE_34:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_34]], label [[THEN_34]], label [[IFMERGE_34]]
 ; CHECK:       then.34:
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[LSR_IV]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[LSR_IV1]], 2
 ; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[SCEVGEP5]], i64 -8
 ; CHECK-NEXT:    [[GEPLOAD18:%.*]] = load i32, ptr [[SCEVGEP6]], align 4
 ; CHECK-NEXT:    [[CMP_35:%.*]] = icmp slt i32 [[GEPLOAD18]], [[T]]
 ; CHECK-NEXT:    br i1 [[CMP_35]], label [[THEN_34_LOOP_EXIT_LOOPEXIT_CRIT_EDGE]], label [[IFMERGE_34]]
 ; CHECK:       ifmerge.34:
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[LSR_IV]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[LSR_IV1]], 2
 ; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[GEPLOAD20:%.*]] = load i32, ptr [[SCEVGEP4]], align 4
 ; CHECK-NEXT:    [[CMP_38:%.*]] = icmp sgt i32 [[GEPLOAD20]], [[T]]
@@ -72,7 +73,7 @@ define i32 @foo(ptr %A, i32 %t) {
 ; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[CMP_38]], [[CMP_39]]
 ; CHECK-NEXT:    br i1 [[OR_COND]], label [[LOOP_EXIT_LOOPEXITSPLITSPLITSPLIT]], label [[IFMERGE_38]]
 ; CHECK:       ifmerge.38:
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[LSR_IV]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[LSR_IV1]], 2
 ; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SCEVGEP2]], i64 4
 ; CHECK-NEXT:    [[GEPLOAD24:%.*]] = load i32, ptr [[SCEVGEP3]], align 4
@@ -81,7 +82,7 @@ define i32 @foo(ptr %A, i32 %t) {
 ; CHECK-NEXT:    [[OR_COND55:%.*]] = and i1 [[CMP_42]], [[CMP_43]]
 ; CHECK-NEXT:    br i1 [[OR_COND55]], label [[IFMERGE_38_LOOP_EXIT_LOOPEXITSPLITSPLIT_CRIT_EDGE]], label [[IFMERGE_42]]
 ; CHECK:       ifmerge.42:
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[LSR_IV]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[LSR_IV1]], 2
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 8
 ; CHECK-NEXT:    [[GEPLOAD28:%.*]] = load i32, ptr [[SCEVGEP1]], align 4
@@ -91,7 +92,7 @@ define i32 @foo(ptr %A, i32 %t) {
 ; CHECK-NEXT:    br i1 [[OR_COND56]], label [[IFMERGE_42_LOOP_EXIT_LOOPEXITSPLIT_CRIT_EDGE]], label [[IFMERGE_46]]
 ; CHECK:       ifmerge.46:
 ; CHECK-NEXT:    [[NEXTIVLOOP_32]] = add nuw nsw i64 [[I1_I64_0]], 1
-; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nuw nsw i64 [[LSR_IV]], 4
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nuw nsw i64 [[LSR_IV1]], 4
 ; CHECK-NEXT:    [[CONDLOOP_32:%.*]] = icmp ult i64 [[NEXTIVLOOP_32]], 12
 ; CHECK-NEXT:    br i1 [[CONDLOOP_32]], label [[LOOP_32]], label [[LOOP_25:%.*]]
 ; CHECK:       loop.25:
diff --git a/llvm/test/Transforms/LoopStrengthReduce/preserve-lcssa.ll b/llvm/test/Transforms/LoopStrengthReduce/preserve-lcssa.ll
index 0add19e286f58..376831faa99fb 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/preserve-lcssa.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/preserve-lcssa.ll
@@ -89,3 +89,119 @@ loop_exit_7:                                      ; preds = %be_6, %loop_4
   %val_i32_24.lcssa = phi i32 [ %val_i32_24, %be_6 ], [ %val_i32_24, %loop_4 ]
   br label %bb_5
 }
+
+define i64 @test_duplicated_phis(i64 noundef %N) {
+; LEGACYPM-LABEL: define i64 @test_duplicated_phis
+; LEGACYPM-SAME: (i64 noundef [[N:%.*]]) {
+; LEGACYPM-NEXT:  entry:
+; LEGACYPM-NEXT:    [[MUL:%.*]] = shl i64 [[N]], 1
+; LEGACYPM-NEXT:    [[CMP6_NOT:%.*]] = icmp eq i64 [[MUL]], 0
+; LEGACYPM-NEXT:    br i1 [[CMP6_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; LEGACYPM:       for.body.preheader:
+; LEGACYPM-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[MUL]], 4
+; LEGACYPM-NEXT:    br i1 [[TMP0]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY_PREHEADER_NEW:%.*]]
+; LEGACYPM:       for.body.preheader.new:
+; LEGACYPM-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[MUL]], -4
+; LEGACYPM-NEXT:    [[TMP1:%.*]] = add i64 [[UNROLL_ITER]], -4
+; LEGACYPM-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 2
+; LEGACYPM-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1
+; LEGACYPM-NEXT:    [[TMP4:%.*]] = sub i64 -3, [[TMP3]]
+; LEGACYPM-NEXT:    br label [[FOR_BODY:%.*]]
+; LEGACYPM:       for.body:
+; LEGACYPM-NEXT:    [[I_07:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[INC_3:%.*]], [[FOR_BODY]] ]
+; LEGACYPM-NEXT:    [[INC_3]] = add i64 [[I_07]], 4
+; LEGACYPM-NEXT:    [[NITER_NCMP_3_NOT:%.*]] = icmp eq i64 [[UNROLL_ITER]], [[INC_3]]
+; LEGACYPM-NEXT:    br i1 [[NITER_NCMP_3_NOT]], label [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; LEGACYPM:       for.end.loopexit.unr-lcssa.loopexit:
+; LEGACYPM-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 1
+; LEGACYPM-NEXT:    br label [[FOR_END_LOOPEXIT_UNR_LCSSA]]
+; LEGACYPM:       for.end.loopexit.unr-lcssa:
+; LEGACYPM-NEXT:    [[RES_1_LCSSA_PH:%.*]] = phi i64 [ undef, [[FOR_BODY_PREHEADER]] ], [ [[TMP5]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; LEGACYPM-NEXT:    [[RES_09_UNR:%.*]] = phi i64 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP4]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; LEGACYPM-NEXT:    [[TMP6:%.*]] = and i64 [[N]], 1
+; LEGACYPM-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[TMP6]], 0
+; LEGACYPM-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[LCMP_MOD_NOT]], i64 [[RES_1_LCSSA_PH]], i64 [[RES_09_UNR]]
+; LEGACYPM-NEXT:    br label [[FOR_END]]
+; LEGACYPM:       for.end:
+; LEGACYPM-NEXT:    [[RES_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[FOR_END_LOOPEXIT_UNR_LCSSA]] ]
+; LEGACYPM-NEXT:    ret i64 [[RES_0_LCSSA]]
+;
+; NEWPM-LABEL: define i64 @test_duplicated_phis
+; NEWPM-SAME: (i64 noundef [[N:%.*]]) {
+; NEWPM-NEXT:  entry:
+; NEWPM-NEXT:    [[MUL:%.*]] = shl i64 [[N]], 1
+; NEWPM-NEXT:    [[CMP6_NOT:%.*]] = icmp eq i64 [[MUL]], 0
+; NEWPM-NEXT:    br i1 [[CMP6_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; NEWPM:       for.body.preheader:
+; NEWPM-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[MUL]], 4
+; NEWPM-NEXT:    br i1 [[TMP0]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY_PREHEADER_NEW:%.*]]
+; NEWPM:       for.body.preheader.new:
+; NEWPM-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[MUL]], -4
+; NEWPM-NEXT:    br label [[FOR_BODY:%.*]]
+; NEWPM:       for.body:
+; NEWPM-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 3, [[FOR_BODY_PREHEADER_NEW]] ]
+; NEWPM-NEXT:    [[I_07:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[INC_3:%.*]], [[FOR_BODY]] ]
+; NEWPM-NEXT:    [[INC_3]] = add i64 [[I_07]], 4
+; NEWPM-NEXT:    [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], -2
+; NEWPM-NEXT:    [[NITER_NCMP_3_NOT:%.*]] = icmp eq i64 [[UNROLL_ITER]], [[INC_3]]
+; NEWPM-NEXT:    [[TMP1:%.*]] = add i64 [[LSR_IV_NEXT]], -3
+; NEWPM-NEXT:    br i1 [[NITER_NCMP_3_NOT]], label [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; NEWPM:       for.end.loopexit.unr-lcssa.loopexit:
+; NEWPM-NEXT:    [[REASS_SUB_LCSSA:%.*]] = phi i64 [ [[LSR_IV_NEXT]], [[FOR_BODY]] ]
+; NEWPM-NEXT:    [[RES_1_3_LCSSA:%.*]] = phi i64 [ [[TMP1]], [[FOR_BODY]] ]
+; NEWPM-NEXT:    [[TMP2:%.*]] = add i64 [[REASS_SUB_LCSSA]], -4
+; NEWPM-NEXT:    br label [[FOR_END_LOOPEXIT_UNR_LCSSA]]
+; NEWPM:       for.end.loopexit.unr-lcssa:
+; NEWPM-NEXT:    [[RES_1_LCSSA_PH:%.*]] = phi i64 [ undef, [[FOR_BODY_PREHEADER]] ], [ [[RES_1_3_LCSSA]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; NEWPM-NEXT:    [[RES_09_UNR:%.*]] = phi i64 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP2]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; NEWPM-NEXT:    [[TMP3:%.*]] = and i64 [[N]], 1
+; NEWPM-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[TMP3]], 0
+; NEWPM-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[LCMP_MOD_NOT]], i64 [[RES_1_LCSSA_PH]], i64 [[RES_09_UNR]]
+; NEWPM-NEXT:    br label [[FOR_END]]
+; NEWPM:       for.end:
+; NEWPM-NEXT:    [[RES_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[FOR_END_LOOPEXIT_UNR_LCSSA]] ]
+; NEWPM-NEXT:    ret i64 [[RES_0_LCSSA]]
+;
+entry:
+  %mul = shl i64 %N, 1
+  %cmp6.not = icmp eq i64 %mul, 0
+  br i1 %cmp6.not, label %for.end, label %for.body.preheader
+
+for.body.preheader:
+  %0 = icmp ult i64 %mul, 4
+  br i1 %0, label %for.end.loopexit.unr-lcssa, label %for.body.preheader.new
+
+for.body.preheader.new:
+  %unroll_iter = and i64 %mul, -4
+  br label %for.body
+
+for.body:
+  %res.09 = phi i64 [ 0, %for.body.preheader.new ], [ %res.1.3, %for.body ]
+  %i.07 = phi i64 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
+  %niter = phi i64 [ 0, %for.body.preheader.new ], [ %niter.next.3, %for.body ]
+  %res.1.1 = add i64 %res.09, -1
+  %inc.1 = or disjoint i64 %i.07, 2
+  %res.1.2 = add i64 %inc.1, %res.1.1
+  %reass.sub = sub i64 %res.1.2, %i.07
+  %res.1.3 = add i64 %reass.sub, -3
+  %inc.3 = add nuw i64 %i.07, 4
+  %niter.next.3 = add i64 %niter, 4
+  %niter.ncmp.3.not = icmp eq i64 %niter.next.3, %unroll_iter
+  br i1 %niter.ncmp.3.not, label %for.end.loopexit.unr-lcssa.loopexit, label %for.body
+
+for.end.loopexit.unr-lcssa.loopexit:
+  %1 = add i64 %reass.sub, -4
+  br label %for.end.loopexit.unr-lcssa
+
+for.end.loopexit.unr-lcssa:
+  %res.1.lcssa.ph = phi i64 [ undef, %for.body.preheader ], [ %res.1.3, %for.end.loopexit.unr-lcssa.loopexit ]
+  %res.09.unr = phi i64 [ -1, %for.body.preheader ], [ %1, %for.end.loopexit.unr-lcssa.loopexit ]
+  %2 = and i64 %N, 1
+  %lcmp.mod.not = icmp eq i64 %2, 0
+  %spec.select = select i1 %lcmp.mod.not, i64 %res.1.lcssa.ph, i64 %res.09.unr
+  br label %for.end
+
+for.end:
+  %res.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.end.loopexit.unr-lcssa ]
+  ret i64 %res.0.lcssa
+}

From 00e40c9b5b1a8208c4f2b785138d79ad0e9107af Mon Sep 17 00:00:00 2001
From: Kolya Panchenko <87679760+nikolaypanchenko@users.noreply.github.com>
Date: Fri, 6 Sep 2024 08:41:36 -0700
Subject: [PATCH 393/425] [LV] Support binary and unary operations with
 EVL-vectorization (#93854)

The patch adds `VPWidenEVLRecipe` which represents `VPWidenRecipe` + EVL
argument. The new recipe replaces `VPWidenRecipe` in
`tryAddExplicitVectorLength` for each binary and unary operations.
Follow up patches will extend support for remaining cases, like `FCmp`
and `ICmp`
---
 llvm/lib/Transforms/Vectorize/VPlan.h         |   69 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |    5 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   52 +
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  101 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |    1 +
 .../LoopVectorize/RISCV/inloop-reduction.ll   |   61 +
 ...-force-tail-with-evl-bin-unary-ops-args.ll | 1763 +++++++++++++++++
 ...ze-force-tail-with-evl-masked-loadstore.ll |   60 +-
 ...-force-tail-with-evl-reverse-load-store.ll |  148 +-
 .../RISCV/vectorize-vp-intrinsics.ll          |   22 +-
 .../RISCV/vplan-vp-intrinsics.ll              |    2 +-
 11 files changed, 2122 insertions(+), 162 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 9ad98a5371d81..cec3135395483 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -923,6 +923,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPWidenCastSC:
     case VPRecipeBase::VPWidenGEPSC:
     case VPRecipeBase::VPWidenSC:
+    case VPRecipeBase::VPWidenEVLSC:
     case VPRecipeBase::VPWidenSelectSC:
     case VPRecipeBase::VPBlendSC:
     case VPRecipeBase::VPPredInstPHISC:
@@ -1107,6 +1108,7 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
   static inline bool classof(const VPRecipeBase *R) {
     return R->getVPDefID() == VPRecipeBase::VPInstructionSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenSC ||
+           R->getVPDefID() == VPRecipeBase::VPWidenEVLSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
            R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
@@ -1410,11 +1412,16 @@ class VPInstruction : public VPRecipeWithIRFlags {
 class VPWidenRecipe : public VPRecipeWithIRFlags {
   unsigned Opcode;
 
+protected:
+  template <typename IterT>
+  VPWidenRecipe(unsigned VPDefOpcode, Instruction &I,
+                iterator_range<IterT> Operands)
+      : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), Opcode(I.getOpcode()) {}
+
 public:
   template <typename IterT>
   VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
-      : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I),
-        Opcode(I.getOpcode()) {}
+      : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {}
 
   ~VPWidenRecipe() override = default;
 
@@ -1424,7 +1431,15 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
     return R;
   }
 
-  VP_CLASSOF_IMPL(VPDef::VPWidenSC)
+  static inline bool classof(const VPRecipeBase *R) {
+    return R->getVPDefID() == VPRecipeBase::VPWidenSC ||
+           R->getVPDefID() == VPRecipeBase::VPWidenEVLSC;
+  }
+
+  static inline bool classof(const VPUser *U) {
+    auto *R = dyn_cast<VPRecipeBase>(U);
+    return R && classof(R);
+  }
 
   /// Produce a widened instruction using the opcode and operands of the recipe,
   /// processing State.VF elements.
@@ -1443,6 +1458,54 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
 #endif
 };
 
+/// A recipe for widening operations with vector-predication intrinsics with
+/// explicit vector length (EVL).
+class VPWidenEVLRecipe : public VPWidenRecipe {
+  using VPRecipeWithIRFlags::transferFlags;
+
+public:
+  template <typename IterT>
+  VPWidenEVLRecipe(Instruction &I, iterator_range<IterT> Operands, VPValue &EVL)
+      : VPWidenRecipe(VPDef::VPWidenEVLSC, I, Operands) {
+    addOperand(&EVL);
+  }
+  VPWidenEVLRecipe(VPWidenRecipe &W, VPValue &EVL)
+      : VPWidenEVLRecipe(*W.getUnderlyingInstr(), W.operands(), EVL) {
+    transferFlags(W);
+  }
+
+  ~VPWidenEVLRecipe() override = default;
+
+  VPWidenRecipe *clone() override final {
+    llvm_unreachable("VPWidenEVLRecipe cannot be cloned");
+    return nullptr;
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPWidenEVLSC);
+
+  VPValue *getEVL() { return getOperand(getNumOperands() - 1); }
+  const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); }
+
+  /// Produce a vp-intrinsic using the opcode and operands of the recipe,
+  /// processing EVL elements.
+  void execute(VPTransformState &State) override final;
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    // EVL in that recipe is always the last operand, thus any use before means
+    // the VPValue should be vectorized.
+    return getEVL() == Op;
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override final;
+#endif
+};
+
 /// VPWidenCastRecipe is a recipe to create vector cast instructions.
 class VPWidenCastRecipe : public VPRecipeWithIRFlags {
   /// Cast instruction opcode.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index d79a7e814ecb3..f091ee5a71b29 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -263,8 +263,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
                 VPWidenCanonicalIVRecipe>([this](const VPRecipeBase *R) {
             return inferScalarType(R->getOperand(0));
           })
-          .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
-                VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
+          .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPWidenEVLRecipe,
+                VPReplicateRecipe, VPWidenCallRecipe, VPWidenMemoryRecipe,
+                VPWidenSelectRecipe>(
               [this](const auto *R) { return inferScalarTypeForRecipe(R); })
           .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
             // TODO: Use info from interleave group.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7708763aef859..ea49d29e28d83 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/IR/VectorBuilder.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -74,6 +75,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPWidenLoadSC:
   case VPWidenPHISC:
   case VPWidenSC:
+  case VPWidenEVLSC:
   case VPWidenSelectSC: {
     const Instruction *I =
         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
@@ -114,6 +116,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
   case VPWidenIntOrFpInductionSC:
   case VPWidenPHISC:
   case VPWidenSC:
+  case VPWidenEVLSC:
   case VPWidenSelectSC: {
     const Instruction *I =
         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
@@ -164,6 +167,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPWidenPHISC:
   case VPWidenPointerInductionSC:
   case VPWidenSC:
+  case VPWidenEVLSC:
   case VPWidenSelectSC: {
     const Instruction *I =
         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
@@ -1262,6 +1266,45 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
   }
 }
 
+void VPWidenEVLRecipe::execute(VPTransformState &State) {
+  unsigned Opcode = getOpcode();
+  // TODO: Support other opcodes
+  if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
+    llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
+
+  State.setDebugLocFrom(getDebugLoc());
+  assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
+                          "explicit vector length.");
+  VPValue *Op0 = getOperand(0);
+
+  assert(State.get(Op0, 0)->getType()->isVectorTy() &&
+         "VPWidenEVLRecipe should not be used for scalars");
+
+  VPValue *EVL = getEVL();
+  Value *EVLArg = State.get(EVL, 0, /*NeedsScalar=*/true);
+  IRBuilderBase &BuilderIR = State.Builder;
+  VectorBuilder Builder(BuilderIR);
+  Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
+
+  SmallVector<Value *, 4> Ops;
+  for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
+    VPValue *VPOp = getOperand(I);
+    Ops.push_back(State.get(VPOp, 0));
+  }
+
+  Builder.setMask(Mask).setEVL(EVLArg);
+  Value *VPInst =
+      Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op");
+  // Currently vp-intrinsics only accept FMF flags.
+  // TODO: Enable other flags when support is added.
+  if (isa<FPMathOperator>(VPInst))
+    setFlags(cast<Instruction>(VPInst));
+
+  State.set(this, VPInst, 0);
+  State.addMetadata(VPInst,
+                    dyn_cast_or_null<Instruction>(getUnderlyingValue()));
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
@@ -1271,6 +1314,15 @@ void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
   printFlags(O);
   printOperands(O, SlotTracker);
 }
+
+void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+                             VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN-VP ";
+  printAsOperand(O, SlotTracker);
+  O << " = " << Instruction::getOpcodeName(getOpcode());
+  printFlags(O);
+  printOperands(O, SlotTracker);
+}
 #endif
 
 void VPWidenCastRecipe::execute(VPTransformState &State) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 39f7bf55ee5eb..eaf54464457d5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -13,6 +13,7 @@
 
 #include "VPlanTransforms.h"
 #include "VPRecipeBuilder.h"
+#include "VPlan.h"
 #include "VPlanAnalysis.h"
 #include "VPlanCFG.h"
 #include "VPlanDominatorTree.h"
@@ -21,6 +22,7 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Intrinsics.h"
@@ -1316,6 +1318,63 @@ void VPlanTransforms::addActiveLaneMask(
     HeaderMask->replaceAllUsesWith(LaneMask);
 }
 
+/// Replace recipes with their EVL variants.
+static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
+  SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
+  for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
+    for (VPUser *U : collectUsersRecursively(HeaderMask)) {
+      auto *CurRecipe = dyn_cast<VPRecipeBase>(U);
+      if (!CurRecipe)
+        continue;
+      auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
+        assert(OrigMask && "Unmasked recipe when folding tail");
+        return HeaderMask == OrigMask ? nullptr : OrigMask;
+      };
+
+      VPRecipeBase *NewRecipe =
+          TypeSwitch<VPRecipeBase *, VPRecipeBase *>(CurRecipe)
+              .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
+                VPValue *NewMask = GetNewMask(L->getMask());
+                return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
+              })
+              .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
+                VPValue *NewMask = GetNewMask(S->getMask());
+                return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
+              })
+              .Case<VPWidenRecipe>([&](VPWidenRecipe *W) -> VPRecipeBase * {
+                unsigned Opcode = W->getOpcode();
+                if (!Instruction::isBinaryOp(Opcode) &&
+                    !Instruction::isUnaryOp(Opcode))
+                  return nullptr;
+                return new VPWidenEVLRecipe(*W, EVL);
+              })
+              .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
+                VPValue *NewMask = GetNewMask(Red->getCondOp());
+                return new VPReductionEVLRecipe(*Red, EVL, NewMask);
+              })
+              .Default([&](VPRecipeBase *R) { return nullptr; });
+
+      if (!NewRecipe)
+        continue;
+
+      [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues();
+      assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
+             "New recipe must define the same number of values as the "
+             "original.");
+      assert(
+          NumDefVal <= 1 &&
+          "Only supports recipes with a single definition or without users.");
+      NewRecipe->insertBefore(CurRecipe);
+      if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(NewRecipe)) {
+        VPValue *CurVPV = CurRecipe->getVPSingleValue();
+        CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
+      }
+      CurRecipe->eraseFromParent();
+    }
+    recursivelyDeleteDeadRecipes(HeaderMask);
+  }
+}
+
 /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
 /// replaces all uses except the canonical IV increment of
 /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
@@ -1385,48 +1444,8 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
   NextEVLIV->insertBefore(CanonicalIVIncrement);
   EVLPhi->addOperand(NextEVLIV);
 
-  for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
-    for (VPUser *U : collectUsersRecursively(HeaderMask)) {
-      VPRecipeBase *NewRecipe = nullptr;
-      auto *CurRecipe = dyn_cast<VPRecipeBase>(U);
-      if (!CurRecipe)
-        continue;
-
-      auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
-        assert(OrigMask && "Unmasked recipe when folding tail");
-        return HeaderMask == OrigMask ? nullptr : OrigMask;
-      };
-      if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(CurRecipe)) {
-        VPValue *NewMask = GetNewMask(MemR->getMask());
-        if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR))
-          NewRecipe = new VPWidenLoadEVLRecipe(*L, *VPEVL, NewMask);
-        else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR))
-          NewRecipe = new VPWidenStoreEVLRecipe(*S, *VPEVL, NewMask);
-        else
-          llvm_unreachable("unsupported recipe");
-      } else if (auto *RedR = dyn_cast<VPReductionRecipe>(CurRecipe)) {
-        NewRecipe = new VPReductionEVLRecipe(*RedR, *VPEVL,
-                                             GetNewMask(RedR->getCondOp()));
-      }
+  transformRecipestoEVLRecipes(Plan, *VPEVL);
 
-      if (NewRecipe) {
-        [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues();
-        assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
-               "New recipe must define the same number of values as the "
-               "original.");
-        assert(
-            NumDefVal <= 1 &&
-            "Only supports recipes with a single definition or without users.");
-        NewRecipe->insertBefore(CurRecipe);
-        if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(NewRecipe)) {
-          VPValue *CurVPV = CurRecipe->getVPSingleValue();
-          CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
-        }
-        CurRecipe->eraseFromParent();
-      }
-    }
-    recursivelyDeleteDeadRecipes(HeaderMask);
-  }
   // Replace all uses of VPCanonicalIVPHIRecipe by
   // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
   CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 452c977106a77..b8b2c0bd4d5ff 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -356,6 +356,7 @@ class VPDef {
     VPWidenStoreEVLSC,
     VPWidenStoreSC,
     VPWidenSC,
+    VPWidenEVLSC,
     VPWidenSelectSC,
     VPBlendSC,
     // START: Phi-like recipes. Need to be kept together.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index deb9f0f9bb7e0..0381f6dae9811 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -385,6 +385,67 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
 ; IF-EVL-INLOOP-NEXT:    ret i32 [[SMIN_LCSSA]]
 ;
+; IF-EVL-LABEL: @smin(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
+; IF-EVL-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP11]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP15:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; IF-EVL-NEXT:    [[TMP16]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> [[TMP16]], <vscale x 4 x i32> [[VEC_PHI]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[TMP17]])
+; IF-EVL-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[TMP20]], [[RDX]]
+; IF-EVL-NEXT:    [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP20]], i32 [[RDX]]
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL:       for.end:
+; IF-EVL-NEXT:    [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT:    ret i32 [[SMIN_LCSSA]]
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll
new file mode 100644
index 0000000000000..501e27d73737c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll
@@ -0,0 +1,1763 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s --check-prefix=NO-VP
+
+
+define void @test_and(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_and(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.and.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT:    [[TMP:%.*]] = and i8 [[TMP20]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_and(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT:    [[TMP:%.*]] = and i8 [[TMP0]], 1
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+  %0 = load i8, ptr %arrayidx, align 1
+  %tmp = and i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+  store i8 %tmp, ptr %arrayidx1, align 1
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_or(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_or(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.or.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT:    [[TMP:%.*]] = or i8 [[TMP20]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_or(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT:    [[TMP:%.*]] = or i8 [[TMP0]], 1
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+  %0 = load i8, ptr %arrayidx, align 1
+  %tmp = or i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+  store i8 %tmp, ptr %arrayidx1, align 1
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_xor(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_xor(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.xor.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT:    [[TMP:%.*]] = xor i8 [[TMP20]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_xor(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT:    [[TMP:%.*]] = xor i8 [[TMP0]], 1
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+  %0 = load i8, ptr %arrayidx, align 1
+  %tmp = xor i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+  store i8 %tmp, ptr %arrayidx1, align 1
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_shl(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_shl(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.shl.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT:    [[TMP:%.*]] = shl i8 [[TMP20]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_shl(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT:    [[TMP:%.*]] = shl i8 [[TMP0]], 1
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+  %0 = load i8, ptr %arrayidx, align 1
+  %tmp = shl i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+  store i8 %tmp, ptr %arrayidx1, align 1
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_lshr(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_lshr(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.lshr.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT:    [[TMP:%.*]] = lshr i8 [[TMP20]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_lshr(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT:    [[TMP:%.*]] = lshr i8 [[TMP0]], 1
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+  %0 = load i8, ptr %arrayidx, align 1
+  %tmp = lshr i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+  store i8 %tmp, ptr %arrayidx1, align 1
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_ashr(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_ashr(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.ashr.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT:    [[TMP:%.*]] = ashr i8 [[TMP20]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_ashr(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT:    [[TMP:%.*]] = ashr i8 [[TMP0]], 1
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+  %0 = load i8, ptr %arrayidx, align 1
+  %tmp = ashr i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+  store i8 %tmp, ptr %arrayidx1, align 1
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_add(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_add(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT:    [[TMP:%.*]] = add i8 [[TMP20]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_add(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT:    [[TMP:%.*]] = add i8 [[TMP0]], 1
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+  %0 = load i8, ptr %arrayidx, align 1
+  %tmp = add i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+  store i8 %tmp, ptr %arrayidx1, align 1
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_sub(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_sub(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.sub.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT:    [[TMP:%.*]] = sub i8 [[TMP20]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_sub(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT:    [[TMP:%.*]] = sub i8 [[TMP0]], 1
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+  %0 = load i8, ptr %arrayidx, align 1
+  %tmp = sub i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+  store i8 %tmp, ptr %arrayidx1, align 1
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_mul(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_mul(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.mul.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT:    [[TMP:%.*]] = mul i8 [[TMP20]], 3
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_mul(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT:    [[TMP:%.*]] = mul i8 [[TMP0]], 3
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+  %0 = load i8, ptr %arrayidx, align 1
+  %tmp = mul i8 %0, 3
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+  store i8 %tmp, ptr %arrayidx1, align 1
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_sdiv(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_sdiv(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.sdiv.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT:    [[TMP:%.*]] = sdiv i8 [[TMP20]], 3
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_sdiv(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT:    [[TMP:%.*]] = sdiv i8 [[TMP0]], 3
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+  %0 = load i8, ptr %arrayidx, align 1
+  %tmp = sdiv i8 %0, 3
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+  store i8 %tmp, ptr %arrayidx1, align 1
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_udiv(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_udiv(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.udiv.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT:    [[TMP:%.*]] = udiv i8 [[TMP20]], 3
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_udiv(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT:    [[TMP:%.*]] = udiv i8 [[TMP0]], 3
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+  %0 = load i8, ptr %arrayidx, align 1
+  %tmp = udiv i8 %0, 3
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+  store i8 %tmp, ptr %arrayidx1, align 1
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_srem(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_srem(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.srem.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT:    [[TMP:%.*]] = srem i8 [[TMP20]], 3
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP25:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_srem(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT:    [[TMP:%.*]] = srem i8 [[TMP0]], 3
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+  %0 = load i8, ptr %arrayidx, align 1
+  %tmp = srem i8 %0, 3
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+  store i8 %tmp, ptr %arrayidx1, align 1
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_urem(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_urem(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.urem.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT:    [[TMP:%.*]] = urem i8 [[TMP20]], 3
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_urem(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT:    [[TMP:%.*]] = urem i8 [[TMP0]], 3
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+  %0 = load i8, ptr %arrayidx, align 1
+  %tmp = urem i8 %0, 3
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+  store i8 %tmp, ptr %arrayidx1, align 1
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+; Floating point tests
+
+define void @test_fadd(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_fadd(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = call fast <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[TMP:%.*]] = fadd fast float [[TMP21]], 3.000000e+00
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_fadd(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[TMP:%.*]] = fadd fast float [[TMP0]], 3.000000e+00
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds float, ptr %a, i64 %len
+  %0 = load float, ptr %arrayidx, align 4
+  %tmp = fadd fast float %0, 3.000000e+00
+  %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len
+  store float %tmp, ptr %arrayidx1, align 4
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_fsub(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_fsub(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = call fast <vscale x 4 x float> @llvm.vp.fsub.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[TMP:%.*]] = fsub fast float [[TMP21]], 3.000000e+00
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP31:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_fsub(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[TMP:%.*]] = fsub fast float [[TMP0]], 3.000000e+00
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds float, ptr %a, i64 %len
+  %0 = load float, ptr %arrayidx, align 4
+  %tmp = fsub fast float %0, 3.000000e+00
+  %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len
+  store float %tmp, ptr %arrayidx1, align 4
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_fmul(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_fmul(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = call fast <vscale x 4 x float> @llvm.vp.fmul.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[TMP:%.*]] = fmul fast float [[TMP21]], 3.000000e+00
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP33:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_fmul(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[TMP:%.*]] = fmul fast float [[TMP0]], 3.000000e+00
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds float, ptr %a, i64 %len
+  %0 = load float, ptr %arrayidx, align 4
+  %tmp = fmul fast float %0, 3.000000e+00
+  %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len
+  store float %tmp, ptr %arrayidx1, align 4
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_fdiv(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_fdiv(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = call fast <vscale x 4 x float> @llvm.vp.fdiv.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[TMP:%.*]] = fdiv fast float [[TMP21]], 3.000000e+00
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP35:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_fdiv(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[TMP:%.*]] = fdiv fast float [[TMP0]], 3.000000e+00
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds float, ptr %a, i64 %len
+  %0 = load float, ptr %arrayidx, align 4
+  %tmp = fdiv fast float %0, 3.000000e+00
+  %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len
+  store float %tmp, ptr %arrayidx1, align 4
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_frem(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_frem(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[TMP:%.*]] = frem fast float [[TMP0]], 3.000000e+00
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_frem(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[TMP:%.*]] = frem fast float [[TMP0]], 3.000000e+00
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds float, ptr %a, i64 %len
+  %0 = load float, ptr %arrayidx, align 4
+  %tmp = frem fast float %0, 3.000000e+00
+  %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len
+  store float %tmp, ptr %arrayidx1, align 4
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
+
+define void @test_fneg(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_fneg(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL:       [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = call fast <vscale x 4 x float> @llvm.vp.fneg.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[TMP:%.*]] = fneg fast float [[TMP21]]
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; IF-EVL-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP37:![0-9]+]]
+; IF-EVL:       [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @test_fneg(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[TMP:%.*]] = fneg fast float [[TMP0]]
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT:    ret void
+;
+loop.preheader:
+  br label %loop
+
+loop:
+  %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+  %dec = add nsw i64 %len, 1
+  %arrayidx = getelementptr inbounds float, ptr %a, i64 %len
+  %0 = load float, ptr %arrayidx, align 4
+  %tmp = fneg fast float %0
+  %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len
+  store float %tmp, ptr %arrayidx1, align 4
+  %.not = icmp eq i64 %dec, 100
+  br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
index b8b2558247fa6..99da5058fbf92 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
@@ -20,43 +20,43 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
-; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; IF-EVL-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP13]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
-; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ne <vscale x 4 x i32> [[VP_OP_LOAD]], zeroinitializer
-; IF-EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i1> zeroinitializer
-; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP13]]
-; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP22]], <vscale x 4 x i1> [[TMP20]], i32 [[TMP12]])
-; IF-EVL-NEXT:    [[TMP23:%.*]] = add <vscale x 4 x i32> [[VP_OP_LOAD]], [[VP_OP_LOAD3]]
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP23]], ptr align 4 [[TMP22]], <vscale x 4 x i1> [[TMP20]], i32 [[TMP12]])
-; IF-EVL-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP12]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP24]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
-; IF-EVL-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP12]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP13]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[TMP17:%.*]] = icmp ne <vscale x 4 x i32> [[VP_OP_LOAD]], zeroinitializer
+; IF-EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[TMP18]], i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VP_OP_LOAD3]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP20]], <vscale x 4 x i1> [[TMP18]], i32 [[TMP10]])
+; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
@@ -65,13 +65,13 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_011]]
-; IF-EVL-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[TMP26]], 0
+; IF-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[TMP23]], 0
 ; IF-EVL-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; IF-EVL:       if.then:
 ; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_011]]
-; IF-EVL-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add i32 [[TMP26]], [[TMP27]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add i32 [[TMP23]], [[TMP24]]
 ; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
 ; IF-EVL-NEXT:    br label [[FOR_INC]]
 ; IF-EVL:       for.inc:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index c5a89d48f77b0..c1cf8b0fc541e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -16,46 +16,46 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; IF-EVL-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
 ; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
-; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP7]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0
-; IF-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], -1
-; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP10]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 0, [[TMP13]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = sub i64 1, [[TMP13]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP14]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]]
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP10]]
-; IF-EVL-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 4
-; IF-EVL-NEXT:    [[TMP21:%.*]] = mul i64 0, [[TMP20]]
-; IF-EVL-NEXT:    [[TMP22:%.*]] = sub i64 1, [[TMP20]]
-; IF-EVL-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP21]]
-; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]]
-; IF-EVL-NEXT:    [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP24]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP8]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; IF-EVL-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IF-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], -1
+; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; IF-EVL-NEXT:    [[TMP12:%.*]] = mul i64 0, [[TMP11]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = sub i64 1, [[TMP11]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 4
+; IF-EVL-NEXT:    [[TMP19:%.*]] = mul i64 0, [[TMP18]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = sub i64 1, [[TMP18]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP19]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP20]]
+; IF-EVL-NEXT:    [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP22]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
@@ -119,61 +119,61 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; IF-EVL-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
 ; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
-; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP7]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; IF-EVL-NEXT:    [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32
-; IF-EVL-NEXT:    [[TMP10:%.*]] = add i32 [[OFFSET_IDX3]], 0
+; IF-EVL-NEXT:    [[TMP8:%.*]] = add i32 [[OFFSET_IDX3]], 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; IF-EVL-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP11]]
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP12]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; IF-EVL-NEXT:    [[TMP14:%.*]] = add i64 [[TMP9]], -1
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP10]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT:    [[TMP17:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 100, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; IF-EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> zeroinitializer
-; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP14]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
-; IF-EVL-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP21]]
-; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP21]]
-; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP22]]
-; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT:    [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT:    [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP8]])
-; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP14]]
-; IF-EVL-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 4
-; IF-EVL-NEXT:    [[TMP29:%.*]] = mul i64 0, [[TMP28]]
-; IF-EVL-NEXT:    [[TMP30:%.*]] = sub i64 1, [[TMP28]]
-; IF-EVL-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[TMP26]], i64 [[TMP29]]
-; IF-EVL-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i64 [[TMP30]]
-; IF-EVL-NEXT:    [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT:    [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP32]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP8]])
-; IF-EVL-NEXT:    [[TMP33:%.*]] = zext i32 [[TMP8]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP33]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; IF-EVL-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP9]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add i64 [[TMP7]], -1
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP8]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 100, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
+; IF-EVL-NEXT:    [[TMP20:%.*]] = mul i64 0, [[TMP19]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = sub i64 1, [[TMP19]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP17]], i64 [[TMP20]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP21]]
+; IF-EVL-NEXT:    [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP23]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP6]])
+; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; IF-EVL-NEXT:    [[TMP27:%.*]] = mul i64 0, [[TMP26]]
+; IF-EVL-NEXT:    [[TMP28:%.*]] = sub i64 1, [[TMP26]]
+; IF-EVL-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP27]]
+; IF-EVL-NEXT:    [[TMP30:%.*]] = getelementptr i32, ptr [[TMP29]], i64 [[TMP28]]
+; IF-EVL-NEXT:    [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP30]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP31:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP31]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
index 362bfd61ebd07..83f7dc3702b08 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
@@ -39,15 +39,15 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP13]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
 ; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
-; IF-EVL-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
-; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
-; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD1]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP19]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
-; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
@@ -56,10 +56,10 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP24]], [[TMP23]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], [[TMP22]]
 ; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
 ; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
index 8caa9368bfde1..04b3ba52cbefc 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
@@ -31,7 +31,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
 ; IF-EVL-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:    WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; IF-EVL-NEXT:    WIDEN-VP ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>, vp<[[EVL]]>
 ; IF-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; IF-EVL-NEXT:    WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>

From 3edd21ba6e35e981f3a91b71358eb41cdbab12da Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Fri, 6 Sep 2024 17:53:44 +0200
Subject: [PATCH 394/425] [bazel] Add missing dependencies for
 b8d6885ff67efbc3142a2b49506ed0cc2b95e054

---
 .../llvm-project-overlay/clang-tools-extra/clangd/BUILD.bazel    | 1 +
 utils/bazel/llvm-project-overlay/clang/BUILD.bazel               | 1 +
 2 files changed, 2 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clangd/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clangd/BUILD.bazel
index f2aa64b639985..2ec99aaa2f374 100644
--- a/utils/bazel/llvm-project-overlay/clang-tools-extra/clangd/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clangd/BUILD.bazel
@@ -67,5 +67,6 @@ cc_library(
         "//clang:index",
         "//llvm:Support",
         "//llvm:TargetParser",
+        "//llvm:config",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 8b52d026336de..1d0ba8bd4d586 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -1945,6 +1945,7 @@ cc_library(
         "//llvm:Support",
         "//llvm:Target",
         "//llvm:TargetParser",
+        "//llvm:config",
     ],
 )
 

From 80cf21dad150ae8fb82e4a17f3bc594d1486b210 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Fri, 6 Sep 2024 11:56:07 -0400
Subject: [PATCH 395/425] [libc] Fix unit test compile flags propagation.
 (#106128)

With this change, I was able to build and test for aarch64 & riscv64 on
x86-64 host as follow:

Pre-requisite:
- cross build toolchain for aarch64
```
$ sudo apt install binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
```
- cross build toolchain for riscv64
```
$ sudo apt install binutils-riscv64-linux-gnu gcc-riscv64-linux-gnu g++-riscv64-linux-gnu
```
- qemu user:
```
$ sudo apt install qemu qemu-user qemu-user-static
```

CMake invocation:
```
$ cmake ../runtimes -GNinja -DLLVM_ENABLE_RUNTIMES=libc -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DLIBC_TARGET_TRIPLE=<aarch64-linux-gnu/riscv64-linux-gnu> -DCMAKE_BUILD_TYPE=Release -DLIBC_TEST_COMPILE_OPTIONS_DEFAULT="-static"
$ ninja libc
$ ninja check-libc
```
---
 libc/CMakeLists.txt                           | 32 +++++++++++++++----
 .../cmake/modules/LLVMLibCArchitectures.cmake |  3 +-
 .../modules/LLVMLibCCheckCpuFeatures.cmake    |  4 +++
 libc/cmake/modules/LLVMLibCCheckMPFR.cmake    |  4 ++-
 .../modules/LLVMLibCCompileOptionRules.cmake  | 18 ++++++-----
 libc/cmake/modules/LLVMLibCTestRules.cmake    | 13 +++++---
 libc/test/UnitTest/CMakeLists.txt             | 26 +++++++++------
 libc/test/integration/startup/CMakeLists.txt  |  2 +-
 libc/utils/MPFRWrapper/CMakeLists.txt         |  4 ++-
 9 files changed, 74 insertions(+), 32 deletions(-)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index dd45d6cc8cb6a..c393d23a3b33d 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -112,6 +112,7 @@ add_compile_definitions(LIBC_NAMESPACE=${LIBC_NAMESPACE})
 
 # Flags to pass down to the compiler while building the libc functions.
 set(LIBC_COMPILE_OPTIONS_DEFAULT "" CACHE STRING "Architecture to tell clang to optimize for (e.g. -march=... or -mcpu=...)")
+set(LIBC_TEST_COMPILE_OPTIONS_DEFAULT "" CACHE STRING "Common compile options for all the tests.")
 
 list(APPEND LIBC_COMPILE_OPTIONS_DEFAULT ${LIBC_COMMON_TUNE_OPTIONS})
 
@@ -131,12 +132,31 @@ if(COMMAND_RETURN_CODE EQUAL 0)
   message(STATUS "Set COMPILER_RESOURCE_DIR to "
                  "${COMPILER_RESOURCE_DIR} using --print-resource-dir")
 else()
-  if (LIBC_TARGET_OS_IS_GPU)
-    message(FATAL_ERROR "COMPILER_RESOURCE_DIR must be set for GPU builds")
-  else()
-    set(COMPILER_RESOURCE_DIR OFF)
-    message(STATUS "COMPILER_RESOURCE_DIR not set
-                    --print-resource-dir not supported by host compiler")
+  # Try with GCC option: -print-search-dirs, which will output in the form:
+  #   install: <path>
+  #   programs: ........
+  # So we try to capture the <path> after "install: " in the first line of the
+  # output.
+  execute_process(
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    COMMAND ${CMAKE_CXX_COMPILER} -print-search-dirs
+    RESULT_VARIABLE COMMAND_RETURN_CODE
+    OUTPUT_VARIABLE COMPILER_RESOURCE_DIR
+  )
+  if(COMMAND_RETURN_CODE EQUAL 0)
+    string(REPLACE " " ";" COMPILER_RESOURCE_DIR ${COMPILER_RESOURCE_DIR})
+    string(REPLACE "\n" ";" COMPILER_RESOURCE_DIR "${COMPILER_RESOURCE_DIR}")
+    list(GET COMPILER_RESOURCE_DIR 1 COMPILER_RESOURCE_DIR)
+    message(STATUS "Set COMPILER_RESOURCE_DIR to "
+    "${COMPILER_RESOURCE_DIR} using --print-search-dirs")
+else()
+    if (LIBC_TARGET_OS_IS_GPU)
+      message(FATAL_ERROR "COMPILER_RESOURCE_DIR must be set for GPU builds")
+    else()
+      set(COMPILER_RESOURCE_DIR OFF)
+      message(STATUS "COMPILER_RESOURCE_DIR not set
+                      --print-resource-dir not supported by host compiler")
+    endif()
   endif()
 endif()
 
diff --git a/libc/cmake/modules/LLVMLibCArchitectures.cmake b/libc/cmake/modules/LLVMLibCArchitectures.cmake
index dacb4db75d337..d922b4f21a8ac 100644
--- a/libc/cmake/modules/LLVMLibCArchitectures.cmake
+++ b/libc/cmake/modules/LLVMLibCArchitectures.cmake
@@ -207,4 +207,5 @@ if(explicit_target_triple AND
 endif()
 
 message(STATUS
-        "Building libc for ${LIBC_TARGET_ARCHITECTURE} on ${LIBC_TARGET_OS}")
+        "Building libc for ${LIBC_TARGET_ARCHITECTURE} on ${LIBC_TARGET_OS} with
+        LIBC_COMPILE_OPTIONS_DEFAULT: ${LIBC_COMPILE_OPTIONS_DEFAULT}")
diff --git a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
index be569f6f9cabf..3b6b6f56fc80c 100644
--- a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
+++ b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
@@ -13,6 +13,10 @@ elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
   set(LIBC_COMPILE_OPTIONS_NATIVE -mcpu=native)
 endif()
 
+if(LIBC_CROSSBUILD)
+  set(LIBC_COMPILE_OPTIONS_NATIVE ${LIBC_COMPILE_OPTIONS_DEFAULT})
+endif()
+
 # Making sure ALL_CPU_FEATURES is sorted.
 list(SORT ALL_CPU_FEATURES)
 
diff --git a/libc/cmake/modules/LLVMLibCCheckMPFR.cmake b/libc/cmake/modules/LLVMLibCCheckMPFR.cmake
index a27c2dc0c030b..95815deb07298 100644
--- a/libc/cmake/modules/LLVMLibCCheckMPFR.cmake
+++ b/libc/cmake/modules/LLVMLibCCheckMPFR.cmake
@@ -11,7 +11,9 @@ else()
     LIBC_TESTS_CAN_USE_MPFR
     ${CMAKE_CURRENT_BINARY_DIR}
     SOURCES
-    ${LIBC_SOURCE_DIR}/utils/MPFRWrapper/check_mpfr.cpp
+      ${LIBC_SOURCE_DIR}/utils/MPFRWrapper/check_mpfr.cpp
+    COMPILE_DEFINITIONS
+      ${LIBC_COMPILE_OPTIONS_DEFAULT}
     LINK_LIBRARIES
       -lmpfr -lgmp -latomic
   )
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index e3dfe1a152969..45dfe3e63302b 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -188,7 +188,10 @@ endfunction()
 function(_get_common_test_compile_options output_var c_test flags)
   _get_compile_options_from_flags(compile_flags ${flags})
 
-  set(compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT} ${compile_flags})
+  set(compile_options
+      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      ${LIBC_TEST_COMPILE_OPTIONS_DEFAULT}
+      ${compile_flags})
 
   if(LLVM_COMPILER_IS_GCC_COMPATIBLE)
     list(APPEND compile_options "-fpie")
@@ -232,9 +235,12 @@ function(_get_common_test_compile_options output_var c_test flags)
 endfunction()
 
 function(_get_hermetic_test_compile_options output_var flags)
-  _get_compile_options_from_flags(compile_flags ${flags})
-  list(APPEND compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT} ${compile_flags}
-       ${flags} -fpie -ffreestanding -fno-exceptions -fno-rtti)
+  _get_common_test_compile_options(compile_options "" "${flags}")
+
+  list(APPEND compile_options "-fpie")
+  list(APPEND compile_options "-ffreestanding")
+  list(APPEND compile_options "-fno-exceptions")
+  list(APPEND compile_options "-fno-rtti")
 
   # The GPU build requires overriding the default CMake triple and architecture.
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
@@ -248,9 +254,5 @@ function(_get_hermetic_test_compile_options output_var flags)
          -nogpulib -march=${LIBC_GPU_TARGET_ARCHITECTURE} -fno-use-cxa-atexit)
   endif()
 
-  if(LLVM_LIBC_FULL_BUILD)
-    list(APPEND compile_options "-DLIBC_FULL_BUILD")
-  endif()
-  
   set(${output_var} ${compile_options} PARENT_SCOPE)
 endfunction()
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index d8097855d1637..1f6ccb27f35f0 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -192,6 +192,7 @@ function(create_libc_unittest fq_target_name)
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
+  target_link_options(${fq_build_target_name} PRIVATE ${compile_options})
 
   if(NOT LIBC_UNITTEST_CXX_STANDARD)
     set(LIBC_UNITTEST_CXX_STANDARD ${CMAKE_CXX_STANDARD})
@@ -465,8 +466,9 @@ function(add_integration_test test_name)
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
 
-  _get_hermetic_test_compile_options(compile_options "${INTEGRATION_TEST_COMPILE_OPTIONS}")
-  target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
+  _get_hermetic_test_compile_options(compile_options "")
+  target_compile_options(${fq_build_target_name} PRIVATE
+                         ${compile_options} ${INTEGRATION_TEST_COMPILE_OPTIONS})
 
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
@@ -638,11 +640,12 @@ function(add_libc_hermetic test_name)
       #OUTPUT_NAME ${fq_target_name}
   )
 
-  _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-  _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
-  target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
+  _get_hermetic_test_compile_options(compile_options "")
+  target_compile_options(${fq_build_target_name} PRIVATE
+                         ${compile_options}
+                         ${HERMETIC_TEST_COMPILE_OPTIONS})
 
   set(link_libraries "")
   foreach(lib IN LISTS HERMETIC_TEST_LINK_LIBRARIES)
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index 6427b86158077..570bd600d1a43 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -19,19 +19,27 @@ function(add_unittest_framework_library name)
       ${TEST_LIB_SRCS}
       ${TEST_LIB_HDRS}
     )
-    target_include_directories(${lib} PUBLIC ${LIBC_SOURCE_DIR})
-    list(APPEND compile_options -fno-exceptions -fno-rtti)
+    target_include_directories(${lib} PRIVATE ${LIBC_SOURCE_DIR})
     if(TARGET libc.src.time.clock)
       target_compile_definitions(${lib} PRIVATE TARGET_SUPPORTS_CLOCK)
     endif()
-    if(LIBC_COMPILER_HAS_FIXED_POINT)
-      list(APPEND compile_options -ffixed-point)
-    endif()
-    target_compile_options(${lib} PUBLIC ${compile_options})
   endforeach()
-  _get_hermetic_test_compile_options(compile_options -nostdinc++)
-  target_include_directories(${name}.hermetic PRIVATE ${LIBC_BUILD_DIR}/include)
-  target_compile_options(${name}.hermetic PRIVATE ${compile_options})
+
+  if(LLVM_LIBC_FULL_BUILD)
+    # TODO: Build test framework with LIBC_FULL_BUILD in full build mode after
+    # making LibcFPExceptionHelpers and LibcDeathTestExecutors hermetic.
+    set(LLVM_LIBC_FULL_BUILD "")
+    _get_common_test_compile_options(compile_options "" "")
+    target_compile_options(${name}.unit PRIVATE ${compile_options})
+    set(LLVM_LIBC_FULL_BUILD ON)
+  else()
+    _get_common_test_compile_options(compile_options "" "")
+    target_compile_options(${name}.unit PRIVATE ${compile_options})
+endif()
+
+  _get_hermetic_test_compile_options(compile_options "")
+  target_include_directories(${name}.hermetic PRIVATE ${LIBC_INCLUDE_DIR})
+  target_compile_options(${name}.hermetic PRIVATE ${compile_options} -nostdinc++)
 
   if(TEST_LIB_DEPENDS)
     foreach(dep IN ITEMS ${TEST_LIB_DEPENDS})
diff --git a/libc/test/integration/startup/CMakeLists.txt b/libc/test/integration/startup/CMakeLists.txt
index fb5d6bc787cc2..3a01d49b5e6c4 100644
--- a/libc/test/integration/startup/CMakeLists.txt
+++ b/libc/test/integration/startup/CMakeLists.txt
@@ -32,7 +32,7 @@ function(add_startup_test target_name)
     PRIVATE
       ${LIBC_SOURCE_DIR}
       ${LIBC_BUILD_DIR}
-      ${LIBC_BUILD_DIR}/include
+      ${LIBC_INCLUDE_DIR}
   )
 
   if(ADD_STARTUP_TEST_DEPENDS)
diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt
index be0415d0956fa..e6d3b5af9dc29 100644
--- a/libc/utils/MPFRWrapper/CMakeLists.txt
+++ b/libc/utils/MPFRWrapper/CMakeLists.txt
@@ -4,7 +4,8 @@ if(LIBC_TESTS_CAN_USE_MPFR)
     MPFRUtils.h
     mpfr_inc.h
   )
-  target_compile_options(libcMPFRWrapper PRIVATE -O3)
+  _get_common_test_compile_options(compile_options "" "")
+  target_compile_options(libcMPFRWrapper PRIVATE -O3 ${compile_options})
   add_dependencies(
     libcMPFRWrapper
     libc.src.__support.CPP.array
@@ -19,6 +20,7 @@ if(LIBC_TESTS_CAN_USE_MPFR)
     target_include_directories(libcMPFRWrapper PUBLIC ${LLVM_LIBC_MPFR_INSTALL_PATH}/include)
     target_link_directories(libcMPFRWrapper PUBLIC ${LLVM_LIBC_MPFR_INSTALL_PATH}/lib)
   endif()
+  target_include_directories(libcMPFRWrapper PUBLIC ${LIBC_SOURCE_DIR})
   target_link_libraries(libcMPFRWrapper PUBLIC LibcFPTestHelpers.unit LibcTest.unit mpfr gmp)
 elseif(NOT LIBC_TARGET_OS_IS_GPU AND NOT LLVM_LIBC_FULL_BUILD)
   message(WARNING "Math tests using MPFR will be skipped.")

From deba13409245aabf3fda8b82a873336ea5238d3a Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Fri, 6 Sep 2024 09:04:12 -0700
Subject: [PATCH 396/425] [Minidump] Support multiple exceptions in a minidump
 (#107319)

A fork of #97470, splitting off the LLVM changes from the LLDB specific
changes. This patch enables a minidump file to have multiple exceptions,
exposed via an iterator of Expected streams.
---
 llvm/include/llvm/Object/Minidump.h           | 83 +++++++++++++++++--
 llvm/lib/Object/Minidump.cpp                  | 17 +++-
 llvm/lib/ObjectYAML/MinidumpYAML.cpp          |  2 +-
 llvm/unittests/Object/MinidumpTest.cpp        | 11 ++-
 .../unittests/ObjectYAML/MinidumpYAMLTest.cpp | 61 ++++++++++++--
 5 files changed, 153 insertions(+), 21 deletions(-)

diff --git a/llvm/include/llvm/Object/Minidump.h b/llvm/include/llvm/Object/Minidump.h
index 65ad6b171c2dc..e6b21979ccaa1 100644
--- a/llvm/include/llvm/Object/Minidump.h
+++ b/llvm/include/llvm/Object/Minidump.h
@@ -83,13 +83,26 @@ class MinidumpFile : public Binary {
     return getListStream<minidump::Thread>(minidump::StreamType::ThreadList);
   }
 
-  /// Returns the contents of the Exception stream.  An error is returned if the
-  /// file does not contain this stream, or the stream is smaller than the size
-  /// of the ExceptionStream structure.  The internal consistency of the stream
-  /// is not checked in any way.
+  /// Returns the contents of the Exception stream. An error is returned if the
+  /// associated stream is smaller than the size of the ExceptionStream
+  /// structure. Or the directory supplied is not of kind exception stream.
+  Expected<const minidump::ExceptionStream &>
+  getExceptionStream(minidump::Directory Directory) const {
+    if (Directory.Type != minidump::StreamType::Exception) {
+      return createError("Not an exception stream");
+    }
+
+    return getStreamFromDirectory<minidump::ExceptionStream>(Directory);
+  }
+
+  /// Returns the first exception stream in the file. An error is returned if
+  /// the associated stream is smaller than the size of the ExceptionStream
+  /// structure. Or the directory supplied is not of kind exception stream.
   Expected<const minidump::ExceptionStream &> getExceptionStream() const {
-    return getStream<minidump::ExceptionStream>(
-        minidump::StreamType::Exception);
+    auto it = getExceptionStreams();
+    if (it.begin() == it.end())
+      return createError("No exception streams");
+    return *it.begin();
   }
 
   /// Returns the list of descriptors embedded in the MemoryList stream. The
@@ -216,8 +229,44 @@ class MinidumpFile : public Binary {
     bool IsEnd;
   };
 
+  class ExceptionStreamsIterator {
+  public:
+    ExceptionStreamsIterator(ArrayRef<minidump::Directory> Streams,
+                             const MinidumpFile *File)
+        : Streams(Streams), File(File) {}
+
+    bool operator==(const ExceptionStreamsIterator &R) const {
+      return Streams.size() == R.Streams.size();
+    }
+
+    bool operator!=(const ExceptionStreamsIterator &R) const {
+      return !(*this == R);
+    }
+
+    Expected<const minidump::ExceptionStream &> operator*() {
+      return File->getExceptionStream(Streams.front());
+    }
+
+    ExceptionStreamsIterator &operator++() {
+      if (!Streams.empty())
+        Streams = Streams.drop_front();
+
+      return *this;
+    }
+
+  private:
+    ArrayRef<minidump::Directory> Streams;
+    const MinidumpFile *File;
+  };
+
   using FallibleMemory64Iterator = llvm::fallible_iterator<Memory64Iterator>;
 
+  /// Returns an iterator that reads each exception stream independently. The
+  /// contents of the exception strema are not validated before being read, an
+  /// error will be returned if the stream is not large enough to contain an
+  /// exception stream, or if the stream points beyond the end of the file.
+  iterator_range<ExceptionStreamsIterator> getExceptionStreams() const;
+
   /// Returns an iterator that pairs each descriptor with it's respective
   /// content from the Memory64List stream. An error is returned if the file
   /// does not contain a Memory64List stream, or if the descriptor data is
@@ -256,14 +305,22 @@ class MinidumpFile : public Binary {
 
   MinidumpFile(MemoryBufferRef Source, const minidump::Header &Header,
                ArrayRef<minidump::Directory> Streams,
-               DenseMap<minidump::StreamType, std::size_t> StreamMap)
+               DenseMap<minidump::StreamType, std::size_t> StreamMap,
+               std::vector<minidump::Directory> ExceptionStreams)
       : Binary(ID_Minidump, Source), Header(Header), Streams(Streams),
-        StreamMap(std::move(StreamMap)) {}
+        StreamMap(std::move(StreamMap)),
+        ExceptionStreams(std::move(ExceptionStreams)) {}
 
   ArrayRef<uint8_t> getData() const {
     return arrayRefFromStringRef(Data.getBuffer());
   }
 
+  /// Return the stream of the given type, cast to the appropriate type. Checks
+  /// that the stream is large enough to hold an object of this type.
+  template <typename T>
+  Expected<const T &>
+  getStreamFromDirectory(minidump::Directory Directory) const;
+
   /// Return the stream of the given type, cast to the appropriate type. Checks
   /// that the stream is large enough to hold an object of this type.
   template <typename T>
@@ -277,8 +334,18 @@ class MinidumpFile : public Binary {
   const minidump::Header &Header;
   ArrayRef<minidump::Directory> Streams;
   DenseMap<minidump::StreamType, std::size_t> StreamMap;
+  std::vector<minidump::Directory> ExceptionStreams;
 };
 
+template <typename T>
+Expected<const T &>
+MinidumpFile::getStreamFromDirectory(minidump::Directory Directory) const {
+  ArrayRef<uint8_t> Stream = getRawStream(Directory);
+  if (Stream.size() >= sizeof(T))
+    return *reinterpret_cast<const T *>(Stream.data());
+  return createEOFError();
+}
+
 template <typename T>
 Expected<const T &> MinidumpFile::getStream(minidump::StreamType Type) const {
   if (std::optional<ArrayRef<uint8_t>> Stream = getRawStream(Type)) {
diff --git a/llvm/lib/Object/Minidump.cpp b/llvm/lib/Object/Minidump.cpp
index 93b2e2cecd105..fe768c4c90711 100644
--- a/llvm/lib/Object/Minidump.cpp
+++ b/llvm/lib/Object/Minidump.cpp
@@ -53,6 +53,12 @@ Expected<std::string> MinidumpFile::getString(size_t Offset) const {
   return Result;
 }
 
+iterator_range<llvm::object::MinidumpFile::ExceptionStreamsIterator>
+MinidumpFile::getExceptionStreams() const {
+  return make_range(ExceptionStreamsIterator(ExceptionStreams, this),
+                    ExceptionStreamsIterator({}, this));
+}
+
 Expected<iterator_range<MinidumpFile::MemoryInfoIterator>>
 MinidumpFile::getMemoryInfoList() const {
   std::optional<ArrayRef<uint8_t>> Stream =
@@ -128,6 +134,7 @@ MinidumpFile::create(MemoryBufferRef Source) {
     return ExpectedStreams.takeError();
 
   DenseMap<StreamType, std::size_t> StreamMap;
+  std::vector<Directory> ExceptionStreams;
   for (const auto &StreamDescriptor : llvm::enumerate(*ExpectedStreams)) {
     StreamType Type = StreamDescriptor.value().Type;
     const LocationDescriptor &Loc = StreamDescriptor.value().Location;
@@ -143,6 +150,13 @@ MinidumpFile::create(MemoryBufferRef Source) {
       continue;
     }
 
+    // Exceptions can be treated as a special case of streams. Other streams
+    // represent a list of entities, but exceptions are unique per stream.
+    if (Type == StreamType::Exception) {
+      ExceptionStreams.push_back(StreamDescriptor.value());
+      continue;
+    }
+
     if (Type == DenseMapInfo<StreamType>::getEmptyKey() ||
         Type == DenseMapInfo<StreamType>::getTombstoneKey())
       return createError("Cannot handle one of the minidump streams");
@@ -153,7 +167,8 @@ MinidumpFile::create(MemoryBufferRef Source) {
   }
 
   return std::unique_ptr<MinidumpFile>(
-      new MinidumpFile(Source, Hdr, *ExpectedStreams, std::move(StreamMap)));
+      new MinidumpFile(Source, Hdr, *ExpectedStreams, std::move(StreamMap),
+                       std::move(ExceptionStreams)));
 }
 
 iterator_range<MinidumpFile::FallibleMemory64Iterator>
diff --git a/llvm/lib/ObjectYAML/MinidumpYAML.cpp b/llvm/lib/ObjectYAML/MinidumpYAML.cpp
index 10b8676b5c4cf..1818823180b7d 100644
--- a/llvm/lib/ObjectYAML/MinidumpYAML.cpp
+++ b/llvm/lib/ObjectYAML/MinidumpYAML.cpp
@@ -499,7 +499,7 @@ Stream::create(const Directory &StreamDesc, const object::MinidumpFile &File) {
   switch (Kind) {
   case StreamKind::Exception: {
     Expected<const minidump::ExceptionStream &> ExpectedExceptionStream =
-        File.getExceptionStream();
+        File.getExceptionStream(StreamDesc);
     if (!ExpectedExceptionStream)
       return ExpectedExceptionStream.takeError();
     Expected<ArrayRef<uint8_t>> ExpectedThreadContext =
diff --git a/llvm/unittests/Object/MinidumpTest.cpp b/llvm/unittests/Object/MinidumpTest.cpp
index d2d9f115bb94a..44a9661bd3f34 100644
--- a/llvm/unittests/Object/MinidumpTest.cpp
+++ b/llvm/unittests/Object/MinidumpTest.cpp
@@ -711,7 +711,7 @@ TEST(MinidumpFile, getMemoryInfoList) {
                                    0x0001000908000000u));
 }
 
-TEST(MinidumpFile, getExceptionStream) {
+TEST(MinidumpFile, getExceptionStreams) {
   std::vector<uint8_t> Data{
       // Header
       'M', 'D', 'M', 'P', 0x93, 0xa7, 0, 0, // Signature, Version
@@ -751,8 +751,11 @@ TEST(MinidumpFile, getExceptionStream) {
   auto ExpectedFile = create(Data);
   ASSERT_THAT_EXPECTED(ExpectedFile, Succeeded());
   const MinidumpFile &File = **ExpectedFile;
-  Expected<const minidump::ExceptionStream &> ExpectedStream =
-      File.getExceptionStream();
+
+  auto ExceptionStreams = File.getExceptionStreams();
+  ASSERT_NE(ExceptionStreams.begin(), ExceptionStreams.end());
+  auto ExceptionIterator = ExceptionStreams.begin();
+  Expected<const ExceptionStream &> ExpectedStream = *ExceptionIterator;
   ASSERT_THAT_EXPECTED(ExpectedStream, Succeeded());
   EXPECT_EQ(0x04030201u, ExpectedStream->ThreadId);
   const minidump::Exception &Exception = ExpectedStream->ExceptionRecord;
@@ -767,4 +770,6 @@ TEST(MinidumpFile, getExceptionStream) {
   }
   EXPECT_EQ(0x84838281, ExpectedStream->ThreadContext.DataSize);
   EXPECT_EQ(0x88878685, ExpectedStream->ThreadContext.RVA);
+  ++ExceptionIterator;
+  ASSERT_EQ(ExceptionIterator, ExceptionStreams.end());
 }
diff --git a/llvm/unittests/ObjectYAML/MinidumpYAMLTest.cpp b/llvm/unittests/ObjectYAML/MinidumpYAMLTest.cpp
index a8b8da925d21d..c805665e0b1ba 100644
--- a/llvm/unittests/ObjectYAML/MinidumpYAMLTest.cpp
+++ b/llvm/unittests/ObjectYAML/MinidumpYAMLTest.cpp
@@ -162,8 +162,9 @@ TEST(MinidumpYAML, ExceptionStream) {
 
   ASSERT_EQ(1u, File.streams().size());
 
-  Expected<const minidump::ExceptionStream &> ExpectedStream =
-      File.getExceptionStream();
+  auto ExceptionIterator = File.getExceptionStreams().begin();
+
+  Expected<const ExceptionStream &> ExpectedStream = *ExceptionIterator;
 
   ASSERT_THAT_EXPECTED(ExpectedStream, Succeeded());
 
@@ -205,9 +206,9 @@ TEST(MinidumpYAML, ExceptionStream_NoParameters) {
 
   ASSERT_EQ(1u, File.streams().size());
 
-  Expected<const minidump::ExceptionStream &> ExpectedStream =
-      File.getExceptionStream();
+  auto ExceptionIterator = File.getExceptionStreams().begin();
 
+  Expected<const ExceptionStream &> ExpectedStream = *ExceptionIterator;
   ASSERT_THAT_EXPECTED(ExpectedStream, Succeeded());
 
   const minidump::ExceptionStream &Stream = *ExpectedStream;
@@ -261,8 +262,9 @@ TEST(MinidumpYAML, ExceptionStream_TooManyParameters) {
 
   ASSERT_EQ(1u, File.streams().size());
 
-  Expected<const minidump::ExceptionStream &> ExpectedStream =
-      File.getExceptionStream();
+  auto ExceptionIterator = File.getExceptionStreams().begin();
+
+  Expected<const ExceptionStream &> ExpectedStream = *ExceptionIterator;
 
   ASSERT_THAT_EXPECTED(ExpectedStream, Succeeded());
 
@@ -312,8 +314,9 @@ TEST(MinidumpYAML, ExceptionStream_ExtraParameter) {
 
   ASSERT_EQ(1u, File.streams().size());
 
-  Expected<const minidump::ExceptionStream &> ExpectedStream =
-      File.getExceptionStream();
+  auto ExceptionIterator = File.getExceptionStreams().begin();
+
+  Expected<const ExceptionStream &> ExpectedStream = *ExceptionIterator;
 
   ASSERT_THAT_EXPECTED(ExpectedStream, Succeeded());
 
@@ -399,3 +402,45 @@ TEST(MinidumpYAML, MemoryRegion_64bit) {
 
   ASSERT_EQ(Iterator, MemoryList.end());
 }
+
+// Test that we can parse multiple exception streams.
+TEST(MinidumpYAML, ExceptionStream_MultipleExceptions) {
+  SmallString<0> Storage;
+  auto ExpectedFile = toBinary(Storage, R"(
+--- !minidump
+Streams:
+  - Type:            Exception
+    Thread ID:  0x7
+    Exception Record:
+      Exception Code:  0x23
+      Exception Flags: 0x5
+      Exception Record: 0x0102030405060708
+      Exception Address: 0x0a0b0c0d0e0f1011
+      Number of Parameters: 2
+      Parameter 0: 0x99
+      Parameter 1: 0x23
+      Parameter 2: 0x42
+    Thread Context:  3DeadBeefDefacedABadCafe
+  - Type:            Exception
+    Thread ID:  0x5
+    Exception Record:
+      Exception Code:  0x23
+      Exception Flags: 0x5
+      Exception Record: 0x0102030405060708
+      Exception Address: 0x0a0b0c0d0e0f1011
+    Thread Context:  3DeadBeefDefacedABadCafe)");
+
+  ASSERT_THAT_EXPECTED(ExpectedFile, Succeeded());
+  object::MinidumpFile &File = **ExpectedFile;
+
+  ASSERT_EQ(2u, File.streams().size());
+
+  size_t count = 0;
+  for (auto exception_stream : File.getExceptionStreams()) {
+    count++;
+    ASSERT_THAT_EXPECTED(exception_stream, Succeeded());
+    ASSERT_THAT(0x23u, exception_stream->ExceptionRecord.ExceptionCode);
+  }
+
+  ASSERT_THAT(2u, count);
+}

From d4d4e77918118f1444dc5ca230d4fdf82bb05b74 Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Fri, 6 Sep 2024 09:04:33 -0700
Subject: [PATCH 397/425] [LLDB] Reappply SBSaveCore AddMemoryList (#107159)

Reapplies #106293, testing identified issue in the merging code. I used
this opportunity to strip CoreFileMemoryRanges to it's own file and then
add unit tests on it's behavior.
---
 lldb/include/lldb/API/SBMemoryRegionInfo.h    |   2 +-
 lldb/include/lldb/API/SBSaveCoreOptions.h     |  11 ++
 lldb/include/lldb/Symbol/SaveCoreOptions.h    |  11 +-
 .../lldb/Target/CoreFileMemoryRanges.h        |  50 ++++++
 lldb/include/lldb/Target/Process.h            |  25 +--
 lldb/include/lldb/Utility/RangeMap.h          |   2 +
 lldb/include/lldb/lldb-enumerations.h         |   1 +
 lldb/include/lldb/lldb-forward.h              |   1 +
 lldb/include/lldb/lldb-private-interfaces.h   |   1 -
 lldb/source/API/SBSaveCoreOptions.cpp         |  11 ++
 lldb/source/Commands/CommandObjectProcess.cpp |   1 +
 .../ObjectFile/Mach-O/ObjectFileMachO.cpp     |   6 +-
 .../ObjectFile/Mach-O/ObjectFileMachO.h       |   1 +
 .../Minidump/MinidumpFileBuilder.cpp          |  35 ++--
 .../ObjectFile/Minidump/MinidumpFileBuilder.h |   5 +-
 .../ObjectFile/Minidump/ObjectFileMinidump.h  |   1 +
 .../ObjectFile/PECOFF/ObjectFilePECOFF.cpp    |   1 +
 .../ObjectFile/PECOFF/ObjectFilePECOFF.h      |   1 +
 lldb/source/Symbol/SaveCoreOptions.cpp        |  14 ++
 lldb/source/Target/CMakeLists.txt             |   1 +
 lldb/source/Target/CoreFileMemoryRanges.cpp   |  49 ++++++
 lldb/source/Target/Process.cpp                |  75 ++++++---
 .../TestProcessSaveCoreMinidump.py            | 149 ++++++++++++++++++
 lldb/unittests/Process/Utility/CMakeLists.txt |   1 +
 .../Utility/CoreFileMemoryRangesTest.cpp      | 108 +++++++++++++
 25 files changed, 496 insertions(+), 67 deletions(-)
 create mode 100644 lldb/include/lldb/Target/CoreFileMemoryRanges.h
 create mode 100644 lldb/source/Target/CoreFileMemoryRanges.cpp
 create mode 100644 lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp

diff --git a/lldb/include/lldb/API/SBMemoryRegionInfo.h b/lldb/include/lldb/API/SBMemoryRegionInfo.h
index be55de4ead1fa..f9a5dc993d7cb 100644
--- a/lldb/include/lldb/API/SBMemoryRegionInfo.h
+++ b/lldb/include/lldb/API/SBMemoryRegionInfo.h
@@ -120,7 +120,7 @@ class LLDB_API SBMemoryRegionInfo {
 private:
   friend class SBProcess;
   friend class SBMemoryRegionInfoList;
-
+  friend class SBSaveCoreOptions;
   friend class lldb_private::ScriptInterpreter;
 
   lldb_private::MemoryRegionInfo &ref();
diff --git a/lldb/include/lldb/API/SBSaveCoreOptions.h b/lldb/include/lldb/API/SBSaveCoreOptions.h
index ba48ba5eaea5a..c076d3ce6f757 100644
--- a/lldb/include/lldb/API/SBSaveCoreOptions.h
+++ b/lldb/include/lldb/API/SBSaveCoreOptions.h
@@ -80,6 +80,17 @@ class LLDB_API SBSaveCoreOptions {
   /// \return True if the thread was removed, false if it was not in the list.
   bool RemoveThread(lldb::SBThread thread);
 
+  /// Add a memory region to save in the core file.
+  ///
+  /// \param region The memory region to save.
+  /// \returns An empty SBError upon success, or an error if the region is
+  /// invalid.
+  /// \note Ranges that overlapped will be unioned into a single region, this
+  /// also supercedes stack minification. Specifying full regions and a
+  /// non-custom core style will include the specified regions and union them
+  /// with all style specific regions.
+  SBError AddMemoryRegionToSave(const SBMemoryRegionInfo &region);
+
   /// Reset all options.
   void Clear();
 
diff --git a/lldb/include/lldb/Symbol/SaveCoreOptions.h b/lldb/include/lldb/Symbol/SaveCoreOptions.h
index f4fed4676fa4a..d90d08026016d 100644
--- a/lldb/include/lldb/Symbol/SaveCoreOptions.h
+++ b/lldb/include/lldb/Symbol/SaveCoreOptions.h
@@ -10,13 +10,15 @@
 #define LLDB_SOURCE_PLUGINS_OBJECTFILE_SaveCoreOPTIONS_H
 
 #include "lldb/Utility/FileSpec.h"
-#include "lldb/lldb-forward.h"
-#include "lldb/lldb-types.h"
+#include "lldb/Utility/RangeMap.h"
 
 #include <optional>
+#include <set>
 #include <string>
 #include <unordered_set>
 
+using MemoryRanges = lldb_private::RangeVector<lldb::addr_t, lldb::addr_t>;
+
 namespace lldb_private {
 
 class SaveCoreOptions {
@@ -38,8 +40,12 @@ class SaveCoreOptions {
   Status AddThread(lldb::ThreadSP thread_sp);
   bool RemoveThread(lldb::ThreadSP thread_sp);
   bool ShouldThreadBeSaved(lldb::tid_t tid) const;
+  bool HasSpecifiedThreads() const;
 
   Status EnsureValidConfiguration(lldb::ProcessSP process_sp) const;
+  const MemoryRanges &GetCoreFileMemoryRanges() const;
+
+  void AddMemoryRegionToSave(const lldb_private::MemoryRegionInfo &region);
 
   void Clear();
 
@@ -51,6 +57,7 @@ class SaveCoreOptions {
   std::optional<lldb::SaveCoreStyle> m_style;
   lldb::ProcessSP m_process_sp;
   std::unordered_set<lldb::tid_t> m_threads_to_save;
+  MemoryRanges m_regions_to_save;
 };
 } // namespace lldb_private
 
diff --git a/lldb/include/lldb/Target/CoreFileMemoryRanges.h b/lldb/include/lldb/Target/CoreFileMemoryRanges.h
new file mode 100644
index 0000000000000..503ecd691e594
--- /dev/null
+++ b/lldb/include/lldb/Target/CoreFileMemoryRanges.h
@@ -0,0 +1,50 @@
+//===-- CoreFileMemoryRanges.h ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Utility/RangeMap.h"
+#include "lldb/Utility/Status.h"
+
+#include "llvm/ADT/AddressRanges.h"
+
+#ifndef LLDB_TARGET_COREFILEMEMORYRANGES_H
+#define LLDB_TARGET_COREFILEMEMORYRANGES_H
+
+namespace lldb_private {
+
+struct CoreFileMemoryRange {
+  llvm::AddressRange range;  /// The address range to save into the core file.
+  uint32_t lldb_permissions; /// A bit set of lldb::Permissions bits.
+
+  bool operator==(const CoreFileMemoryRange &rhs) const {
+    return range == rhs.range && lldb_permissions == rhs.lldb_permissions;
+  }
+
+  bool operator!=(const CoreFileMemoryRange &rhs) const {
+    return !(*this == rhs);
+  }
+
+  bool operator<(const CoreFileMemoryRange &rhs) const {
+    if (range < rhs.range)
+      return true;
+    if (range == rhs.range)
+      return lldb_permissions < rhs.lldb_permissions;
+    return false;
+  }
+};
+
+class CoreFileMemoryRanges
+    : public lldb_private::RangeDataVector<lldb::addr_t, lldb::addr_t,
+                                           CoreFileMemoryRange> {
+public:
+  /// Finalize and merge all overlapping ranges in this collection. Ranges
+  /// will be seperated based on permissions.
+  Status FinalizeCoreFileSaveRanges();
+};
+} // namespace lldb_private
+
+#endif // LLDB_TARGET_COREFILEMEMORYRANGES_H
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index c66cfb2c245ef..b8c53a474ba6b 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -35,6 +35,8 @@
 #include "lldb/Host/ProcessLaunchInfo.h"
 #include "lldb/Host/ProcessRunLock.h"
 #include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
+#include "lldb/Target/CoreFileMemoryRanges.h"
 #include "lldb/Target/ExecutionContextScope.h"
 #include "lldb/Target/InstrumentationRuntime.h"
 #include "lldb/Target/Memory.h"
@@ -710,29 +712,6 @@ class Process : public std::enable_shared_from_this<Process>,
   ///     is not supported by the plugin, error otherwise.
   virtual llvm::Expected<bool> SaveCore(llvm::StringRef outfile);
 
-  struct CoreFileMemoryRange {
-    llvm::AddressRange range;  /// The address range to save into the core file.
-    uint32_t lldb_permissions; /// A bit set of lldb::Permissions bits.
-
-    bool operator==(const CoreFileMemoryRange &rhs) const {
-      return range == rhs.range && lldb_permissions == rhs.lldb_permissions;
-    }
-
-    bool operator!=(const CoreFileMemoryRange &rhs) const {
-      return !(*this == rhs);
-    }
-
-    bool operator<(const CoreFileMemoryRange &rhs) const {
-      if (range < rhs.range)
-        return true;
-      if (range == rhs.range)
-        return lldb_permissions < rhs.lldb_permissions;
-      return false;
-    }
-  };
-
-  using CoreFileMemoryRanges = std::vector<CoreFileMemoryRange>;
-
   /// Helper function for Process::SaveCore(...) that calculates the address
   /// ranges that should be saved. This allows all core file plug-ins to save
   /// consistent memory ranges given a \a core_style.
diff --git a/lldb/include/lldb/Utility/RangeMap.h b/lldb/include/lldb/Utility/RangeMap.h
index 8cc382bcc046c..c636348129b64 100644
--- a/lldb/include/lldb/Utility/RangeMap.h
+++ b/lldb/include/lldb/Utility/RangeMap.h
@@ -450,6 +450,8 @@ class RangeDataVector {
 
   void Append(const Entry &entry) { m_entries.emplace_back(entry); }
 
+  void Append(B &&b, S &&s, T &&t) { m_entries.emplace_back(Entry(b, s, t)); }
+
   bool Erase(uint32_t start, uint32_t end) {
     if (start >= end || end > m_entries.size())
       return false;
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 7bfde8b9de127..938f6e3abe8f2 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -1222,6 +1222,7 @@ enum SaveCoreStyle {
   eSaveCoreFull = 1,
   eSaveCoreDirtyOnly = 2,
   eSaveCoreStackOnly = 3,
+  eSaveCoreCustomOnly = 4,
 };
 
 /// Events that might happen during a trace session.
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index 337eff696fcf3..5fb288ad43af4 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -207,6 +207,7 @@ class StackFrameRecognizer;
 class StackFrameRecognizerManager;
 class StackID;
 class Status;
+class SaveCoreOptions;
 class StopInfo;
 class Stoppoint;
 class StoppointCallbackContext;
diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h
index b3c8cda899b95..5bac5cd3e86b5 100644
--- a/lldb/include/lldb/lldb-private-interfaces.h
+++ b/lldb/include/lldb/lldb-private-interfaces.h
@@ -9,7 +9,6 @@
 #ifndef LLDB_LLDB_PRIVATE_INTERFACES_H
 #define LLDB_LLDB_PRIVATE_INTERFACES_H
 
-#include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/lldb-enumerations.h"
 #include "lldb/lldb-forward.h"
 #include "lldb/lldb-private-enumerations.h"
diff --git a/lldb/source/API/SBSaveCoreOptions.cpp b/lldb/source/API/SBSaveCoreOptions.cpp
index ef82b0253f119..c79b57fa62c2b 100644
--- a/lldb/source/API/SBSaveCoreOptions.cpp
+++ b/lldb/source/API/SBSaveCoreOptions.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/API/SBSaveCoreOptions.h"
+#include "lldb/API/SBMemoryRegionInfo.h"
 #include "lldb/Host/FileSystem.h"
 #include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/Utility/Instrumentation.h"
@@ -89,6 +90,16 @@ bool SBSaveCoreOptions::RemoveThread(lldb::SBThread thread) {
   return m_opaque_up->RemoveThread(thread.GetSP());
 }
 
+lldb::SBError
+SBSaveCoreOptions::AddMemoryRegionToSave(const SBMemoryRegionInfo &region) {
+  LLDB_INSTRUMENT_VA(this, region);
+  // Currently add memory region can't fail, so we always return a success
+  // SBerror, but because these API's live forever, this is the most future
+  // proof thing to do.
+  m_opaque_up->AddMemoryRegionToSave(region.ref());
+  return SBError();
+}
+
 void SBSaveCoreOptions::Clear() {
   LLDB_INSTRUMENT_VA(this);
   m_opaque_up->Clear();
diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp
index 25eb633f1e6da..5b0f4f66f248b 100644
--- a/lldb/source/Commands/CommandObjectProcess.cpp
+++ b/lldb/source/Commands/CommandObjectProcess.cpp
@@ -25,6 +25,7 @@
 #include "lldb/Interpreter/OptionArgParser.h"
 #include "lldb/Interpreter/OptionGroupPythonClassWithDict.h"
 #include "lldb/Interpreter/Options.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/Target/Platform.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/StopInfo.h"
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index b28beab117cca..06da83e26a26a 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -6562,13 +6562,15 @@ bool ObjectFileMachO::SaveCore(const lldb::ProcessSP &process_sp,
     }
 
     if (make_core) {
-      Process::CoreFileMemoryRanges core_ranges;
+      CoreFileMemoryRanges core_ranges;
       error = process_sp->CalculateCoreFileSaveRanges(options, core_ranges);
       if (error.Success()) {
         const uint32_t addr_byte_size = target_arch.GetAddressByteSize();
         const ByteOrder byte_order = target_arch.GetByteOrder();
         std::vector<llvm::MachO::segment_command_64> segment_load_commands;
-        for (const auto &core_range : core_ranges) {
+        for (const auto &core_range_info : core_ranges) {
+          // TODO: Refactor RangeDataVector to have a data iterator.
+          const auto &core_range = core_range_info.data;
           uint32_t cmd_type = LC_SEGMENT_64;
           uint32_t segment_size = sizeof(llvm::MachO::segment_command_64);
           if (addr_byte_size == 4) {
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
index 27bc237aaac48..be87112df7d89 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
@@ -12,6 +12,7 @@
 #include "lldb/Core/Address.h"
 #include "lldb/Host/SafeMachO.h"
 #include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/FileSpecList.h"
 #include "lldb/Utility/RangeMap.h"
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
index 5c9ba223ad143..edc568a6b47e0 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
@@ -831,25 +831,32 @@ Status MinidumpFileBuilder::AddMemoryList() {
   // bytes of the core file. Thread structures in minidump files can only use
   // 32 bit memory descriptiors, so we emit them first to ensure the memory is
   // in accessible with a 32 bit offset.
-  Process::CoreFileMemoryRanges ranges_32;
-  Process::CoreFileMemoryRanges ranges_64;
-  Process::CoreFileMemoryRanges all_core_memory_ranges;
+  std::vector<CoreFileMemoryRange> ranges_32;
+  std::vector<CoreFileMemoryRange> ranges_64;
+  CoreFileMemoryRanges all_core_memory_ranges;
   error = m_process_sp->CalculateCoreFileSaveRanges(m_save_core_options,
                                                     all_core_memory_ranges);
+
+  std::vector<CoreFileMemoryRange> all_core_memory_vec;
+  // Extract all the data into just a vector of data. So we can mutate this in
+  // place.
+  for (const auto &core_range : all_core_memory_ranges)
+    all_core_memory_vec.push_back(core_range.data);
+
   if (error.Fail())
     return error;
 
   // Start by saving all of the stacks and ensuring they fit under the 32b
   // limit.
   uint64_t total_size = GetCurrentDataEndOffset();
-  auto iterator = all_core_memory_ranges.begin();
-  while (iterator != all_core_memory_ranges.end()) {
+  auto iterator = all_core_memory_vec.begin();
+  while (iterator != all_core_memory_vec.end()) {
     if (m_saved_stack_ranges.count(iterator->range.start()) > 0) {
       // We don't save stacks twice.
       ranges_32.push_back(*iterator);
       total_size +=
           iterator->range.size() + sizeof(llvm::minidump::MemoryDescriptor);
-      iterator = all_core_memory_ranges.erase(iterator);
+      iterator = all_core_memory_vec.erase(iterator);
     } else {
       iterator++;
     }
@@ -869,11 +876,11 @@ Status MinidumpFileBuilder::AddMemoryList() {
   // Then anything overflow extends into 64b addressable space.
   // All core memeroy ranges will either container nothing on stacks only
   // or all the memory ranges including stacks
-  if (!all_core_memory_ranges.empty())
-    total_size += 256 + (all_core_memory_ranges.size() *
+  if (!all_core_memory_vec.empty())
+    total_size += 256 + (all_core_memory_vec.size() *
                          sizeof(llvm::minidump::MemoryDescriptor_64));
 
-  for (const auto &core_range : all_core_memory_ranges) {
+  for (const auto &core_range : all_core_memory_vec) {
     const addr_t range_size = core_range.range.size();
     // We don't need to check for stacks here because we already removed them
     // from all_core_memory_ranges.
@@ -958,15 +965,15 @@ Status MinidumpFileBuilder::DumpDirectories() const {
 }
 
 static uint64_t
-GetLargestRangeSize(const Process::CoreFileMemoryRanges &ranges) {
+GetLargestRangeSize(const std::vector<CoreFileMemoryRange> &ranges) {
   uint64_t max_size = 0;
   for (const auto &core_range : ranges)
     max_size = std::max(max_size, core_range.range.size());
   return max_size;
 }
 
-Status
-MinidumpFileBuilder::AddMemoryList_32(Process::CoreFileMemoryRanges &ranges) {
+Status MinidumpFileBuilder::AddMemoryList_32(
+    std::vector<CoreFileMemoryRange> &ranges) {
   std::vector<MemoryDescriptor> descriptors;
   Status error;
   if (ranges.size() == 0)
@@ -1042,8 +1049,8 @@ MinidumpFileBuilder::AddMemoryList_32(Process::CoreFileMemoryRanges &ranges) {
   return error;
 }
 
-Status
-MinidumpFileBuilder::AddMemoryList_64(Process::CoreFileMemoryRanges &ranges) {
+Status MinidumpFileBuilder::AddMemoryList_64(
+    std::vector<CoreFileMemoryRange> &ranges) {
   Status error;
   if (ranges.empty())
     return error;
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
index 762de83db5a39..71001e26c00e9 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
@@ -23,6 +23,7 @@
 #include <utility>
 #include <variant>
 
+#include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
@@ -120,9 +121,9 @@ class MinidumpFileBuilder {
   lldb_private::Status AddData(const void *data, uint64_t size);
   // Add MemoryList stream, containing dumps of important memory segments
   lldb_private::Status
-  AddMemoryList_64(lldb_private::Process::CoreFileMemoryRanges &ranges);
+  AddMemoryList_64(std::vector<lldb_private::CoreFileMemoryRange> &ranges);
   lldb_private::Status
-  AddMemoryList_32(lldb_private::Process::CoreFileMemoryRanges &ranges);
+  AddMemoryList_32(std::vector<lldb_private::CoreFileMemoryRange> &ranges);
   // Update the thread list on disk with the newly emitted stack RVAs.
   lldb_private::Status FixThreadStacks();
   lldb_private::Status FlushBufferToDisk();
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h
index b76fcd0052a8a..2f45f01558e66 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h
+++ b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h
@@ -21,6 +21,7 @@
 #define LLDB_SOURCE_PLUGINS_OBJECTFILE_MINIDUMP_OBJECTFILEMINIDUMP_H
 
 #include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/Utility/ArchSpec.h"
 
 class ObjectFileMinidump : public lldb_private::PluginInterface {
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index 9d01089745dfc..8d9c919bc9b10 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -17,6 +17,7 @@
 #include "lldb/Interpreter/OptionValueDictionary.h"
 #include "lldb/Interpreter/OptionValueProperties.h"
 #include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/SectionLoadList.h"
 #include "lldb/Target/Target.h"
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
index 8bccf3be3e5f6..4f4dedf773c5b 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
@@ -13,6 +13,7 @@
 #include <vector>
 
 #include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
 #include "llvm/Object/COFF.h"
 
 class ObjectFilePECOFF : public lldb_private::ObjectFile {
diff --git a/lldb/source/Symbol/SaveCoreOptions.cpp b/lldb/source/Symbol/SaveCoreOptions.cpp
index 35943726f2e4e..8d9aadece2152 100644
--- a/lldb/source/Symbol/SaveCoreOptions.cpp
+++ b/lldb/source/Symbol/SaveCoreOptions.cpp
@@ -102,6 +102,19 @@ bool SaveCoreOptions::ShouldThreadBeSaved(lldb::tid_t tid) const {
   return m_threads_to_save.count(tid) > 0;
 }
 
+bool SaveCoreOptions::HasSpecifiedThreads() const {
+  return !m_threads_to_save.empty();
+}
+
+void SaveCoreOptions::AddMemoryRegionToSave(
+    const lldb_private::MemoryRegionInfo &region) {
+  m_regions_to_save.Insert(region.GetRange(), /*combine=*/true);
+}
+
+const MemoryRanges &SaveCoreOptions::GetCoreFileMemoryRanges() const {
+  return m_regions_to_save;
+}
+
 Status SaveCoreOptions::EnsureValidConfiguration(
     lldb::ProcessSP process_sp) const {
   Status error;
@@ -131,4 +144,5 @@ void SaveCoreOptions::Clear() {
   m_style = std::nullopt;
   m_threads_to_save.clear();
   m_process_sp.reset();
+  m_regions_to_save.Clear();
 }
diff --git a/lldb/source/Target/CMakeLists.txt b/lldb/source/Target/CMakeLists.txt
index a42c44b761dc5..a6d2eace97542 100644
--- a/lldb/source/Target/CMakeLists.txt
+++ b/lldb/source/Target/CMakeLists.txt
@@ -11,6 +11,7 @@ add_lldb_library(lldbTarget
   ABI.cpp
   AssertFrameRecognizer.cpp
   DynamicRegisterInfo.cpp
+  CoreFileMemoryRanges.cpp
   ExecutionContext.cpp
   InstrumentationRuntime.cpp
   InstrumentationRuntimeStopInfo.cpp
diff --git a/lldb/source/Target/CoreFileMemoryRanges.cpp b/lldb/source/Target/CoreFileMemoryRanges.cpp
new file mode 100644
index 0000000000000..c244b5890ac36
--- /dev/null
+++ b/lldb/source/Target/CoreFileMemoryRanges.cpp
@@ -0,0 +1,49 @@
+//===-- CoreFileMemoryRanges.cpp --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Target/CoreFileMemoryRanges.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+Status CoreFileMemoryRanges::FinalizeCoreFileSaveRanges() {
+  Status error;
+  std::vector<size_t> indexes_to_remove;
+  this->Sort();
+  for (size_t i = this->GetSize() - 1; i > 0; i--) {
+    auto region = this->GetMutableEntryAtIndex(i);
+    auto next_region = this->GetMutableEntryAtIndex(i - 1);
+    if (next_region->GetRangeEnd() >= region->GetRangeBase() &&
+        region->GetRangeBase() <= next_region->GetRangeEnd() &&
+        region->data.lldb_permissions == next_region->data.lldb_permissions) {
+      const addr_t base =
+          std::min(region->GetRangeBase(), next_region->GetRangeBase());
+      const addr_t byte_size =
+          std::max(region->GetRangeEnd(), next_region->GetRangeEnd()) - base;
+
+      next_region->SetRangeBase(base);
+      next_region->SetByteSize(byte_size);
+
+      // Because this is a range data vector, the entry has a base as well
+      // as the data contained in the entry. So we have to update both.
+      // And llvm::AddressRange isn't mutable so we have to create a new one.
+      llvm::AddressRange range(base, base + byte_size);
+      const CoreFileMemoryRange core_range = {
+          range, next_region->data.lldb_permissions};
+      next_region->data = core_range;
+      if (!this->Erase(i, i + 1)) {
+        error = Status::FromErrorString(
+            "Core file memory ranges mutated outside of "
+            "CalculateCoreFileSaveRanges");
+        return error;
+      }
+    }
+  }
+
+  return error;
+}
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 40f3115f1ff6d..dbcb97355d0bf 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -6463,7 +6463,7 @@ Status Process::WriteMemoryTags(lldb::addr_t addr, size_t len,
 }
 
 // Create a CoreFileMemoryRange from a MemoryRegionInfo
-static Process::CoreFileMemoryRange
+static CoreFileMemoryRange
 CreateCoreFileMemoryRange(const MemoryRegionInfo &region) {
   const addr_t addr = region.GetRange().GetRangeBase();
   llvm::AddressRange range(addr, addr + region.GetRange().GetByteSize());
@@ -6474,7 +6474,7 @@ CreateCoreFileMemoryRange(const MemoryRegionInfo &region) {
 // were added. Return false if the dirty page information is not valid or in
 // the region.
 static bool AddDirtyPages(const MemoryRegionInfo &region,
-                          Process::CoreFileMemoryRanges &ranges) {
+                          CoreFileMemoryRanges &ranges) {
   const auto &dirty_page_list = region.GetDirtyPageList();
   if (!dirty_page_list)
     return false;
@@ -6494,14 +6494,14 @@ static bool AddDirtyPages(const MemoryRegionInfo &region,
       } else {
         // Add previous contiguous range and init the new range with the
         // current dirty page.
-        ranges.push_back({range, lldb_permissions});
+        ranges.Append(range.start(), range.end(), {range, lldb_permissions});
         range = llvm::AddressRange(page_addr, page_addr + page_size);
       }
     }
   }
   // The last range
   if (!range.empty())
-    ranges.push_back({range, lldb_permissions});
+    ranges.Append(range.start(), range.end(), {range, lldb_permissions});
   return true;
 }
 
@@ -6513,7 +6513,7 @@ static bool AddDirtyPages(const MemoryRegionInfo &region,
 // will be added to \a ranges, else the entire range will be added to \a
 // ranges.
 static void AddRegion(const MemoryRegionInfo &region, bool try_dirty_pages,
-                      Process::CoreFileMemoryRanges &ranges) {
+                      CoreFileMemoryRanges &ranges) {
   // Don't add empty ranges.
   if (region.GetRange().GetByteSize() == 0)
     return;
@@ -6522,13 +6522,17 @@ static void AddRegion(const MemoryRegionInfo &region, bool try_dirty_pages,
     return;
   if (try_dirty_pages && AddDirtyPages(region, ranges))
     return;
-  ranges.push_back(CreateCoreFileMemoryRange(region));
+
+  ranges.Append(region.GetRange().GetRangeBase(),
+                region.GetRange().GetByteSize(),
+                CreateCoreFileMemoryRange(region));
 }
 
-static void SaveOffRegionsWithStackPointers(
-    Process &process, const SaveCoreOptions &core_options,
-    const MemoryRegionInfos &regions, Process::CoreFileMemoryRanges &ranges,
-    std::set<addr_t> &stack_ends) {
+static void SaveOffRegionsWithStackPointers(Process &process,
+                                            const SaveCoreOptions &core_options,
+                                            const MemoryRegionInfos &regions,
+                                            CoreFileMemoryRanges &ranges,
+                                            std::set<addr_t> &stack_ends) {
   const bool try_dirty_pages = true;
 
   // Before we take any dump, we want to save off the used portions of the
@@ -6568,11 +6572,11 @@ static void SaveOffRegionsWithStackPointers(
 // for a full core file style.
 static void GetCoreFileSaveRangesFull(Process &process,
                                       const MemoryRegionInfos &regions,
-                                      Process::CoreFileMemoryRanges &ranges,
+                                      CoreFileMemoryRanges &ranges,
                                       std::set<addr_t> &stack_ends) {
 
   // Don't add only dirty pages, add full regions.
-const bool try_dirty_pages = false;
+  const bool try_dirty_pages = false;
   for (const auto &region : regions)
     if (stack_ends.count(region.GetRange().GetRangeEnd()) == 0)
       AddRegion(region, try_dirty_pages, ranges);
@@ -6582,9 +6586,10 @@ const bool try_dirty_pages = false;
 // least some dirty pages, as some OS versions don't support reporting what
 // pages are dirty within an memory region. If no memory regions have dirty
 // page information fall back to saving out all ranges with write permissions.
-static void GetCoreFileSaveRangesDirtyOnly(
-    Process &process, const MemoryRegionInfos &regions,
-    Process::CoreFileMemoryRanges &ranges, std::set<addr_t> &stack_ends) {
+static void GetCoreFileSaveRangesDirtyOnly(Process &process,
+                                           const MemoryRegionInfos &regions,
+                                           CoreFileMemoryRanges &ranges,
+                                           std::set<addr_t> &stack_ends) {
 
   // Iterate over the regions and find all dirty pages.
   bool have_dirty_page_info = false;
@@ -6613,9 +6618,10 @@ static void GetCoreFileSaveRangesDirtyOnly(
 // dirty regions as this will make the core file smaller. If the process
 // doesn't support dirty regions, then it will fall back to adding the full
 // stack region.
-static void GetCoreFileSaveRangesStackOnly(
-    Process &process, const MemoryRegionInfos &regions,
-    Process::CoreFileMemoryRanges &ranges, std::set<addr_t> &stack_ends) {
+static void GetCoreFileSaveRangesStackOnly(Process &process,
+                                           const MemoryRegionInfos &regions,
+                                           CoreFileMemoryRanges &ranges,
+                                           std::set<addr_t> &stack_ends) {
   const bool try_dirty_pages = true;
   // Some platforms support annotating the region information that tell us that
   // it comes from a thread stack. So look for those regions first.
@@ -6628,6 +6634,23 @@ static void GetCoreFileSaveRangesStackOnly(
   }
 }
 
+static void GetUserSpecifiedCoreFileSaveRanges(Process &process,
+                                               const MemoryRegionInfos &regions,
+                                               const SaveCoreOptions &options,
+                                               CoreFileMemoryRanges &ranges) {
+  const auto &option_ranges = options.GetCoreFileMemoryRanges();
+  if (option_ranges.IsEmpty())
+    return;
+
+  for (const auto &range : regions) {
+    auto entry = option_ranges.FindEntryThatContains(range.GetRange());
+    if (entry)
+      ranges.Append(range.GetRange().GetRangeBase(),
+                    range.GetRange().GetByteSize(),
+                    CreateCoreFileMemoryRange(range));
+  }
+}
+
 Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options,
                                             CoreFileMemoryRanges &ranges) {
   lldb_private::MemoryRegionInfos regions;
@@ -6643,11 +6666,18 @@ Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options,
         "callers must set the core_style to something other than "
         "eSaveCoreUnspecified");
 
+  GetUserSpecifiedCoreFileSaveRanges(*this, regions, options, ranges);
+
   std::set<addr_t> stack_ends;
-  SaveOffRegionsWithStackPointers(*this, options, regions, ranges, stack_ends);
+  // For fully custom set ups, we don't want to even look at threads if there
+  // are no threads specified.
+  if (core_style != lldb::eSaveCoreCustomOnly || options.HasSpecifiedThreads())
+    SaveOffRegionsWithStackPointers(*this, options, regions, ranges,
+                                    stack_ends);
 
   switch (core_style) {
   case eSaveCoreUnspecified:
+  case eSaveCoreCustomOnly:
     break;
 
   case eSaveCoreFull:
@@ -6666,10 +6696,11 @@ Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options,
   if (err.Fail())
     return err;
 
-  if (ranges.empty())
-    return Status("no valid address ranges found for core style");
+  if (ranges.IsEmpty())
+    return Status::FromErrorString(
+        "no valid address ranges found for core style");
 
-  return Status(); // Success!
+  return ranges.FinalizeCoreFileSaveRanges();
 }
 
 std::vector<ThreadSP>
diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
index ed15793b527fc..2cbe20ee10b1a 100644
--- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
+++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
@@ -344,3 +344,152 @@ def test_save_linux_mini_dump_default_options(self):
             self.assertTrue(self.dbg.DeleteTarget(target))
             if os.path.isfile(default_value_file):
                 os.unlink(default_value_file)
+
+    @skipUnlessArch("x86_64")
+    @skipUnlessPlatform(["linux"])
+    def test_save_linux_minidump_one_region(self):
+        """Test that we can save a Linux mini dump with one region in sbsavecore regions"""
+
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        one_region_file = self.getBuildArtifact("core.one_region.dmp")
+        try:
+            target = self.dbg.CreateTarget(exe)
+            process = target.LaunchSimple(
+                None, None, self.get_process_working_directory()
+            )
+            self.assertState(process.GetState(), lldb.eStateStopped)
+
+            memory_region = lldb.SBMemoryRegionInfo()
+            memory_list = process.GetMemoryRegions()
+            memory_list.GetMemoryRegionAtIndex(0, memory_region)
+
+            # This is almost identical to the single thread test case because
+            # minidump defaults to stacks only, so we want to see if the
+            # default options work as expected.
+            options = lldb.SBSaveCoreOptions()
+            file_spec = lldb.SBFileSpec(one_region_file)
+            options.SetOutputFile(file_spec)
+            options.SetPluginName("minidump")
+            options.AddMemoryRegionToSave(memory_region)
+            options.SetStyle(lldb.eSaveCoreCustomOnly)
+            error = process.SaveCore(options)
+            print(f"Error: {error.GetCString()}")
+            self.assertTrue(error.Success(), error.GetCString())
+
+            core_target = self.dbg.CreateTarget(None)
+            core_proc = core_target.LoadCore(one_region_file)
+            core_memory_list = core_proc.GetMemoryRegions()
+            # Note because the /proc/pid maps are included on linux, we can't
+            # depend on size for validation, so we'll ensure the first region
+            # is present and then assert we fail on the second.
+            core_memory_region = lldb.SBMemoryRegionInfo()
+            core_memory_list.GetMemoryRegionAtIndex(0, core_memory_region)
+            self.assertEqual(
+                core_memory_region.GetRegionBase(), memory_region.GetRegionBase()
+            )
+            self.assertEqual(
+                core_memory_region.GetRegionEnd(), memory_region.GetRegionEnd()
+            )
+
+            region_two = lldb.SBMemoryRegionInfo()
+            core_memory_list.GetMemoryRegionAtIndex(1, region_two)
+            err = lldb.SBError()
+            content = core_proc.ReadMemory(region_two.GetRegionBase(), 1, err)
+            self.assertTrue(err.Fail(), "Should fail to read memory")
+
+        finally:
+            self.assertTrue(self.dbg.DeleteTarget(target))
+            if os.path.isfile(one_region_file):
+                os.unlink(one_region_file)
+
+    @skipUnlessArch("x86_64")
+    @skipUnlessPlatform(["linux"])
+    def test_save_minidump_custom_save_style(self):
+        """Test that verifies a custom and unspecified save style fails for
+        containing no data to save"""
+
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        custom_file = self.getBuildArtifact("core.custom.dmp")
+        try:
+            target = self.dbg.CreateTarget(exe)
+            process = target.LaunchSimple(
+                None, None, self.get_process_working_directory()
+            )
+            self.assertState(process.GetState(), lldb.eStateStopped)
+
+            options = lldb.SBSaveCoreOptions()
+            options.SetOutputFile(lldb.SBFileSpec(custom_file))
+            options.SetPluginName("minidump")
+            options.SetStyle(lldb.eSaveCoreCustomOnly)
+
+            error = process.SaveCore(options)
+            self.assertTrue(error.Fail())
+            self.assertEqual(
+                error.GetCString(), "no valid address ranges found for core style"
+            )
+
+        finally:
+            self.assertTrue(self.dbg.DeleteTarget(target))
+            if os.path.isfile(custom_file):
+                os.unlink(custom_file)
+
+    def save_core_with_region(self, process, region_index):
+        try:
+            custom_file = self.getBuildArtifact("core.custom.dmp")
+            memory_region = lldb.SBMemoryRegionInfo()
+            memory_list = process.GetMemoryRegions()
+            memory_list.GetMemoryRegionAtIndex(0, memory_region)
+            options = lldb.SBSaveCoreOptions()
+            options.SetOutputFile(lldb.SBFileSpec(custom_file))
+            options.SetPluginName("minidump")
+            options.SetStyle(lldb.eSaveCoreFull)
+
+            error = process.SaveCore(options)
+            self.assertTrue(error.Success())
+            core_target = self.dbg.CreateTarget(None)
+            core_proc = core_target.LoadCore(custom_file)
+            core_memory_list = core_proc.GetMemoryRegions()
+            # proc/pid/ maps are included on linux, so we can't depend on size
+            # for validation, we make a set of all the ranges,
+            # and ensure no duplicates!
+            range_set = set()
+            for x in range(core_memory_list.GetSize()):
+                core_memory_region = lldb.SBMemoryRegionInfo()
+                core_memory_list.GetMemoryRegionAtIndex(x, core_memory_region)
+                mem_tuple = (
+                    core_memory_region.GetRegionBase(),
+                    core_memory_region.GetRegionEnd(),
+                )
+                self.assertTrue(
+                    mem_tuple not in range_set, "Duplicate memory region found"
+                )
+                range_set.add(mem_tuple)
+        finally:
+            if os.path.isfile(custom_file):
+                os.unlink(custom_file)
+
+    @skipUnlessArch("x86_64")
+    @skipUnlessPlatform(["linux"])
+    def test_save_minidump_custom_save_style_duplicated_regions(self):
+        """Test that verifies a custom and unspecified save style fails for
+        containing no data to save"""
+
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        try:
+            target = self.dbg.CreateTarget(exe)
+            process = target.LaunchSimple(
+                None, None, self.get_process_working_directory()
+            )
+            self.assertState(process.GetState(), lldb.eStateStopped)
+
+            memory_list = process.GetMemoryRegions()
+            # Test that we don't duplicate regions, by duplicating regions
+            # at various indices.
+            self.save_core_with_region(process, 0)
+            self.save_core_with_region(process, len(memory_list) - 1)
+
+        finally:
+            self.assertTrue(self.dbg.DeleteTarget(target))
diff --git a/lldb/unittests/Process/Utility/CMakeLists.txt b/lldb/unittests/Process/Utility/CMakeLists.txt
index 651f871621fdf..ec0ff95d073b9 100644
--- a/lldb/unittests/Process/Utility/CMakeLists.txt
+++ b/lldb/unittests/Process/Utility/CMakeLists.txt
@@ -18,6 +18,7 @@ add_lldb_unittest(ProcessUtilityTests
   LinuxProcMapsTest.cpp
   MemoryTagManagerAArch64MTETest.cpp
   RegisterContextTest.cpp
+  CoreFileMemoryRangesTest.cpp
   ${PLATFORM_SOURCES}
 
   LINK_LIBS
diff --git a/lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp b/lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp
new file mode 100644
index 0000000000000..7f1254acf6f53
--- /dev/null
+++ b/lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp
@@ -0,0 +1,108 @@
+//===-- CoreFileMemoryRangesTests.cpp
+//---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include "lldb/Target/CoreFileMemoryRanges.h"
+#include "lldb/lldb-types.h"
+
+using namespace lldb_private;
+
+TEST(CoreFileMemoryRangesTest, MapOverlappingRanges) {
+  lldb_private::CoreFileMemoryRanges ranges;
+  const lldb::addr_t start_addr = 0x1000;
+  const lldb::addr_t increment_addr = 0x1000;
+  const size_t iterations = 10;
+  for (size_t i = 0; i < iterations; i++) {
+    const lldb::addr_t start = start_addr + (i * increment_addr);
+    const lldb::addr_t end = start + increment_addr;
+    // Arbitrary value
+    const uint32_t permissions = 0x3;
+    llvm::AddressRange range(start, end);
+    const CoreFileMemoryRange core_range = {range, permissions};
+    // The range data is Start, Size, While the range is start-end.
+    CoreFileMemoryRanges::Entry entry = {start, end - start, core_range};
+    ranges.Append(entry);
+  }
+
+  Status error = ranges.FinalizeCoreFileSaveRanges();
+  EXPECT_TRUE(error.Success());
+  ASSERT_THAT(1, ranges.GetSize());
+  const auto range = ranges.GetEntryAtIndex(0);
+  ASSERT_TRUE(range);
+  ASSERT_THAT(start_addr, range->GetRangeBase());
+  ASSERT_THAT(start_addr + (iterations * increment_addr), range->GetRangeEnd());
+}
+
+TEST(CoreFileMemoryRangesTest, RangesSplitByPermissions) {
+  lldb_private::CoreFileMemoryRanges ranges;
+  const lldb::addr_t start_addr = 0x1000;
+  const lldb::addr_t increment_addr = 0x1000;
+  const size_t iterations = 10;
+  for (size_t i = 0; i < iterations; i++) {
+    const lldb::addr_t start = start_addr + (i * increment_addr);
+    const lldb::addr_t end = start + increment_addr;
+    const uint32_t permissions = i;
+    llvm::AddressRange range(start, end);
+    const CoreFileMemoryRange core_range = {range, permissions};
+    // The range data is Start, Size, While the range is start-end.
+    CoreFileMemoryRanges::Entry entry = {start, end - start, core_range};
+    ranges.Append(entry);
+  }
+
+  Status error = ranges.FinalizeCoreFileSaveRanges();
+  EXPECT_TRUE(error.Success());
+  ASSERT_THAT(10, ranges.GetSize());
+  const auto range = ranges.GetEntryAtIndex(0);
+  ASSERT_TRUE(range);
+  ASSERT_THAT(start_addr, range->GetRangeBase());
+  ASSERT_THAT(start_addr + increment_addr, range->GetRangeEnd());
+}
+
+TEST(CoreFileMemoryRangesTest, MapPartialOverlappingRanges) {
+  lldb_private::CoreFileMemoryRanges ranges;
+  const lldb::addr_t start_addr = 0x1000;
+  const lldb::addr_t increment_addr = 0x1000;
+  const size_t iterations = 10;
+  for (size_t i = 0; i < iterations; i++) {
+    const lldb::addr_t start = start_addr + (i * increment_addr);
+    const lldb::addr_t end = start + increment_addr;
+    // Arbitrary value
+    const uint32_t permissions = 0x3;
+    llvm::AddressRange range(start, end);
+    const CoreFileMemoryRange core_range = {range, permissions};
+    // The range data is Start, Size, While the range is start-end.
+    CoreFileMemoryRanges::Entry entry = {start, end - start, core_range};
+    ranges.Append(entry);
+  }
+
+  const lldb::addr_t unique_start = 0x7fff0000;
+  const lldb::addr_t unique_end = unique_start + increment_addr;
+  llvm::AddressRange range(unique_start, unique_end);
+  const uint32_t permissions = 0x3;
+  const CoreFileMemoryRange core_range = {range, permissions};
+  // The range data is Start, Size, While the range is start-end.
+  CoreFileMemoryRanges::Entry entry = {unique_start, unique_end - unique_start,
+                                       core_range};
+  ranges.Append(entry);
+
+  Status error = ranges.FinalizeCoreFileSaveRanges();
+  EXPECT_TRUE(error.Success());
+  ASSERT_THAT(2, ranges.GetSize());
+  const auto merged_range = ranges.GetEntryAtIndex(0);
+  ASSERT_TRUE(merged_range);
+  ASSERT_THAT(start_addr, merged_range->GetRangeBase());
+  ASSERT_THAT(start_addr + (iterations * increment_addr),
+              merged_range->GetRangeEnd());
+  const auto unique_range = ranges.GetEntryAtIndex(1);
+  ASSERT_TRUE(unique_range);
+  ASSERT_THAT(unique_start, unique_range->GetRangeBase());
+  ASSERT_THAT(unique_end, unique_range->GetRangeEnd());
+}

From eb0f12188a3afa10ba353a32328577dc0d898524 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 6 Sep 2024 12:10:04 -0400
Subject: [PATCH 398/425] [libc++][modules] Tweak a few includes (#107467)

Add a few missing includes, remove two unnecessary ones and use
__cstddef/size_t.h instead of <cstddef> in a few places. This is a
collection of miscellaneous findings that collectively unblock other
modularization patches.
---
 libcxx/include/__math/copysign.h                 | 1 -
 libcxx/include/__math/remainder.h                | 1 -
 libcxx/include/__tuple/find_index.h              | 2 +-
 libcxx/include/__tuple/make_tuple_types.h        | 2 +-
 libcxx/include/__tuple/sfinae_helpers.h          | 2 +-
 libcxx/include/__tuple/tuple_element.h           | 2 +-
 libcxx/include/__tuple/tuple_indices.h           | 2 +-
 libcxx/include/__tuple/tuple_like_ext.h          | 2 +-
 libcxx/include/__tuple/tuple_like_no_subrange.h  | 2 +-
 libcxx/include/__tuple/tuple_size.h              | 2 +-
 libcxx/include/future                            | 1 +
 libcxx/include/variant                           | 2 ++
 libcxx/test/libcxx/transitive_includes/cxx11.csv | 5 +----
 libcxx/test/libcxx/transitive_includes/cxx14.csv | 5 +----
 libcxx/test/libcxx/transitive_includes/cxx17.csv | 5 +----
 libcxx/test/libcxx/transitive_includes/cxx20.csv | 5 +----
 libcxx/test/libcxx/transitive_includes/cxx23.csv | 4 +---
 libcxx/test/libcxx/transitive_includes/cxx26.csv | 4 +---
 18 files changed, 17 insertions(+), 32 deletions(-)

diff --git a/libcxx/include/__math/copysign.h b/libcxx/include/__math/copysign.h
index b38690bb581a1..2c3b0dd6f97b7 100644
--- a/libcxx/include/__math/copysign.h
+++ b/libcxx/include/__math/copysign.h
@@ -13,7 +13,6 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/promote.h>
-#include <limits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__math/remainder.h b/libcxx/include/__math/remainder.h
index 0fbf0b8ef97b9..0adb7f3af5de2 100644
--- a/libcxx/include/__math/remainder.h
+++ b/libcxx/include/__math/remainder.h
@@ -14,7 +14,6 @@
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/promote.h>
-#include <limits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__tuple/find_index.h b/libcxx/include/__tuple/find_index.h
index 133b00419d0c6..cc7329d3e89b4 100644
--- a/libcxx/include/__tuple/find_index.h
+++ b/libcxx/include/__tuple/find_index.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___TUPLE_FIND_INDEX_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__type_traits/is_same.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__tuple/make_tuple_types.h b/libcxx/include/__tuple/make_tuple_types.h
index 9e0fefae2f2f5..53e98c3d6e975 100644
--- a/libcxx/include/__tuple/make_tuple_types.h
+++ b/libcxx/include/__tuple/make_tuple_types.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___TUPLE_MAKE_TUPLE_TYPES_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__fwd/array.h>
 #include <__fwd/tuple.h>
 #include <__tuple/tuple_element.h>
@@ -19,7 +20,6 @@
 #include <__type_traits/copy_cvref.h>
 #include <__type_traits/remove_cv.h>
 #include <__type_traits/remove_reference.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__tuple/sfinae_helpers.h b/libcxx/include/__tuple/sfinae_helpers.h
index c7145e0b011a9..a785cec138b2f 100644
--- a/libcxx/include/__tuple/sfinae_helpers.h
+++ b/libcxx/include/__tuple/sfinae_helpers.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___TUPLE_SFINAE_HELPERS_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__fwd/tuple.h>
 #include <__tuple/make_tuple_types.h>
 #include <__tuple/tuple_element.h>
@@ -23,7 +24,6 @@
 #include <__type_traits/is_same.h>
 #include <__type_traits/remove_cvref.h>
 #include <__type_traits/remove_reference.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__tuple/tuple_element.h b/libcxx/include/__tuple/tuple_element.h
index 9127c47dc8f1a..1404460e74352 100644
--- a/libcxx/include/__tuple/tuple_element.h
+++ b/libcxx/include/__tuple/tuple_element.h
@@ -10,9 +10,9 @@
 #define _LIBCPP___TUPLE_TUPLE_ELEMENT_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__tuple/tuple_indices.h>
 #include <__tuple/tuple_types.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__tuple/tuple_indices.h b/libcxx/include/__tuple/tuple_indices.h
index 501e711255ec1..25dc9ec685916 100644
--- a/libcxx/include/__tuple/tuple_indices.h
+++ b/libcxx/include/__tuple/tuple_indices.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___TUPLE_MAKE_TUPLE_INDICES_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__utility/integer_sequence.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__tuple/tuple_like_ext.h b/libcxx/include/__tuple/tuple_like_ext.h
index 0cc21e0b75fd1..45c0e65d62ff3 100644
--- a/libcxx/include/__tuple/tuple_like_ext.h
+++ b/libcxx/include/__tuple/tuple_like_ext.h
@@ -10,12 +10,12 @@
 #define _LIBCPP___TUPLE_TUPLE_LIKE_EXT_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__fwd/array.h>
 #include <__fwd/pair.h>
 #include <__fwd/tuple.h>
 #include <__tuple/tuple_types.h>
 #include <__type_traits/integral_constant.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__tuple/tuple_like_no_subrange.h b/libcxx/include/__tuple/tuple_like_no_subrange.h
index 274b0bf188e1f..30018d6a05ae2 100644
--- a/libcxx/include/__tuple/tuple_like_no_subrange.h
+++ b/libcxx/include/__tuple/tuple_like_no_subrange.h
@@ -10,13 +10,13 @@
 #define _LIBCPP___TUPLE_TUPLE_LIKE_NO_SUBRANGE_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__fwd/array.h>
 #include <__fwd/complex.h>
 #include <__fwd/pair.h>
 #include <__fwd/tuple.h>
 #include <__tuple/tuple_size.h>
 #include <__type_traits/remove_cvref.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__tuple/tuple_size.h b/libcxx/include/__tuple/tuple_size.h
index 21c9811abeee7..b970280fe3784 100644
--- a/libcxx/include/__tuple/tuple_size.h
+++ b/libcxx/include/__tuple/tuple_size.h
@@ -10,13 +10,13 @@
 #define _LIBCPP___TUPLE_TUPLE_SIZE_H
 
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__fwd/tuple.h>
 #include <__tuple/tuple_types.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_const.h>
 #include <__type_traits/is_volatile.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/future b/libcxx/include/future
index 0be32620139e3..01c0b10172cd3 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -369,6 +369,7 @@ template <class R, class Alloc> struct uses_allocator<packaged_task<R>, Alloc>;
 #  include <__assert>
 #  include <__chrono/duration.h>
 #  include <__chrono/time_point.h>
+#  include <__condition_variable/condition_variable.h>
 #  include <__exception/exception_ptr.h>
 #  include <__memory/addressof.h>
 #  include <__memory/allocator.h>
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 5f2d03b7227b8..1367cd66f3701 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -233,7 +233,9 @@ namespace std {
 #include <__type_traits/conjunction.h>
 #include <__type_traits/dependent_type.h>
 #include <__type_traits/is_array.h>
+#include <__type_traits/is_assignable.h>
 #include <__type_traits/is_constructible.h>
+#include <__type_traits/is_convertible.h>
 #include <__type_traits/is_destructible.h>
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index f7b0179f6f60e..722d624d3fb2a 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -354,11 +354,9 @@ future atomic
 future cerrno
 future chrono
 future cstddef
-future cstdint
 future cstdlib
-future cstring
+future ctime
 future exception
-future initializer_list
 future iosfwd
 future limits
 future mutex
@@ -369,7 +367,6 @@ future string
 future system_error
 future thread
 future type_traits
-future typeinfo
 future version
 initializer_list cstddef
 iomanip istream
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index 11afb76583a8c..d82090f5506f2 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -357,11 +357,9 @@ future atomic
 future cerrno
 future chrono
 future cstddef
-future cstdint
 future cstdlib
-future cstring
+future ctime
 future exception
-future initializer_list
 future iosfwd
 future limits
 future mutex
@@ -372,7 +370,6 @@ future string
 future system_error
 future thread
 future type_traits
-future typeinfo
 future version
 initializer_list cstddef
 iomanip istream
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index 42ba4ef3f6f52..f6e2243f9dfd7 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -357,11 +357,9 @@ future atomic
 future cerrno
 future chrono
 future cstddef
-future cstdint
 future cstdlib
-future cstring
+future ctime
 future exception
-future initializer_list
 future iosfwd
 future limits
 future mutex
@@ -372,7 +370,6 @@ future string
 future system_error
 future thread
 future type_traits
-future typeinfo
 future version
 initializer_list cstddef
 iomanip istream
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index cd48b37520f15..2a0ec91565733 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -364,11 +364,9 @@ functional version
 future atomic
 future cerrno
 future cstddef
-future cstdint
 future cstdlib
-future cstring
+future ctime
 future exception
-future initializer_list
 future iosfwd
 future limits
 future mutex
@@ -379,7 +377,6 @@ future string
 future system_error
 future thread
 future type_traits
-future typeinfo
 future version
 initializer_list cstddef
 iomanip istream
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index db09568fc76ff..68877978d84ec 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -238,10 +238,8 @@ functional vector
 functional version
 future cerrno
 future cstddef
-future cstdint
 future cstdlib
-future cstring
-future initializer_list
+future ctime
 future limits
 future mutex
 future new
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index db09568fc76ff..68877978d84ec 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -238,10 +238,8 @@ functional vector
 functional version
 future cerrno
 future cstddef
-future cstdint
 future cstdlib
-future cstring
-future initializer_list
+future ctime
 future limits
 future mutex
 future new

From f8350f13020a27e7aa74fd4ab7919503c24771ca Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 6 Sep 2024 12:11:20 -0400
Subject: [PATCH 399/425] [libc++][modules] Introduce a forward-declaration for
 std::byte (#107402)

We need a forward-declaration so that we can know about std::byte from
some type traits without having to include std::byte's definition, which
(circularly) depends back on type traits.
---
 libcxx/include/CMakeLists.txt                 |  1 +
 libcxx/include/__cstddef/byte.h               |  1 +
 libcxx/include/__fwd/byte.h                   | 26 +++++++++++++++++++
 ...s_trivially_lexicographically_comparable.h |  2 +-
 libcxx/include/module.modulemap               |  3 +++
 libcxx/utils/generate_iwyu_mapping.py         |  2 ++
 6 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 libcxx/include/__fwd/byte.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 0f43916dae438..a571832ab724d 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -418,6 +418,7 @@ set(files
   __functional/weak_result_type.h
   __fwd/array.h
   __fwd/bit_reference.h
+  __fwd/byte.h
   __fwd/complex.h
   __fwd/deque.h
   __fwd/format.h
diff --git a/libcxx/include/__cstddef/byte.h b/libcxx/include/__cstddef/byte.h
index b8cfe5e8d1c7e..09e1d75e0b41f 100644
--- a/libcxx/include/__cstddef/byte.h
+++ b/libcxx/include/__cstddef/byte.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___CSTDDEF_BYTE_H
 
 #include <__config>
+#include <__fwd/byte.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_integral.h>
 
diff --git a/libcxx/include/__fwd/byte.h b/libcxx/include/__fwd/byte.h
new file mode 100644
index 0000000000000..0301833d93cf2
--- /dev/null
+++ b/libcxx/include/__fwd/byte.h
@@ -0,0 +1,26 @@
+//===---------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___FWD_BYTE_H
+#define _LIBCPP___FWD_BYTE_H
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if _LIBCPP_STD_VER >= 17
+namespace std { // purposefully not versioned
+
+enum class byte : unsigned char;
+
+} // namespace std
+#endif // _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___FWD_BYTE_H
diff --git a/libcxx/include/__type_traits/is_trivially_lexicographically_comparable.h b/libcxx/include/__type_traits/is_trivially_lexicographically_comparable.h
index 337f878fea5c1..15dda5824a362 100644
--- a/libcxx/include/__type_traits/is_trivially_lexicographically_comparable.h
+++ b/libcxx/include/__type_traits/is_trivially_lexicographically_comparable.h
@@ -10,13 +10,13 @@
 #define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_LEXICOGRAPHICALLY_COMPARABLE_H
 
 #include <__config>
+#include <__fwd/byte.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_unsigned.h>
 #include <__type_traits/remove_cv.h>
 #include <__type_traits/void_t.h>
 #include <__utility/declval.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 3abc11723a5a9..65df579b8d6dd 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -636,6 +636,9 @@ module std_private_bit_reference     [system] {
 module std_private_fwd_bit_reference [system] {
   header "__fwd/bit_reference.h"
 }
+module std_private_fwd_byte [system] {
+  header "__fwd/byte.h"
+}
 module std_private_config            [system] {
   textual header "__config"
   textual header "__configuration/abi.h"
diff --git a/libcxx/utils/generate_iwyu_mapping.py b/libcxx/utils/generate_iwyu_mapping.py
index b0ebe1b9e93d2..599201808bb79 100644
--- a/libcxx/utils/generate_iwyu_mapping.py
+++ b/libcxx/utils/generate_iwyu_mapping.py
@@ -40,6 +40,8 @@ def IWYU_mapping(header: str) -> typing.Optional[typing.List[str]]:
         return ["atomic", "mutex", "semaphore", "thread"]
     elif header == "__tree":
         return ["map", "set"]
+    elif header == "__fwd/byte.h":
+        return ["cstddef"]
     elif header == "__fwd/pair.h":
         return ["utility"]
     elif header == "__fwd/subrange.h":

From ce192b87b2a09ee27e4077763db0486921a485c0 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 6 Sep 2024 09:12:06 -0700
Subject: [PATCH 400/425] [Vectorize] Fix a warning

This patch fixes:

  llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp:1278:12: error:
  unused variable 'Op0' [-Werror,-Wunused-variable]
---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ea49d29e28d83..e94e7561f3c2d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1275,9 +1275,8 @@ void VPWidenEVLRecipe::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
   assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
                           "explicit vector length.");
-  VPValue *Op0 = getOperand(0);
 
-  assert(State.get(Op0, 0)->getType()->isVectorTy() &&
+  assert(State.get(getOperand(0), 0)->getType()->isVectorTy() &&
          "VPWidenEVLRecipe should not be used for scalars");
 
   VPValue *EVL = getEVL();

From 37086ea21cd966465694cc6998a6e937846ec28d Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 6 Sep 2024 12:12:29 -0400
Subject: [PATCH 401/425] [libc++] Decouple iterator_traits test from precise
 Clang diagnostics (#107478)

This makes the test more robust and prevents it from breaking when the
diagnostic changes subtly (e.g. under modules).
---
 .../iterator.traits/empty.compile.pass.cpp    | 132 ++++++++++++++++++
 .../iterator.traits/empty.verify.cpp          | 123 ----------------
 2 files changed, 132 insertions(+), 123 deletions(-)
 create mode 100644 libcxx/test/std/iterators/iterator.primitives/iterator.traits/empty.compile.pass.cpp
 delete mode 100644 libcxx/test/std/iterators/iterator.primitives/iterator.traits/empty.verify.cpp

diff --git a/libcxx/test/std/iterators/iterator.primitives/iterator.traits/empty.compile.pass.cpp b/libcxx/test/std/iterators/iterator.primitives/iterator.traits/empty.compile.pass.cpp
new file mode 100644
index 0000000000000..0ee3e1e7ba8a0
--- /dev/null
+++ b/libcxx/test/std/iterators/iterator.primitives/iterator.traits/empty.compile.pass.cpp
@@ -0,0 +1,132 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <iterator>
+
+// struct iterator_traits
+// {
+// };
+
+#include <iterator>
+#include <type_traits>
+
+#include "test_macros.h"
+
+template <class...>
+using always_void = void;
+
+#define HAS_XXX(member)                                                                                                \
+  template <class T, class = void>                                                                                     \
+  struct has_##member : std::false_type {};                                                                            \
+  template <class T>                                                                                                   \
+  struct has_##member<T, always_void<typename T::member> > : std::true_type {}
+
+HAS_XXX(difference_type);
+HAS_XXX(value_type);
+HAS_XXX(pointer);
+HAS_XXX(reference);
+HAS_XXX(iterator_category);
+
+struct A {};
+struct NotAnIteratorEmpty {};
+
+struct NotAnIteratorNoDifference {
+  //     typedef int                       difference_type;
+  typedef A value_type;
+  typedef A* pointer;
+  typedef A& reference;
+  typedef std::forward_iterator_tag iterator_category;
+};
+
+struct NotAnIteratorNoValue {
+  typedef int difference_type;
+  //     typedef A                         value_type;
+  typedef A* pointer;
+  typedef A& reference;
+  typedef std::forward_iterator_tag iterator_category;
+};
+
+struct NotAnIteratorNoPointer {
+  typedef int difference_type;
+  typedef A value_type;
+  //     typedef A*                        pointer;
+  typedef A& reference;
+  typedef std::forward_iterator_tag iterator_category;
+};
+
+struct NotAnIteratorNoReference {
+  typedef int difference_type;
+  typedef A value_type;
+  typedef A* pointer;
+  //    typedef A&                        reference;
+  typedef std::forward_iterator_tag iterator_category;
+};
+
+struct NotAnIteratorNoCategory {
+  typedef int difference_type;
+  typedef A value_type;
+  typedef A* pointer;
+  typedef A& reference;
+  //     typedef std::forward_iterator_tag iterator_category;
+};
+
+void test() {
+  {
+    typedef std::iterator_traits<NotAnIteratorEmpty> T;
+    static_assert(!has_difference_type<T>::value, "");
+    static_assert(!has_value_type<T>::value, "");
+    static_assert(!has_pointer<T>::value, "");
+    static_assert(!has_reference<T>::value, "");
+    static_assert(!has_iterator_category<T>::value, "");
+  }
+
+  {
+    typedef std::iterator_traits<NotAnIteratorNoDifference> T;
+    static_assert(!has_difference_type<T>::value, "");
+    static_assert(!has_value_type<T>::value, "");
+    static_assert(!has_pointer<T>::value, "");
+    static_assert(!has_reference<T>::value, "");
+    static_assert(!has_iterator_category<T>::value, "");
+  }
+
+  {
+    typedef std::iterator_traits<NotAnIteratorNoValue> T;
+    static_assert(!has_difference_type<T>::value, "");
+    static_assert(!has_value_type<T>::value, "");
+    static_assert(!has_pointer<T>::value, "");
+    static_assert(!has_reference<T>::value, "");
+    static_assert(!has_iterator_category<T>::value, "");
+  }
+#if TEST_STD_VER <= 17 || !defined(__cpp_lib_concepts)
+  {
+    typedef std::iterator_traits<NotAnIteratorNoPointer> T;
+    static_assert(!has_difference_type<T>::value, "");
+    static_assert(!has_value_type<T>::value, "");
+    static_assert(!has_pointer<T>::value, "");
+    static_assert(!has_reference<T>::value, "");
+    static_assert(!has_iterator_category<T>::value, "");
+  }
+#endif // TEST_STD_VER <= 17 || !defined(__cpp_lib_concepts)
+  {
+    typedef std::iterator_traits<NotAnIteratorNoReference> T;
+    static_assert(!has_difference_type<T>::value, "");
+    static_assert(!has_value_type<T>::value, "");
+    static_assert(!has_pointer<T>::value, "");
+    static_assert(!has_reference<T>::value, "");
+    static_assert(!has_iterator_category<T>::value, "");
+  }
+
+  {
+    typedef std::iterator_traits<NotAnIteratorNoCategory> T;
+    static_assert(!has_difference_type<T>::value, "");
+    static_assert(!has_value_type<T>::value, "");
+    static_assert(!has_pointer<T>::value, "");
+    static_assert(!has_reference<T>::value, "");
+    static_assert(!has_iterator_category<T>::value, "");
+  }
+}
diff --git a/libcxx/test/std/iterators/iterator.primitives/iterator.traits/empty.verify.cpp b/libcxx/test/std/iterators/iterator.primitives/iterator.traits/empty.verify.cpp
deleted file mode 100644
index f683ae93beac6..0000000000000
--- a/libcxx/test/std/iterators/iterator.primitives/iterator.traits/empty.verify.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// struct iterator_traits
-// {
-// };
-
-#include <iterator>
-#include "test_macros.h"
-
-struct A {};
-struct NotAnIteratorEmpty {};
-
-struct NotAnIteratorNoDifference
-{
-//     typedef int                       difference_type;
-    typedef A                         value_type;
-    typedef A*                        pointer;
-    typedef A&                        reference;
-    typedef std::forward_iterator_tag iterator_category;
-};
-
-struct NotAnIteratorNoValue
-{
-    typedef int                       difference_type;
-//     typedef A                         value_type;
-    typedef A*                        pointer;
-    typedef A&                        reference;
-    typedef std::forward_iterator_tag iterator_category;
-};
-
-struct NotAnIteratorNoPointer
-{
-    typedef int                       difference_type;
-    typedef A                         value_type;
-//     typedef A*                        pointer;
-    typedef A&                        reference;
-    typedef std::forward_iterator_tag iterator_category;
-};
-
-struct NotAnIteratorNoReference
-{
-    typedef int                       difference_type;
-    typedef A                         value_type;
-    typedef A*                        pointer;
-//    typedef A&                        reference;
-    typedef std::forward_iterator_tag iterator_category;
-};
-
-struct NotAnIteratorNoCategory
-{
-    typedef int                       difference_type;
-    typedef A                         value_type;
-    typedef A*                        pointer;
-    typedef A&                        reference;
-//     typedef std::forward_iterator_tag iterator_category;
-};
-
-int main(int, char**)
-{
-    {
-    typedef std::iterator_traits<NotAnIteratorEmpty> T;
-    typedef T::difference_type   DT; // expected-error-re {{no type named 'difference_type' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::value_type        VT; // expected-error-re {{no type named 'value_type' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::pointer           PT; // expected-error-re {{no type named 'pointer' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::reference         RT; // expected-error-re {{no type named 'reference' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::iterator_category CT; // expected-error-re {{no type named 'iterator_category' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    }
-
-    {
-    typedef std::iterator_traits<NotAnIteratorNoDifference> T;
-    typedef T::difference_type   DT; // expected-error-re {{no type named 'difference_type' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::value_type        VT; // expected-error-re {{no type named 'value_type' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::pointer           PT; // expected-error-re {{no type named 'pointer' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::reference         RT; // expected-error-re {{no type named 'reference' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::iterator_category CT; // expected-error-re {{no type named 'iterator_category' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    }
-
-    {
-    typedef std::iterator_traits<NotAnIteratorNoValue> T;
-    typedef T::difference_type   DT; // expected-error-re {{no type named 'difference_type' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::value_type        VT; // expected-error-re {{no type named 'value_type' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::pointer           PT; // expected-error-re {{no type named 'pointer' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::reference         RT; // expected-error-re {{no type named 'reference' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::iterator_category CT; // expected-error-re {{no type named 'iterator_category' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    }
-#if TEST_STD_VER <= 17 || !defined(__cpp_lib_concepts)
-    {
-    typedef std::iterator_traits<NotAnIteratorNoPointer> T;
-    typedef T::difference_type   DT; // expected-error-re {{no type named 'difference_type' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::value_type        VT; // expected-error-re {{no type named 'value_type' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::pointer           PT; // expected-error-re {{no type named 'pointer' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::reference         RT; // expected-error-re {{no type named 'reference' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::iterator_category CT; // expected-error-re {{no type named 'iterator_category' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    }
-#endif // TEST_STD_VER <= 17 || !defined(__cpp_lib_concepts)
-    {
-    typedef std::iterator_traits<NotAnIteratorNoReference> T;
-    typedef T::difference_type   DT; // expected-error-re {{no type named 'difference_type' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::value_type        VT; // expected-error-re {{no type named 'value_type' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::pointer           PT; // expected-error-re {{no type named 'pointer' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::reference         RT; // expected-error-re {{no type named 'reference' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::iterator_category CT; // expected-error-re {{no type named 'iterator_category' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    }
-
-    {
-    typedef std::iterator_traits<NotAnIteratorNoCategory> T;
-    typedef T::difference_type   DT; // expected-error-re {{no type named 'difference_type' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::value_type        VT; // expected-error-re {{no type named 'value_type' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::pointer           PT; // expected-error-re {{no type named 'pointer' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::reference         RT; // expected-error-re {{no type named 'reference' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    typedef T::iterator_category CT; // expected-error-re {{no type named 'iterator_category' in 'std:{{.*}}:iterator_traits<{{.+}}>}}
-    }
-
-  return 0;
-}

From cfc095118c149d6d95a828139ed54fb34e99a3f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 6 Sep 2024 09:17:31 -0700
Subject: [PATCH 402/425] [flang][cuda] Do not check assignment semantic in cuf
 kernel (#107512)

---
 flang/lib/Semantics/check-cuda.cpp           |  7 ++++++-
 flang/lib/Semantics/check-cuda.h             |  2 ++
 flang/test/Lower/CUDA/cuda-data-transfer.cuf | 21 ++++++++++++++++++++
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index 60b8b3213fdb8..d8bd435bf09f3 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -565,13 +565,18 @@ void CUDAChecker::Enter(const parser::CUFKernelDoConstruct &x) {
       std::get<std::list<parser::CUFReduction>>(directive.t)) {
     CheckReduce(context_, reduce);
   }
+  inCUFKernelDoConstruct_ = true;
+}
+
+void CUDAChecker::Leave(const parser::CUFKernelDoConstruct &) {
+  inCUFKernelDoConstruct_ = false;
 }
 
 void CUDAChecker::Enter(const parser::AssignmentStmt &x) {
   auto lhsLoc{std::get<parser::Variable>(x.t).GetSource()};
   const auto &scope{context_.FindScope(lhsLoc)};
   const Scope &progUnit{GetProgramUnitContaining(scope)};
-  if (IsCUDADeviceContext(&progUnit)) {
+  if (IsCUDADeviceContext(&progUnit) || inCUFKernelDoConstruct_) {
     return; // Data transfer with assignment is only perform on host.
   }
 
diff --git a/flang/lib/Semantics/check-cuda.h b/flang/lib/Semantics/check-cuda.h
index aa0cb46360bef..222a2ee04b57c 100644
--- a/flang/lib/Semantics/check-cuda.h
+++ b/flang/lib/Semantics/check-cuda.h
@@ -39,10 +39,12 @@ class CUDAChecker : public virtual BaseChecker {
   void Enter(const parser::FunctionSubprogram &);
   void Enter(const parser::SeparateModuleSubprogram &);
   void Enter(const parser::CUFKernelDoConstruct &);
+  void Leave(const parser::CUFKernelDoConstruct &);
   void Enter(const parser::AssignmentStmt &);
 
 private:
   SemanticsContext &context_;
+  bool inCUFKernelDoConstruct_ = false;
 };
 
 bool CanonicalizeCUDA(parser::Program &);
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index f189bf9b62108..bcbac9c303142 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -332,3 +332,24 @@ end subroutine
 ! CHECK: acc.serial
 ! CHECK-NOT: cuf.data_transfer
 ! CHECK: hlfir.assign
+
+! Check that cuf.data_transfer are not generated within cuf kernel and do not
+! trigger semantic error.
+subroutine sub17()
+  integer, parameter :: n = 10
+  real, device :: adev(n)
+  real, device :: bdev(n)
+  real :: ahost
+  real, managed :: b
+  integer :: i
+
+  adev = ahost
+  !$cuf kernel do <<<*,*>>>
+  do i = 1, n
+    ahost = adev(i) * bdev(i) + b
+  enddo
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub17()
+! CHECK: cuf.kernel<<<*, *>>>
+! CHECK-NOT: cuf.data_transfer

From ce91e2153f1a8f725a1e4853bb21e7dbfcdae9d7 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Fri, 6 Sep 2024 09:22:03 -0700
Subject: [PATCH 403/425] [lld][InstrProf] Sort startup functions for
 compression (#107348)

---
 lld/MachO/BPSectionOrderer.cpp             | 23 +++++++++++++++++++---
 lld/MachO/BPSectionOrderer.h               |  2 +-
 lld/MachO/Config.h                         |  1 +
 lld/MachO/Driver.cpp                       |  7 +++++++
 lld/MachO/Options.td                       |  5 +++++
 lld/MachO/SectionPriorities.cpp            |  1 +
 lld/test/MachO/bp-section-orderer-errs.s   |  3 +++
 lld/test/MachO/bp-section-orderer-stress.s |  4 ++--
 8 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp
index 97458c96fd80d..07b44d48d6593 100644
--- a/lld/MachO/BPSectionOrderer.cpp
+++ b/lld/MachO/BPSectionOrderer.cpp
@@ -158,7 +158,8 @@ static SmallVector<std::pair<unsigned, UtilityNodes>> getUnsForCompression(
 
 DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     size_t &highestAvailablePriority, StringRef profilePath,
-    bool forFunctionCompression, bool forDataCompression, bool verbose) {
+    bool forFunctionCompression, bool forDataCompression,
+    bool compressionSortStartupFunctions, bool verbose) {
 
   SmallVector<const InputSection *> sections;
   DenseMap<const InputSection *, uint64_t> sectionToIdx;
@@ -268,8 +269,24 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     }
   }
 
-  // Map a section index (to be ordered for compression) to a list of duplicate
-  // section indices (not ordered for compression).
+  if (compressionSortStartupFunctions) {
+    SmallVector<unsigned> startupIdxs;
+    for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
+      startupIdxs.push_back(sectionIdx);
+    auto unsForStartupFunctionCompression =
+        getUnsForCompression(sections, sectionToIdx, startupIdxs,
+                             /*duplicateSectionIdxs=*/nullptr, maxUN);
+    for (auto &[sectionIdx, compressionUns] :
+         unsForStartupFunctionCompression) {
+      auto &uns = startupSectionIdxUNs[sectionIdx];
+      uns.append(compressionUns);
+      llvm::sort(uns);
+      uns.erase(std::unique(uns.begin(), uns.end()), uns.end());
+    }
+  }
+
+  // Map a section index (order directly) to a list of duplicate section indices
+  // (not ordered directly).
   DenseMap<unsigned, SmallVector<unsigned>> duplicateSectionIdxs;
   auto unsForFunctionCompression = getUnsForCompression(
       sections, sectionToIdx, sectionIdxsForFunctionCompression,
diff --git a/lld/MachO/BPSectionOrderer.h b/lld/MachO/BPSectionOrderer.h
index 6f9eefd5d82be..cefd0ceb10e56 100644
--- a/lld/MachO/BPSectionOrderer.h
+++ b/lld/MachO/BPSectionOrderer.h
@@ -30,7 +30,7 @@ llvm::DenseMap<const lld::macho::InputSection *, size_t>
 runBalancedPartitioning(size_t &highestAvailablePriority,
                         llvm::StringRef profilePath,
                         bool forFunctionCompression, bool forDataCompression,
-                        bool verbose);
+                        bool compressionSortStartupFunctions, bool verbose);
 
 } // namespace lld::macho
 
diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index 4e940693602c9..5fca3f16a897e 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -219,6 +219,7 @@ struct Configuration {
   llvm::StringRef printSymbolOrder;
 
   llvm::StringRef irpgoProfileSortProfilePath;
+  bool compressionSortStartupFunctions = false;
   bool functionOrderForCompression = false;
   bool dataOrderForCompression = false;
   bool verboseBpSectionOrderer = false;
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index fa5a5dab8b565..73b83123f49b4 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1778,6 +1778,13 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     config->irpgoProfileSortProfilePath = arg->getValue();
     IncompatWithCGSort(arg->getSpelling());
   }
+  config->compressionSortStartupFunctions =
+      args.hasFlag(OPT_compression_sort_startup_functions,
+                   OPT_no_compression_sort_startup_functions, false);
+  if (config->irpgoProfileSortProfilePath.empty() &&
+      config->compressionSortStartupFunctions)
+    error("--compression-sort-startup-functions must be used with "
+          "--irpgo-profile-sort");
   if (const Arg *arg = args.getLastArg(OPT_compression_sort)) {
     StringRef compressionSortStr = arg->getValue();
     if (compressionSortStr == "function") {
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 9c9570cdbeb05..cbd28bbceef78 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -131,6 +131,11 @@ def irpgo_profile_sort_eq: Joined<["--"], "irpgo-profile-sort=">,
     Alias<!cast<Separate>(irpgo_profile_sort)>, MetaVarName<"<profile>">,
     HelpText<"Read the IRPGO profile at <profile> to order sections to improve startup time">,
     Group<grp_lld>;
+def compression_sort_startup_functions: Flag<["--"], "compression-sort-startup-functions">,
+    HelpText<"Order startup functions to improve compressed size in addition to startup time">,
+    Group<grp_lld>;
+def no_compression_sort_startup_functions: Flag<["--"], "no-compression-sort-startup-functions">,
+    HelpText<"Do not order startup function for compression">, Group<grp_lld>;
 def compression_sort: Joined<["--"], "compression-sort=">,
     MetaVarName<"[none,function,data,both]">,
     HelpText<"Order sections to improve compressed size">, Group<grp_lld>;
diff --git a/lld/MachO/SectionPriorities.cpp b/lld/MachO/SectionPriorities.cpp
index 69c301d8ff8a7..1e7fb5973b808 100644
--- a/lld/MachO/SectionPriorities.cpp
+++ b/lld/MachO/SectionPriorities.cpp
@@ -359,6 +359,7 @@ macho::PriorityBuilder::buildInputSectionPriorities() {
     sectionPriorities = runBalancedPartitioning(
         highestAvailablePriority, config->irpgoProfileSortProfilePath,
         config->functionOrderForCompression, config->dataOrderForCompression,
+        config->compressionSortStartupFunctions,
         config->verboseBpSectionOrderer);
   } else if (config->callGraphProfileSort) {
     // Sort sections by the profile data provided by __LLVM,__cg_profile
diff --git a/lld/test/MachO/bp-section-orderer-errs.s b/lld/test/MachO/bp-section-orderer-errs.s
index 8392b88508481..682eb0c08bf1f 100644
--- a/lld/test/MachO/bp-section-orderer-errs.s
+++ b/lld/test/MachO/bp-section-orderer-errs.s
@@ -7,3 +7,6 @@
 
 # RUN: not %lld -o /dev/null --compression-sort=malformed 2>&1 | FileCheck %s --check-prefix=COMPRESSION-MALFORM
 # COMPRESSION-MALFORM: unknown value `malformed` for --compression-sort=
+
+# RUN: not %lld -o /dev/null --compression-sort-startup-functions 2>&1 | FileCheck %s --check-prefix=STARTUP
+# STARTUP: --compression-sort-startup-functions must be used with --irpgo-profile-sort
diff --git a/lld/test/MachO/bp-section-orderer-stress.s b/lld/test/MachO/bp-section-orderer-stress.s
index fdc6a20e2655b..986e2d8fd1069 100644
--- a/lld/test/MachO/bp-section-orderer-stress.s
+++ b/lld/test/MachO/bp-section-orderer-stress.s
@@ -7,8 +7,8 @@
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t.s -o %t.o
 # RUN: llvm-profdata merge %t.proftext -o %t.profdata
 
-# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order1.txt
-# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order2.txt
+# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort-startup-functions --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order1.txt
+# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort-startup-functions --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order2.txt
 # RUN: diff %t.order1.txt %t.order2.txt
 
 import random

From 62180dfd8d86af92219662d0cdb01672d801f86f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 6 Sep 2024 09:28:33 -0700
Subject: [PATCH 404/425] [RISCV] Reduce the interface to RISCVCCAssignFn. NFC
 (#107503)

DataLayout, ABI, and TargetLowering can all be obtained via the
MachineFunction reference in the State object. This is how the targets
that use TableGen for CC handlers get these objects.

This might be a little slower, but it simplies all the callers in
SelectionDAG and GlobalISel.
---
 .../Target/RISCV/GISel/RISCVCallLowering.cpp  | 23 +++++-----------
 llvm/lib/Target/RISCV/RISCVCallingConv.cpp    | 26 +++++++++++-------
 llvm/lib/Target/RISCV/RISCVCallingConv.h      | 27 +++++++------------
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 16 +++++------
 4 files changed, 38 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
index 14832204058f8..f50611aac5dd1 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
@@ -45,13 +45,8 @@ struct RISCVOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner {
                  CCValAssign::LocInfo LocInfo,
                  const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
                  CCState &State) override {
-    MachineFunction &MF = State.getMachineFunction();
-    const DataLayout &DL = MF.getDataLayout();
-    const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>();
-
-    if (RISCVAssignFn(DL, Subtarget.getTargetABI(), ValNo, ValVT, LocVT,
-                      LocInfo, Flags, State, Info.IsFixed, IsRet, Info.Ty,
-                      *Subtarget.getTargetLowering()))
+    if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, Info.IsFixed,
+                      IsRet, Info.Ty))
       return true;
 
     StackSize = State.getStackSize();
@@ -197,15 +192,12 @@ struct RISCVIncomingValueAssigner : public CallLowering::IncomingValueAssigner {
                  const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
                  CCState &State) override {
     MachineFunction &MF = State.getMachineFunction();
-    const DataLayout &DL = MF.getDataLayout();
-    const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>();
 
     if (LocVT.isScalableVector())
       MF.getInfo<RISCVMachineFunctionInfo>()->setIsVectorCall();
 
-    if (RISCVAssignFn(DL, Subtarget.getTargetABI(), ValNo, ValVT, LocVT,
-                      LocInfo, Flags, State, /*IsFixed=*/true, IsRet, Info.Ty,
-                      *Subtarget.getTargetLowering()))
+    if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State,
+                      /*IsFixed=*/true, IsRet, Info.Ty))
       return true;
 
     StackSize = State.getStackSize();
@@ -441,11 +433,9 @@ bool RISCVCallLowering::canLowerReturn(MachineFunction &MF,
                                        SmallVectorImpl<BaseArgInfo> &Outs,
                                        bool IsVarArg) const {
   SmallVector<CCValAssign, 16> ArgLocs;
-  const auto &TLI = *getTLI<RISCVTargetLowering>();
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
                  MF.getFunction().getContext());
 
-  RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
   const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>();
 
   std::optional<unsigned> FirstMaskArgument = std::nullopt;
@@ -460,9 +450,8 @@ bool RISCVCallLowering::canLowerReturn(MachineFunction &MF,
 
   for (unsigned I = 0, E = Outs.size(); I < E; ++I) {
     MVT VT = MVT::getVT(Outs[I].Ty);
-    if (CC_RISCV(MF.getDataLayout(), ABI, I, VT, VT, CCValAssign::Full,
-                 Outs[I].Flags[0], CCInfo, /*IsFixed=*/true,
-                 /*isRet=*/true, nullptr, TLI))
+    if (CC_RISCV(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo,
+                 /*IsFixed=*/true, /*isRet=*/true, nullptr))
       return false;
   }
   return true;
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
index bf6ae2d1c2910..48f91bba09044 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -247,10 +247,14 @@ static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo, CCState &State,
 }
 
 // Implements the RISC-V calling convention. Returns true upon failure.
-bool llvm::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
-                    MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
-                    ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
-                    bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI) {
+bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
+                    CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+                    CCState &State, bool IsFixed, bool IsRet, Type *OrigTy) {
+  const MachineFunction &MF = State.getMachineFunction();
+  const DataLayout &DL = MF.getDataLayout();
+  const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>();
+  const RISCVTargetLowering &TLI = *Subtarget.getTargetLowering();
+
   unsigned XLen = DL.getLargestLegalIntTypeSizeInBits();
   assert(XLen == 32 || XLen == 64);
   MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;
@@ -276,6 +280,7 @@ bool llvm::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
   // variadic argument, or if no F64 argument registers are available.
   bool UseGPRForF64 = true;
 
+  RISCVABI::ABI ABI = Subtarget.getTargetABI();
   switch (ABI) {
   default:
     llvm_unreachable("Unexpected ABI");
@@ -502,12 +507,15 @@ bool llvm::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
 
 // FastCC has less than 1% performance improvement for some particular
 // benchmark. But theoretically, it may have benefit for some cases.
-bool llvm::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
-                           unsigned ValNo, MVT ValVT, MVT LocVT,
+bool llvm::CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
                            CCValAssign::LocInfo LocInfo,
                            ISD::ArgFlagsTy ArgFlags, CCState &State,
-                           bool IsFixed, bool IsRet, Type *OrigTy,
-                           const RISCVTargetLowering &TLI) {
+                           bool IsFixed, bool IsRet, Type *OrigTy) {
+  const MachineFunction &MF = State.getMachineFunction();
+  const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>();
+  const RISCVTargetLowering &TLI = *Subtarget.getTargetLowering();
+  RISCVABI::ABI ABI = Subtarget.getTargetABI();
+
   if (LocVT == MVT::i32 || LocVT == MVT::i64) {
     if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
@@ -515,8 +523,6 @@ bool llvm::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
     }
   }
 
-  const RISCVSubtarget &Subtarget = TLI.getSubtarget();
-
   if (LocVT == MVT::f16 && Subtarget.hasStdExtZfhmin()) {
     static const MCPhysReg FPR16List[] = {
         RISCV::F10_H, RISCV::F11_H, RISCV::F12_H, RISCV::F13_H, RISCV::F14_H,
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.h b/llvm/lib/Target/RISCV/RISCVCallingConv.h
index 9154a31d11608..bf823b78317d5 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.h
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.h
@@ -15,28 +15,21 @@
 
 namespace llvm {
 
-class DataLayout;
-class RISCVTargetLowering;
-
 /// RISCVCCAssignFn - This target-specific function extends the default
 /// CCValAssign with additional information used to lower RISC-V calling
 /// conventions.
-typedef bool RISCVCCAssignFn(const DataLayout &DL, RISCVABI::ABI,
-                             unsigned ValNo, MVT ValVT, MVT LocVT,
+typedef bool RISCVCCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT,
                              CCValAssign::LocInfo LocInfo,
                              ISD::ArgFlagsTy ArgFlags, CCState &State,
-                             bool IsFixed, bool IsRet, Type *OrigTy,
-                             const RISCVTargetLowering &TLI);
-
-bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
-              MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
-              ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
-              bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI);
-
-bool CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
-                     MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
-                     ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
-                     bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI);
+                             bool IsFixed, bool IsRet, Type *OrigTy);
+
+bool CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
+              CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+              CCState &State, bool IsFixed, bool IsRet, Type *OrigTy);
+
+bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
+                     CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+                     CCState &State, bool IsFixed, bool IsRet, Type *OrigTy);
 
 bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
                   CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b0c8c95d67cde..3ef13b648493e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -18951,9 +18951,8 @@ void RISCVTargetLowering::analyzeInputArgs(
     else if (Ins[i].isOrigArg())
       ArgTy = FType->getParamType(Ins[i].getOrigArgIndex());
 
-    RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
-    if (Fn(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
-           ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy, *this)) {
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo,
+           /*IsFixed=*/true, IsRet, ArgTy)) {
       LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
                         << ArgVT << '\n');
       llvm_unreachable(nullptr);
@@ -18972,9 +18971,8 @@ void RISCVTargetLowering::analyzeOutputArgs(
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
     Type *OrigTy = CLI ? CLI->getArgs()[Outs[i].OrigArgIndex].Ty : nullptr;
 
-    RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
-    if (Fn(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
-           ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy, *this)) {
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo,
+           Outs[i].IsFixed, IsRet, OrigTy)) {
       LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
                         << ArgVT << "\n");
       llvm_unreachable(nullptr);
@@ -19688,10 +19686,8 @@ bool RISCVTargetLowering::CanLowerReturn(
   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
     MVT VT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-    RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
-    if (CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full,
-                 ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr,
-                 *this))
+    if (CC_RISCV(i, VT, VT, CCValAssign::Full, ArgFlags, CCInfo,
+                 /*IsFixed=*/true, /*IsRet=*/true, nullptr))
       return false;
   }
   return true;

From e1fdaaafc51bf7323813139b2809ffea43e65ee4 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 6 Sep 2024 09:35:13 -0700
Subject: [PATCH 405/425] [AMDGPU] Work around a warning

This patch works around:

  llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp:1101:13:
  error: enumeration values 'USubCond' and 'USubSat' not handled in
  switch [-Werror,-Wswitch]

I've notified the author in #105568.
---
 llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 77971323aa1ec..787747e605580 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1154,6 +1154,9 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
       break;
     case AtomicRMWInst::BAD_BINOP:
       llvm_unreachable("Not sure how we got a bad binop");
+    case AtomicRMWInst::USubCond:
+    case AtomicRMWInst::USubSat:
+      break;
     }
   }
 

From 1accedc1eba1f00cd522012a1c5bb84e3686663b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 6 Sep 2024 15:23:16 +0100
Subject: [PATCH 406/425] [X86] setcc-lowering.ll - add nounwind to remove cfi
 noise. NFC.

---
 llvm/test/CodeGen/X86/setcc-lowering.ll | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/X86/setcc-lowering.ll b/llvm/test/CodeGen/X86/setcc-lowering.ll
index 90e5c279d2e17..aa4fbb469c14f 100644
--- a/llvm/test/CodeGen/X86/setcc-lowering.ll
+++ b/llvm/test/CodeGen/X86/setcc-lowering.ll
@@ -7,7 +7,7 @@
 ; Verify that we don't crash during codegen due to a wrong lowering
 ; of a setcc node with illegal operand types and return type.
 
-define <8 x i16> @pr25080(<8 x i32> %a) {
+define <8 x i16> @pr25080(<8 x i32> %a) nounwind {
 ; AVX1-LABEL: pr25080:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
@@ -50,7 +50,7 @@ entry:
   ret <8 x i16> %3
 }
 
-define void @pr26232(i64 %a, <16 x i1> %b) {
+define void @pr26232(i64 %a, <16 x i1> %b) nounwind {
 ; AVX1-LABEL: pr26232:
 ; AVX1:       # %bb.0: # %allocas
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
@@ -89,8 +89,6 @@ define void @pr26232(i64 %a, <16 x i1> %b) {
 ; KNL-32-LABEL: pr26232:
 ; KNL-32:       # %bb.0: # %allocas
 ; KNL-32-NEXT:    pushl %esi
-; KNL-32-NEXT:    .cfi_def_cfa_offset 8
-; KNL-32-NEXT:    .cfi_offset %esi, -8
 ; KNL-32-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-32-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-32-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -111,7 +109,6 @@ define void @pr26232(i64 %a, <16 x i1> %b) {
 ; KNL-32-NEXT:    jne .LBB1_1
 ; KNL-32-NEXT:  # %bb.2: # %for_exit600
 ; KNL-32-NEXT:    popl %esi
-; KNL-32-NEXT:    .cfi_def_cfa_offset 4
 ; KNL-32-NEXT:    retl
 allocas:
   br label %for_test11.preheader
@@ -132,7 +129,7 @@ for_exit600:                                      ; preds = %for_loop599
   ret void
 }
 
-define <4 x i32> @pcmpgt(<4 x i8> %x) {
+define <4 x i32> @pcmpgt(<4 x i8> %x) nounwind {
 ; AVX-LABEL: pcmpgt:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero

From c1f78d349704c4a6be6abb436687f3ce449d3778 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 6 Sep 2024 17:33:34 +0100
Subject: [PATCH 407/425] [X86] LowerSELECTWithCmpZero - add missing
 description of fold and cleanup zero/allones extension code. NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 44592afcf7216..e20633e041072 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24084,13 +24084,13 @@ static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS,
   if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
     return SDValue();
 
+  // Convert OR/XOR 'identity' patterns (iff X is 0 or 1):
+  // select (X != 0), Y, (OR Y, Z) -> (OR Y, (AND (0 - X), Z))
+  // select (X != 0), Y, (XOR Y, Z) -> (XOR Y, (AND (0 - X), Z))
   if (!Subtarget.canUseCMOV() && X86CC == X86::COND_E &&
       CmpVal.getOpcode() == ISD::AND && isOneConstant(CmpVal.getOperand(1))) {
     SDValue Src1, Src2;
-    // true if RHS is XOR or OR operator and one of its operands
-    // is equal to LHS
-    // ( a , a op b) || ( b , a op b)
-    auto isOrXorPattern = [&]() {
+    auto isIdentityPattern = [&]() {
       if ((RHS.getOpcode() == ISD::XOR || RHS.getOpcode() == ISD::OR) &&
           (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS)) {
         Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
@@ -24100,20 +24100,17 @@ static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS,
       return false;
     };
 
-    if (isOrXorPattern()) {
-      SDValue Neg;
-      unsigned int CmpSz = CmpVT.getSizeInBits();
+    if (isIdentityPattern()) {
       // we need mask of all zeros or ones with same size of the other
       // operands.
-      if (CmpSz > VT.getSizeInBits())
+      SDValue Neg = CmpVal;
+      if (CmpVT.bitsGT(VT))
         Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpVal);
-      else if (CmpSz < VT.getSizeInBits())
+      else if (CmpVT.bitsLT(VT))
         Neg = DAG.getNode(
             ISD::AND, DL, VT,
             DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpVal.getOperand(0)),
             DAG.getConstant(1, DL, VT));
-      else
-        Neg = CmpVal;
       SDValue Mask = DAG.getNegative(Neg, DL, VT); // -(and (x, 0x1))
       SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
       return DAG.getNode(RHS.getOpcode(), DL, VT, And, Src2);  // And Op y

From ce2e38653fab52cc53c7a267727b996c14c7af82 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Fri, 6 Sep 2024 12:45:16 -0400
Subject: [PATCH 408/425] [Attributor] Add support for atomic operations in
 `AAAddressSpace` (#106927)

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |   9 +-
 .../Transforms/IPO/AttributorAttributes.cpp   |   9 +-
 llvm/test/CodeGen/AMDGPU/aa-as-infer.ll       | 157 ++++++++++++++++++
 3 files changed, 171 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 72049f0aa6b86..ffeec31bb930a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1084,10 +1084,15 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
       if (auto *LI = dyn_cast<LoadInst>(&I)) {
         A.getOrCreateAAFor<AAAddressSpace>(
             IRPosition::value(*LI->getPointerOperand()));
-      }
-      if (auto *SI = dyn_cast<StoreInst>(&I)) {
+      } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
         A.getOrCreateAAFor<AAAddressSpace>(
             IRPosition::value(*SI->getPointerOperand()));
+      } else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) {
+        A.getOrCreateAAFor<AAAddressSpace>(
+            IRPosition::value(*RMW->getPointerOperand()));
+      } else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I)) {
+        A.getOrCreateAAFor<AAAddressSpace>(
+            IRPosition::value(*CmpX->getPointerOperand()));
       }
     }
   }
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 1258387b9185a..6fbb16e250f87 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -12584,10 +12584,15 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
       if (auto *LI = dyn_cast<LoadInst>(Inst)) {
         Changed |=
             makeChange(A, LI, U, OriginalValue, NewPtrTy, UseOriginalValue);
-      }
-      if (auto *SI = dyn_cast<StoreInst>(Inst)) {
+      } else if (auto *SI = dyn_cast<StoreInst>(Inst)) {
         Changed |=
             makeChange(A, SI, U, OriginalValue, NewPtrTy, UseOriginalValue);
+      } else if (auto *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+        Changed |=
+            makeChange(A, RMW, U, OriginalValue, NewPtrTy, UseOriginalValue);
+      } else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+        Changed |=
+            makeChange(A, CmpX, U, OriginalValue, NewPtrTy, UseOriginalValue);
       }
       return true;
     };
diff --git a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
index 37689edc53e25..fdc5debb18915 100644
--- a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
+++ b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
@@ -2,6 +2,7 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -S %s -o - | FileCheck %s
 
 @g1 = protected addrspace(1) externally_initialized global i32 0, align 4
+@g2 = protected addrspace(1) externally_initialized global i32 0, align 4
 
 define internal void @volatile_load_store_as_0(ptr %p) {
 ; CHECK-LABEL: define internal void @volatile_load_store_as_0(
@@ -86,3 +87,159 @@ define void @call_volatile_load_store_as_4(ptr addrspace(4) %p1, ptr addrspace(4
   call void @volatile_load_store_as_1(ptr %p2.cast)
   ret void
 }
+
+define internal void @can_infer_cmpxchg(ptr %word) {
+; CHECK-LABEL: define internal void @can_infer_cmpxchg(
+; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
+; CHECK-NEXT:    [[CMPXCHG_0:%.*]] = cmpxchg ptr addrspace(1) [[TMP1]], i32 0, i32 4 monotonic monotonic, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
+; CHECK-NEXT:    [[CMPXCHG_1:%.*]] = cmpxchg ptr addrspace(1) [[TMP2]], i32 0, i32 5 acq_rel monotonic, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
+; CHECK-NEXT:    [[CMPXCHG_2:%.*]] = cmpxchg ptr addrspace(1) [[TMP3]], i32 0, i32 6 acquire monotonic, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
+; CHECK-NEXT:    [[CMPXCHG_3:%.*]] = cmpxchg ptr addrspace(1) [[TMP4]], i32 0, i32 7 release monotonic, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
+; CHECK-NEXT:    [[CMPXCHG_4:%.*]] = cmpxchg ptr addrspace(1) [[TMP5]], i32 0, i32 8 seq_cst monotonic, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
+; CHECK-NEXT:    [[CMPXCHG_5:%.*]] = cmpxchg weak ptr addrspace(1) [[TMP6]], i32 0, i32 9 seq_cst monotonic, align 4
+; CHECK-NEXT:    [[CMPXCHG_6:%.*]] = cmpxchg volatile ptr [[WORD]], i32 0, i32 10 seq_cst monotonic, align 4
+; CHECK-NEXT:    [[CMPXCHG_7:%.*]] = cmpxchg weak volatile ptr [[WORD]], i32 0, i32 11 syncscope("singlethread") seq_cst monotonic, align 4
+; CHECK-NEXT:    ret void
+;
+  %cmpxchg.0 = cmpxchg ptr %word, i32 0, i32 4 monotonic monotonic, align 4
+  %cmpxchg.1 = cmpxchg ptr %word, i32 0, i32 5 acq_rel monotonic, align 4
+  %cmpxchg.2 = cmpxchg ptr %word, i32 0, i32 6 acquire monotonic, align 4
+  %cmpxchg.3 = cmpxchg ptr %word, i32 0, i32 7 release monotonic, align 4
+  %cmpxchg.4 = cmpxchg ptr %word, i32 0, i32 8 seq_cst monotonic, align 4
+  %cmpxchg.5 = cmpxchg weak ptr %word, i32 0, i32 9 seq_cst monotonic, align 4
+  %cmpxchg.6 = cmpxchg volatile ptr %word, i32 0, i32 10 seq_cst monotonic, align 4
+  %cmpxchg.7 = cmpxchg weak volatile ptr %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic, align 4
+  ret void
+}
+
+define internal void @can_not_infer_cmpxchg(ptr %word) {
+; CHECK-LABEL: define internal void @can_not_infer_cmpxchg(
+; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMPXCHG_0:%.*]] = cmpxchg ptr [[WORD]], i32 0, i32 4 monotonic monotonic, align 4
+; CHECK-NEXT:    [[CMPXCHG_1:%.*]] = cmpxchg ptr [[WORD]], i32 0, i32 5 acq_rel monotonic, align 4
+; CHECK-NEXT:    [[CMPXCHG_2:%.*]] = cmpxchg ptr [[WORD]], i32 0, i32 6 acquire monotonic, align 4
+; CHECK-NEXT:    [[CMPXCHG_3:%.*]] = cmpxchg ptr [[WORD]], i32 0, i32 7 release monotonic, align 4
+; CHECK-NEXT:    [[CMPXCHG_4:%.*]] = cmpxchg ptr [[WORD]], i32 0, i32 8 seq_cst monotonic, align 4
+; CHECK-NEXT:    [[CMPXCHG_5:%.*]] = cmpxchg weak ptr [[WORD]], i32 0, i32 9 seq_cst monotonic, align 4
+; CHECK-NEXT:    [[CMPXCHG_6:%.*]] = cmpxchg volatile ptr [[WORD]], i32 0, i32 10 seq_cst monotonic, align 4
+; CHECK-NEXT:    [[CMPXCHG_7:%.*]] = cmpxchg weak volatile ptr [[WORD]], i32 0, i32 11 syncscope("singlethread") seq_cst monotonic, align 4
+; CHECK-NEXT:    ret void
+;
+  %cmpxchg.0 = cmpxchg ptr %word, i32 0, i32 4 monotonic monotonic, align 4
+  %cmpxchg.1 = cmpxchg ptr %word, i32 0, i32 5 acq_rel monotonic, align 4
+  %cmpxchg.2 = cmpxchg ptr %word, i32 0, i32 6 acquire monotonic, align 4
+  %cmpxchg.3 = cmpxchg ptr %word, i32 0, i32 7 release monotonic, align 4
+  %cmpxchg.4 = cmpxchg ptr %word, i32 0, i32 8 seq_cst monotonic, align 4
+  %cmpxchg.5 = cmpxchg weak ptr %word, i32 0, i32 9 seq_cst monotonic, align 4
+  %cmpxchg.6 = cmpxchg volatile ptr %word, i32 0, i32 10 seq_cst monotonic, align 4
+  %cmpxchg.7 = cmpxchg weak volatile ptr %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic, align 4
+  ret void
+}
+
+define internal void @can_infer_atomicrmw(ptr %word) {
+; CHECK-LABEL: define internal void @can_infer_atomicrmw(
+; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
+; CHECK-NEXT:    [[ATOMICRMW_XCHG:%.*]] = atomicrmw xchg ptr addrspace(1) [[TMP1]], i32 12 monotonic, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
+; CHECK-NEXT:    [[ATOMICRMW_ADD:%.*]] = atomicrmw add ptr addrspace(1) [[TMP2]], i32 13 monotonic, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
+; CHECK-NEXT:    [[ATOMICRMW_SUB:%.*]] = atomicrmw sub ptr addrspace(1) [[TMP3]], i32 14 monotonic, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
+; CHECK-NEXT:    [[ATOMICRMW_AND:%.*]] = atomicrmw and ptr addrspace(1) [[TMP4]], i32 15 monotonic, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
+; CHECK-NEXT:    [[ATOMICRMW_NAND:%.*]] = atomicrmw nand ptr addrspace(1) [[TMP5]], i32 16 monotonic, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
+; CHECK-NEXT:    [[ATOMICRMW_OR:%.*]] = atomicrmw or ptr addrspace(1) [[TMP6]], i32 17 monotonic, align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
+; CHECK-NEXT:    [[ATOMICRMW_XOR:%.*]] = atomicrmw xor ptr addrspace(1) [[TMP7]], i32 18 monotonic, align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
+; CHECK-NEXT:    [[ATOMICRMW_MAX:%.*]] = atomicrmw max ptr addrspace(1) [[TMP8]], i32 19 monotonic, align 4
+; CHECK-NEXT:    [[ATOMICRMW_MIN:%.*]] = atomicrmw volatile min ptr [[WORD]], i32 20 monotonic, align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
+; CHECK-NEXT:    [[ATOMICRMW_UMAX:%.*]] = atomicrmw umax ptr addrspace(1) [[TMP10]], i32 21 syncscope("singlethread") monotonic, align 4
+; CHECK-NEXT:    [[ATOMICRMW_UMIN:%.*]] = atomicrmw volatile umin ptr [[WORD]], i32 22 syncscope("singlethread") monotonic, align 4
+; CHECK-NEXT:    ret void
+;
+  %atomicrmw.xchg = atomicrmw xchg ptr %word, i32 12 monotonic, align 4
+  %atomicrmw.add = atomicrmw add ptr %word, i32 13 monotonic, align 4
+  %atomicrmw.sub = atomicrmw sub ptr %word, i32 14 monotonic, align 4
+  %atomicrmw.and = atomicrmw and ptr %word, i32 15 monotonic, align 4
+  %atomicrmw.nand = atomicrmw nand ptr %word, i32 16 monotonic, align 4
+  %atomicrmw.or = atomicrmw or ptr %word, i32 17 monotonic, align 4
+  %atomicrmw.xor = atomicrmw xor ptr %word, i32 18 monotonic, align 4
+  %atomicrmw.max = atomicrmw max ptr %word, i32 19 monotonic, align 4
+  %atomicrmw.min = atomicrmw volatile min ptr %word, i32 20 monotonic, align 4
+  %atomicrmw.umax = atomicrmw umax ptr %word, i32 21 syncscope("singlethread") monotonic, align 4
+  %atomicrmw.umin = atomicrmw volatile umin ptr %word, i32 22 syncscope("singlethread") monotonic, align 4
+  ret void
+}
+
+define internal void @can_not_infer_atomicrmw(ptr %word) {
+; CHECK-LABEL: define internal void @can_not_infer_atomicrmw(
+; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[ATOMICRMW_XCHG:%.*]] = atomicrmw xchg ptr [[WORD]], i32 12 monotonic, align 4
+; CHECK-NEXT:    [[ATOMICRMW_ADD:%.*]] = atomicrmw add ptr [[WORD]], i32 13 monotonic, align 4
+; CHECK-NEXT:    [[ATOMICRMW_SUB:%.*]] = atomicrmw sub ptr [[WORD]], i32 14 monotonic, align 4
+; CHECK-NEXT:    [[ATOMICRMW_AND:%.*]] = atomicrmw and ptr [[WORD]], i32 15 monotonic, align 4
+; CHECK-NEXT:    [[ATOMICRMW_NAND:%.*]] = atomicrmw nand ptr [[WORD]], i32 16 monotonic, align 4
+; CHECK-NEXT:    [[ATOMICRMW_OR:%.*]] = atomicrmw or ptr [[WORD]], i32 17 monotonic, align 4
+; CHECK-NEXT:    [[ATOMICRMW_XOR:%.*]] = atomicrmw xor ptr [[WORD]], i32 18 monotonic, align 4
+; CHECK-NEXT:    [[ATOMICRMW_MAX:%.*]] = atomicrmw max ptr [[WORD]], i32 19 monotonic, align 4
+; CHECK-NEXT:    [[ATOMICRMW_MIN:%.*]] = atomicrmw volatile min ptr [[WORD]], i32 20 monotonic, align 4
+; CHECK-NEXT:    [[ATOMICRMW_UMAX:%.*]] = atomicrmw umax ptr [[WORD]], i32 21 syncscope("singlethread") monotonic, align 4
+; CHECK-NEXT:    [[ATOMICRMW_UMIN:%.*]] = atomicrmw volatile umin ptr [[WORD]], i32 22 syncscope("singlethread") monotonic, align 4
+; CHECK-NEXT:    ret void
+;
+  %atomicrmw.xchg = atomicrmw xchg ptr %word, i32 12 monotonic, align 4
+  %atomicrmw.add = atomicrmw add ptr %word, i32 13 monotonic, align 4
+  %atomicrmw.sub = atomicrmw sub ptr %word, i32 14 monotonic, align 4
+  %atomicrmw.and = atomicrmw and ptr %word, i32 15 monotonic, align 4
+  %atomicrmw.nand = atomicrmw nand ptr %word, i32 16 monotonic, align 4
+  %atomicrmw.or = atomicrmw or ptr %word, i32 17 monotonic, align 4
+  %atomicrmw.xor = atomicrmw xor ptr %word, i32 18 monotonic, align 4
+  %atomicrmw.max = atomicrmw max ptr %word, i32 19 monotonic, align 4
+  %atomicrmw.min = atomicrmw volatile min ptr %word, i32 20 monotonic, align 4
+  %atomicrmw.umax = atomicrmw umax ptr %word, i32 21 syncscope("singlethread") monotonic, align 4
+  %atomicrmw.umin = atomicrmw volatile umin ptr %word, i32 22 syncscope("singlethread") monotonic, align 4
+  ret void
+}
+
+define void @foo(ptr addrspace(3) %val) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr addrspace(3) [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[VAL_CAST:%.*]] = addrspacecast ptr addrspace(3) [[VAL]] to ptr
+; CHECK-NEXT:    call void @can_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
+; CHECK-NEXT:    call void @can_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
+; CHECK-NEXT:    call void @can_not_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
+; CHECK-NEXT:    call void @can_not_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
+; CHECK-NEXT:    call void @can_not_infer_cmpxchg(ptr [[VAL_CAST]])
+; CHECK-NEXT:    call void @can_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
+; CHECK-NEXT:    call void @can_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
+; CHECK-NEXT:    call void @can_not_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
+; CHECK-NEXT:    call void @can_not_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
+; CHECK-NEXT:    call void @can_not_infer_atomicrmw(ptr [[VAL_CAST]])
+; CHECK-NEXT:    ret void
+;
+  %g1.cast = addrspacecast ptr addrspace(1) @g1 to ptr
+  %g2.cast = addrspacecast ptr addrspace(1) @g2 to ptr
+  %val.cast = addrspacecast ptr addrspace(3) %val to ptr
+  call void @can_infer_cmpxchg(ptr %g1.cast)
+  call void @can_infer_cmpxchg(ptr %g2.cast)
+  call void @can_not_infer_cmpxchg(ptr %g1.cast)
+  call void @can_not_infer_cmpxchg(ptr %g2.cast)
+  call void @can_not_infer_cmpxchg(ptr %val.cast)
+  call void @can_infer_atomicrmw(ptr %g1.cast)
+  call void @can_infer_atomicrmw(ptr %g2.cast)
+  call void @can_not_infer_atomicrmw(ptr %g1.cast)
+  call void @can_not_infer_atomicrmw(ptr %g2.cast)
+  call void @can_not_infer_atomicrmw(ptr %val.cast)
+  ret void
+}
+

From c08f80e348ed84e4372d4a8687dfcaaa2df813aa Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 6 Sep 2024 17:43:21 +0100
Subject: [PATCH 409/425] [X86] select.ll - add ADD/SUB identity select test
 cases

LowerSELECTWithCmpZero already handles "SELECT (X != 0), Y, (OR/XOR Y, Z) -> (OR/XOR Y, (AND (0 - X), Z))" - we can also handle ADD/SUB in a similar way, although SUB is more restricted as its not commutative.
---
 llvm/test/CodeGen/X86/select.ll | 380 +++++++++++++++++++++++++++++++-
 1 file changed, 374 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index f370ac0a8c7c5..aa7f509327002 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -1693,6 +1693,374 @@ entry:
  ret i32 %1
 }
 
+define i32 @select_add(i32 %A, i32 %B, i8 %cond) {
+; GENERIC-LABEL: select_add:
+; GENERIC:       ## %bb.0: ## %entry
+; GENERIC-NEXT:    ## kill: def $esi killed $esi def $rsi
+; GENERIC-NEXT:    ## kill: def $edi killed $edi def $rdi
+; GENERIC-NEXT:    leal (%rsi,%rdi), %eax
+; GENERIC-NEXT:    testb $1, %dl
+; GENERIC-NEXT:    cmovel %edi, %eax
+; GENERIC-NEXT:    retq
+;
+; ATOM-LABEL: select_add:
+; ATOM:       ## %bb.0: ## %entry
+; ATOM-NEXT:    ## kill: def $esi killed $esi def $rsi
+; ATOM-NEXT:    ## kill: def $edi killed $edi def $rdi
+; ATOM-NEXT:    leal (%rsi,%rdi), %eax
+; ATOM-NEXT:    testb $1, %dl
+; ATOM-NEXT:    cmovel %edi, %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; ATHLON-LABEL: select_add:
+; ATHLON:       ## %bb.0: ## %entry
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    addl %ecx, %eax
+; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; ATHLON-NEXT:    cmovel %ecx, %eax
+; ATHLON-NEXT:    retl
+;
+; MCU-LABEL: select_add:
+; MCU:       # %bb.0: # %entry
+; MCU-NEXT:    testb $1, %cl
+; MCU-NEXT:    je .LBB36_2
+; MCU-NEXT:  # %bb.1: # %entry
+; MCU-NEXT:    addl %edx, %eax
+; MCU-NEXT:  .LBB36_2: # %entry
+; MCU-NEXT:    retl
+entry:
+ %and = and i8 %cond, 1
+ %cmp10 = icmp eq i8 %and, 0
+ %0 = add i32 %B, %A
+ %1 = select i1 %cmp10, i32 %A, i32 %0
+ ret i32 %1
+}
+
+; Equivalent to above, but with icmp ne (and %cond, 1), 1 instead of
+; icmp eq (and %cond, 1), 0
+define i32 @select_add_b(i32 %A, i32 %B, i8 %cond) {
+; GENERIC-LABEL: select_add_b:
+; GENERIC:       ## %bb.0: ## %entry
+; GENERIC-NEXT:    ## kill: def $esi killed $esi def $rsi
+; GENERIC-NEXT:    ## kill: def $edi killed $edi def $rdi
+; GENERIC-NEXT:    leal (%rsi,%rdi), %eax
+; GENERIC-NEXT:    testb $1, %dl
+; GENERIC-NEXT:    cmovel %edi, %eax
+; GENERIC-NEXT:    retq
+;
+; ATOM-LABEL: select_add_b:
+; ATOM:       ## %bb.0: ## %entry
+; ATOM-NEXT:    ## kill: def $esi killed $esi def $rsi
+; ATOM-NEXT:    ## kill: def $edi killed $edi def $rdi
+; ATOM-NEXT:    leal (%rsi,%rdi), %eax
+; ATOM-NEXT:    testb $1, %dl
+; ATOM-NEXT:    cmovel %edi, %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; ATHLON-LABEL: select_add_b:
+; ATHLON:       ## %bb.0: ## %entry
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    addl %ecx, %eax
+; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; ATHLON-NEXT:    cmovel %ecx, %eax
+; ATHLON-NEXT:    retl
+;
+; MCU-LABEL: select_add_b:
+; MCU:       # %bb.0: # %entry
+; MCU-NEXT:    testb $1, %cl
+; MCU-NEXT:    je .LBB37_2
+; MCU-NEXT:  # %bb.1:
+; MCU-NEXT:    addl %edx, %eax
+; MCU-NEXT:  .LBB37_2: # %entry
+; MCU-NEXT:    retl
+entry:
+ %and = and i8 %cond, 1
+ %cmp10 = icmp ne i8 %and, 1
+ %0 = add i32 %B, %A
+ %1 = select i1 %cmp10, i32 %A, i32 %0
+ ret i32 %1
+}
+
+define i32 @select_add_1(i32 %A, i32 %B, i32 %cond) {
+; GENERIC-LABEL: select_add_1:
+; GENERIC:       ## %bb.0: ## %entry
+; GENERIC-NEXT:    ## kill: def $esi killed $esi def $rsi
+; GENERIC-NEXT:    ## kill: def $edi killed $edi def $rdi
+; GENERIC-NEXT:    leal (%rsi,%rdi), %eax
+; GENERIC-NEXT:    testb $1, %dl
+; GENERIC-NEXT:    cmovel %edi, %eax
+; GENERIC-NEXT:    retq
+;
+; ATOM-LABEL: select_add_1:
+; ATOM:       ## %bb.0: ## %entry
+; ATOM-NEXT:    ## kill: def $esi killed $esi def $rsi
+; ATOM-NEXT:    ## kill: def $edi killed $edi def $rdi
+; ATOM-NEXT:    leal (%rsi,%rdi), %eax
+; ATOM-NEXT:    testb $1, %dl
+; ATOM-NEXT:    cmovel %edi, %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; ATHLON-LABEL: select_add_1:
+; ATHLON:       ## %bb.0: ## %entry
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    addl %ecx, %eax
+; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; ATHLON-NEXT:    cmovel %ecx, %eax
+; ATHLON-NEXT:    retl
+;
+; MCU-LABEL: select_add_1:
+; MCU:       # %bb.0: # %entry
+; MCU-NEXT:    testb $1, %cl
+; MCU-NEXT:    je .LBB38_2
+; MCU-NEXT:  # %bb.1: # %entry
+; MCU-NEXT:    addl %edx, %eax
+; MCU-NEXT:  .LBB38_2: # %entry
+; MCU-NEXT:    retl
+entry:
+ %and = and i32 %cond, 1
+ %cmp10 = icmp eq i32 %and, 0
+ %0 = add i32 %B, %A
+ %1 = select i1 %cmp10, i32 %A, i32 %0
+ ret i32 %1
+}
+
+; Equivalent to above, but with icmp ne (and %cond, 1), 1 instead of
+; icmp eq (and %cond, 1), 0
+define i32 @select_add_1b(i32 %A, i32 %B, i32 %cond) {
+; GENERIC-LABEL: select_add_1b:
+; GENERIC:       ## %bb.0: ## %entry
+; GENERIC-NEXT:    ## kill: def $esi killed $esi def $rsi
+; GENERIC-NEXT:    ## kill: def $edi killed $edi def $rdi
+; GENERIC-NEXT:    leal (%rsi,%rdi), %eax
+; GENERIC-NEXT:    testb $1, %dl
+; GENERIC-NEXT:    cmovel %edi, %eax
+; GENERIC-NEXT:    retq
+;
+; ATOM-LABEL: select_add_1b:
+; ATOM:       ## %bb.0: ## %entry
+; ATOM-NEXT:    ## kill: def $esi killed $esi def $rsi
+; ATOM-NEXT:    ## kill: def $edi killed $edi def $rdi
+; ATOM-NEXT:    leal (%rsi,%rdi), %eax
+; ATOM-NEXT:    testb $1, %dl
+; ATOM-NEXT:    cmovel %edi, %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; ATHLON-LABEL: select_add_1b:
+; ATHLON:       ## %bb.0: ## %entry
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    addl %ecx, %eax
+; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; ATHLON-NEXT:    cmovel %ecx, %eax
+; ATHLON-NEXT:    retl
+;
+; MCU-LABEL: select_add_1b:
+; MCU:       # %bb.0: # %entry
+; MCU-NEXT:    testb $1, %cl
+; MCU-NEXT:    je .LBB39_2
+; MCU-NEXT:  # %bb.1:
+; MCU-NEXT:    addl %edx, %eax
+; MCU-NEXT:  .LBB39_2: # %entry
+; MCU-NEXT:    retl
+entry:
+ %and = and i32 %cond, 1
+ %cmp10 = icmp ne i32 %and, 1
+ %0 = add i32 %B, %A
+ %1 = select i1 %cmp10, i32 %A, i32 %0
+ ret i32 %1
+}
+
+define i32 @select_sub(i32 %A, i32 %B, i8 %cond) {
+; CHECK-LABEL: select_sub:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    subl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %esi, %eax
+; CHECK-NEXT:    retq
+;
+; ATHLON-LABEL: select_sub:
+; ATHLON:       ## %bb.0: ## %entry
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    movl %ecx, %eax
+; ATHLON-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; ATHLON-NEXT:    cmovel %ecx, %eax
+; ATHLON-NEXT:    retl
+;
+; MCU-LABEL: select_sub:
+; MCU:       # %bb.0: # %entry
+; MCU-NEXT:    testb $1, %cl
+; MCU-NEXT:    je .LBB40_2
+; MCU-NEXT:  # %bb.1: # %entry
+; MCU-NEXT:    subl %eax, %edx
+; MCU-NEXT:  .LBB40_2: # %entry
+; MCU-NEXT:    movl %edx, %eax
+; MCU-NEXT:    retl
+entry:
+ %and = and i8 %cond, 1
+ %cmp10 = icmp eq i8 %and, 0
+ %0 = sub i32 %B, %A
+ %1 = select i1 %cmp10, i32 %B, i32 %0
+ ret i32 %1
+}
+
+; Equivalent to above, but with icmp ne (and %cond, 1), 1 instead of
+; icmp eq (and %cond, 1), 0
+define i32 @select_sub_b(i32 %A, i32 %B, i8 %cond) {
+; CHECK-LABEL: select_sub_b:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    subl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %esi, %eax
+; CHECK-NEXT:    retq
+;
+; ATHLON-LABEL: select_sub_b:
+; ATHLON:       ## %bb.0: ## %entry
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    movl %ecx, %eax
+; ATHLON-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; ATHLON-NEXT:    cmovel %ecx, %eax
+; ATHLON-NEXT:    retl
+;
+; MCU-LABEL: select_sub_b:
+; MCU:       # %bb.0: # %entry
+; MCU-NEXT:    testb $1, %cl
+; MCU-NEXT:    je .LBB41_2
+; MCU-NEXT:  # %bb.1:
+; MCU-NEXT:    subl %eax, %edx
+; MCU-NEXT:  .LBB41_2: # %entry
+; MCU-NEXT:    movl %edx, %eax
+; MCU-NEXT:    retl
+entry:
+ %and = and i8 %cond, 1
+ %cmp10 = icmp ne i8 %and, 1
+ %0 = sub i32 %B, %A
+ %1 = select i1 %cmp10, i32 %B, i32 %0
+ ret i32 %1
+}
+
+define i32 @select_sub_1(i32 %A, i32 %B, i32 %cond) {
+; CHECK-LABEL: select_sub_1:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    subl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %esi, %eax
+; CHECK-NEXT:    retq
+;
+; ATHLON-LABEL: select_sub_1:
+; ATHLON:       ## %bb.0: ## %entry
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    movl %ecx, %eax
+; ATHLON-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; ATHLON-NEXT:    cmovel %ecx, %eax
+; ATHLON-NEXT:    retl
+;
+; MCU-LABEL: select_sub_1:
+; MCU:       # %bb.0: # %entry
+; MCU-NEXT:    testb $1, %cl
+; MCU-NEXT:    je .LBB42_2
+; MCU-NEXT:  # %bb.1: # %entry
+; MCU-NEXT:    subl %eax, %edx
+; MCU-NEXT:  .LBB42_2: # %entry
+; MCU-NEXT:    movl %edx, %eax
+; MCU-NEXT:    retl
+entry:
+ %and = and i32 %cond, 1
+ %cmp10 = icmp eq i32 %and, 0
+ %0 = sub i32 %B, %A
+ %1 = select i1 %cmp10, i32 %B, i32 %0
+ ret i32 %1
+}
+
+; Equivalent to above, but with icmp ne (and %cond, 1), 1 instead of
+; icmp eq (and %cond, 1), 0
+define i32 @select_sub_1b(i32 %A, i32 %B, i32 %cond) {
+; CHECK-LABEL: select_sub_1b:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    subl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %esi, %eax
+; CHECK-NEXT:    retq
+;
+; ATHLON-LABEL: select_sub_1b:
+; ATHLON:       ## %bb.0: ## %entry
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    movl %ecx, %eax
+; ATHLON-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; ATHLON-NEXT:    cmovel %ecx, %eax
+; ATHLON-NEXT:    retl
+;
+; MCU-LABEL: select_sub_1b:
+; MCU:       # %bb.0: # %entry
+; MCU-NEXT:    testb $1, %cl
+; MCU-NEXT:    je .LBB43_2
+; MCU-NEXT:  # %bb.1:
+; MCU-NEXT:    subl %eax, %edx
+; MCU-NEXT:  .LBB43_2: # %entry
+; MCU-NEXT:    movl %edx, %eax
+; MCU-NEXT:    retl
+entry:
+ %and = and i32 %cond, 1
+ %cmp10 = icmp ne i32 %and, 1
+ %0 = sub i32 %B, %A
+ %1 = select i1 %cmp10, i32 %B, i32 %0
+ ret i32 %1
+}
+
+; Negative case - incorrect subtract operand
+define i32 @select_sub_neg(i32 %A, i32 %B, i8 %cond) {
+; CHECK-LABEL: select_sub_neg:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    subl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %edi, %eax
+; CHECK-NEXT:    retq
+;
+; ATHLON-LABEL: select_sub_neg:
+; ATHLON:       ## %bb.0: ## %entry
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    subl %ecx, %eax
+; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; ATHLON-NEXT:    cmovel %ecx, %eax
+; ATHLON-NEXT:    retl
+;
+; MCU-LABEL: select_sub_neg:
+; MCU:       # %bb.0: # %entry
+; MCU-NEXT:    testb $1, %cl
+; MCU-NEXT:    je .LBB44_2
+; MCU-NEXT:  # %bb.1: # %entry
+; MCU-NEXT:    subl %eax, %edx
+; MCU-NEXT:    movl %edx, %eax
+; MCU-NEXT:  .LBB44_2: # %entry
+; MCU-NEXT:    retl
+entry:
+ %and = and i8 %cond, 1
+ %cmp10 = icmp eq i8 %and, 0
+ %0 = sub i32 %B, %A
+ %1 = select i1 %cmp10, i32 %A, i32 %0
+ ret i32 %1
+}
+
 define i64 @PR51612(i64 %x, i64 %y) {
 ; CHECK-LABEL: PR51612:
 ; CHECK:       ## %bb.0:
@@ -1719,11 +2087,11 @@ define i64 @PR51612(i64 %x, i64 %y) {
 ; MCU:       # %bb.0:
 ; MCU-NEXT:    addl $1, %eax
 ; MCU-NEXT:    adcl $0, %edx
-; MCU-NEXT:    jae .LBB36_2
+; MCU-NEXT:    jae .LBB45_2
 ; MCU-NEXT:  # %bb.1:
 ; MCU-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; MCU-NEXT:    incl %eax
-; MCU-NEXT:  .LBB36_2:
+; MCU-NEXT:  .LBB45_2:
 ; MCU-NEXT:    andl 10, %eax
 ; MCU-NEXT:    xorl %edx, %edx
 ; MCU-NEXT:    retl
@@ -1778,10 +2146,10 @@ define i8 @select_uaddo_common_op0(i8 %a, i8 %b, i8 %c, i1 %cond) {
 ; MCU-LABEL: select_uaddo_common_op0:
 ; MCU:       # %bb.0:
 ; MCU-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; MCU-NEXT:    jne .LBB37_2
+; MCU-NEXT:    jne .LBB46_2
 ; MCU-NEXT:  # %bb.1:
 ; MCU-NEXT:    movl %ecx, %edx
-; MCU-NEXT:  .LBB37_2:
+; MCU-NEXT:  .LBB46_2:
 ; MCU-NEXT:    addb %dl, %al
 ; MCU-NEXT:    # kill: def $al killed $al killed $eax
 ; MCU-NEXT:    retl
@@ -1827,10 +2195,10 @@ define i32 @select_uaddo_common_op1(i32 %a, i32 %b, i32 %c, i1 %cond) {
 ; MCU-LABEL: select_uaddo_common_op1:
 ; MCU:       # %bb.0:
 ; MCU-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; MCU-NEXT:    jne .LBB38_2
+; MCU-NEXT:    jne .LBB47_2
 ; MCU-NEXT:  # %bb.1:
 ; MCU-NEXT:    movl %ecx, %eax
-; MCU-NEXT:  .LBB38_2:
+; MCU-NEXT:  .LBB47_2:
 ; MCU-NEXT:    addl %edx, %eax
 ; MCU-NEXT:    retl
   %ab = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)

From fd13cc03bdc9696bcab98ddb9b177ed89650ca30 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 6 Sep 2024 10:13:26 -0700
Subject: [PATCH 410/425] [RISCV] Use Subtarget variable instaed of
 TLI.getSubtarget(). NFC

---
 llvm/lib/Target/RISCV/RISCVCallingConv.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
index 48f91bba09044..c6a66e69401a6 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -607,7 +607,7 @@ bool llvm::CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
       // Try and pass the address via a "fast" GPR.
       if (MCRegister GPRReg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
         LocInfo = CCValAssign::Indirect;
-        LocVT = TLI.getSubtarget().getXLenVT();
+        LocVT = Subtarget.getXLenVT();
         State.addLoc(CCValAssign::getReg(ValNo, ValVT, GPRReg, LocVT, LocInfo));
       } else if (ValVT.isFixedLengthVector()) {
         auto StackAlign =

From a291fe5ed44fa37493d038c78ff4d73135fd85a9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 6 Sep 2024 21:17:36 +0400
Subject: [PATCH 411/425] clang/AMDGPU: Update test message order

Order of atomic expansion remarks is backwards since
100d9b89947bb1d42af20010bb594fa4c02542fc
---
 clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl     | 7 ++++---
 .../test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl | 5 +++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl b/clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl
index d23005e018f35..72027eda4571d 100644
--- a/clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl
@@ -26,10 +26,11 @@ typedef enum memory_scope {
 #endif
 } memory_scope;
 
-// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope [-Rpass=atomic-expand]
-// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope [-Rpass=atomic-expand]
-// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope [-Rpass=atomic-expand]
 // REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope [-Rpass=atomic-expand]
+// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope [-Rpass=atomic-expand]
+// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope [-Rpass=atomic-expand]
+// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope [-Rpass=atomic-expand]
+
 // GFX90A-CAS-LABEL: @atomic_cas
 // GFX90A-CAS: atomicrmw fadd ptr addrspace(1) {{.*}} syncscope("workgroup-one-as") monotonic
 // GFX90A-CAS: atomicrmw fadd ptr addrspace(1) {{.*}} syncscope("agent-one-as") monotonic
diff --git a/clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl b/clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl
index 80ad9b4df8f64..7d684bc185a58 100644
--- a/clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl
@@ -27,9 +27,10 @@ typedef enum memory_scope {
 #endif
 } memory_scope;
 
-// GFX90A-HW-REMARK: Hardware instruction generated for atomic fadd operation at memory scope workgroup-one-as due to an unsafe request. [-Rpass=si-lower]
-// GFX90A-HW-REMARK: Hardware instruction generated for atomic fadd operation at memory scope agent-one-as due to an unsafe request. [-Rpass=si-lower]
 // GFX90A-HW-REMARK: Hardware instruction generated for atomic fadd operation at memory scope wavefront-one-as due to an unsafe request. [-Rpass=si-lower]
+// GFX90A-HW-REMARK: Hardware instruction generated for atomic fadd operation at memory scope agent-one-as due to an unsafe request. [-Rpass=si-lower]
+// GFX90A-HW-REMARK: Hardware instruction generated for atomic fadd operation at memory scope workgroup-one-as due to an unsafe request. [-Rpass=si-lower]
+
 // GFX90A-HW-REMARK: global_atomic_add_f32 v0, v[0:1], v2, off glc
 // GFX90A-HW-REMARK: global_atomic_add_f32 v0, v[0:1], v2, off glc
 // GFX90A-HW-REMARK: global_atomic_add_f32 v0, v[0:1], v2, off glc

From 0ffa377c6b2583ecb326af8b9084951a106d3881 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 6 Sep 2024 10:25:08 -0700
Subject: [PATCH 412/425] [ThinLTO] Shrink GlobalValueSummary by 8 bytes
 (#107342)

During the ThinLTO indexing step for one of our large applications, we
create 7.5 million instances of GlobalValueSummary.

Changing:

  std::vector<ValueInfo> RefEdgeList;

to:

  SmallVector<ValueInfo, 0> RefEdgeList;

in GlobalValueSummary reduces the size of each instance by 8 bytes.
The rest of the patch makes the same change to other places so that
the types stay compatible across function boundaries.
---
 llvm/include/llvm/AsmParser/LLParser.h        |  2 +-
 llvm/include/llvm/IR/ModuleSummaryIndex.h     | 15 ++++----
 llvm/include/llvm/IR/ModuleSummaryIndexYAML.h |  8 ++---
 llvm/lib/Analysis/ModuleSummaryAnalysis.cpp   | 36 ++++++++++---------
 llvm/lib/AsmParser/LLParser.cpp               |  6 ++--
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     | 18 +++++-----
 6 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index 9e551d3ae2bbe..2bbcda7093854 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -419,7 +419,7 @@ namespace llvm {
     bool parseParamAccessCall(FunctionSummary::ParamAccess::Call &Call,
                               IdLocListType &IdLocList);
     bool parseParamAccessOffset(ConstantRange &Range);
-    bool parseOptionalRefs(std::vector<ValueInfo> &Refs);
+    bool parseOptionalRefs(SmallVectorImpl<ValueInfo> &Refs);
     bool parseTypeIdEntry(unsigned ID);
     bool parseTypeIdSummary(TypeIdSummary &TIS);
     bool parseTypeIdCompatibleVtableEntry(unsigned ID);
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 6d39150a03bfd..7bd90d3ed7e9d 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -538,10 +538,13 @@ class GlobalValueSummary {
   /// (either by the initializer of a global variable, or referenced
   /// from within a function). This does not include functions called, which
   /// are listed in the derived FunctionSummary object.
-  std::vector<ValueInfo> RefEdgeList;
+  /// We use SmallVector<ValueInfo, 0> instead of std::vector<ValueInfo> for its
+  /// smaller memory footprint.
+  SmallVector<ValueInfo, 0> RefEdgeList;
 
 protected:
-  GlobalValueSummary(SummaryKind K, GVFlags Flags, std::vector<ValueInfo> Refs)
+  GlobalValueSummary(SummaryKind K, GVFlags Flags,
+                     SmallVectorImpl<ValueInfo> &&Refs)
       : Kind(K), Flags(Flags), RefEdgeList(std::move(Refs)) {
     assert((K != AliasKind || Refs.empty()) &&
            "Expect no references for AliasSummary");
@@ -641,7 +644,7 @@ class AliasSummary : public GlobalValueSummary {
 
 public:
   AliasSummary(GVFlags Flags)
-      : GlobalValueSummary(AliasKind, Flags, ArrayRef<ValueInfo>{}),
+      : GlobalValueSummary(AliasKind, Flags, SmallVector<ValueInfo, 0>{}),
         AliaseeSummary(nullptr) {}
 
   /// Check if this is an alias summary.
@@ -857,7 +860,7 @@ class FunctionSummary : public GlobalValueSummary {
             /*NotEligibleToImport=*/true, /*Live=*/true, /*IsLocal=*/false,
             /*CanAutoHide=*/false, GlobalValueSummary::ImportKind::Definition),
         /*NumInsts=*/0, FunctionSummary::FFlags{}, /*EntryCount=*/0,
-        std::vector<ValueInfo>(), std::move(Edges),
+        SmallVector<ValueInfo, 0>(), std::move(Edges),
         std::vector<GlobalValue::GUID>(),
         std::vector<FunctionSummary::VFuncId>(),
         std::vector<FunctionSummary::VFuncId>(),
@@ -913,7 +916,7 @@ class FunctionSummary : public GlobalValueSummary {
 
 public:
   FunctionSummary(GVFlags Flags, unsigned NumInsts, FFlags FunFlags,
-                  uint64_t EntryCount, std::vector<ValueInfo> Refs,
+                  uint64_t EntryCount, SmallVectorImpl<ValueInfo> &&Refs,
                   std::vector<EdgeTy> CGEdges,
                   std::vector<GlobalValue::GUID> TypeTests,
                   std::vector<VFuncId> TypeTestAssumeVCalls,
@@ -1166,7 +1169,7 @@ class GlobalVarSummary : public GlobalValueSummary {
   } VarFlags;
 
   GlobalVarSummary(GVFlags Flags, GVarFlags VarFlags,
-                   std::vector<ValueInfo> Refs)
+                   SmallVectorImpl<ValueInfo> &&Refs)
       : GlobalValueSummary(GlobalVarKind, Flags, std::move(Refs)),
         VarFlags(VarFlags) {}
 
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
index 6cc533f043a51..ec2907ca7662e 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -216,7 +216,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
     }
     auto &Elem = V.try_emplace(KeyInt, /*IsAnalysis=*/false).first->second;
     for (auto &FSum : FSums) {
-      std::vector<ValueInfo> Refs;
+      SmallVector<ValueInfo, 0> Refs;
       Refs.reserve(FSum.Refs.size());
       for (auto &RefGUID : FSum.Refs) {
         auto It = V.try_emplace(RefGUID, /*IsAnalysis=*/false).first;
@@ -229,9 +229,9 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
               FSum.NotEligibleToImport, FSum.Live, FSum.IsLocal,
               FSum.CanAutoHide,
               static_cast<GlobalValueSummary::ImportKind>(FSum.ImportType)),
-          /*NumInsts=*/0, FunctionSummary::FFlags{}, /*EntryCount=*/0, Refs,
-          ArrayRef<FunctionSummary::EdgeTy>{}, std::move(FSum.TypeTests),
-          std::move(FSum.TypeTestAssumeVCalls),
+          /*NumInsts=*/0, FunctionSummary::FFlags{}, /*EntryCount=*/0,
+          std::move(Refs), ArrayRef<FunctionSummary::EdgeTy>{},
+          std::move(FSum.TypeTests), std::move(FSum.TypeTestAssumeVCalls),
           std::move(FSum.TypeCheckedLoadVCalls),
           std::move(FSum.TypeTestAssumeConstVCalls),
           std::move(FSum.TypeCheckedLoadConstVCalls),
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index e9490ccba8215..39e7926ea43fe 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -99,10 +99,11 @@ extern cl::opt<bool> MemProfReportHintedSizes;
 // can only take an address of basic block located in the same function.
 // Set `RefLocalLinkageIFunc` to true if the analyzed value references a
 // local-linkage ifunc.
-static bool findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
-                         SetVector<ValueInfo, std::vector<ValueInfo>> &RefEdges,
-                         SmallPtrSet<const User *, 8> &Visited,
-                         bool &RefLocalLinkageIFunc) {
+static bool
+findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
+             SetVector<ValueInfo, SmallVector<ValueInfo, 0>> &RefEdges,
+             SmallPtrSet<const User *, 8> &Visited,
+             bool &RefLocalLinkageIFunc) {
   bool HasBlockAddress = false;
   SmallVector<const User *, 32> Worklist;
   if (Visited.insert(CurUser).second)
@@ -310,7 +311,7 @@ static void computeFunctionSummary(
   MapVector<ValueInfo, CalleeInfo, DenseMap<ValueInfo, unsigned>,
             std::vector<std::pair<ValueInfo, CalleeInfo>>>
       CallGraphEdges;
-  SetVector<ValueInfo, std::vector<ValueInfo>> RefEdges, LoadRefEdges,
+  SetVector<ValueInfo, SmallVector<ValueInfo, 0>> RefEdges, LoadRefEdges,
       StoreRefEdges;
   SetVector<GlobalValue::GUID, std::vector<GlobalValue::GUID>> TypeTests;
   SetVector<FunctionSummary::VFuncId, std::vector<FunctionSummary::VFuncId>>
@@ -568,16 +569,17 @@ static void computeFunctionSummary(
   if (PSI->hasPartialSampleProfile() && ScalePartialSampleProfileWorkingSetSize)
     Index.addBlockCount(F.size());
 
-  std::vector<ValueInfo> Refs;
+  SmallVector<ValueInfo, 0> Refs;
   if (IsThinLTO) {
-    auto AddRefEdges = [&](const std::vector<const Instruction *> &Instrs,
-                           SetVector<ValueInfo, std::vector<ValueInfo>> &Edges,
-                           SmallPtrSet<const User *, 8> &Cache) {
-      for (const auto *I : Instrs) {
-        Cache.erase(I);
-        findRefEdges(Index, I, Edges, Cache, HasLocalIFuncCallOrRef);
-      }
-    };
+    auto AddRefEdges =
+        [&](const std::vector<const Instruction *> &Instrs,
+            SetVector<ValueInfo, SmallVector<ValueInfo, 0>> &Edges,
+            SmallPtrSet<const User *, 8> &Cache) {
+          for (const auto *I : Instrs) {
+            Cache.erase(I);
+            findRefEdges(Index, I, Edges, Cache, HasLocalIFuncCallOrRef);
+          }
+        };
 
     // By now we processed all instructions in a function, except
     // non-volatile loads and non-volatile value stores. Let's find
@@ -805,7 +807,7 @@ static void computeVariableSummary(ModuleSummaryIndex &Index,
                                    DenseSet<GlobalValue::GUID> &CantBePromoted,
                                    const Module &M,
                                    SmallVectorImpl<MDNode *> &Types) {
-  SetVector<ValueInfo, std::vector<ValueInfo>> RefEdges;
+  SetVector<ValueInfo, SmallVector<ValueInfo, 0>> RefEdges;
   SmallPtrSet<const User *, 8> Visited;
   bool RefLocalIFunc = false;
   bool HasBlockAddress =
@@ -961,7 +963,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                         /* MayThrow */ true,
                         /* HasUnknownCall */ true,
                         /* MustBeUnreachable */ false},
-                    /*EntryCount=*/0, ArrayRef<ValueInfo>{},
+                    /*EntryCount=*/0, SmallVector<ValueInfo, 0>{},
                     ArrayRef<FunctionSummary::EdgeTy>{},
                     ArrayRef<GlobalValue::GUID>{},
                     ArrayRef<FunctionSummary::VFuncId>{},
@@ -978,7 +980,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                     GlobalVarSummary::GVarFlags(
                         false, false, cast<GlobalVariable>(GV)->isConstant(),
                         GlobalObject::VCallVisibilityPublic),
-                    ArrayRef<ValueInfo>{});
+                    SmallVector<ValueInfo, 0>{});
             Index.addGlobalValueSummary(*GV, std::move(Summary));
           }
         });
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index d8380fa27a2a2..da1dbe55fcec5 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -9399,7 +9399,7 @@ bool LLParser::parseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
   std::vector<FunctionSummary::EdgeTy> Calls;
   FunctionSummary::TypeIdInfo TypeIdInfo;
   std::vector<FunctionSummary::ParamAccess> ParamAccesses;
-  std::vector<ValueInfo> Refs;
+  SmallVector<ValueInfo, 0> Refs;
   std::vector<CallsiteInfo> Callsites;
   std::vector<AllocInfo> Allocs;
   // Default is all-zeros (conservative values).
@@ -9487,7 +9487,7 @@ bool LLParser::parseVariableSummary(std::string Name, GlobalValue::GUID GUID,
                                         /* WriteOnly */ false,
                                         /* Constant */ false,
                                         GlobalObject::VCallVisibilityPublic);
-  std::vector<ValueInfo> Refs;
+  SmallVector<ValueInfo, 0> Refs;
   VTableFuncList VTableFuncs;
   if (parseToken(lltok::colon, "expected ':' here") ||
       parseToken(lltok::lparen, "expected '(' here") ||
@@ -9993,7 +9993,7 @@ bool LLParser::parseOptionalParamAccesses(
 
 /// OptionalRefs
 ///   := 'refs' ':' '(' GVReference [',' GVReference]* ')'
-bool LLParser::parseOptionalRefs(std::vector<ValueInfo> &Refs) {
+bool LLParser::parseOptionalRefs(SmallVectorImpl<ValueInfo> &Refs) {
   assert(Lex.getKind() == lltok::kw_refs);
   Lex.Lex();
 
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index f887c9be73e21..e52757fe952d2 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -985,7 +985,7 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase {
   Error parseValueSymbolTable(
       uint64_t Offset,
       DenseMap<unsigned, GlobalValue::LinkageTypes> &ValueIdToLinkageMap);
-  std::vector<ValueInfo> makeRefList(ArrayRef<uint64_t> Record);
+  SmallVector<ValueInfo, 0> makeRefList(ArrayRef<uint64_t> Record);
   std::vector<FunctionSummary::EdgeTy> makeCallList(ArrayRef<uint64_t> Record,
                                                     bool IsOldProfileFormat,
                                                     bool HasProfile,
@@ -7369,9 +7369,9 @@ Error ModuleSummaryIndexBitcodeReader::parseModule() {
   }
 }
 
-std::vector<ValueInfo>
+SmallVector<ValueInfo, 0>
 ModuleSummaryIndexBitcodeReader::makeRefList(ArrayRef<uint64_t> Record) {
-  std::vector<ValueInfo> Ret;
+  SmallVector<ValueInfo, 0> Ret;
   Ret.reserve(Record.size());
   for (uint64_t RefValueId : Record)
     Ret.push_back(std::get<0>(getValueInfoFromValueId(RefValueId)));
@@ -7516,7 +7516,7 @@ void ModuleSummaryIndexBitcodeReader::parseTypeIdCompatibleVtableSummaryRecord(
     parseTypeIdCompatibleVtableInfo(Record, Slot, TypeId);
 }
 
-static void setSpecialRefs(std::vector<ValueInfo> &Refs, unsigned ROCnt,
+static void setSpecialRefs(SmallVectorImpl<ValueInfo> &Refs, unsigned ROCnt,
                            unsigned WOCnt) {
   // Readonly and writeonly refs are in the end of the refs list.
   assert(ROCnt + WOCnt <= Refs.size());
@@ -7666,7 +7666,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       int CallGraphEdgeStartIndex = RefListStartIndex + NumRefs;
       assert(Record.size() >= RefListStartIndex + NumRefs &&
              "Record size inconsistent with number of references");
-      std::vector<ValueInfo> Refs = makeRefList(
+      SmallVector<ValueInfo, 0> Refs = makeRefList(
           ArrayRef<uint64_t>(Record).slice(RefListStartIndex, NumRefs));
       bool HasProfile = (BitCode == bitc::FS_PERMODULE_PROFILE);
       bool HasRelBF = (BitCode == bitc::FS_PERMODULE_RELBF);
@@ -7741,7 +7741,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
         GVF = getDecodedGVarFlags(Record[2]);
         RefArrayStart = 3;
       }
-      std::vector<ValueInfo> Refs =
+      SmallVector<ValueInfo, 0> Refs =
           makeRefList(ArrayRef<uint64_t>(Record).slice(RefArrayStart));
       auto FS =
           std::make_unique<GlobalVarSummary>(Flags, GVF, std::move(Refs));
@@ -7762,7 +7762,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       unsigned RefListStartIndex = 4;
       unsigned VTableListStartIndex = RefListStartIndex + NumRefs;
       auto Flags = getDecodedGVSummaryFlags(RawFlags, Version);
-      std::vector<ValueInfo> Refs = makeRefList(
+      SmallVector<ValueInfo, 0> Refs = makeRefList(
           ArrayRef<uint64_t>(Record).slice(RefListStartIndex, NumRefs));
       VTableFuncList VTableFuncs;
       for (unsigned I = VTableListStartIndex, E = Record.size(); I != E; ++I) {
@@ -7823,7 +7823,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       int CallGraphEdgeStartIndex = RefListStartIndex + NumRefs;
       assert(Record.size() >= RefListStartIndex + NumRefs &&
              "Record size inconsistent with number of references");
-      std::vector<ValueInfo> Refs = makeRefList(
+      SmallVector<ValueInfo, 0> Refs = makeRefList(
           ArrayRef<uint64_t>(Record).slice(RefListStartIndex, NumRefs));
       bool HasProfile = (BitCode == bitc::FS_COMBINED_PROFILE);
       std::vector<FunctionSummary::EdgeTy> Edges = makeCallList(
@@ -7883,7 +7883,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
         GVF = getDecodedGVarFlags(Record[3]);
         RefArrayStart = 4;
       }
-      std::vector<ValueInfo> Refs =
+      SmallVector<ValueInfo, 0> Refs =
           makeRefList(ArrayRef<uint64_t>(Record).slice(RefArrayStart));
       auto FS =
           std::make_unique<GlobalVarSummary>(Flags, GVF, std::move(Refs));

From 2e9b3316ee8b4339e2f023173db780ea09c44a88 Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Fri, 6 Sep 2024 10:28:04 -0700
Subject: [PATCH 413/425] [compiler-rt][rtsan] Introduce RTSAN_OPTIONS and
 flags (#107174)

This matches much of the boilerplate found in the other sanitizers.
---
 compiler-rt/lib/rtsan/CMakeLists.txt          |  6 +-
 compiler-rt/lib/rtsan/rtsan.cpp               |  6 ++
 compiler-rt/lib/rtsan/rtsan_context.cpp       |  2 +
 compiler-rt/lib/rtsan/rtsan_flags.cpp         | 58 +++++++++++++++++++
 compiler-rt/lib/rtsan/rtsan_flags.h           | 28 +++++++++
 compiler-rt/lib/rtsan/rtsan_flags.inc         | 20 +++++++
 compiler-rt/lib/rtsan/rtsan_stack.cpp         | 12 ----
 .../lib/rtsan/tests/rtsan_test_context.cpp    | 26 +++++----
 compiler-rt/test/rtsan/unrecognized_flags.cpp | 11 ++++
 llvm/test/Unit/lit.cfg.py                     |  1 +
 llvm/utils/lit/lit/TestingConfig.py           |  1 +
 llvm/utils/lit/lit/llvm/config.py             |  1 +
 12 files changed, 149 insertions(+), 23 deletions(-)
 create mode 100644 compiler-rt/lib/rtsan/rtsan_flags.cpp
 create mode 100644 compiler-rt/lib/rtsan/rtsan_flags.h
 create mode 100644 compiler-rt/lib/rtsan/rtsan_flags.inc
 create mode 100644 compiler-rt/test/rtsan/unrecognized_flags.cpp

diff --git a/compiler-rt/lib/rtsan/CMakeLists.txt b/compiler-rt/lib/rtsan/CMakeLists.txt
index bd7358e86e59b..3f146a757a97e 100644
--- a/compiler-rt/lib/rtsan/CMakeLists.txt
+++ b/compiler-rt/lib/rtsan/CMakeLists.txt
@@ -3,6 +3,7 @@ include_directories(..)
 set(RTSAN_CXX_SOURCES
   rtsan.cpp
   rtsan_context.cpp
+  rtsan_flags.cpp
   rtsan_stack.cpp
   rtsan_interceptors.cpp)
 
@@ -12,7 +13,10 @@ set(RTSAN_PREINIT_SOURCES
 set(RTSAN_HEADERS
   rtsan.h
   rtsan_context.h
-  rtsan_stack.h)
+  rtsan_flags.h
+  rtsan_flags.inc
+  rtsan_stack.h
+  )
 
 set(RTSAN_DEPS)
 
diff --git a/compiler-rt/lib/rtsan/rtsan.cpp b/compiler-rt/lib/rtsan/rtsan.cpp
index 1388ce66cbde2..f929c9ae81c11 100644
--- a/compiler-rt/lib/rtsan/rtsan.cpp
+++ b/compiler-rt/lib/rtsan/rtsan.cpp
@@ -10,9 +10,11 @@
 
 #include <rtsan/rtsan.h>
 #include <rtsan/rtsan_context.h>
+#include <rtsan/rtsan_flags.h>
 #include <rtsan/rtsan_interceptors.h>
 
 #include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_mutex.h"
 
 using namespace __rtsan;
@@ -29,7 +31,11 @@ extern "C" {
 
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_init() {
   CHECK(!__rtsan_is_initialized());
+
+  SanitizerToolName = "RealtimeSanitizer";
+  InitializeFlags();
   InitializeInterceptors();
+
   SetInitialized();
 }
 
diff --git a/compiler-rt/lib/rtsan/rtsan_context.cpp b/compiler-rt/lib/rtsan/rtsan_context.cpp
index 97f18dfbbcca8..a49b70360babb 100644
--- a/compiler-rt/lib/rtsan/rtsan_context.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_context.cpp
@@ -8,6 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <rtsan/rtsan.h>
 #include <rtsan/rtsan_context.h>
 
 #include <rtsan/rtsan_stack.h>
@@ -75,6 +76,7 @@ void __rtsan::Context::BypassPop() { bypass_depth_--; }
 
 void __rtsan::ExpectNotRealtime(Context &context,
                                 const char *intercepted_function_name) {
+  CHECK(__rtsan_is_initialized());
   if (context.InRealtimeContext() && !context.IsBypassed()) {
     context.BypassPush();
 
diff --git a/compiler-rt/lib/rtsan/rtsan_flags.cpp b/compiler-rt/lib/rtsan/rtsan_flags.cpp
new file mode 100644
index 0000000000000..beab2a2fc5d89
--- /dev/null
+++ b/compiler-rt/lib/rtsan/rtsan_flags.cpp
@@ -0,0 +1,58 @@
+//===--- rtsan_flags.cpp - Realtime Sanitizer -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of RealtimeSanitizer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "rtsan/rtsan_flags.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_flags.h"
+
+using namespace __sanitizer;
+using namespace __rtsan;
+
+Flags __rtsan::flags_data;
+
+SANITIZER_INTERFACE_WEAK_DEF(const char *, __rtsan_default_options, void) {
+  return "";
+}
+
+static void RegisterRtsanFlags(FlagParser *parser, Flags *f) {
+#define RTSAN_FLAG(Type, Name, DefaultValue, Description)                      \
+  RegisterFlag(parser, #Name, Description, &f->Name);
+#include "rtsan_flags.inc"
+#undef RTSAN_FLAG
+}
+
+void __rtsan::InitializeFlags() {
+  SetCommonFlagsDefaults();
+  {
+    CommonFlags cf;
+    cf.CopyFrom(*common_flags());
+    cf.external_symbolizer_path = GetEnv("RTSAN_SYMBOLIZER_PATH");
+    OverrideCommonFlags(cf);
+  }
+
+  FlagParser parser;
+  RegisterRtsanFlags(&parser, &flags());
+  RegisterCommonFlags(&parser);
+
+  // Override from user-specified string.
+  parser.ParseString(__rtsan_default_options());
+
+  parser.ParseStringFromEnv("RTSAN_OPTIONS");
+
+  InitializeCommonFlags();
+
+  if (Verbosity())
+    ReportUnrecognizedFlags();
+
+  if (common_flags()->help)
+    parser.PrintFlagDescriptions();
+}
diff --git a/compiler-rt/lib/rtsan/rtsan_flags.h b/compiler-rt/lib/rtsan/rtsan_flags.h
new file mode 100644
index 0000000000000..29025c29b6fc2
--- /dev/null
+++ b/compiler-rt/lib/rtsan/rtsan_flags.h
@@ -0,0 +1,28 @@
+//===----------------------- rtsan_flags.h ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of RealtimeSanitizer.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+namespace __rtsan {
+
+struct Flags {
+#define RTSAN_FLAG(Type, Name, DefaultValue, Description)                      \
+  Type Name{DefaultValue};
+#include "rtsan_flags.inc"
+#undef RTSAN_FLAG
+};
+
+extern Flags flags_data;
+inline Flags &flags() { return flags_data; }
+
+void InitializeFlags();
+
+} // namespace __rtsan
diff --git a/compiler-rt/lib/rtsan/rtsan_flags.inc b/compiler-rt/lib/rtsan/rtsan_flags.inc
new file mode 100644
index 0000000000000..93b0294313672
--- /dev/null
+++ b/compiler-rt/lib/rtsan/rtsan_flags.inc
@@ -0,0 +1,20 @@
+//===------------------------ rtsan_flags.inc -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTSan runtime flags.
+//
+//===----------------------------------------------------------------------===//
+#ifndef RTSAN_FLAG
+#error "Define RTSAN_FLAG prior to including this file!"
+#endif
+
+// RTSAN_FLAG(Type, Name, DefaultValue, Description)
+// See COMMON_FLAG in sanitizer_flags.inc for more details.
+
+// Example flag, until we get a real one
+// RTSAN_FLAG(bool, halt_on_error, true, "If true, halt the program on error")
diff --git a/compiler-rt/lib/rtsan/rtsan_stack.cpp b/compiler-rt/lib/rtsan/rtsan_stack.cpp
index 0598af2337ed1..aeb6a2a3cbd7e 100644
--- a/compiler-rt/lib/rtsan/rtsan_stack.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_stack.cpp
@@ -29,21 +29,9 @@ void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,
 }
 } // namespace __sanitizer
 
-static void SetGlobalStackTraceFormat() {
-  SetCommonFlagsDefaults();
-  CommonFlags cf;
-  cf.CopyFrom(*common_flags());
-  cf.stack_trace_format = "DEFAULT";
-  cf.external_symbolizer_path = GetEnv("RTSAN_SYMBOLIZER_PATH");
-  OverrideCommonFlags(cf);
-}
-
 void __rtsan::PrintStackTrace(uptr pc, uptr bp) {
-
   BufferedStackTrace stack{};
 
   stack.Unwind(pc, bp, nullptr, common_flags()->fast_unwind_on_fatal);
-
-  SetGlobalStackTraceFormat();
   stack.Print();
 }
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_context.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_context.cpp
index cad8cf2d53996..6b9d7e2df123c 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_context.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_context.cpp
@@ -10,31 +10,37 @@
 
 #include "rtsan_test_utilities.h"
 
-#include "rtsan_context.h"
+#include "rtsan/rtsan.h"
+#include "rtsan/rtsan_context.h"
 
-TEST(TestRtsanContext, CanCreateContext) { __rtsan::Context context{}; }
+#include <gtest/gtest.h>
 
-TEST(TestRtsanContext, ExpectNotRealtimeDoesNotDieBeforeRealtimePush) {
+class TestRtsanContext : public ::testing::Test {
+protected:
+  void SetUp() override { __rtsan_ensure_initialized(); }
+};
+
+TEST_F(TestRtsanContext, ExpectNotRealtimeDoesNotDieBeforeRealtimePush) {
   __rtsan::Context context{};
   ExpectNotRealtime(context, "do_some_stuff");
 }
 
-TEST(TestRtsanContext, ExpectNotRealtimeDoesNotDieAfterPushAndPop) {
+TEST_F(TestRtsanContext, ExpectNotRealtimeDoesNotDieAfterPushAndPop) {
   __rtsan::Context context{};
   context.RealtimePush();
   context.RealtimePop();
   ExpectNotRealtime(context, "do_some_stuff");
 }
 
-TEST(TestRtsanContext, ExpectNotRealtimeDiesAfterRealtimePush) {
+TEST_F(TestRtsanContext, ExpectNotRealtimeDiesAfterRealtimePush) {
   __rtsan::Context context{};
 
   context.RealtimePush();
   EXPECT_DEATH(ExpectNotRealtime(context, "do_some_stuff"), "");
 }
 
-TEST(TestRtsanContext,
-     ExpectNotRealtimeDiesAfterRealtimeAfterMorePushesThanPops) {
+TEST_F(TestRtsanContext,
+       ExpectNotRealtimeDiesAfterRealtimeAfterMorePushesThanPops) {
   __rtsan::Context context{};
 
   context.RealtimePush();
@@ -45,7 +51,7 @@ TEST(TestRtsanContext,
   EXPECT_DEATH(ExpectNotRealtime(context, "do_some_stuff"), "");
 }
 
-TEST(TestRtsanContext, ExpectNotRealtimeDoesNotDieAfterBypassPush) {
+TEST_F(TestRtsanContext, ExpectNotRealtimeDoesNotDieAfterBypassPush) {
   __rtsan::Context context{};
 
   context.RealtimePush();
@@ -53,8 +59,8 @@ TEST(TestRtsanContext, ExpectNotRealtimeDoesNotDieAfterBypassPush) {
   ExpectNotRealtime(context, "do_some_stuff");
 }
 
-TEST(TestRtsanContext,
-     ExpectNotRealtimeDoesNotDieIfBypassDepthIsGreaterThanZero) {
+TEST_F(TestRtsanContext,
+       ExpectNotRealtimeDoesNotDieIfBypassDepthIsGreaterThanZero) {
   __rtsan::Context context{};
 
   context.RealtimePush();
diff --git a/compiler-rt/test/rtsan/unrecognized_flags.cpp b/compiler-rt/test/rtsan/unrecognized_flags.cpp
new file mode 100644
index 0000000000000..9e44e9f429936
--- /dev/null
+++ b/compiler-rt/test/rtsan/unrecognized_flags.cpp
@@ -0,0 +1,11 @@
+// RUN: %clangxx -fsanitize=realtime %s -o %t
+// RUN: RTSAN_OPTIONS="verbosity=1,asdf=1" %run %t 2>&1 | FileCheck %s
+// UNSUPPORTED: ios
+
+// Intent: Make sure we are respecting some basic common flags
+
+int main() {
+  return 0;
+  // CHECK: WARNING: found 1 unrecognized flag(s):
+  // CHECK-NEXT: {{.*asdf*}}
+}
diff --git a/llvm/test/Unit/lit.cfg.py b/llvm/test/Unit/lit.cfg.py
index 61296d7ea0032..f1646d1b894cd 100644
--- a/llvm/test/Unit/lit.cfg.py
+++ b/llvm/test/Unit/lit.cfg.py
@@ -47,6 +47,7 @@
     "ASAN_OPTIONS",
     "HWASAN_OPTIONS",
     "MSAN_OPTIONS",
+    "RTSAN_OPTIONS",
     "TSAN_OPTIONS",
     "UBSAN_OPTIONS",
 ]:
diff --git a/llvm/utils/lit/lit/TestingConfig.py b/llvm/utils/lit/lit/TestingConfig.py
index 9d8e93460933d..f81b07baeeaed 100644
--- a/llvm/utils/lit/lit/TestingConfig.py
+++ b/llvm/utils/lit/lit/TestingConfig.py
@@ -41,6 +41,7 @@ def fromdefaults(litConfig):
             "LSAN_OPTIONS",
             "HWASAN_OPTIONS",
             "MSAN_OPTIONS",
+            "RTSAN_OPTIONS",
             "TSAN_OPTIONS",
             "UBSAN_OPTIONS",
             "ADB",
diff --git a/llvm/utils/lit/lit/llvm/config.py b/llvm/utils/lit/lit/llvm/config.py
index 8a6a0c2ed8089..5f762ec7f3514 100644
--- a/llvm/utils/lit/lit/llvm/config.py
+++ b/llvm/utils/lit/lit/llvm/config.py
@@ -83,6 +83,7 @@ def __init__(self, lit_config, config):
                 "UBSAN_SYMBOLIZER_PATH" "ASAN_OPTIONS",
                 "HWASAN_OPTIONS",
                 "MSAN_OPTIONS",
+                "RTSAN_OPTIONS",
                 "TSAN_OPTIONS",
                 "UBSAN_OPTIONS",
             ]

From df138625df5881c7e593db13fa13cf39d68856cf Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 6 Sep 2024 21:32:19 +0400
Subject: [PATCH 414/425] AMDGPU: Remove unnecessary pointer bitcast

---
 llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 76553e99431c1..bfb57b1edf508 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -1033,16 +1033,14 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
   DispatchPtr->addDereferenceableRetAttr(64);
 
   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
-  Value *CastDispatchPtr = Builder.CreateBitCast(
-      DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
 
   // We could do a single 64-bit load here, but it's likely that the basic
   // 32-bit and extract sequence is already present, and it is probably easier
   // to CSE this. The loads should be mergeable later anyway.
-  Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
+  Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1);
   LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
 
-  Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
+  Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2);
   LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
 
   MDNode *MD = MDNode::get(Mod->getContext(), std::nullopt);

From a6577791d4ed8ca7f548598fb179c473d275178b Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Fri, 6 Sep 2024 18:41:56 +0100
Subject: [PATCH 415/425] LV: fix style after cursory reading (NFC) (#105830)

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 305 +++++++++---------
 1 file changed, 151 insertions(+), 154 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 780da36189c36..d059b57638012 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -71,7 +71,6 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
@@ -104,7 +103,6 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -126,12 +124,9 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/IR/VectorBuilder.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/InstructionCost.h"
@@ -148,12 +143,10 @@
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include <algorithm>
 #include <cassert>
-#include <cmath>
 #include <cstdint>
 #include <functional>
 #include <iterator>
 #include <limits>
-#include <map>
 #include <memory>
 #include <string>
 #include <tuple>
@@ -361,7 +354,7 @@ cl::opt<bool> EnableVPlanNativePath(
     "enable-vplan-native-path", cl::Hidden,
     cl::desc("Enable VPlan-native vectorization path with "
              "support for outer loop vectorization."));
-}
+} // namespace llvm
 
 // This flag enables the stress testing of the VPlan H-CFG construction in the
 // VPlan-native vectorization path. It must be used in conjuction with
@@ -1190,8 +1183,8 @@ class LoopVectorizationCostModel {
     assert(VF.isVector() && "Expected VF >=2");
     /// Broadcast this decicion to all instructions inside the group.
     /// But the cost will be assigned to one instruction only.
-    for (unsigned i = 0; i < Grp->getFactor(); ++i) {
-      if (auto *I = Grp->getMember(i)) {
+    for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
+      if (auto *I = Grp->getMember(Idx)) {
         if (Grp->getInsertPos() == I)
           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
         else
@@ -1834,7 +1827,7 @@ class GeneratedRTChecks {
   /// un-linked from the IR and is added back during vector code generation. If
   /// there is no vector code generation, the check blocks are removed
   /// completely.
-  void Create(Loop *L, const LoopAccessInfo &LAI,
+  void create(Loop *L, const LoopAccessInfo &LAI,
               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
 
     // Hard cutoff to limit compile-time increase in case a very large number of
@@ -2610,7 +2603,7 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
     IRBuilder<> B(LoopVectorPreHeader->getTerminator());
 
     // Fast-math-flags propagate from the original induction instruction.
-    if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
+    if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
       B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
 
     EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
@@ -2791,7 +2784,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
       IRBuilder<> B(MiddleBlock->getTerminator());
 
       // Fast-math-flags propagate from the original induction instruction.
-      if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
+      if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
 
       Value *CountMinusOne = B.CreateSub(
@@ -2903,7 +2896,7 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
   return ScalarCallCost;
 }
 
-static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
+static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
     return Elt;
   return VectorType::get(Elt, VF);
@@ -2914,7 +2907,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
                                                    ElementCount VF) const {
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   assert(ID && "Expected intrinsic call!");
-  Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
+  Type *RetTy = maybeVectorizeType(CI->getType(), VF);
   FastMathFlags FMF;
   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
     FMF = FPMO->getFastMathFlags();
@@ -2924,7 +2917,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
   SmallVector<Type *> ParamTys;
   std::transform(FTy->param_begin(), FTy->param_end(),
                  std::back_inserter(ParamTys),
-                 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
+                 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
 
   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
                                     dyn_cast<IntrinsicInst>(CI));
@@ -3016,7 +3009,7 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
 
   // Returns true if a given use occurs in the predicated block. Phi nodes use
   // their operands in their corresponding predecessor blocks.
-  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
+  auto IsBlockOfUsePredicated = [&](Use &U) -> bool {
     auto *I = cast<Instruction>(U.getUser());
     BasicBlock *BB = I->getParent();
     if (auto *Phi = dyn_cast<PHINode>(I))
@@ -3060,7 +3053,7 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
       // It's legal to sink the instruction if all its uses occur in the
       // predicated block. Otherwise, there's nothing to do yet, and we may
       // need to reanalyze the instruction.
-      if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
+      if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) {
         InstsToReanalyze.push_back(I);
         continue;
       }
@@ -3088,9 +3081,9 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
       // Make sure the builder has a valid insert point.
       Builder.SetInsertPoint(NewPhi);
-      for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
-        VPValue *Inc = VPPhi->getIncomingValue(i);
-        VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
+      for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) {
+        VPValue *Inc = VPPhi->getIncomingValue(Idx);
+        VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
       }
     }
@@ -3124,7 +3117,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
   // The pointer operands of loads and stores will be scalar as long as the
   // memory access is not a gather or scatter operation. The value operand of a
   // store will remain scalar if the store is scalarized.
-  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
+  auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
     assert(WideningDecision != CM_Unknown &&
            "Widening decision should be ready at this moment");
@@ -3138,7 +3131,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
 
   // A helper that returns true if the given value is a getelementptr
   // instruction contained in the loop.
-  auto isLoopVaryingGEP = [&](Value *V) {
+  auto IsLoopVaryingGEP = [&](Value *V) {
     return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
   };
 
@@ -3146,10 +3139,10 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
   // be a scalar use and the pointer is only used by memory accesses, we place
   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
   // PossibleNonScalarPtrs.
-  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
+  auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
     // We only care about bitcast and getelementptr instructions contained in
     // the loop.
-    if (!isLoopVaryingGEP(Ptr))
+    if (!IsLoopVaryingGEP(Ptr))
       return;
 
     // If the pointer has already been identified as scalar (e.g., if it was
@@ -3161,7 +3154,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
     // If the use of the pointer will be a scalar use, and all users of the
     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
     // place the pointer in PossibleNonScalarPtrs.
-    if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
+    if (IsScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
           return isa<LoadInst>(U) || isa<StoreInst>(U);
         }))
       ScalarPtrs.insert(I);
@@ -3186,10 +3179,10 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
       if (auto *Load = dyn_cast<LoadInst>(&I)) {
-        evaluatePtrUse(Load, Load->getPointerOperand());
+        EvaluatePtrUse(Load, Load->getPointerOperand());
       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
-        evaluatePtrUse(Store, Store->getPointerOperand());
-        evaluatePtrUse(Store, Store->getValueOperand());
+        EvaluatePtrUse(Store, Store->getPointerOperand());
+        EvaluatePtrUse(Store, Store->getValueOperand());
       }
     }
   for (auto *I : ScalarPtrs)
@@ -3215,14 +3208,14 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
   unsigned Idx = 0;
   while (Idx != Worklist.size()) {
     Instruction *Dst = Worklist[Idx++];
-    if (!isLoopVaryingGEP(Dst->getOperand(0)))
+    if (!IsLoopVaryingGEP(Dst->getOperand(0)))
       continue;
     auto *Src = cast<Instruction>(Dst->getOperand(0));
     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
           auto *J = cast<Instruction>(U);
           return !TheLoop->contains(J) || Worklist.count(J) ||
                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
-                  isScalarUse(J, Src));
+                  IsScalarUse(J, Src));
         })) {
       Worklist.insert(Src);
       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
@@ -3247,7 +3240,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
       return Induction.second.getKind() ==
                  InductionDescriptor::IK_PtrInduction &&
              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
-             Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
+             Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
     };
 
     // Determine if all users of the induction variable are scalar after
@@ -3466,21 +3459,20 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   // losslessly cast all values to a common type.
   unsigned InterleaveFactor = Group->getFactor();
   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
-  for (unsigned i = 0; i < InterleaveFactor; i++) {
-    Instruction *Member = Group->getMember(i);
+  for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
+    Instruction *Member = Group->getMember(Idx);
     if (!Member)
       continue;
     auto *MemberTy = getLoadStoreType(Member);
     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
     // Don't coerce non-integral pointers to integers or vice versa.
-    if (MemberNI != ScalarNI) {
+    if (MemberNI != ScalarNI)
       // TODO: Consider adding special nullptr value case here
       return false;
-    } else if (MemberNI && ScalarNI &&
-               ScalarTy->getPointerAddressSpace() !=
-               MemberTy->getPointerAddressSpace()) {
+    if (MemberNI && ScalarNI &&
+        ScalarTy->getPointerAddressSpace() !=
+            MemberTy->getPointerAddressSpace())
       return false;
-    }
   }
 
   // Check if masking is required.
@@ -3561,7 +3553,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
 
   // Global values, params and instructions outside of current loop are out of
   // scope.
-  auto isOutOfScope = [&](Value *V) -> bool {
+  auto IsOutOfScope = [&](Value *V) -> bool {
     Instruction *I = dyn_cast<Instruction>(V);
     return (!I || !TheLoop->contains(I));
   };
@@ -3573,8 +3565,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   // that require predication must not be considered uniform after
   // vectorization, because that would create an erroneous replicating region
   // where only a single instance out of VF should be formed.
-  auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
-    if (isOutOfScope(I)) {
+  auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
+    if (IsOutOfScope(I)) {
       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
                         << *I << "\n");
       return;
@@ -3597,13 +3589,13 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   for (BasicBlock *E : Exiting) {
     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
     if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
-      addToWorklistIfAllowed(Cmp);
+      AddToWorklistIfAllowed(Cmp);
   }
 
   auto PrevVF = VF.divideCoefficientBy(2);
   // Return true if all lanes perform the same memory operation, and we can
   // thus chose to execute only one.
-  auto isUniformMemOpUse = [&](Instruction *I) {
+  auto IsUniformMemOpUse = [&](Instruction *I) {
     // If the value was already known to not be uniform for the previous
     // (smaller VF), it cannot be uniform for the larger VF.
     if (PrevVF.isVector()) {
@@ -3621,12 +3613,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
   };
 
-  auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
+  auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
     InstWidening WideningDecision = getWideningDecision(I, VF);
     assert(WideningDecision != CM_Unknown &&
            "Widening decision should be ready at this moment");
 
-    if (isUniformMemOpUse(I))
+    if (IsUniformMemOpUse(I))
       return true;
 
     return (WideningDecision == CM_Widen ||
@@ -3637,11 +3629,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   // Returns true if Ptr is the pointer operand of a memory access instruction
   // I, I is known to not require scalarization, and the pointer is not also
   // stored.
-  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
+  auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
     if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
       return false;
     return getLoadStorePointerOperand(I) == Ptr &&
-           (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
+           (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
   };
 
   // Holds a list of values which are known to have at least one uniform use.
@@ -3663,7 +3655,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
         case Intrinsic::lifetime_start:
         case Intrinsic::lifetime_end:
           if (TheLoop->hasLoopInvariantOperands(&I))
-            addToWorklistIfAllowed(&I);
+            AddToWorklistIfAllowed(&I);
           break;
         default:
           break;
@@ -3673,9 +3665,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
       // ExtractValue instructions must be uniform, because the operands are
       // known to be loop-invariant.
       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
-        assert(isOutOfScope(EVI->getAggregateOperand()) &&
+        assert(IsOutOfScope(EVI->getAggregateOperand()) &&
                "Expected aggregate value to be loop invariant");
-        addToWorklistIfAllowed(EVI);
+        AddToWorklistIfAllowed(EVI);
         continue;
       }
 
@@ -3684,10 +3676,10 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
       if (!Ptr)
         continue;
 
-      if (isUniformMemOpUse(&I))
-        addToWorklistIfAllowed(&I);
+      if (IsUniformMemOpUse(&I))
+        AddToWorklistIfAllowed(&I);
 
-      if (isVectorizedMemAccessUse(&I, Ptr))
+      if (IsVectorizedMemAccessUse(&I, Ptr))
         HasUniformUse.insert(Ptr);
     }
 
@@ -3695,28 +3687,28 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   // demanding) users.  Since loops are assumed to be in LCSSA form, this
   // disallows uses outside the loop as well.
   for (auto *V : HasUniformUse) {
-    if (isOutOfScope(V))
+    if (IsOutOfScope(V))
       continue;
     auto *I = cast<Instruction>(V);
     auto UsersAreMemAccesses =
       llvm::all_of(I->users(), [&](User *U) -> bool {
         auto *UI = cast<Instruction>(U);
-        return TheLoop->contains(UI) && isVectorizedMemAccessUse(UI, V);
+        return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
       });
     if (UsersAreMemAccesses)
-      addToWorklistIfAllowed(I);
+      AddToWorklistIfAllowed(I);
   }
 
   // Expand Worklist in topological order: whenever a new instruction
   // is added , its users should be already inside Worklist.  It ensures
   // a uniform instruction will only be used by uniform instructions.
-  unsigned idx = 0;
-  while (idx != Worklist.size()) {
-    Instruction *I = Worklist[idx++];
+  unsigned Idx = 0;
+  while (Idx != Worklist.size()) {
+    Instruction *I = Worklist[Idx++];
 
     for (auto *OV : I->operand_values()) {
       // isOutOfScope operands cannot be uniform instructions.
-      if (isOutOfScope(OV))
+      if (IsOutOfScope(OV))
         continue;
       // First order recurrence Phi's should typically be considered
       // non-uniform.
@@ -3728,9 +3720,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
       auto *OI = cast<Instruction>(OV);
       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
             auto *J = cast<Instruction>(U);
-            return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
+            return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
           }))
-        addToWorklistIfAllowed(OI);
+        AddToWorklistIfAllowed(OI);
     }
   }
 
@@ -3750,7 +3742,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
       auto *I = cast<Instruction>(U);
       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
-             isVectorizedMemAccessUse(I, Ind);
+             IsVectorizedMemAccessUse(I, Ind);
     });
     if (!UniformInd)
       continue;
@@ -3761,14 +3753,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
           auto *I = cast<Instruction>(U);
           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
-                 isVectorizedMemAccessUse(I, IndUpdate);
+                 IsVectorizedMemAccessUse(I, IndUpdate);
         });
     if (!UniformIndUpdate)
       continue;
 
     // The induction variable and its update instruction will remain uniform.
-    addToWorklistIfAllowed(Ind);
-    addToWorklistIfAllowed(IndUpdate);
+    AddToWorklistIfAllowed(Ind);
+    AddToWorklistIfAllowed(IndUpdate);
   }
 
   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
@@ -3918,8 +3910,8 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
       if (UserVF.isScalable())
         return FixedScalableVFPair(
             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
-      else
-        return UserVF;
+
+      return UserVF;
     }
 
     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
@@ -4706,11 +4698,10 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
     if (hasPlanWithVF(ForcedEC))
       return {ForcedEC, 0, 0};
-    else {
-      LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
-                           "viable.\n");
-      return Result;
-    }
+
+    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
+                         "viable.\n");
+    return Result;
   }
 
   if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
@@ -4900,8 +4891,8 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   RegisterUsage R = calculateRegisterUsage({VF})[0];
   // We divide by these constants so assume that we have at least one
   // instruction that uses at least one register.
-  for (auto& pair : R.MaxLocalUsers) {
-    pair.second = std::max(pair.second, 1U);
+  for (auto &Pair : R.MaxLocalUsers) {
+    Pair.second = std::max(Pair.second, 1U);
   }
 
   // We calculate the interleave count using the following formula.
@@ -4917,11 +4908,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   // this currently happens when OptForSize, in which case IC is set to 1 above.
   unsigned IC = UINT_MAX;
 
-  for (auto& pair : R.MaxLocalUsers) {
-    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
+  for (const auto &Pair : R.MaxLocalUsers) {
+    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
                       << " registers of "
-                      << TTI.getRegisterClassName(pair.first) << " register class\n");
+                      << TTI.getRegisterClassName(Pair.first)
+                      << " register class\n");
     if (VF.isScalar()) {
       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
         TargetNumRegisters = ForceTargetNumScalarRegs;
@@ -4929,10 +4921,10 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
         TargetNumRegisters = ForceTargetNumVectorRegs;
     }
-    unsigned MaxLocalUsers = pair.second;
+    unsigned MaxLocalUsers = Pair.second;
     unsigned LoopInvariantRegs = 0;
-    if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
-      LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
+    if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end())
+      LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
 
     unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
                                      MaxLocalUsers);
@@ -5117,10 +5109,10 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
       // Interleave no less than SmallIC but not as aggressive as the normal IC
       // to satisfy the rare situation when resources are too limited.
       return std::max(IC / 2, SmallIC);
-    } else {
-      LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
-      return SmallIC;
     }
+
+    LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
+    return SmallIC;
   }
 
   // Interleave if this is a large loop (small loops are already dealt with by
@@ -5223,11 +5215,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
   };
 
-  for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
-    Instruction *I = IdxToInstr[i];
+  for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
+    Instruction *I = IdxToInstr[Idx];
 
     // Remove all of the instructions that end at this location.
-    InstrList &List = TransposeEnds[i];
+    InstrList &List = TransposeEnds[Idx];
     for (Instruction *ToRemove : List)
       OpenIntervals.erase(ToRemove);
 
@@ -5242,7 +5234,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
     collectInLoopReductions();
 
     // For each VF find the maximum usage of registers.
-    for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
+    for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
       // Count the number of registers used, per register class, given all open
       // intervals.
       // Note that elements in this SmallMapVector will be default constructed
@@ -5250,7 +5242,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
       // there is no previous entry for ClassID.
       SmallMapVector<unsigned, unsigned, 4> RegUsage;
 
-      if (VFs[j].isScalar()) {
+      if (VFs[J].isScalar()) {
         for (auto *Inst : OpenIntervals) {
           unsigned ClassID =
               TTI.getRegisterClassForType(false, Inst->getType());
@@ -5259,12 +5251,12 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
           RegUsage[ClassID] += 1;
         }
       } else {
-        collectUniformsAndScalars(VFs[j]);
+        collectUniformsAndScalars(VFs[J]);
         for (auto *Inst : OpenIntervals) {
           // Skip ignored values for VF > 1.
           if (VecValuesToIgnore.count(Inst))
             continue;
-          if (isScalarAfterVectorization(Inst, VFs[j])) {
+          if (isScalarAfterVectorization(Inst, VFs[J])) {
             unsigned ClassID =
                 TTI.getRegisterClassForType(false, Inst->getType());
             // FIXME: The target might use more than one register for the type
@@ -5273,25 +5265,25 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
           } else {
             unsigned ClassID =
                 TTI.getRegisterClassForType(true, Inst->getType());
-            RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
+            RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
           }
         }
       }
 
-      for (auto& pair : RegUsage) {
-        auto &Entry = MaxUsages[j][pair.first];
-        Entry = std::max(Entry, pair.second);
+      for (const auto &Pair : RegUsage) {
+        auto &Entry = MaxUsages[J][Pair.first];
+        Entry = std::max(Entry, Pair.second);
       }
     }
 
-    LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
+    LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
                       << OpenIntervals.size() << '\n');
 
     // Add the current instruction to the list of open intervals.
     OpenIntervals.insert(I);
   }
 
-  for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
+  for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
     // Note that elements in this SmallMapVector will be default constructed
     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
     // there is no previous entry for ClassID.
@@ -5303,20 +5295,20 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
       bool IsScalar = all_of(Inst->users(), [&](User *U) {
         auto *I = cast<Instruction>(U);
         return TheLoop != LI->getLoopFor(I->getParent()) ||
-               isScalarAfterVectorization(I, VFs[i]);
+               isScalarAfterVectorization(I, VFs[Idx]);
       });
 
-      ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
+      ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
       unsigned ClassID =
           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
     }
 
     LLVM_DEBUG({
-      dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
-      dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
+      dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
+      dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
              << " item\n";
-      for (const auto &pair : MaxUsages[i]) {
+      for (const auto &pair : MaxUsages[Idx]) {
         dbgs() << "LV(REG): RegisterClass: "
                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
                << " registers\n";
@@ -5331,8 +5323,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
     });
 
     RU.LoopInvariantRegs = Invariant;
-    RU.MaxLocalUsers = MaxUsages[i];
-    RUs[i] = RU;
+    RU.MaxLocalUsers = MaxUsages[Idx];
+    RUs[Idx] = RU;
   }
 
   return RUs;
@@ -5423,7 +5415,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
   SmallVector<Instruction *, 8> Worklist;
 
   // Returns true if the given instruction can be scalarized.
-  auto canBeScalarized = [&](Instruction *I) -> bool {
+  auto CanBeScalarized = [&](Instruction *I) -> bool {
     // We only attempt to scalarize instructions forming a single-use chain
     // from the original predicated block that would otherwise be vectorized.
     // Although not strictly necessary, we give up on instructions we know will
@@ -5499,7 +5491,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
       if (auto *J = dyn_cast<Instruction>(U.get())) {
         assert(VectorType::isValidElementType(J->getType()) &&
                "Instruction has non-scalar type");
-        if (canBeScalarized(J))
+        if (CanBeScalarized(J))
           Worklist.push_back(J);
         else if (needsExtract(J, VF)) {
           ScalarCost += TTI.getScalarizationOverhead(
@@ -5579,10 +5571,10 @@ static const SCEV *getAddressAccessSCEV(
 
   // We are looking for a gep with all loop invariant indices except for one
   // which should be an induction variable.
-  auto SE = PSE.getSE();
+  auto *SE = PSE.getSE();
   unsigned NumOperands = Gep->getNumOperands();
-  for (unsigned i = 1; i < NumOperands; ++i) {
-    Value *Opd = Gep->getOperand(i);
+  for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
+    Value *Opd = Gep->getOperand(Idx);
     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
         !Legal->isInductionVariable(Opd))
       return nullptr;
@@ -5601,7 +5593,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
     return InstructionCost::getInvalid();
 
   Type *ValTy = getLoadStoreType(I);
-  auto SE = PSE.getSE();
+  auto *SE = PSE.getSE();
 
   unsigned AS = getLoadStoreAddressSpace(I);
   Value *Ptr = getLoadStorePointerOperand(I);
@@ -5636,10 +5628,10 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
     Cost /= getReciprocalPredBlockProb();
 
     // Add the cost of an i1 extract and a branch
-    auto *Vec_i1Ty =
+    auto *VecI1Ty =
         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
     Cost += TTI.getScalarizationOverhead(
-        Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
+        VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
         /*Insert=*/false, /*Extract=*/true, CostKind);
     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
 
@@ -5700,11 +5692,11 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
   }
   StoreInst *SI = cast<StoreInst>(I);
 
-  bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
+  bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
   return TTI.getAddressComputationCost(ValTy) +
          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
                              CostKind) +
-         (isLoopInvariantStoreValue
+         (IsLoopInvariantStoreValue
               ? 0
               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
                                        CostKind, VF.getKnownMinValue() - 1));
@@ -5732,7 +5724,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   unsigned AS = getLoadStoreAddressSpace(I);
   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
-  auto Group = getInterleavedAccessGroup(I);
+  const auto *Group = getInterleavedAccessGroup(I);
   assert(Group && "Fail to get an interleaved access group.");
 
   unsigned InterleaveFactor = Group->getFactor();
@@ -5999,7 +5991,7 @@ InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
   // any overhead.
   SmallVector<Type *> Tys;
   for (auto *V : filterExtractingOperands(Ops, VF))
-    Tys.push_back(MaybeVectorizeType(V->getType(), VF));
+    Tys.push_back(maybeVectorizeType(V->getType(), VF));
   return Cost + TTI.getOperandsScalarizationOverhead(
                     filterExtractingOperands(Ops, VF), Tys, CostKind);
 }
@@ -6023,7 +6015,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
         NumPredStores++;
 
       if (Legal->isUniformMemOp(I, VF)) {
-        auto isLegalToScalarize = [&]() {
+        auto IsLegalToScalarize = [&]() {
           if (!VF.isScalable())
             // Scalarization of fixed length vectors "just works".
             return true;
@@ -6054,8 +6046,9 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
         // FIXME: This cost is a significant under-estimate for tail folded
         // memory ops.
-        const InstructionCost ScalarizationCost = isLegalToScalarize() ?
-          getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
+        const InstructionCost ScalarizationCost =
+            IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
+                                 : InstructionCost::getInvalid();
 
         // Choose better solution for the current VF,  Note that Invalid
         // costs compare as maximumal large.  If both are invalid, we get
@@ -6084,7 +6077,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
       InstructionCost InterleaveCost = InstructionCost::getInvalid();
       unsigned NumAccesses = 1;
       if (isAccessInterleaved(&I)) {
-        auto Group = getInterleavedAccessGroup(&I);
+        const auto *Group = getInterleavedAccessGroup(&I);
         assert(Group && "Fail to get an interleaved access group.");
 
         // Make one decision for the whole group.
@@ -6122,7 +6115,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
       // If the instructions belongs to an interleave group, the whole group
       // receives the same decision. The whole group receives the cost, but
       // the cost will actually be assigned to one instruction.
-      if (auto Group = getInterleavedAccessGroup(&I))
+      if (const auto *Group = getInterleavedAccessGroup(&I))
         setWideningDecision(Group, VF, Decision, Cost);
       else
         setWideningDecision(&I, VF, Decision, Cost);
@@ -6173,7 +6166,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
             I, VF, CM_Scalarize,
             (VF.getKnownMinValue() *
              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
-      else if (auto Group = getInterleavedAccessGroup(I)) {
+      else if (const auto *Group = getInterleavedAccessGroup(I)) {
         // Scalarize an interleave group of address loads.
         for (unsigned I = 0; I < Group->getFactor(); ++I) {
           if (Instruction *Member = Group->getMember(I))
@@ -6385,10 +6378,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   Type *RetTy = I->getType();
   if (canTruncateToMinimalBitwidth(I, VF))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
-  auto SE = PSE.getSE();
+  auto *SE = PSE.getSE();
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
-  auto hasSingleCopyAfterVectorization = [this](Instruction *I,
+  auto HasSingleCopyAfterVectorization = [this](Instruction *I,
                                                 ElementCount VF) -> bool {
     if (VF.isScalar())
       return true;
@@ -6402,7 +6395,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
              return !Scalarized->second.count(UI);
            });
   };
-  (void) hasSingleCopyAfterVectorization;
+  (void)HasSingleCopyAfterVectorization;
 
   Type *VectorTy;
   if (isScalarAfterVectorization(I, VF)) {
@@ -6415,7 +6408,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
            I->getOpcode() == Instruction::PHI ||
            (I->getOpcode() == Instruction::BitCast &&
             I->getType()->isPointerTy()) ||
-           hasSingleCopyAfterVectorization(I, VF));
+           HasSingleCopyAfterVectorization(I, VF));
     VectorTy = RetTy;
   } else
     VectorTy = ToVectorTy(RetTy, VF);
@@ -6452,19 +6445,21 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       if (VF.isScalable())
         return InstructionCost::getInvalid();
       // Return cost for branches around scalarized and predicated blocks.
-      auto *Vec_i1Ty =
+      auto *VecI1Ty =
           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
       return (
           TTI.getScalarizationOverhead(
-              Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
+              VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
               /*Insert*/ false, /*Extract*/ true, CostKind) +
           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
-    } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
+    }
+
+    if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
       // The back-edge branch will remain, as will all scalar branches.
       return TTI.getCFInstrCost(Instruction::Br, CostKind);
-    else
-      // This branch will be eliminated by if-conversion.
-      return 0;
+
+    // This branch will be eliminated by if-conversion.
+    return 0;
     // Note: We currently assume zero cost for an unconditional branch inside
     // a predicated block since it will become a fall-through, although we
     // may decide in the future to call TTI for all branches.
@@ -7065,9 +7060,9 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
         buildVPlansWithVPRecipes(UserVF, UserVF);
         LLVM_DEBUG(printPlans(dbgs()));
         return;
-      } else
-        reportVectorizationInfo("UserVF ignored because of invalid costs.",
-                                "InvalidCost", ORE, OrigLoop);
+      }
+      reportVectorizationInfo("UserVF ignored because of invalid costs.",
+                              "InvalidCost", ORE, OrigLoop);
     }
   }
 
@@ -7405,7 +7400,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   return BestFactor;
 }
 
-static void AddRuntimeUnrollDisableMetaData(Loop *L) {
+static void addRuntimeUnrollDisableMetaData(Loop *L) {
   SmallVector<Metadata *, 4> MDs;
   // Reserve first location for self reference to the LoopID metadata node.
   MDs.push_back(nullptr);
@@ -7413,14 +7408,14 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
   MDNode *LoopID = L->getLoopID();
   if (LoopID) {
     // First find existing loop unrolling disable metadata.
-    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
-      auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+    for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
+      auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
       if (MD) {
         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
         IsUnrollMetadata =
             S && S->getString().starts_with("llvm.loop.unroll.disable");
       }
-      MDs.push_back(LoopID->getOperand(i));
+      MDs.push_back(LoopID->getOperand(I));
     }
   }
 
@@ -7626,7 +7621,7 @@ LoopVectorizationPlanner::executePlan(
   TargetTransformInfo::UnrollingPreferences UP;
   TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
   if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
-    AddRuntimeUnrollDisableMetaData(L);
+    addRuntimeUnrollDisableMetaData(L);
 
   // 3. Fix the vectorized code: take care of header phi's, live-outs,
   //    predication, updating analyses.
@@ -8128,7 +8123,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
          "Must be called with either a load or store");
 
-  auto willWiden = [&](ElementCount VF) -> bool {
+  auto WillWiden = [&](ElementCount VF) -> bool {
     LoopVectorizationCostModel::InstWidening Decision =
         CM.getWideningDecision(I, VF);
     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
@@ -8141,7 +8136,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
     return Decision != LoopVectorizationCostModel::CM_Scalarize;
   };
 
-  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
+  if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden, Range))
     return nullptr;
 
   VPValue *Mask = nullptr;
@@ -8228,7 +8223,7 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
 
   // Determine whether \p K is a truncation based on an induction variable that
   // can be optimized.
-  auto isOptimizableIVTruncate =
+  auto IsOptimizableIVTruncate =
       [&](Instruction *K) -> std::function<bool(ElementCount)> {
     return [=](ElementCount VF) -> bool {
       return CM.isOptimizableIVTruncate(K, VF);
@@ -8236,7 +8231,7 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
   };
 
   if (LoopVectorizationPlanner::getDecisionAndClampRange(
-          isOptimizableIVTruncate(I), Range)) {
+          IsOptimizableIVTruncate(I), Range)) {
 
     auto *Phi = cast<PHINode>(I->getOperand(0));
     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
@@ -8518,7 +8513,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   // First, check for specific widening recipes that deal with inductions, Phi
   // nodes, calls and memory operations.
   VPRecipeBase *Recipe;
-  if (auto Phi = dyn_cast<PHINode>(Instr)) {
+  if (auto *Phi = dyn_cast<PHINode>(Instr)) {
     if (Phi->getParent() != OrigLoop->getHeader())
       return tryToBlend(Phi, Operands);
 
@@ -8568,7 +8563,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   if (!shouldWiden(Instr, Range))
     return nullptr;
 
-  if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
     return new VPWidenGEPRecipe(GEP,
                                 make_range(Operands.begin(), Operands.end()));
 
@@ -8642,7 +8637,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
     const MapVector<PHINode *, InductionDescriptor> &Inductions) {
-  auto MiddleVPBB =
+  auto *MiddleVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
   // No edge from the middle block to the unique exit block has been inserted
   // and there is nothing to fix from vector loop; phis should have incoming
@@ -8683,7 +8678,7 @@ addUsersInExitBlock(VPlan &Plan,
   if (ExitingValuesToFix.empty())
     return;
 
-  auto MiddleVPBB =
+  auto *MiddleVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
   BasicBlock *ExitBB =
       cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock();
@@ -8923,7 +8918,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   // placeholders for its members' Recipes which we'll be replacing with a
   // single VPInterleaveRecipe.
   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
-    auto applyIG = [IG, this](ElementCount VF) -> bool {
+    auto ApplyIG = [IG, this](ElementCount VF) -> bool {
       bool Result = (VF.isVector() && // Query is illegal for VF == 1
                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
                          LoopVectorizationCostModel::CM_Interleave);
@@ -8934,7 +8929,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
              "Unsupported interleave factor for scalable vectors");
       return Result;
     };
-    if (!getDecisionAndClampRange(applyIG, Range))
+    if (!getDecisionAndClampRange(ApplyIG, Range))
       continue;
     InterleaveGroups.insert(IG);
   };
@@ -9981,7 +9976,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     //  Optimistically generate runtime checks if they are needed. Drop them if
     //  they turn out to not be profitable.
     if (VF.Width.isVector() || SelectedIC > 1)
-      Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+      Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
 
     // Check if it is profitable to vectorize with runtime checks.
     bool ForceVectorization =
@@ -10063,7 +10058,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
              << IntDiagMsg.second;
     });
     return false;
-  } else if (!VectorizeLoop && InterleaveLoop) {
+  }
+
+  if (!VectorizeLoop && InterleaveLoop) {
     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
     ORE->emit([&]() {
       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
@@ -10238,7 +10235,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     L->setLoopID(*RemainderLoopID);
   } else {
     if (DisableRuntimeUnroll)
-      AddRuntimeUnrollDisableMetaData(L);
+      addRuntimeUnrollDisableMetaData(L);
 
     // Mark the loop as already vectorized to avoid vectorizing again.
     Hints.setAlreadyVectorized();

From fb1494137e551448d8742d5e6367bf62e923598d Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Fri, 6 Sep 2024 10:46:47 -0700
Subject: [PATCH 416/425] [clang][rtsan] Add realtime_sanitizer to Features.def
 (#106650)

Allows us to introduce the scoped disabler in #106736
---
 clang/include/clang/Basic/Features.def              |  2 ++
 clang/test/Lexer/has_feature_realtime_sanitizer.cpp | 12 ++++++++++++
 2 files changed, 14 insertions(+)
 create mode 100644 clang/test/Lexer/has_feature_realtime_sanitizer.cpp

diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index 10538f555b418..7f5d26118bdc7 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -54,6 +54,8 @@ FEATURE(memtag_globals,
 FEATURE(xray_instrument, LangOpts.XRayInstrument)
 FEATURE(undefined_behavior_sanitizer,
         LangOpts.Sanitize.hasOneOf(SanitizerKind::Undefined))
+FEATURE(realtime_sanitizer,
+        LangOpts.Sanitize.has(SanitizerKind::Realtime))
 FEATURE(coverage_sanitizer, LangOpts.SanitizeCoverage)
 FEATURE(assume_nonnull, true)
 FEATURE(attribute_analyzer_noreturn, true)
diff --git a/clang/test/Lexer/has_feature_realtime_sanitizer.cpp b/clang/test/Lexer/has_feature_realtime_sanitizer.cpp
new file mode 100644
index 0000000000000..76febeb6473a4
--- /dev/null
+++ b/clang/test/Lexer/has_feature_realtime_sanitizer.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -E -fsanitize=realtime %s -o - | FileCheck --check-prefix=CHECK-RTSAN %s
+// RUN: %clang_cc1 -E  %s -o - | FileCheck --check-prefix=CHECK-NO-RTSAN %s
+
+#if __has_feature(realtime_sanitizer)
+int RealtimeSanitizerEnabled();
+#else
+int RealtimeSanitizerDisabled();
+#endif
+
+// CHECK-RTSAN: RealtimeSanitizerEnabled
+
+// CHECK-NO-RTSAN: RealtimeSanitizerDisabled

From 59f8796aaabc1ce400a8698431d3c6bfab4ad1a4 Mon Sep 17 00:00:00 2001
From: Thomas Fransham <tfransham@gmail.com>
Date: Fri, 6 Sep 2024 18:51:44 +0100
Subject: [PATCH 417/425] Initial changes for llvm shared library build using
 explicit visibility annotations

These are my initial build and code changes to supporting building llvm
as shared library/DLL on windows(without force exporting all symbols)
and making symbol visibility hidden by default on Linux which adding
explicit symbol visibility macros to the whole llvm codebase.

Updated cmake code to allow building llvm-shlib on windows by appending
/WHOLEARCHIVE:lib to the linker options.
Remove the hardcoded CMake error from using LLVM_BUILD_LLVM_DYLIB on
windows.
Updated CMake to define new macros to control conditional export macros
in llvm/Support/Compiler.h
Use /Zc:dllexportInlines- when compiling with clang-cl on windows with a
opt out CMake option to disable using it.
Replace some use of LLVM_EXTERNAL_VISIBILITY with new export macros.

Some of the cmake and code changes are based on @tstellar's earlier PR
https://github.com/llvm/llvm-project/pull/67502.

I have Windows building using clang-cl, while for MSVC its at-least able
to build libllvm, but some tests can't build because llvm iterator
template metaprogramming that doesn't work well with dllexport. Linux
should build without issue. My full branch is here
https://github.com/fsfod/llvm-project/tree/llvm-export-api-20.0 and
including all the auto generated export macros from clang tooling based
system.
---
 llvm/CMakeLists.txt                           | 42 ++++++++---
 llvm/cmake/modules/AddLLVM.cmake              | 45 +++++++++++
 llvm/cmake/modules/HandleLLVMOptions.cmake    |  6 ++
 llvm/include/llvm/ADT/Any.h                   |  2 +-
 llvm/include/llvm/Analysis/LazyCallGraph.h    |  2 +-
 llvm/include/llvm/Analysis/LoopInfo.h         |  2 +-
 llvm/include/llvm/Analysis/LoopNestAnalysis.h |  2 +-
 llvm/include/llvm/CodeGen/MachineFunction.h   |  2 +-
 llvm/include/llvm/IR/Function.h               |  3 +-
 llvm/include/llvm/IR/Module.h                 |  2 +-
 llvm/include/llvm/Support/Compiler.h          | 74 +++++++++++++++++--
 llvm/tools/llvm-shlib/CMakeLists.txt          | 33 +++++----
 llvm/utils/TableGen/Basic/CMakeLists.txt      |  2 +-
 llvm/utils/TableGen/Common/CMakeLists.txt     |  2 +-
 14 files changed, 180 insertions(+), 39 deletions(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index d681b1ccab629..c637febce1c1f 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -2,6 +2,8 @@
 
 cmake_minimum_required(VERSION 3.20.0)
 
+include(CMakeDependentOption)
+
 set(LLVM_COMMON_CMAKE_UTILS ${CMAKE_CURRENT_SOURCE_DIR}/../cmake)
 include(${LLVM_COMMON_CMAKE_UTILS}/Modules/CMakePolicy.cmake
   NO_POLICY_SCOPE)
@@ -839,20 +841,40 @@ endif()
 
 if(MSVC)
   option(LLVM_BUILD_LLVM_C_DYLIB "Build LLVM-C.dll (Windows only)" ON)
-  # Set this variable to OFF here so it can't be set with a command-line
-  # argument.
-  set (LLVM_LINK_LLVM_DYLIB OFF)
   if (BUILD_SHARED_LIBS)
     message(FATAL_ERROR "BUILD_SHARED_LIBS options is not supported on Windows.")
   endif()
-else()
-  option(LLVM_LINK_LLVM_DYLIB "Link tools against the libllvm dynamic library" OFF)
+ else()
   option(LLVM_BUILD_LLVM_C_DYLIB "Build libllvm-c re-export library (Darwin only)" OFF)
-  set(LLVM_BUILD_LLVM_DYLIB_default OFF)
-  if(LLVM_LINK_LLVM_DYLIB OR LLVM_BUILD_LLVM_C_DYLIB)
-    set(LLVM_BUILD_LLVM_DYLIB_default ON)
-  endif()
-  option(LLVM_BUILD_LLVM_DYLIB "Build libllvm dynamic library" ${LLVM_BUILD_LLVM_DYLIB_default})
+endif()
+
+# Used to test building the llvm shared library with explicit symbol visibility on
+# Windows and Linux. For ELF platforms default symbols visibility is set to hidden.
+set(LLVM_BUILD_LLVM_DYLIB_VIS FALSE CACHE BOOL "")
+mark_as_advanced(LLVM_BUILD_LLVM_DYLIB_VIS)
+
+set(CAN_BUILD_LLVM_DYLIB OFF)
+if(NOT MSVC OR LLVM_BUILD_LLVM_DYLIB_VIS)
+  set(CAN_BUILD_LLVM_DYLIB ON)
+endif()
+
+cmake_dependent_option(LLVM_LINK_LLVM_DYLIB "Link tools against the libllvm dynamic library" OFF
+                       "CAN_BUILD_LLVM_DYLIB" OFF)
+
+set(LLVM_BUILD_LLVM_DYLIB_default OFF)
+if(LLVM_LINK_LLVM_DYLIB OR LLVM_BUILD_LLVM_C_DYLIB)
+  set(LLVM_BUILD_LLVM_DYLIB_default ON)
+endif()
+cmake_dependent_option(LLVM_BUILD_LLVM_DYLIB "Build libllvm dynamic library" ${LLVM_BUILD_LLVM_DYLIB_default}
+                       "CAN_BUILD_LLVM_DYLIB" OFF)
+
+cmake_dependent_option(LLVM_DYLIB_EXPORT_INLINES "Force inline members of classes to be DLL exported when
+                       building with clang-cl so the libllvm DLL is compatible with MSVC"
+                       OFF
+                       "MSVC;LLVM_BUILD_LLVM_DYLIB_VIS" OFF)
+
+if(LLVM_BUILD_LLVM_DYLIB_VIS)
+  set(LLVM_BUILD_LLVM_DYLIB ON)
 endif()
 
 if (LLVM_LINK_LLVM_DYLIB AND BUILD_SHARED_LIBS)
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 99959ecfae9cb..c62b5649facae 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -606,6 +606,10 @@ function(llvm_add_library name)
         endif()
       endforeach()
     endif()
+
+    if(ARG_DISABLE_LLVM_LINK_LLVM_DYLIB)
+      target_compile_definitions(${obj_name} PRIVATE LLVM_BUILD_STATIC)
+    endif()
   endif()
 
   if(ARG_SHARED AND ARG_STATIC)
@@ -641,8 +645,36 @@ function(llvm_add_library name)
   endif()
   set_target_properties(${name} PROPERTIES FOLDER "${subproject_title}/Libraries")
 
+  ## If were compiling with clang-cl use /Zc:dllexportInlines- to exclude inline 
+  ## class members from being dllexport'ed to reduce compile time.
+  ## This will also keep us below the 64k exported symbol limit
+  ## https://blog.llvm.org/2018/11/30-faster-windows-builds-with-clang-cl_14.html
+  if(LLVM_BUILD_LLVM_DYLIB AND NOT LLVM_DYLIB_EXPORT_INLINES AND 
+     MSVC AND CMAKE_CXX_COMPILER_ID MATCHES Clang)
+    target_compile_options(${name} PUBLIC /Zc:dllexportInlines-)
+    if(TARGET ${obj_name})
+      target_compile_options(${obj_name} PUBLIC /Zc:dllexportInlines-)
+    endif()
+  endif()
+
   if(ARG_COMPONENT_LIB)
     set_target_properties(${name} PROPERTIES LLVM_COMPONENT TRUE)
+    if(LLVM_BUILD_LLVM_DYLIB OR BUILD_SHARED_LIBS)
+      target_compile_definitions(${name} PRIVATE LLVM_EXPORTS)
+    endif()
+
+    # When building shared objects for each target there are some internal APIs
+    # that are used across shared objects which we can't hide.
+    if (LLVM_BUILD_LLVM_DYLIB_VIS AND NOT BUILD_SHARED_LIBS AND NOT APPLE AND
+        (NOT (WIN32 OR CYGWIN) OR (MINGW AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")) AND
+        NOT (${CMAKE_SYSTEM_NAME} MATCHES "AIX") AND
+        NOT DEFINED CMAKE_CXX_VISIBILITY_PRESET)
+
+      set_target_properties(${name} PROPERTIES
+                            C_VISIBILITY_PRESET hidden
+                            CXX_VISIBILITY_PRESET hidden
+                            VISIBILITY_INLINES_HIDDEN YES)
+    endif()
     set_property(GLOBAL APPEND PROPERTY LLVM_COMPONENT_LIBS ${name})
   endif()
 
@@ -741,6 +773,9 @@ function(llvm_add_library name)
     if (LLVM_LINK_LLVM_DYLIB AND NOT ARG_DISABLE_LLVM_LINK_LLVM_DYLIB)
       set(llvm_libs LLVM)
     else()
+      if(ARG_DISABLE_LLVM_LINK_LLVM_DYLIB)
+        target_compile_definitions(${name} PRIVATE LLVM_BUILD_STATIC)
+      endif()
       llvm_map_components_to_libnames(llvm_libs
        ${ARG_LINK_COMPONENTS}
        ${LLVM_LINK_COMPONENTS}
@@ -1116,6 +1151,16 @@ macro(add_llvm_executable name)
   if (ARG_EXPORT_SYMBOLS)
     export_executable_symbols(${name})
   endif()
+
+  if(ARG_DISABLE_LLVM_LINK_LLVM_DYLIB OR NOT LLVM_LINK_LLVM_DYLIB)
+    target_compile_definitions(${name} PRIVATE LLVM_BUILD_STATIC)
+  endif()
+
+  if(LLVM_BUILD_LLVM_DYLIB_VIS AND NOT LLVM_DYLIB_EXPORT_INLINES AND
+     MSVC AND CMAKE_CXX_COMPILER_ID MATCHES Clang)
+    # This has to match how the libraries the executable is linked to are built or there be linker errors.
+    target_compile_options(${name} PRIVATE /Zc:dllexportInlines-)
+  endif()
 endmacro(add_llvm_executable name)
 
 # add_llvm_pass_plugin(name [NO_MODULE] ...)
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index bdbd36174fc7a..ed13a82905b4e 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -754,6 +754,12 @@ if (MSVC)
       # any code that uses the LLVM_ALIGNAS macro), so this is must be disabled to
       # avoid unwanted alignment warnings.
       -wd4324 # Suppress 'structure was padded due to __declspec(align())'
+      # This is triggered for every variable that is a template type of a class even 
+      # if there private when the class is dllexport'ed
+      -wd4251 # Suppress 'needs to have dll-interface to be used by clients'
+      # We only putting dll export on classes with out of line members so this 
+      # warning gets triggered a lot for bases we haven't exported'
+      -wd4275 # non dll-interface class used as base for dll-interface class
 
       # Promoted warnings.
       -w14062 # Promote 'enumerator in switch of enum is not handled' to level 1 warning.
diff --git a/llvm/include/llvm/ADT/Any.h b/llvm/include/llvm/ADT/Any.h
index 89b4d9e5e4052..88dbce96c2c3d 100644
--- a/llvm/include/llvm/ADT/Any.h
+++ b/llvm/include/llvm/ADT/Any.h
@@ -25,7 +25,7 @@
 
 namespace llvm {
 
-class LLVM_EXTERNAL_VISIBILITY Any {
+class LLVM_ABI Any {
 
   // The `Typeid<T>::Id` static data member below is a globally unique
   // identifier for the type `T`. It is explicitly marked with default
diff --git a/llvm/include/llvm/Analysis/LazyCallGraph.h b/llvm/include/llvm/Analysis/LazyCallGraph.h
index e67a0388bcaa6..e7fd18967d9be 100644
--- a/llvm/include/llvm/Analysis/LazyCallGraph.h
+++ b/llvm/include/llvm/Analysis/LazyCallGraph.h
@@ -412,7 +412,7 @@ class LazyCallGraph {
   /// outer structure. SCCs do not support mutation of the call graph, that
   /// must be done through the containing \c RefSCC in order to fully reason
   /// about the ordering and connections of the graph.
-  class LLVM_EXTERNAL_VISIBILITY SCC {
+  class LLVM_ABI SCC {
     friend class LazyCallGraph;
     friend class LazyCallGraph::Node;
 
diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
index 748b7e11cbde9..abc0bb8588fa8 100644
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -36,7 +36,7 @@ extern template class LoopBase<BasicBlock, Loop>;
 
 /// Represents a single loop in the control flow graph.  Note that not all SCCs
 /// in the CFG are necessarily loops.
-class LLVM_EXTERNAL_VISIBILITY Loop : public LoopBase<BasicBlock, Loop> {
+class LLVM_ABI Loop : public LoopBase<BasicBlock, Loop> {
 public:
   /// A range representing the start and end location of a loop.
   class LocRange {
diff --git a/llvm/include/llvm/Analysis/LoopNestAnalysis.h b/llvm/include/llvm/Analysis/LoopNestAnalysis.h
index 3b33dd505ddeb..22d5cb6ca15f5 100644
--- a/llvm/include/llvm/Analysis/LoopNestAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopNestAnalysis.h
@@ -25,7 +25,7 @@ using LoopVectorTy = SmallVector<Loop *, 8>;
 class LPMUpdater;
 
 /// This class represents a loop nest and can be used to query its properties.
-class LLVM_EXTERNAL_VISIBILITY LoopNest {
+class LLVM_ABI LoopNest {
 public:
   using InstrVectorTy = SmallVector<const Instruction *>;
 
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index 9845520d6db6b..aeb72ca24d79b 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -254,7 +254,7 @@ struct LandingPadInfo {
       : LandingPadBlock(MBB) {}
 };
 
-class LLVM_EXTERNAL_VISIBILITY MachineFunction {
+class LLVM_ABI MachineFunction {
   Function &F;
   const LLVMTargetMachine &Target;
   const TargetSubtargetInfo *STI;
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index ce48eed7883e6..f7e4e976ae4c4 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -60,8 +60,7 @@ class User;
 class BranchProbabilityInfo;
 class BlockFrequencyInfo;
 
-class LLVM_EXTERNAL_VISIBILITY Function : public GlobalObject,
-                                          public ilist_node<Function> {
+class LLVM_ABI Function : public GlobalObject, public ilist_node<Function> {
 public:
   using BasicBlockListType = SymbolTableList<BasicBlock>;
 
diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index e8905f7ed393c..528e19af5518d 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -62,7 +62,7 @@ class VersionTuple;
 /// constant references to global variables in the module.  When a global
 /// variable is destroyed, it should have no entries in the GlobalList.
 /// The main container class for the LLVM Intermediate Representation.
-class LLVM_EXTERNAL_VISIBILITY Module {
+class LLVM_ABI Module {
   /// @name Types And Enumerations
   /// @{
 public:
diff --git a/llvm/include/llvm/Support/Compiler.h b/llvm/include/llvm/Support/Compiler.h
index 9d8607f8dd9ac..e1d54af7dfcce 100644
--- a/llvm/include/llvm/Support/Compiler.h
+++ b/llvm/include/llvm/Support/Compiler.h
@@ -110,7 +110,8 @@
 /// this attribute will be made public and visible outside of any shared library
 /// they are linked in to.
 
-#if LLVM_HAS_CPP_ATTRIBUTE(gnu::visibility)
+#if LLVM_HAS_CPP_ATTRIBUTE(gnu::visibility) && defined(__GNUC__) &&            \
+    !defined(__clang__)
 #define LLVM_ATTRIBUTE_VISIBILITY_HIDDEN [[gnu::visibility("hidden")]]
 #define LLVM_ATTRIBUTE_VISIBILITY_DEFAULT [[gnu::visibility("default")]]
 #elif __has_attribute(visibility)
@@ -121,18 +122,79 @@
 #define LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #endif
 
-
-#if (!(defined(_WIN32) || defined(__CYGWIN__)) ||                              \
-     (defined(__MINGW32__) && defined(__clang__)))
-#define LLVM_LIBRARY_VISIBILITY LLVM_ATTRIBUTE_VISIBILITY_HIDDEN
 #if defined(LLVM_BUILD_LLVM_DYLIB) || defined(LLVM_BUILD_SHARED_LIBS)
 #define LLVM_EXTERNAL_VISIBILITY LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #else
 #define LLVM_EXTERNAL_VISIBILITY
 #endif
+
+#if (!(defined(_WIN32) || defined(__CYGWIN__)) ||                              \
+     (defined(__MINGW32__) && defined(__clang__)))
+#define LLVM_LIBRARY_VISIBILITY LLVM_ATTRIBUTE_VISIBILITY_HIDDEN
+#define LLVM_ALWAYS_EXPORT LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
+#elif defined(_WIN32)
+#define LLVM_ALWAYS_EXPORT __declspec(dllexport)
+#define LLVM_LIBRARY_VISIBILITY
 #else
 #define LLVM_LIBRARY_VISIBILITY
-#define LLVM_EXTERNAL_VISIBILITY
+#define LLVM_ALWAYS_EXPORT
+#endif
+
+/// LLVM_ABI is the main export/visibility macro to mark something as explicitly
+/// exported when llvm is built as a shared library with everything else that is
+/// unannotated will have internal visibility.
+///
+/// LLVM_EXPORT_TEMPLATE is used on explicit template instantiations in source
+/// files that were declared extern in a header. This macro is only set as a
+/// compiler export attribute on windows, on other platforms it does nothing.
+///
+/// LLVM_TEMPLATE_ABI is for annotating extern template declarations in headers
+/// for both functions and classes. On windows its turned in to dllimport for
+/// library consumers, for other platforms its a default visibility attribute.
+///
+/// LLVM_C_ABI is used to annotated functions and data that need to be exported
+/// for the libllvm-c API. This used both for the llvm-c headers and for the
+/// functions declared in the different Target's c++ source files that don't
+/// include the header forward declaring them.
+#ifndef LLVM_ABI_GENERATING_ANNOTATIONS
+// Marker to add to classes or functions in public headers that should not have
+// export macros added to them by the clang tool
+#define LLVM_ABI_NOT_EXPORTED
+#if defined(LLVM_BUILD_LLVM_DYLIB) || defined(LLVM_BUILD_SHARED_LIBS) ||       \
+    defined(LLVM_ENABLE_PLUGINS)
+// Some libraries like those for tablegen are linked in to tools that used
+// in the build so can't depend on the llvm shared library. If export macros
+// were left enabled when building these we would get duplicate or
+// missing symbol linker errors on windows.
+#if defined(LLVM_BUILD_STATIC)
+#define LLVM_ABI
+#define LLVM_TEMPLATE_ABI
+#define LLVM_EXPORT_TEMPLATE
+#elif defined(_WIN32) && !defined(__MINGW32__)
+#if defined(LLVM_EXPORTS)
+#define LLVM_ABI __declspec(dllexport)
+#define LLVM_TEMPLATE_ABI
+#define LLVM_EXPORT_TEMPLATE __declspec(dllexport)
+#else
+#define LLVM_ABI __declspec(dllimport)
+#define LLVM_TEMPLATE_ABI __declspec(dllimport)
+#define LLVM_EXPORT_TEMPLATE
+#endif
+#elif defined(__ELF__) || defined(__MINGW32__)
+#define LLVM_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
+#define LLVM_TEMPLATE_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
+#define LLVM_EXPORT_TEMPLATE
+#elif defined(__MACH__) || defined(__WASM__)
+#define LLVM_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
+#define LLVM_TEMPLATE_ABI
+#define LLVM_EXPORT_TEMPLATE
+#endif
+#else
+#define LLVM_ABI
+#define LLVM_TEMPLATE_ABI
+#define LLVM_EXPORT_TEMPLATE
+#endif
+#define LLVM_C_ABI LLVM_ABI
 #endif
 
 #if defined(__GNUC__)
diff --git a/llvm/tools/llvm-shlib/CMakeLists.txt b/llvm/tools/llvm-shlib/CMakeLists.txt
index b20ac318e768d..0436bd973b44b 100644
--- a/llvm/tools/llvm-shlib/CMakeLists.txt
+++ b/llvm/tools/llvm-shlib/CMakeLists.txt
@@ -11,7 +11,7 @@ if(LLVM_LINK_LLVM_DYLIB AND LLVM_DYLIB_EXPORTED_SYMBOL_FILE)
 endif()
 
 if(LLVM_BUILD_LLVM_DYLIB)
-  if(MSVC)
+  if(MSVC AND NOT LLVM_BUILD_LLVM_DYLIB_VIS)
     message(FATAL_ERROR "Generating libLLVM is not supported on MSVC")
   endif()
   if(ZOS)
@@ -49,18 +49,25 @@ if(LLVM_BUILD_LLVM_DYLIB)
     ${CMAKE_CURRENT_SOURCE_DIR}/simple_version_script.map.in
     ${LLVM_LIBRARY_DIR}/tools/llvm-shlib/simple_version_script.map)
 
-    # GNU ld doesn't resolve symbols in the version script.
-    set(LIB_NAMES -Wl,--whole-archive ${LIB_NAMES} -Wl,--no-whole-archive)
-    if (NOT LLVM_LINKER_IS_SOLARISLD AND NOT MINGW)
-      # Solaris ld does not accept global: *; so there is no way to version *all* global symbols
-      set(LIB_NAMES -Wl,--version-script,${LLVM_LIBRARY_DIR}/tools/llvm-shlib/simple_version_script.map ${LIB_NAMES})
-    endif()
-    if (NOT MINGW AND NOT LLVM_LINKER_IS_SOLARISLD_ILLUMOS)
-      # Optimize function calls for default visibility definitions to avoid PLT and
-      # reduce dynamic relocations.
-      # Note: for -fno-pic default, the address of a function may be different from
-      # inside and outside libLLVM.so.
-      target_link_options(LLVM PRIVATE LINKER:-Bsymbolic-functions)
+    if(MSVC)
+      target_link_directories(LLVM PRIVATE ${LLVM_LIBRARY_DIR})
+      foreach(library ${LIB_NAMES})
+        target_link_options(LLVM PRIVATE /WHOLEARCHIVE:${library}.lib)
+      endforeach()
+    else()
+      # GNU ld doesn't resolve symbols in the version script.
+      set(LIB_NAMES -Wl,--whole-archive ${LIB_NAMES} -Wl,--no-whole-archive)
+      if (NOT LLVM_LINKER_IS_SOLARISLD AND NOT MINGW)
+        # Solaris ld does not accept global: *; so there is no way to version *all* global symbols
+        set(LIB_NAMES -Wl,--version-script,${LLVM_LIBRARY_DIR}/tools/llvm-shlib/simple_version_script.map ${LIB_NAMES})
+      endif()
+      if (NOT MINGW AND NOT LLVM_LINKER_IS_SOLARISLD_ILLUMOS)
+        # Optimize function calls for default visibility definitions to avoid PLT and
+        # reduce dynamic relocations.
+        # Note: for -fno-pic default, the address of a function may be different from
+        # inside and outside libLLVM.so.
+        target_link_options(LLVM PRIVATE LINKER:-Bsymbolic-functions)
+      endif()
     endif()
   endif()
 
diff --git a/llvm/utils/TableGen/Basic/CMakeLists.txt b/llvm/utils/TableGen/Basic/CMakeLists.txt
index 09d79a01cae0a..41d737e8d418e 100644
--- a/llvm/utils/TableGen/Basic/CMakeLists.txt
+++ b/llvm/utils/TableGen/Basic/CMakeLists.txt
@@ -8,7 +8,7 @@ set(LLVM_LINK_COMPONENTS
   TableGen
   )
 
-add_llvm_library(LLVMTableGenBasic OBJECT EXCLUDE_FROM_ALL
+add_llvm_library(LLVMTableGenBasic OBJECT EXCLUDE_FROM_ALL DISABLE_LLVM_LINK_LLVM_DYLIB
   CodeGenIntrinsics.cpp
   SDNodeProperties.cpp
 )
diff --git a/llvm/utils/TableGen/Common/CMakeLists.txt b/llvm/utils/TableGen/Common/CMakeLists.txt
index ce5092b6962ba..7342156980f35 100644
--- a/llvm/utils/TableGen/Common/CMakeLists.txt
+++ b/llvm/utils/TableGen/Common/CMakeLists.txt
@@ -10,7 +10,7 @@ set(LLVM_LINK_COMPONENTS
   TableGen
   )
 
-add_llvm_library(LLVMTableGenCommon STATIC OBJECT EXCLUDE_FROM_ALL
+add_llvm_library(LLVMTableGenCommon STATIC OBJECT EXCLUDE_FROM_ALL DISABLE_LLVM_LINK_LLVM_DYLIB
   GlobalISel/CodeExpander.cpp
   GlobalISel/CombinerUtils.cpp
   GlobalISel/CXXPredicates.cpp

From c014db47ca298ad7a0f52a57c3bfc2a9914371b2 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 6 Sep 2024 10:55:38 -0700
Subject: [PATCH 418/425] [ADT] Remove the const variant of LookupBucketFor in
 DenseMap (#107608)

DenseMap has const and non-const variants of LookupBucketFor to find a
bucket for insertion purposes.  Now that queries (e.g. find, contains,
and erase) have been migrated to use doFind instead of
LookupBucketFor, nobody uses the const variant of LookupBucketFor.
This patch removes the const variant.

As far as the logistics go, the non-const variant calls the const
variant with a couple of const_cast, so this patch repurposes the
const variant as the non-const one while removing the original
non-const variant.
---
 llvm/include/llvm/ADT/DenseMap.h | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 745288ff047f4..00290c9dd0a58 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -678,10 +678,9 @@ class DenseMapBase : public DebugEpochBase {
   /// FoundBucket.  If the bucket contains the key and a value, this returns
   /// true, otherwise it returns a bucket with an empty marker or tombstone and
   /// returns false.
-  template<typename LookupKeyT>
-  bool LookupBucketFor(const LookupKeyT &Val,
-                       const BucketT *&FoundBucket) const {
-    const BucketT *BucketsPtr = getBuckets();
+  template <typename LookupKeyT>
+  bool LookupBucketFor(const LookupKeyT &Val, BucketT *&FoundBucket) {
+    BucketT *BucketsPtr = getBuckets();
     const unsigned NumBuckets = getNumBuckets();
 
     if (NumBuckets == 0) {
@@ -690,7 +689,7 @@ class DenseMapBase : public DebugEpochBase {
     }
 
     // FoundTombstone - Keep track of whether we find a tombstone while probing.
-    const BucketT *FoundTombstone = nullptr;
+    BucketT *FoundTombstone = nullptr;
     const KeyT EmptyKey = getEmptyKey();
     const KeyT TombstoneKey = getTombstoneKey();
     assert(!KeyInfoT::isEqual(Val, EmptyKey) &&
@@ -700,7 +699,7 @@ class DenseMapBase : public DebugEpochBase {
     unsigned BucketNo = getHashValue(Val) & (NumBuckets-1);
     unsigned ProbeAmt = 1;
     while (true) {
-      const BucketT *ThisBucket = BucketsPtr + BucketNo;
+      BucketT *ThisBucket = BucketsPtr + BucketNo;
       // Found Val's bucket?  If so, return it.
       if (LLVM_LIKELY(KeyInfoT::isEqual(Val, ThisBucket->getFirst()))) {
         FoundBucket = ThisBucket;
@@ -729,15 +728,6 @@ class DenseMapBase : public DebugEpochBase {
     }
   }
 
-  template <typename LookupKeyT>
-  bool LookupBucketFor(const LookupKeyT &Val, BucketT *&FoundBucket) {
-    const BucketT *ConstFoundBucket;
-    bool Result = const_cast<const DenseMapBase *>(this)
-      ->LookupBucketFor(Val, ConstFoundBucket);
-    FoundBucket = const_cast<BucketT *>(ConstFoundBucket);
-    return Result;
-  }
-
 public:
   /// Return the approximate size (in bytes) of the actual map.
   /// This is just the raw memory used by DenseMap.

From fc1aad5c903d63b6652b68626facf68c04841786 Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Thu, 29 Aug 2024 17:17:12 -0700
Subject: [PATCH 419/425] [compiler-rt][rtsan] Introduce rtsan_interface.h and
 ScopedDisabler

---
 compiler-rt/include/CMakeLists.txt            |   1 +
 .../include/sanitizer/rtsan_interface.h       | 115 ++++++++++++++++++
 compiler-rt/lib/rtsan/rtsan.h                 |  20 +--
 compiler-rt/test/rtsan/disabler.cpp           |  35 ++++++
 compiler-rt/test/rtsan/enabler.cpp            |  35 ++++++
 5 files changed, 192 insertions(+), 14 deletions(-)
 create mode 100644 compiler-rt/include/sanitizer/rtsan_interface.h
 create mode 100644 compiler-rt/test/rtsan/disabler.cpp
 create mode 100644 compiler-rt/test/rtsan/enabler.cpp

diff --git a/compiler-rt/include/CMakeLists.txt b/compiler-rt/include/CMakeLists.txt
index d598a94ee2e23..242d62b9b447b 100644
--- a/compiler-rt/include/CMakeLists.txt
+++ b/compiler-rt/include/CMakeLists.txt
@@ -10,6 +10,7 @@ if (COMPILER_RT_BUILD_SANITIZERS)
     sanitizer/lsan_interface.h
     sanitizer/msan_interface.h
     sanitizer/netbsd_syscall_hooks.h
+    sanitizer/rtsan_interface.h
     sanitizer/scudo_interface.h
     sanitizer/tsan_interface.h
     sanitizer/tsan_interface_atomic.h
diff --git a/compiler-rt/include/sanitizer/rtsan_interface.h b/compiler-rt/include/sanitizer/rtsan_interface.h
new file mode 100644
index 0000000000000..399cbfd294dac
--- /dev/null
+++ b/compiler-rt/include/sanitizer/rtsan_interface.h
@@ -0,0 +1,115 @@
+//===-- sanitizer/rtsan_interface.h -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of RealtimeSanitizer.
+//
+// Public interface header.
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_RTSAN_INTERFACE_H
+#define SANITIZER_RTSAN_INTERFACE_H
+
+#if __has_include(<sanitizer/common_interface_defs.h>)
+#include <sanitizer/common_interface_defs.h>
+#else
+#define SANITIZER_CDECL
+#endif // __has_include(<sanitizer/common_interface_defs.h>)
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Initializes rtsan if it has not been initialized yet.
+// Used by the RTSan runtime to ensure that rtsan is initialized before any
+// other rtsan functions are called.
+void SANITIZER_CDECL __rtsan_ensure_initialized();
+
+// Enter real-time context.
+// When in a real-time context, RTSan interceptors will error if realtime
+// violations are detected. Calls to this method are injected at the code
+// generation stage when RTSan is enabled.
+void SANITIZER_CDECL __rtsan_realtime_enter();
+
+// Exit the real-time context.
+// When not in a real-time context, RTSan interceptors will simply forward
+// intercepted method calls to the real methods.
+void SANITIZER_CDECL __rtsan_realtime_exit();
+
+// Disable all RTSan error reporting.
+void SANITIZER_CDECL __rtsan_disable(void);
+
+// Re-enable all RTSan error reporting.
+// The counterpart to `__rtsan_disable`.
+void SANITIZER_CDECL __rtsan_enable(void);
+
+// Expect that the next call to a function with the given name will not be
+// called from a realtime context.
+void SANITIZER_CDECL
+__rtsan_expect_not_realtime(const char *intercepted_function_name);
+
+#ifdef __cplusplus
+} // extern "C"
+
+namespace __rtsan {
+#if defined(__has_feature) && __has_feature(realtime_sanitizer)
+
+void Initialize() { __rtsan_ensure_initialized(); }
+
+class ScopedEnabler {
+public:
+  ScopedEnabler() { __rtsan_realtime_enter(); }
+  ~ScopedEnabler() { __rtsan_realtime_exit(); }
+
+#if __cplusplus >= 201103L
+  ScopedEnabler(const ScopedEnabler &) = delete;
+  ScopedEnabler &operator=(const ScopedEnabler &) = delete;
+  ScopedEnabler(ScopedEnabler &&) = delete;
+  ScopedEnabler &operator=(ScopedEnabler &&) = delete;
+#else
+private:
+  ScopedEnabler(const ScopedEnabler &);
+  ScopedEnabler &operator=(const ScopedEnabler &);
+#endif // __cplusplus >= 201103L
+};
+
+class ScopedDisabler {
+public:
+  ScopedDisabler() { __rtsan_disable(); }
+  ~ScopedDisabler() { __rtsan_enable(); }
+
+#if __cplusplus >= 201103L
+  ScopedDisabler(const ScopedDisabler &) = delete;
+  ScopedDisabler &operator=(const ScopedDisabler &) = delete;
+  ScopedDisabler(ScopedDisabler &&) = delete;
+  ScopedDisabler &operator=(ScopedDisabler &&) = delete;
+#else
+private:
+  ScopedDisabler(const ScopedDisabler &);
+  ScopedDisabler &operator=(const ScopedDisabler &);
+#endif // __cplusplus >= 201103L
+};
+
+#else // doesn't have realtime_sanitizer
+
+void Initialize() {}
+
+class ScopedEnabler {
+public:
+  ScopedEnabler() {}
+};
+
+class ScopedDisabler {
+public:
+  ScopedDisabler() {}
+};
+
+#endif // defined(__has_feature) && __has_feature(realtime_sanitizer)
+} // namespace __rtsan
+#endif // __cplusplus
+
+#endif // SANITIZER_RTSAN_INTERFACE_H
diff --git a/compiler-rt/lib/rtsan/rtsan.h b/compiler-rt/lib/rtsan/rtsan.h
index ae23609f97d2d..80fcb1dec2ff1 100644
--- a/compiler-rt/lib/rtsan/rtsan.h
+++ b/compiler-rt/lib/rtsan/rtsan.h
@@ -18,32 +18,24 @@ extern "C" {
 // A call to this method is added to the preinit array on Linux systems.
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_init();
 
-// Initializes rtsan if it has not been initialized yet.
-// Used by the RTSan runtime to ensure that rtsan is initialized before any
-// other rtsan functions are called.
+// See documentation in rtsan_interface.h.
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_ensure_initialized();
 
 SANITIZER_INTERFACE_ATTRIBUTE bool __rtsan_is_initialized();
 
-// Enter real-time context.
-// When in a real-time context, RTSan interceptors will error if realtime
-// violations are detected. Calls to this method are injected at the code
-// generation stage when RTSan is enabled.
+// See documentation in rtsan_interface.h.
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_realtime_enter();
 
-// Exit the real-time context.
-// When not in a real-time context, RTSan interceptors will simply forward
-// intercepted method calls to the real methods.
+// See documentation in rtsan_interface.h.
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_realtime_exit();
 
-// Disable all RTSan error reporting.
-// Injected into the code if "nosanitize(realtime)" is on a function.
+// See documentation in rtsan_interface.h.
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_disable();
 
-// Re-enable all RTSan error reporting.
-// The counterpart to `__rtsan_disable`.
+// See documentation in rtsan_interface.h.
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_enable();
 
+// See documentation in rtsan_interface.h.
 SANITIZER_INTERFACE_ATTRIBUTE void
 __rtsan_expect_not_realtime(const char *intercepted_function_name);
 
diff --git a/compiler-rt/test/rtsan/disabler.cpp b/compiler-rt/test/rtsan/disabler.cpp
new file mode 100644
index 0000000000000..1fc14b0bac4e0
--- /dev/null
+++ b/compiler-rt/test/rtsan/disabler.cpp
@@ -0,0 +1,35 @@
+// RUN: %clangxx -fsanitize=realtime %s -o %t
+// RUN: not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx %s -fsanitize=realtime -o - -S -emit-llvm | FileCheck %s --check-prefix=CHECK-ENABLED-IR
+// RUN: %clangxx %s -o - -S -emit-llvm | FileCheck %s --check-prefix=CHECK-DISABLED-IR
+// UNSUPPORTED: ios
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "sanitizer/rtsan_interface.h"
+
+void violation() [[clang::nonblocking]] {
+  void *ptr;
+  {
+    __rtsan::ScopedDisabler disabler{};
+    ptr = malloc(2);
+    printf("ptr: %p\n", ptr); // ensure we don't optimize out the malloc
+  }
+
+  free(ptr);
+}
+
+int main() {
+  violation();
+  return 0;
+  // CHECK: {{.*Real-time violation.*}}
+  // CHECK-NOT: {{.*malloc*}}
+  // CHECK: {{.*free*}}
+}
+
+// CHECK-ENABLED-IR: {{.*@__rtsan_disable.*}}
+// CHECK-ENABLED-IR: {{.*@__rtsan_enable.*}}
+
+// CHECK-DISABLED-IR-NOT: {{.*__rtsan_disable.*}}
+// CHECK-DISABLED-IR-NOT: {{.*__rtsan_enable.*}}
diff --git a/compiler-rt/test/rtsan/enabler.cpp b/compiler-rt/test/rtsan/enabler.cpp
new file mode 100644
index 0000000000000..0ae8ba6e79283
--- /dev/null
+++ b/compiler-rt/test/rtsan/enabler.cpp
@@ -0,0 +1,35 @@
+// RUN: %clangxx -fsanitize=realtime %s -o %t
+// RUN: not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx %s -fsanitize=realtime -o - -S -emit-llvm | FileCheck %s --check-prefix=CHECK-ENABLED-IR
+// RUN: %clangxx %s -o - -S -emit-llvm | FileCheck %s --check-prefix=CHECK-DISABLED-IR
+// UNSUPPORTED: ios
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "sanitizer/rtsan_interface.h"
+
+void violation() {
+  void *ptr = malloc(2);
+  ptr = malloc(2);
+  printf("ptr: %p\n", ptr); // ensure we don't optimize out the malloc
+
+  {
+    __rtsan::ScopedEnabler rtsan{};
+    free(ptr);
+  }
+}
+
+int main() {
+  violation();
+  return 0;
+  // CHECK: {{.*Real-time violation.*}}
+  // CHECK-NOT: {{.*malloc*}}
+  // CHECK: {{.*free*}}
+}
+
+// CHECK-ENABLED-IR: {{.*@__rtsan_realtime_enter.*}}
+// CHECK-ENABLED-IR: {{.*@__rtsan_realtime_exit.*}}
+
+// CHECK-DISABLED-IR-NOT: {{.*__rtsan_realtime_enter.*}}
+// CHECK-DISABLED-IR-NOT: {{.*__rtsan_realtime_exit.*}}

From bed7e12d2fb5c0221b247f4da124baecc549844c Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Fri, 30 Aug 2024 08:20:03 -0700
Subject: [PATCH 420/425] Add rtsan_enabled define

---
 compiler-rt/include/sanitizer/rtsan_interface.h | 6 ++++--
 compiler-rt/test/rtsan/enabler.cpp              | 7 +++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/include/sanitizer/rtsan_interface.h b/compiler-rt/include/sanitizer/rtsan_interface.h
index 399cbfd294dac..b816d5a8b419e 100644
--- a/compiler-rt/include/sanitizer/rtsan_interface.h
+++ b/compiler-rt/include/sanitizer/rtsan_interface.h
@@ -56,7 +56,8 @@ __rtsan_expect_not_realtime(const char *intercepted_function_name);
 } // extern "C"
 
 namespace __rtsan {
-#if defined(__has_feature) && __has_feature(realtime_sanitizer)
+#if (defined(__has_feature) && __has_feature(realtime_sanitizer)) ||           \
+    defined(RTSAN_ENABLED)
 
 void Initialize() { __rtsan_ensure_initialized(); }
 
@@ -108,7 +109,8 @@ class ScopedDisabler {
   ScopedDisabler() {}
 };
 
-#endif // defined(__has_feature) && __has_feature(realtime_sanitizer)
+#endif // defined(__has_feature) && __has_feature(realtime_sanitizer) ||
+       // defined(RTSAN_ENABLED)
 } // namespace __rtsan
 #endif // __cplusplus
 
diff --git a/compiler-rt/test/rtsan/enabler.cpp b/compiler-rt/test/rtsan/enabler.cpp
index 0ae8ba6e79283..f26116ba274e5 100644
--- a/compiler-rt/test/rtsan/enabler.cpp
+++ b/compiler-rt/test/rtsan/enabler.cpp
@@ -1,6 +1,6 @@
-// RUN: %clangxx -fsanitize=realtime %s -o %t
+// RUN: %clangxx -DRTSAN_ENABLED -fsanitize=realtime %s -o %t
 // RUN: not %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx %s -fsanitize=realtime -o - -S -emit-llvm | FileCheck %s --check-prefix=CHECK-ENABLED-IR
+// RUN: %clangxx %s -DRTSAN_ENABLED -o - -S -emit-llvm | FileCheck %s --check-prefix=CHECK-ENABLED-IR
 // RUN: %clangxx %s -o - -S -emit-llvm | FileCheck %s --check-prefix=CHECK-DISABLED-IR
 // UNSUPPORTED: ios
 
@@ -21,6 +21,7 @@ void violation() {
 }
 
 int main() {
+  __rtsan::Initialize();
   violation();
   return 0;
   // CHECK: {{.*Real-time violation.*}}
@@ -28,8 +29,10 @@ int main() {
   // CHECK: {{.*free*}}
 }
 
+// CHECK-ENABLED-IR: {{.*__rtsan_ensure_initialized.*}}
 // CHECK-ENABLED-IR: {{.*@__rtsan_realtime_enter.*}}
 // CHECK-ENABLED-IR: {{.*@__rtsan_realtime_exit.*}}
 
+// CHECK-DISABLED-IR-NOT: {{.*__rtsan_ensure_initialized.*}}
 // CHECK-DISABLED-IR-NOT: {{.*__rtsan_realtime_enter.*}}
 // CHECK-DISABLED-IR-NOT: {{.*__rtsan_realtime_exit.*}}

From 5f91c56e53eed6b0ce6020e8f3086781a34b28db Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Fri, 30 Aug 2024 08:33:57 -0700
Subject: [PATCH 421/425] Tweaked name of macro

---
 compiler-rt/include/sanitizer/rtsan_interface.h | 6 +++---
 compiler-rt/test/rtsan/enabler.cpp              | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/compiler-rt/include/sanitizer/rtsan_interface.h b/compiler-rt/include/sanitizer/rtsan_interface.h
index b816d5a8b419e..a32ba94aad453 100644
--- a/compiler-rt/include/sanitizer/rtsan_interface.h
+++ b/compiler-rt/include/sanitizer/rtsan_interface.h
@@ -57,7 +57,7 @@ __rtsan_expect_not_realtime(const char *intercepted_function_name);
 
 namespace __rtsan {
 #if (defined(__has_feature) && __has_feature(realtime_sanitizer)) ||           \
-    defined(RTSAN_ENABLED)
+    SANITIZE_REALTIME
 
 void Initialize() { __rtsan_ensure_initialized(); }
 
@@ -109,8 +109,8 @@ class ScopedDisabler {
   ScopedDisabler() {}
 };
 
-#endif // defined(__has_feature) && __has_feature(realtime_sanitizer) ||
-       // defined(RTSAN_ENABLED)
+#endif // (defined(__has_feature) && __has_feature(realtime_sanitizer)) ||
+       // SANITIZE_REALTIME
 } // namespace __rtsan
 #endif // __cplusplus
 
diff --git a/compiler-rt/test/rtsan/enabler.cpp b/compiler-rt/test/rtsan/enabler.cpp
index f26116ba274e5..68f37276ed246 100644
--- a/compiler-rt/test/rtsan/enabler.cpp
+++ b/compiler-rt/test/rtsan/enabler.cpp
@@ -1,6 +1,6 @@
-// RUN: %clangxx -DRTSAN_ENABLED -fsanitize=realtime %s -o %t
+// RUN: %clangxx -DSANITIZE_REALTIME=1 -fsanitize=realtime %s -o %t
 // RUN: not %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx %s -DRTSAN_ENABLED -o - -S -emit-llvm | FileCheck %s --check-prefix=CHECK-ENABLED-IR
+// RUN: %clangxx %s -DSANITIZE_REALTIME=1 -o - -S -emit-llvm | FileCheck %s --check-prefix=CHECK-ENABLED-IR
 // RUN: %clangxx %s -o - -S -emit-llvm | FileCheck %s --check-prefix=CHECK-DISABLED-IR
 // UNSUPPORTED: ios
 

From 218b520a347ee895c57b740dc6b17f313be32222 Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Mon, 2 Sep 2024 08:50:34 -0700
Subject: [PATCH 422/425] [PR] david - get rid of scoped enabler, macro,
 simplify interface

---
 .../include/sanitizer/rtsan_interface.h       | 53 +------------------
 compiler-rt/lib/rtsan/rtsan.h                 | 16 ++++--
 compiler-rt/test/rtsan/enabler.cpp            | 38 -------------
 3 files changed, 14 insertions(+), 93 deletions(-)
 delete mode 100644 compiler-rt/test/rtsan/enabler.cpp

diff --git a/compiler-rt/include/sanitizer/rtsan_interface.h b/compiler-rt/include/sanitizer/rtsan_interface.h
index a32ba94aad453..875e76b66eac3 100644
--- a/compiler-rt/include/sanitizer/rtsan_interface.h
+++ b/compiler-rt/include/sanitizer/rtsan_interface.h
@@ -24,22 +24,6 @@
 extern "C" {
 #endif // __cplusplus
 
-// Initializes rtsan if it has not been initialized yet.
-// Used by the RTSan runtime to ensure that rtsan is initialized before any
-// other rtsan functions are called.
-void SANITIZER_CDECL __rtsan_ensure_initialized();
-
-// Enter real-time context.
-// When in a real-time context, RTSan interceptors will error if realtime
-// violations are detected. Calls to this method are injected at the code
-// generation stage when RTSan is enabled.
-void SANITIZER_CDECL __rtsan_realtime_enter();
-
-// Exit the real-time context.
-// When not in a real-time context, RTSan interceptors will simply forward
-// intercepted method calls to the real methods.
-void SANITIZER_CDECL __rtsan_realtime_exit();
-
 // Disable all RTSan error reporting.
 void SANITIZER_CDECL __rtsan_disable(void);
 
@@ -47,36 +31,11 @@ void SANITIZER_CDECL __rtsan_disable(void);
 // The counterpart to `__rtsan_disable`.
 void SANITIZER_CDECL __rtsan_enable(void);
 
-// Expect that the next call to a function with the given name will not be
-// called from a realtime context.
-void SANITIZER_CDECL
-__rtsan_expect_not_realtime(const char *intercepted_function_name);
-
 #ifdef __cplusplus
 } // extern "C"
 
 namespace __rtsan {
-#if (defined(__has_feature) && __has_feature(realtime_sanitizer)) ||           \
-    SANITIZE_REALTIME
-
-void Initialize() { __rtsan_ensure_initialized(); }
-
-class ScopedEnabler {
-public:
-  ScopedEnabler() { __rtsan_realtime_enter(); }
-  ~ScopedEnabler() { __rtsan_realtime_exit(); }
-
-#if __cplusplus >= 201103L
-  ScopedEnabler(const ScopedEnabler &) = delete;
-  ScopedEnabler &operator=(const ScopedEnabler &) = delete;
-  ScopedEnabler(ScopedEnabler &&) = delete;
-  ScopedEnabler &operator=(ScopedEnabler &&) = delete;
-#else
-private:
-  ScopedEnabler(const ScopedEnabler &);
-  ScopedEnabler &operator=(const ScopedEnabler &);
-#endif // __cplusplus >= 201103L
-};
+#if defined(__has_feature) && __has_feature(realtime_sanitizer)
 
 class ScopedDisabler {
 public:
@@ -97,20 +56,12 @@ class ScopedDisabler {
 
 #else // doesn't have realtime_sanitizer
 
-void Initialize() {}
-
-class ScopedEnabler {
-public:
-  ScopedEnabler() {}
-};
-
 class ScopedDisabler {
 public:
   ScopedDisabler() {}
 };
 
-#endif // (defined(__has_feature) && __has_feature(realtime_sanitizer)) ||
-       // SANITIZE_REALTIME
+#endif // defined(__has_feature) && __has_feature(realtime_sanitizer)
 } // namespace __rtsan
 #endif // __cplusplus
 
diff --git a/compiler-rt/lib/rtsan/rtsan.h b/compiler-rt/lib/rtsan/rtsan.h
index 80fcb1dec2ff1..68f875164af8f 100644
--- a/compiler-rt/lib/rtsan/rtsan.h
+++ b/compiler-rt/lib/rtsan/rtsan.h
@@ -18,15 +18,22 @@ extern "C" {
 // A call to this method is added to the preinit array on Linux systems.
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_init();
 
-// See documentation in rtsan_interface.h.
+// Initializes rtsan if it has not been initialized yet.
+// Used by the RTSan runtime to ensure that rtsan is initialized before any
+// other rtsan functions are called.
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_ensure_initialized();
 
 SANITIZER_INTERFACE_ATTRIBUTE bool __rtsan_is_initialized();
 
-// See documentation in rtsan_interface.h.
+// Enter real-time context.
+// When in a real-time context, RTSan interceptors will error if realtime
+// violations are detected. Calls to this method are injected at the code
+// generation stage when RTSan is enabled.
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_realtime_enter();
 
-// See documentation in rtsan_interface.h.
+// Exit the real-time context.
+// When not in a real-time context, RTSan interceptors will simply forward
+// intercepted method calls to the real methods.
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_realtime_exit();
 
 // See documentation in rtsan_interface.h.
@@ -35,7 +42,8 @@ SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_disable();
 // See documentation in rtsan_interface.h.
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_enable();
 
-// See documentation in rtsan_interface.h.
+// Expect that the next call to a function with the given name will not be
+// called from a realtime context.
 SANITIZER_INTERFACE_ATTRIBUTE void
 __rtsan_expect_not_realtime(const char *intercepted_function_name);
 
diff --git a/compiler-rt/test/rtsan/enabler.cpp b/compiler-rt/test/rtsan/enabler.cpp
deleted file mode 100644
index 68f37276ed246..0000000000000
--- a/compiler-rt/test/rtsan/enabler.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// RUN: %clangxx -DSANITIZE_REALTIME=1 -fsanitize=realtime %s -o %t
-// RUN: not %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx %s -DSANITIZE_REALTIME=1 -o - -S -emit-llvm | FileCheck %s --check-prefix=CHECK-ENABLED-IR
-// RUN: %clangxx %s -o - -S -emit-llvm | FileCheck %s --check-prefix=CHECK-DISABLED-IR
-// UNSUPPORTED: ios
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "sanitizer/rtsan_interface.h"
-
-void violation() {
-  void *ptr = malloc(2);
-  ptr = malloc(2);
-  printf("ptr: %p\n", ptr); // ensure we don't optimize out the malloc
-
-  {
-    __rtsan::ScopedEnabler rtsan{};
-    free(ptr);
-  }
-}
-
-int main() {
-  __rtsan::Initialize();
-  violation();
-  return 0;
-  // CHECK: {{.*Real-time violation.*}}
-  // CHECK-NOT: {{.*malloc*}}
-  // CHECK: {{.*free*}}
-}
-
-// CHECK-ENABLED-IR: {{.*__rtsan_ensure_initialized.*}}
-// CHECK-ENABLED-IR: {{.*@__rtsan_realtime_enter.*}}
-// CHECK-ENABLED-IR: {{.*@__rtsan_realtime_exit.*}}
-
-// CHECK-DISABLED-IR-NOT: {{.*__rtsan_ensure_initialized.*}}
-// CHECK-DISABLED-IR-NOT: {{.*__rtsan_realtime_enter.*}}
-// CHECK-DISABLED-IR-NOT: {{.*__rtsan_realtime_exit.*}}

From ec206964c01585d4addaa26e3d4105d2239f55a5 Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Mon, 2 Sep 2024 08:51:28 -0700
Subject: [PATCH 423/425] Improve docs of rtsan enable/disable

---
 compiler-rt/include/sanitizer/rtsan_interface.h | 5 +++--
 compiler-rt/lib/rtsan/rtsan.h                   | 2 --
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/include/sanitizer/rtsan_interface.h b/compiler-rt/include/sanitizer/rtsan_interface.h
index 875e76b66eac3..edff161d74b4e 100644
--- a/compiler-rt/include/sanitizer/rtsan_interface.h
+++ b/compiler-rt/include/sanitizer/rtsan_interface.h
@@ -25,10 +25,11 @@ extern "C" {
 #endif // __cplusplus
 
 // Disable all RTSan error reporting.
+// Must be paired with a call to `__rtsan_enable`
 void SANITIZER_CDECL __rtsan_disable(void);
 
 // Re-enable all RTSan error reporting.
-// The counterpart to `__rtsan_disable`.
+// Must follow a call to `__rtsan_disable`.
 void SANITIZER_CDECL __rtsan_enable(void);
 
 #ifdef __cplusplus
@@ -54,7 +55,7 @@ class ScopedDisabler {
 #endif // __cplusplus >= 201103L
 };
 
-#else // doesn't have realtime_sanitizer
+#else
 
 class ScopedDisabler {
 public:
diff --git a/compiler-rt/lib/rtsan/rtsan.h b/compiler-rt/lib/rtsan/rtsan.h
index 68f875164af8f..ca72d41d11125 100644
--- a/compiler-rt/lib/rtsan/rtsan.h
+++ b/compiler-rt/lib/rtsan/rtsan.h
@@ -42,8 +42,6 @@ SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_disable();
 // See documentation in rtsan_interface.h.
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_enable();
 
-// Expect that the next call to a function with the given name will not be
-// called from a realtime context.
 SANITIZER_INTERFACE_ATTRIBUTE void
 __rtsan_expect_not_realtime(const char *intercepted_function_name);
 

From 50f3672bbac09b902f8449d64aca5dfa1e8cb9ca Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Fri, 6 Sep 2024 10:54:49 -0700
Subject: [PATCH 424/425] [PR] vitalybuka - nested disabler test

---
 compiler-rt/test/rtsan/disabler.cpp | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/test/rtsan/disabler.cpp b/compiler-rt/test/rtsan/disabler.cpp
index 1fc14b0bac4e0..4101a756b803b 100644
--- a/compiler-rt/test/rtsan/disabler.cpp
+++ b/compiler-rt/test/rtsan/disabler.cpp
@@ -17,6 +17,20 @@ void violation() [[clang::nonblocking]] {
     printf("ptr: %p\n", ptr); // ensure we don't optimize out the malloc
   }
 
+  // ensure nested disablers don't interfere with one another
+  void *ptr2;
+  {
+    __rtsan::ScopedDisabler disabler{};
+    {
+      __rtsan::ScopedDisabler disabler2{};
+      ptr2 = malloc(2);
+      printf("ptr: %p\n", ptr); // ensure we don't optimize out the malloc
+    }
+
+    free(ptr2);
+    printf("Free'd second pointer in disabled context without crashing\n");
+  }
+
   free(ptr);
 }
 
@@ -25,7 +39,8 @@ int main() {
   return 0;
   // CHECK: {{.*Real-time violation.*}}
   // CHECK-NOT: {{.*malloc*}}
-  // CHECK: {{.*free*}}
+  // CHECK: Free'd second pointer in disabled context without crashing
+  // CHECK: {{.*real-time unsafe function `free`.*}}
 }
 
 // CHECK-ENABLED-IR: {{.*@__rtsan_disable.*}}

From aac11a1e4eef01e2ff22f44207250bbfba759f33 Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Fri, 6 Sep 2024 10:56:05 -0700
Subject: [PATCH 425/425] [PR] vitalybuka - adjusted interface, deleted
 constructors and removed has_include

---
 compiler-rt/include/sanitizer/rtsan_interface.h | 14 ++++++++++----
 compiler-rt/test/rtsan/disabler.cpp             | 15 +++++++++------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/compiler-rt/include/sanitizer/rtsan_interface.h b/compiler-rt/include/sanitizer/rtsan_interface.h
index edff161d74b4e..5d7ce5345712e 100644
--- a/compiler-rt/include/sanitizer/rtsan_interface.h
+++ b/compiler-rt/include/sanitizer/rtsan_interface.h
@@ -14,11 +14,7 @@
 #ifndef SANITIZER_RTSAN_INTERFACE_H
 #define SANITIZER_RTSAN_INTERFACE_H
 
-#if __has_include(<sanitizer/common_interface_defs.h>)
 #include <sanitizer/common_interface_defs.h>
-#else
-#define SANITIZER_CDECL
-#endif // __has_include(<sanitizer/common_interface_defs.h>)
 
 #ifdef __cplusplus
 extern "C" {
@@ -60,6 +56,16 @@ class ScopedDisabler {
 class ScopedDisabler {
 public:
   ScopedDisabler() {}
+#if __cplusplus >= 201103L
+  ScopedDisabler(const ScopedDisabler &) = delete;
+  ScopedDisabler &operator=(const ScopedDisabler &) = delete;
+  ScopedDisabler(ScopedDisabler &&) = delete;
+  ScopedDisabler &operator=(ScopedDisabler &&) = delete;
+#else
+private:
+  ScopedDisabler(const ScopedDisabler &);
+  ScopedDisabler &operator=(const ScopedDisabler &);
+#endif // __cplusplus >= 201103L
 };
 
 #endif // defined(__has_feature) && __has_feature(realtime_sanitizer)
diff --git a/compiler-rt/test/rtsan/disabler.cpp b/compiler-rt/test/rtsan/disabler.cpp
index 4101a756b803b..0a6411a2be694 100644
--- a/compiler-rt/test/rtsan/disabler.cpp
+++ b/compiler-rt/test/rtsan/disabler.cpp
@@ -14,21 +14,22 @@ void violation() [[clang::nonblocking]] {
   {
     __rtsan::ScopedDisabler disabler{};
     ptr = malloc(2);
-    printf("ptr: %p\n", ptr); // ensure we don't optimize out the malloc
+    fprintf(stderr, "Allocated pointer %p in disabled context\n", ptr);
   }
 
   // ensure nested disablers don't interfere with one another
-  void *ptr2;
   {
+    void *ptr2;
     __rtsan::ScopedDisabler disabler{};
     {
       __rtsan::ScopedDisabler disabler2{};
       ptr2 = malloc(2);
-      printf("ptr: %p\n", ptr); // ensure we don't optimize out the malloc
+      fprintf(stderr, "Allocated second pointer %p in disabled context\n",
+              ptr2);
     }
 
     free(ptr2);
-    printf("Free'd second pointer in disabled context without crashing\n");
+    fprintf(stderr, "Free'd second pointer in disabled context\n");
   }
 
   free(ptr);
@@ -37,10 +38,12 @@ void violation() [[clang::nonblocking]] {
 int main() {
   violation();
   return 0;
+  // CHECK: Allocated pointer {{.*}} in disabled context
+  // CHECK: Allocated second pointer {{.*}} in disabled context
+  // CHECK: Free'd second pointer in disabled context
   // CHECK: {{.*Real-time violation.*}}
   // CHECK-NOT: {{.*malloc*}}
-  // CHECK: Free'd second pointer in disabled context without crashing
-  // CHECK: {{.*real-time unsafe function `free`.*}}
+  // CHECK-NEXT: {{.*free.*}}
 }
 
 // CHECK-ENABLED-IR: {{.*@__rtsan_disable.*}}